Last active
November 26, 2024 02:08
-
-
Save naufalso/94f88a7c9d8717629a154f85bfd06841 to your computer and use it in GitHub Desktop.
Push markdown to github
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import re | |
import base64 | |
import argparse | |
from typing import List, Dict, Any | |
from datasets import Dataset | |
def embed_images_in_markdown(markdown_text: str, base_path: str = ".") -> str: | |
""" | |
Embed images in markdown text as base64 encoded strings. | |
Args: | |
markdown_text (str): The markdown text containing image links. | |
base_path (str): The base path to resolve relative image paths. | |
Returns: | |
str: The markdown text with images embedded as base64 strings. | |
""" | |
def encode_image_to_base64(image_path: str) -> str: | |
with open(image_path, "rb") as image_file: | |
encoded_string = base64.b64encode(image_file.read()).decode('utf-8') | |
return encoded_string | |
def replace_image(match: re.Match) -> str: | |
image_path = match.group(2) | |
full_image_path = os.path.join(base_path, image_path) | |
if os.path.exists(full_image_path): | |
encoded_image = encode_image_to_base64(full_image_path) | |
return f"" | |
else: | |
return match.group(0) # return the original markdown if image not found | |
# Regex to find markdown image syntax  | |
image_regex = re.compile(r'!\[([^\]]*)\]\(([^)]+)\)') | |
new_markdown_text = re.sub(image_regex, replace_image, markdown_text) | |
return new_markdown_text | |
def process_markdown_files(base_dir: str, include_emb_image: bool) -> List[Dict[str, Any]]: | |
""" | |
Process markdown files to embed images and collect metadata. | |
Args: | |
base_dir (str): The base directory to search for markdown files. | |
include_emb_image (bool): Whether to include embedded images in the output. | |
Returns: | |
List[Dict[str, Any]]: A list of dictionaries containing metadata and processed text. | |
""" | |
md_files = [] | |
for root, dirs, files in os.walk(base_dir): | |
for file in files: | |
if file == "index.md": | |
with open(os.path.join(root, file), "r") as f: | |
text = f.read() | |
data = { | |
"title": os.path.basename(root), | |
"path": os.path.join(root, file), | |
"text": text, | |
"total_chars": len(text), | |
"file_size_mb": os.path.getsize(os.path.join(root, file)) / 1024 / 1024 | |
} | |
if include_emb_image: | |
text_w_embed_image = embed_images_in_markdown(text, base_path=root) | |
data['text_w_embed_image'] = text_w_embed_image | |
md_files.append(data) | |
return md_files | |
def main(): | |
parser = argparse.ArgumentParser(description="Process markdown files and push to Hugging Face Hub.") | |
parser.add_argument("--base_dir", type=str, default=".", help="Base directory to search for markdown files.") | |
parser.add_argument("--dataset_name", type=str, required=True, help="Name of the dataset to push to Hugging Face Hub.") | |
parser.add_argument("--private", action="store_true", help="Set the dataset visibility to private.") | |
parser.add_argument("--include_emb_image", action="store_true", help="Include embedded images in the output.") | |
args = parser.parse_args() | |
md_files = process_markdown_files(args.base_dir, args.include_emb_image) | |
dataset = Dataset.from_list(md_files) | |
dataset.push_to_hub(args.dataset_name, private=args.private) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment