Skip to content

Instantly share code, notes, and snippets.

@naufalso
Last active November 26, 2024 02:08
Show Gist options
  • Save naufalso/94f88a7c9d8717629a154f85bfd06841 to your computer and use it in GitHub Desktop.
Save naufalso/94f88a7c9d8717629a154f85bfd06841 to your computer and use it in GitHub Desktop.
Push markdown to github
import os
import re
import base64
import argparse
from typing import List, Dict, Any
from datasets import Dataset
def embed_images_in_markdown(markdown_text: str, base_path: str = ".") -> str:
"""
Embed images in markdown text as base64 encoded strings.
Args:
markdown_text (str): The markdown text containing image links.
base_path (str): The base path to resolve relative image paths.
Returns:
str: The markdown text with images embedded as base64 strings.
"""
def encode_image_to_base64(image_path: str) -> str:
with open(image_path, "rb") as image_file:
encoded_string = base64.b64encode(image_file.read()).decode('utf-8')
return encoded_string
def replace_image(match: re.Match) -> str:
image_path = match.group(2)
full_image_path = os.path.join(base_path, image_path)
if os.path.exists(full_image_path):
encoded_image = encode_image_to_base64(full_image_path)
return f"![{match.group(1)}](data:image/png;base64,{encoded_image})"
else:
return match.group(0) # return the original markdown if image not found
# Regex to find markdown image syntax ![alt text](image_path)
image_regex = re.compile(r'!\[([^\]]*)\]\(([^)]+)\)')
new_markdown_text = re.sub(image_regex, replace_image, markdown_text)
return new_markdown_text
def process_markdown_files(base_dir: str, include_emb_image: bool) -> List[Dict[str, Any]]:
"""
Process markdown files to embed images and collect metadata.
Args:
base_dir (str): The base directory to search for markdown files.
include_emb_image (bool): Whether to include embedded images in the output.
Returns:
List[Dict[str, Any]]: A list of dictionaries containing metadata and processed text.
"""
md_files = []
for root, dirs, files in os.walk(base_dir):
for file in files:
if file == "index.md":
with open(os.path.join(root, file), "r") as f:
text = f.read()
data = {
"title": os.path.basename(root),
"path": os.path.join(root, file),
"text": text,
"total_chars": len(text),
"file_size_mb": os.path.getsize(os.path.join(root, file)) / 1024 / 1024
}
if include_emb_image:
text_w_embed_image = embed_images_in_markdown(text, base_path=root)
data['text_w_embed_image'] = text_w_embed_image
md_files.append(data)
return md_files
def main():
parser = argparse.ArgumentParser(description="Process markdown files and push to Hugging Face Hub.")
parser.add_argument("--base_dir", type=str, default=".", help="Base directory to search for markdown files.")
parser.add_argument("--dataset_name", type=str, required=True, help="Name of the dataset to push to Hugging Face Hub.")
parser.add_argument("--private", action="store_true", help="Set the dataset visibility to private.")
parser.add_argument("--include_emb_image", action="store_true", help="Include embedded images in the output.")
args = parser.parse_args()
md_files = process_markdown_files(args.base_dir, args.include_emb_image)
dataset = Dataset.from_list(md_files)
dataset.push_to_hub(args.dataset_name, private=args.private)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment