naufalso · November 26, 2024 02:08
diff --git a/push_to_hub.py b/push_to_hub.py
 import os
 import re
 import base64
 import argparse
 from typing import List, Dict, Any
 from datasets import Dataset

 def embed_images_in_markdown(markdown_text: str, base_path: str = ".") -> str:
    """
    Embed images in markdown text as base64 encoded strings.

    Args:
        markdown_text (str): The markdown text containing image links.
        base_path (str): The base path to resolve relative image paths.

    Returns:
        str: The markdown text with images embedded as base64 strings.
    """
    def encode_image_to_base64(image_path: str) -> str:
        with open(image_path, "rb") as image_file:
            encoded_string = base64.b64encode(image_file.read()).decode('utf-8')
        return encoded_string

    def replace_image(match: re.Match) -> str:
        image_path = match.group(2)
        full_image_path = os.path.join(base_path, image_path)
        if os.path.exists(full_image_path):
            encoded_image = encode_image_to_base64(full_image_path)
            return f"![{match.group(1)}](data:image/png;base64,{encoded_image})"
        else:
            return match.group(0)  # return the original markdown if image not found

    # Regex to find markdown image syntax ![alt text](image_path)
    image_regex = re.compile(r'!\[([^\]]*)\]\(([^)]+)\)')
    new_markdown_text = re.sub(image_regex, replace_image, markdown_text)

    return new_markdown_text

 def process_markdown_files(base_dir: str, include_emb_image: bool) -> List[Dict[str, Any]]:
    """
    Process markdown files to embed images and collect metadata.

    Args:
        base_dir (str): The base directory to search for markdown files.
        include_emb_image (bool): Whether to include embedded images in the output.

    Returns:
        List[Dict[str, Any]]: A list of dictionaries containing metadata and processed text.
    """
    md_files = []
    for root, dirs, files in os.walk(base_dir):
        for file in files:
            if file == "index.md":
                with open(os.path.join(root, file), "r") as f:
                    text = f.read()
                    
                data = {
                    "title": os.path.basename(root),
                    "path": os.path.join(root, file),
                    "text": text,
                    "total_chars": len(text),
                    "file_size_mb": os.path.getsize(os.path.join(root, file)) / 1024 / 1024
                }

                if include_emb_image:
                    text_w_embed_image = embed_images_in_markdown(text, base_path=root)
                    data['text_w_embed_image'] = text_w_embed_image

                md_files.append(data)
    return md_files

 def main():
    parser = argparse.ArgumentParser(description="Process markdown files and push to Hugging Face Hub.")
    parser.add_argument("--base_dir", type=str, default=".", help="Base directory to search for markdown files.")
    parser.add_argument("--dataset_name", type=str, required=True, help="Name of the dataset to push to Hugging Face Hub.")
    parser.add_argument("--private", action="store_true", help="Set the dataset visibility to private.")
    parser.add_argument("--include_emb_image", action="store_true", help="Include embedded images in the output.")
    args = parser.parse_args()

    md_files = process_markdown_files(args.base_dir, args.include_emb_image)
    dataset = Dataset.from_list(md_files)
    dataset.push_to_hub(args.dataset_name, private=args.private)

 if __name__ == "__main__":
    main()
	import os
	import re
	import base64
	import argparse
	from typing import List, Dict, Any
	from datasets import Dataset

	def embed_images_in_markdown(markdown_text: str, base_path: str = ".") -> str:
	"""
	Embed images in markdown text as base64 encoded strings.

	Args:
	markdown_text (str): The markdown text containing image links.
	base_path (str): The base path to resolve relative image paths.

	Returns:
	str: The markdown text with images embedded as base64 strings.
	"""
	def encode_image_to_base64(image_path: str) -> str:
	with open(image_path, "rb") as image_file:
	encoded_string = base64.b64encode(image_file.read()).decode('utf-8')
	return encoded_string

	def replace_image(match: re.Match) -> str:
	image_path = match.group(2)
	full_image_path = os.path.join(base_path, image_path)
	if os.path.exists(full_image_path):
	encoded_image = encode_image_to_base64(full_image_path)
	return f"![{match.group(1)}](data:image/png;base64,{encoded_image})"
	else:
	return match.group(0) # return the original markdown if image not found

	# Regex to find markdown image syntax ![alt text](image_path)
	image_regex = re.compile(r'!\[([^\]]*)\]\(([^)]+)\)')
	new_markdown_text = re.sub(image_regex, replace_image, markdown_text)

	return new_markdown_text

	def process_markdown_files(base_dir: str, include_emb_image: bool) -> List[Dict[str, Any]]:
	"""
	Process markdown files to embed images and collect metadata.

	Args:
	base_dir (str): The base directory to search for markdown files.
	include_emb_image (bool): Whether to include embedded images in the output.

	Returns:
	List[Dict[str, Any]]: A list of dictionaries containing metadata and processed text.
	"""
	md_files = []
	for root, dirs, files in os.walk(base_dir):
	for file in files:
	if file == "index.md":
	with open(os.path.join(root, file), "r") as f:
	text = f.read()

	data = {
	"title": os.path.basename(root),
	"path": os.path.join(root, file),
	"text": text,
	"total_chars": len(text),
	"file_size_mb": os.path.getsize(os.path.join(root, file)) / 1024 / 1024
	}

	if include_emb_image:
	text_w_embed_image = embed_images_in_markdown(text, base_path=root)
	data['text_w_embed_image'] = text_w_embed_image

	md_files.append(data)
	return md_files

	def main():
	parser = argparse.ArgumentParser(description="Process markdown files and push to Hugging Face Hub.")
	parser.add_argument("--base_dir", type=str, default=".", help="Base directory to search for markdown files.")
	parser.add_argument("--dataset_name", type=str, required=True, help="Name of the dataset to push to Hugging Face Hub.")
	parser.add_argument("--private", action="store_true", help="Set the dataset visibility to private.")
	parser.add_argument("--include_emb_image", action="store_true", help="Include embedded images in the output.")
	args = parser.parse_args()

	md_files = process_markdown_files(args.base_dir, args.include_emb_image)
	dataset = Dataset.from_list(md_files)
	dataset.push_to_hub(args.dataset_name, private=args.private)

	if __name__ == "__main__":
	main()