naufalso · February 13, 2025 07:07
diff --git a/describe_md_image_vlm.py b/describe_md_image_vlm.py
 #!/usr/bin/env python3
 """
 Markdown Image Descriptor Script

 This script processes a markdown file, extracts images, encodes them in base64,
 and sends them with contextual text to a Visual Language Model (VLM) for description extraction.
 The image markdown references are then replaced with the structured descriptions provided by the model.
 If an image is deemed non-important (e.g., logos) it is replaced with a markdown comment.
 Additionally, if an image contains text information (e.g., terminal screenshots, tables, code snippets),
 the VLM is instructed to extract and return only the text content using markdown formatting.
 """

 import os
 import re
 import base64
 import requests
 import logging
 from tqdm import tqdm
 from openai import OpenAI  # Ensure you have the appropriate client/library installed.
 from langchain_text_splitters import MarkdownHeaderTextSplitter

 # Configure logging
 logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")


 def load_and_chunk_markdown(file_path: str, text_splitter: MarkdownHeaderTextSplitter):
    """
    Loads a markdown file and splits it into chunks using the provided text splitter.

    Parameters:
        file_path (str): Path to the markdown file.
        text_splitter (MarkdownHeaderTextSplitter): A text splitter instance.

    Returns:
        tuple: (full_content, chunks)
    """
    try:
        with open(file_path, "r", encoding="utf-8") as f:
            content = f.read()
    except Exception as e:
        logging.error(f"Error reading file {file_path}: {e}")
        raise

    chunks = text_splitter.split_text(content)
    return content, chunks


 def encode_image(image_path: str) -> str:
    """
    Encodes an image to a base64 string.

    If the image_path is a URL, the image will be downloaded; otherwise, it is read from the local filesystem.

    Parameters:
        image_path (str): The URL or local path to the image file.

    Returns:
        str: A base64-encoded string formatted as "data:image/{image_type};base64,{encoded_data}".
    """
    try:
        if image_path.startswith("http://") or image_path.startswith("https://"):
            response = requests.get(image_path)
            response.raise_for_status()
            image_data = response.content
            # Extract image type from URL, handling possible query strings
            image_type = image_path.split(".")[-1].split("?")[0]
        else:
            with open(image_path, "rb") as image_file:
                image_data = image_file.read()
            image_type = image_path.split(".")[-1]
    except Exception as e:
        logging.error(f"Failed to load image from {image_path}: {e}")
        raise

    base64_str = base64.b64encode(image_data).decode("utf-8")
    return f"data:image/{image_type};base64,{base64_str}"


 def is_non_important_image(alt_text: str, link: str) -> bool:
    """
    Determines if an image is non-important based on its alt text or filename.

    Parameters:
        alt_text (str): The alt text from the markdown image.
        link (str): The image URL or path.

    Returns:
        bool: True if the image is considered non-important, False otherwise.
    """
    non_important_keywords = ["logo", "favicon", "icon"]
    alt_text_lower = alt_text.lower() if alt_text else ""
    filename_lower = os.path.basename(link).lower()
    return any(keyword in alt_text_lower or keyword in filename_lower for keyword in non_important_keywords)


 def find_image_in_chunk(chunk_text: str, base_path: str) -> dict:
    """
    Extracts image references from the provided markdown text chunk.

    Searches for image markdown syntax (e.g., ![alt_text](url)) and, for each found image,
    returns a dictionary with:
      - "alt_text": the alternate text for the image.
      - "prompt": the chunk text with the image reference replaced by "[IMAGE]".
      - "original_ref": the original markdown image reference.
      - "image_base64": the base64 encoded string of the image (local paths are resolved against base_path).

    Parameters:
        chunk_text (str): Markdown text containing potential image references.
        base_path (str): Base directory for resolving relative image file paths.

    Returns:
        dict: Keys are the image links and values are details dictionaries.
    """
    contexts = {}
    # Regex to match markdown image syntax: ![alt_text](url)
    pattern = r"!\[(.*?)\]\((.*?)\)"
    for match in re.finditer(pattern, chunk_text):
        alt_text, link = match.groups()
        original_ref = match.group(0)
        prompt = chunk_text.replace(original_ref, "[IMAGE]")
        # Resolve relative paths for local images
        if not (link.startswith("http://") or link.startswith("https://")):
            image_full_path = os.path.join(base_path, link)
        else:
            image_full_path = link

        try:
            image_base64 = encode_image(image_full_path)
        except Exception:
            logging.error(f"Skipping image {image_full_path} due to an encoding error.")
            image_base64 = ""

        contexts[link] = {
            "alt_text": alt_text,
            "prompt": prompt,
            "original_ref": original_ref,
            "image_base64": image_base64,
        }
    return contexts


 def describe_images(full_content: str,
                    chunks: list,
                    client: OpenAI,
                    system_message: str,
                    base_path: str,
                    progress_desc: str = "Describing images") -> str:
    """
    Processes markdown content to extract and describe images.

    For each chunk, it identifies image references and sends the context along with the base64
    encoded image to a Visual Language Model. The resulting description replaces the original image reference.
    Non-important images (e.g., logos) are replaced with a markdown comment.
    For images that contain text information (e.g., terminal screenshots, tables, code snippets),
    the VLM is instructed to extract only the text content in markdown format.

    Parameters:
        full_content (str): The full markdown text.
        chunks (list): A list of markdown text chunks.
        client (OpenAI): An instance of the OpenAI client used for image description.
        system_message (str): The system prompt for the VLM.
        base_path (str): Base path for resolving local image paths.
        progress_desc (str, optional): Label for the progress bar.

    Returns:
        str: The updated markdown content.
    """
    for chunk in tqdm(chunks, desc=progress_desc):
        contexts = find_image_in_chunk(chunk, base_path)
        if contexts:
            for image_link, context in contexts.items():
                # Check if the image is non-important (e.g., a logo, favicon, or icon)
                if is_non_important_image(context["alt_text"], image_link):
                    logging.info(f"Image '{image_link}' deemed non-important. Replacing with markdown comment.")
                    full_content = full_content.replace(context["original_ref"], "<!-- logo -->")
                    continue

                if not context["image_base64"]:
                    continue  # Skip images that failed to encode

                try:
                    response = client.chat.completions.create(
                        model="mistralai/Pixtral-Large-Instruct-2411",  # Update as needed.
                        messages=[
                            {"role": "system", "content": system_message},
                            {
                                "role": "user",
                                "content": [
                                    {"type": "text", "text": context["prompt"]},
                                    {
                                        "type": "image_url",
                                        "image_url": {"url": context["image_base64"]},
                                    },
                                ],
                            },
                        ],
                        max_tokens=32000,
                    )
                    described_text = response.choices[0].message.content.strip()
                except Exception as e:
                    logging.error(f"Error describing image {image_link}: {e}")
                    described_text = "<!--- Error describing image --->"

                full_content = full_content.replace(context["original_ref"], described_text)
    return full_content


 def parse_arguments():
    """
    Parses command line arguments.

    Returns:
        argparse.Namespace: Parsed arguments.
    """
    import argparse
    parser = argparse.ArgumentParser(
        description="Process a markdown file to describe embedded images using a Visual Language Model."
    )
    parser.add_argument("--input", "-i", required=True, help="Path to the input markdown file.")
    parser.add_argument("--output", "-o", required=True, help="Path to the output markdown file.")
    parser.add_argument("--model_endpoint", required=False, help="Model endpoint URL (if not using default).")
    parser.add_argument("--api_key", required=False, help="API key for the OpenAI client.")
    return parser.parse_args()


 def main():
    args = parse_arguments()

    input_markdown = args.input
    output_markdown = args.output
    model_endpoint = args.model_endpoint
    api_key = args.api_key if args.api_key else "Dummy"

    # Define the system message for the Visual Language Model with extended guidelines.
    system_message = (
        "You are a Visual Language Model (VLM). Carefully extract information from images of any type—"
        "such as diagrams, figures, terminal screenshots, user interfaces, tables, code snippets, and other visuals—and output structured text "
        "directly in the specified format without any additional explanations or notes. Your goal is to help make the document "
        "readable for those who can see the image. You will be given a chunk of the preceding and following texts as context for the figure.\n\n"
        "Below are the extraction guidelines:\n\n"
        "1. For all content types, extract:\n"
        "- **Title**: If present.\n"
        "- **Description**: A concise description of the content.\n"
        "- **Key Points**: A summary of the main ideas or important details.\n"
        "- **Data**: Important data, values, or text displayed, if present.\n"
        "- **Annotations**: Any comments or labels present, if applicable.\n\n"
        "2. **Output Format:**\n"
        "- Wrap the entire output with markdown triple backticks.\n"
        "- Use the following structure for visual contents:\n\n"
        "```markdown\n"
        "### Content Type: [Content Type]\n"
        "#### Title: [Title]\n"
        "#### Description:\n"
        "- [Description of the content]\n"
        "#### Key Points:\n"
        "- [List of key points]\n"
        "#### Data:\n"
        "- [Optional, if there's extracted data, values, or text]\n"
        "#### Annotations:\n"
        "- [Optional, any annotations or comments]\n"
        "```\n\n"
        "3. **For Images Containing Text Information (e.g., terminal screenshots, tables, code snippets):**\n"
        "- Extract only the text content present in the image.\n"
        "- Preserve the original formatting by using appropriate markdown formatting such as triple backticks for code or markdown table formatting for tables.\n"
        "- Do not include additional explanations or structure beyond the extracted text.\n\n"
        "4. **Multiple Contents:**\n"
        "- If the image contains multiple distinct contents, separate them clearly using the same format for each.\n"
        "5. **Unreadable Content:**\n"
        "- For any unreadable or uninterpretable sections, directly output: \"<!--- Figure unreadable --->\" without any other details and without backticks.\n"
        "6. **Unimportant Content:**\n"
        "- If the image is not important (e.g., a logo) and does not add additional details, output: \"<!-- logo -->\".\n"
        "7. **Direct Output Only:**\n"
        "- Provide only the extracted information in the specified format without explanations or extra notes."
    )

    # Initialize the markdown splitter with header rules.
    headers_to_split_on = [
        ("#", "Title"),
        ("##", "Chapter"),
        ("###", "Subchapter"),
    ]
    markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on, strip_headers=False)

    # Load and chunk the markdown file.
    logging.info(f"Loading markdown file from {input_markdown}")
    full_content, chunks = load_and_chunk_markdown(input_markdown, markdown_splitter)

    # Determine base path for resolving relative image paths.
    base_path = os.path.dirname(os.path.abspath(input_markdown))

    # Initialize the OpenAI client; add the model endpoint if provided.
    client_kwargs = {}
    if model_endpoint:
        client_kwargs["base_url"] = model_endpoint
    if api_key:
        client_kwargs["api_key"] = api_key
    client = OpenAI(**client_kwargs)

    # Process the markdown to describe images.
    logging.info("Processing images in the markdown content...")
    updated_content = describe_images(full_content, chunks, client, system_message, base_path)

    # Write the updated markdown to the output file.
    try:
        with open(output_markdown, "w", encoding="utf-8") as out_file:
            out_file.write(updated_content)
        logging.info(f"Processed markdown saved to {output_markdown}")
    except Exception as e:
        logging.error(f"Failed to write output file {output_markdown}: {e}")


 if __name__ == "__main__":
    main()
	#!/usr/bin/env python3
	"""
	Markdown Image Descriptor Script

	This script processes a markdown file, extracts images, encodes them in base64,
	and sends them with contextual text to a Visual Language Model (VLM) for description extraction.
	The image markdown references are then replaced with the structured descriptions provided by the model.
	If an image is deemed non-important (e.g., logos) it is replaced with a markdown comment.
	Additionally, if an image contains text information (e.g., terminal screenshots, tables, code snippets),
	the VLM is instructed to extract and return only the text content using markdown formatting.
	"""

	import os
	import re
	import base64
	import requests
	import logging
	from tqdm import tqdm
	from openai import OpenAI # Ensure you have the appropriate client/library installed.
	from langchain_text_splitters import MarkdownHeaderTextSplitter

	# Configure logging
	logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")


	def load_and_chunk_markdown(file_path: str, text_splitter: MarkdownHeaderTextSplitter):
	"""
	Loads a markdown file and splits it into chunks using the provided text splitter.

	Parameters:
	file_path (str): Path to the markdown file.
	text_splitter (MarkdownHeaderTextSplitter): A text splitter instance.

	Returns:
	tuple: (full_content, chunks)
	"""
	try:
	with open(file_path, "r", encoding="utf-8") as f:
	content = f.read()
	except Exception as e:
	logging.error(f"Error reading file {file_path}: {e}")
	raise

	chunks = text_splitter.split_text(content)
	return content, chunks


	def encode_image(image_path: str) -> str:
	"""
	Encodes an image to a base64 string.

	If the image_path is a URL, the image will be downloaded; otherwise, it is read from the local filesystem.

	Parameters:
	image_path (str): The URL or local path to the image file.

	Returns:
	str: A base64-encoded string formatted as "data:image/{image_type};base64,{encoded_data}".
	"""
	try:
	if image_path.startswith("http://") or image_path.startswith("https://"):
	response = requests.get(image_path)
	response.raise_for_status()
	image_data = response.content
	# Extract image type from URL, handling possible query strings
	image_type = image_path.split(".")[-1].split("?")[0]
	else:
	with open(image_path, "rb") as image_file:
	image_data = image_file.read()
	image_type = image_path.split(".")[-1]
	except Exception as e:
	logging.error(f"Failed to load image from {image_path}: {e}")
	raise

	base64_str = base64.b64encode(image_data).decode("utf-8")
	return f"data:image/{image_type};base64,{base64_str}"


	def is_non_important_image(alt_text: str, link: str) -> bool:
	"""
	Determines if an image is non-important based on its alt text or filename.

	Parameters:
	alt_text (str): The alt text from the markdown image.
	link (str): The image URL or path.

	Returns:
	bool: True if the image is considered non-important, False otherwise.
	"""
	non_important_keywords = ["logo", "favicon", "icon"]
	alt_text_lower = alt_text.lower() if alt_text else ""
	filename_lower = os.path.basename(link).lower()
	return any(keyword in alt_text_lower or keyword in filename_lower for keyword in non_important_keywords)


	def find_image_in_chunk(chunk_text: str, base_path: str) -> dict:
	"""
	Extracts image references from the provided markdown text chunk.

	Searches for image markdown syntax (e.g., ![alt_text](url)) and, for each found image,
	returns a dictionary with:
	- "alt_text": the alternate text for the image.
	- "prompt": the chunk text with the image reference replaced by "[IMAGE]".
	- "original_ref": the original markdown image reference.
	- "image_base64": the base64 encoded string of the image (local paths are resolved against base_path).

	Parameters:
	chunk_text (str): Markdown text containing potential image references.
	base_path (str): Base directory for resolving relative image file paths.

	Returns:
	dict: Keys are the image links and values are details dictionaries.
	"""
	contexts = {}
	# Regex to match markdown image syntax: ![alt_text](url)
	pattern = r"!\[(.?)\]\((.?)\)"
	for match in re.finditer(pattern, chunk_text):
	alt_text, link = match.groups()
	original_ref = match.group(0)
	prompt = chunk_text.replace(original_ref, "[IMAGE]")
	# Resolve relative paths for local images
	if not (link.startswith("http://") or link.startswith("https://")):
	image_full_path = os.path.join(base_path, link)
	else:
	image_full_path = link

	try:
	image_base64 = encode_image(image_full_path)
	except Exception:
	logging.error(f"Skipping image {image_full_path} due to an encoding error.")
	image_base64 = ""

	contexts[link] = {
	"alt_text": alt_text,
	"prompt": prompt,
	"original_ref": original_ref,
	"image_base64": image_base64,
	}
	return contexts


	def describe_images(full_content: str,
	chunks: list,
	client: OpenAI,
	system_message: str,
	base_path: str,
	progress_desc: str = "Describing images") -> str:
	"""
	Processes markdown content to extract and describe images.

	For each chunk, it identifies image references and sends the context along with the base64
	encoded image to a Visual Language Model. The resulting description replaces the original image reference.
	Non-important images (e.g., logos) are replaced with a markdown comment.
	For images that contain text information (e.g., terminal screenshots, tables, code snippets),
	the VLM is instructed to extract only the text content in markdown format.

	Parameters:
	full_content (str): The full markdown text.
	chunks (list): A list of markdown text chunks.
	client (OpenAI): An instance of the OpenAI client used for image description.
	system_message (str): The system prompt for the VLM.
	base_path (str): Base path for resolving local image paths.
	progress_desc (str, optional): Label for the progress bar.

	Returns:
	str: The updated markdown content.
	"""
	for chunk in tqdm(chunks, desc=progress_desc):
	contexts = find_image_in_chunk(chunk, base_path)
	if contexts:
	for image_link, context in contexts.items():
	# Check if the image is non-important (e.g., a logo, favicon, or icon)
	if is_non_important_image(context["alt_text"], image_link):
	logging.info(f"Image '{image_link}' deemed non-important. Replacing with markdown comment.")
	full_content = full_content.replace(context["original_ref"], "<!-- logo -->")
	continue

	if not context["image_base64"]:
	continue # Skip images that failed to encode

	try:
	response = client.chat.completions.create(
	model="mistralai/Pixtral-Large-Instruct-2411", # Update as needed.
	messages=[
	{"role": "system", "content": system_message},
	{
	"role": "user",
	"content": [
	{"type": "text", "text": context["prompt"]},
	{
	"type": "image_url",
	"image_url": {"url": context["image_base64"]},
	},
	],
	},
	],
	max_tokens=32000,
	)
	described_text = response.choices[0].message.content.strip()
	except Exception as e:
	logging.error(f"Error describing image {image_link}: {e}")
	described_text = "<!--- Error describing image --->"

	full_content = full_content.replace(context["original_ref"], described_text)
	return full_content


	def parse_arguments():
	"""
	Parses command line arguments.

	Returns:
	argparse.Namespace: Parsed arguments.
	"""
	import argparse
	parser = argparse.ArgumentParser(
	description="Process a markdown file to describe embedded images using a Visual Language Model."
	)
	parser.add_argument("--input", "-i", required=True, help="Path to the input markdown file.")
	parser.add_argument("--output", "-o", required=True, help="Path to the output markdown file.")
	parser.add_argument("--model_endpoint", required=False, help="Model endpoint URL (if not using default).")
	parser.add_argument("--api_key", required=False, help="API key for the OpenAI client.")
	return parser.parse_args()


	def main():
	args = parse_arguments()

	input_markdown = args.input
	output_markdown = args.output
	model_endpoint = args.model_endpoint
	api_key = args.api_key if args.api_key else "Dummy"

	# Define the system message for the Visual Language Model with extended guidelines.
	system_message = (
	"You are a Visual Language Model (VLM). Carefully extract information from images of any type—"
	"such as diagrams, figures, terminal screenshots, user interfaces, tables, code snippets, and other visuals—and output structured text "
	"directly in the specified format without any additional explanations or notes. Your goal is to help make the document "
	"readable for those who can see the image. You will be given a chunk of the preceding and following texts as context for the figure.\n\n"
	"Below are the extraction guidelines:\n\n"
	"1. For all content types, extract:\n"
	"- Title: If present.\n"
	"- Description: A concise description of the content.\n"
	"- Key Points: A summary of the main ideas or important details.\n"
	"- Data: Important data, values, or text displayed, if present.\n"
	"- Annotations: Any comments or labels present, if applicable.\n\n"
	"2. Output Format:\n"
	"- Wrap the entire output with markdown triple backticks.\n"
	"- Use the following structure for visual contents:\n\n"
	"```markdown\n"
	"### Content Type: [Content Type]\n"
	"#### Title: [Title]\n"
	"#### Description:\n"
	"- [Description of the content]\n"
	"#### Key Points:\n"
	"- [List of key points]\n"
	"#### Data:\n"
	"- [Optional, if there's extracted data, values, or text]\n"
	"#### Annotations:\n"
	"- [Optional, any annotations or comments]\n"
	"```\n\n"
	"3. For Images Containing Text Information (e.g., terminal screenshots, tables, code snippets):\n"
	"- Extract only the text content present in the image.\n"
	"- Preserve the original formatting by using appropriate markdown formatting such as triple backticks for code or markdown table formatting for tables.\n"
	"- Do not include additional explanations or structure beyond the extracted text.\n\n"
	"4. Multiple Contents:\n"
	"- If the image contains multiple distinct contents, separate them clearly using the same format for each.\n"
	"5. Unreadable Content:\n"
	"- For any unreadable or uninterpretable sections, directly output: \"<!--- Figure unreadable --->\" without any other details and without backticks.\n"
	"6. Unimportant Content:\n"
	"- If the image is not important (e.g., a logo) and does not add additional details, output: \"<!-- logo -->\".\n"
	"7. Direct Output Only:\n"
	"- Provide only the extracted information in the specified format without explanations or extra notes."
	)

	# Initialize the markdown splitter with header rules.
	headers_to_split_on = [
	("#", "Title"),
	("##", "Chapter"),
	("###", "Subchapter"),
	]
	markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on, strip_headers=False)

	# Load and chunk the markdown file.
	logging.info(f"Loading markdown file from {input_markdown}")
	full_content, chunks = load_and_chunk_markdown(input_markdown, markdown_splitter)

	# Determine base path for resolving relative image paths.
	base_path = os.path.dirname(os.path.abspath(input_markdown))

	# Initialize the OpenAI client; add the model endpoint if provided.
	client_kwargs = {}
	if model_endpoint:
	client_kwargs["base_url"] = model_endpoint
	if api_key:
	client_kwargs["api_key"] = api_key
	client = OpenAI(**client_kwargs)

	# Process the markdown to describe images.
	logging.info("Processing images in the markdown content...")
	updated_content = describe_images(full_content, chunks, client, system_message, base_path)

	# Write the updated markdown to the output file.
	try:
	with open(output_markdown, "w", encoding="utf-8") as out_file:
	out_file.write(updated_content)
	logging.info(f"Processed markdown saved to {output_markdown}")
	except Exception as e:
	logging.error(f"Failed to write output file {output_markdown}: {e}")


	if __name__ == "__main__":
	main()