Created
February 13, 2025 07:07
-
-
Save naufalso/4c7a617dddc7d3a8e52b55a3e1e19026 to your computer and use it in GitHub Desktop.
Describe images inside markdown with VLM
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
""" | |
Markdown Image Descriptor Script | |
This script processes a markdown file, extracts images, encodes them in base64, | |
and sends them with contextual text to a Visual Language Model (VLM) for description extraction. | |
The image markdown references are then replaced with the structured descriptions provided by the model. | |
If an image is deemed non-important (e.g., logos) it is replaced with a markdown comment. | |
Additionally, if an image contains text information (e.g., terminal screenshots, tables, code snippets), | |
the VLM is instructed to extract and return only the text content using markdown formatting. | |
""" | |
import os | |
import re | |
import base64 | |
import requests | |
import logging | |
from tqdm import tqdm | |
from openai import OpenAI # Ensure you have the appropriate client/library installed. | |
from langchain_text_splitters import MarkdownHeaderTextSplitter | |
# Configure logging | |
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") | |
def load_and_chunk_markdown(file_path: str, text_splitter: MarkdownHeaderTextSplitter): | |
""" | |
Loads a markdown file and splits it into chunks using the provided text splitter. | |
Parameters: | |
file_path (str): Path to the markdown file. | |
text_splitter (MarkdownHeaderTextSplitter): A text splitter instance. | |
Returns: | |
tuple: (full_content, chunks) | |
""" | |
try: | |
with open(file_path, "r", encoding="utf-8") as f: | |
content = f.read() | |
except Exception as e: | |
logging.error(f"Error reading file {file_path}: {e}") | |
raise | |
chunks = text_splitter.split_text(content) | |
return content, chunks | |
def encode_image(image_path: str) -> str: | |
""" | |
Encodes an image to a base64 string. | |
If the image_path is a URL, the image will be downloaded; otherwise, it is read from the local filesystem. | |
Parameters: | |
image_path (str): The URL or local path to the image file. | |
Returns: | |
str: A base64-encoded string formatted as "data:image/{image_type};base64,{encoded_data}". | |
""" | |
try: | |
if image_path.startswith("http://") or image_path.startswith("https://"): | |
response = requests.get(image_path) | |
response.raise_for_status() | |
image_data = response.content | |
# Extract image type from URL, handling possible query strings | |
image_type = image_path.split(".")[-1].split("?")[0] | |
else: | |
with open(image_path, "rb") as image_file: | |
image_data = image_file.read() | |
image_type = image_path.split(".")[-1] | |
except Exception as e: | |
logging.error(f"Failed to load image from {image_path}: {e}") | |
raise | |
base64_str = base64.b64encode(image_data).decode("utf-8") | |
return f"data:image/{image_type};base64,{base64_str}" | |
def is_non_important_image(alt_text: str, link: str) -> bool: | |
""" | |
Determines if an image is non-important based on its alt text or filename. | |
Parameters: | |
alt_text (str): The alt text from the markdown image. | |
link (str): The image URL or path. | |
Returns: | |
bool: True if the image is considered non-important, False otherwise. | |
""" | |
non_important_keywords = ["logo", "favicon", "icon"] | |
alt_text_lower = alt_text.lower() if alt_text else "" | |
filename_lower = os.path.basename(link).lower() | |
return any(keyword in alt_text_lower or keyword in filename_lower for keyword in non_important_keywords) | |
def find_image_in_chunk(chunk_text: str, base_path: str) -> dict: | |
""" | |
Extracts image references from the provided markdown text chunk. | |
Searches for image markdown syntax (e.g., ) and, for each found image, | |
returns a dictionary with: | |
- "alt_text": the alternate text for the image. | |
- "prompt": the chunk text with the image reference replaced by "[IMAGE]". | |
- "original_ref": the original markdown image reference. | |
- "image_base64": the base64 encoded string of the image (local paths are resolved against base_path). | |
Parameters: | |
chunk_text (str): Markdown text containing potential image references. | |
base_path (str): Base directory for resolving relative image file paths. | |
Returns: | |
dict: Keys are the image links and values are details dictionaries. | |
""" | |
contexts = {} | |
# Regex to match markdown image syntax:  | |
pattern = r"!\[(.*?)\]\((.*?)\)" | |
for match in re.finditer(pattern, chunk_text): | |
alt_text, link = match.groups() | |
original_ref = match.group(0) | |
prompt = chunk_text.replace(original_ref, "[IMAGE]") | |
# Resolve relative paths for local images | |
if not (link.startswith("http://") or link.startswith("https://")): | |
image_full_path = os.path.join(base_path, link) | |
else: | |
image_full_path = link | |
try: | |
image_base64 = encode_image(image_full_path) | |
except Exception: | |
logging.error(f"Skipping image {image_full_path} due to an encoding error.") | |
image_base64 = "" | |
contexts[link] = { | |
"alt_text": alt_text, | |
"prompt": prompt, | |
"original_ref": original_ref, | |
"image_base64": image_base64, | |
} | |
return contexts | |
def describe_images(full_content: str, | |
chunks: list, | |
client: OpenAI, | |
system_message: str, | |
base_path: str, | |
progress_desc: str = "Describing images") -> str: | |
""" | |
Processes markdown content to extract and describe images. | |
For each chunk, it identifies image references and sends the context along with the base64 | |
encoded image to a Visual Language Model. The resulting description replaces the original image reference. | |
Non-important images (e.g., logos) are replaced with a markdown comment. | |
For images that contain text information (e.g., terminal screenshots, tables, code snippets), | |
the VLM is instructed to extract only the text content in markdown format. | |
Parameters: | |
full_content (str): The full markdown text. | |
chunks (list): A list of markdown text chunks. | |
client (OpenAI): An instance of the OpenAI client used for image description. | |
system_message (str): The system prompt for the VLM. | |
base_path (str): Base path for resolving local image paths. | |
progress_desc (str, optional): Label for the progress bar. | |
Returns: | |
str: The updated markdown content. | |
""" | |
for chunk in tqdm(chunks, desc=progress_desc): | |
contexts = find_image_in_chunk(chunk, base_path) | |
if contexts: | |
for image_link, context in contexts.items(): | |
# Check if the image is non-important (e.g., a logo, favicon, or icon) | |
if is_non_important_image(context["alt_text"], image_link): | |
logging.info(f"Image '{image_link}' deemed non-important. Replacing with markdown comment.") | |
full_content = full_content.replace(context["original_ref"], "<!-- logo -->") | |
continue | |
if not context["image_base64"]: | |
continue # Skip images that failed to encode | |
try: | |
response = client.chat.completions.create( | |
model="mistralai/Pixtral-Large-Instruct-2411", # Update as needed. | |
messages=[ | |
{"role": "system", "content": system_message}, | |
{ | |
"role": "user", | |
"content": [ | |
{"type": "text", "text": context["prompt"]}, | |
{ | |
"type": "image_url", | |
"image_url": {"url": context["image_base64"]}, | |
}, | |
], | |
}, | |
], | |
max_tokens=32000, | |
) | |
described_text = response.choices[0].message.content.strip() | |
except Exception as e: | |
logging.error(f"Error describing image {image_link}: {e}") | |
described_text = "<!--- Error describing image --->" | |
full_content = full_content.replace(context["original_ref"], described_text) | |
return full_content | |
def parse_arguments(): | |
""" | |
Parses command line arguments. | |
Returns: | |
argparse.Namespace: Parsed arguments. | |
""" | |
import argparse | |
parser = argparse.ArgumentParser( | |
description="Process a markdown file to describe embedded images using a Visual Language Model." | |
) | |
parser.add_argument("--input", "-i", required=True, help="Path to the input markdown file.") | |
parser.add_argument("--output", "-o", required=True, help="Path to the output markdown file.") | |
parser.add_argument("--model_endpoint", required=False, help="Model endpoint URL (if not using default).") | |
parser.add_argument("--api_key", required=False, help="API key for the OpenAI client.") | |
return parser.parse_args() | |
def main(): | |
args = parse_arguments() | |
input_markdown = args.input | |
output_markdown = args.output | |
model_endpoint = args.model_endpoint | |
api_key = args.api_key if args.api_key else "Dummy" | |
# Define the system message for the Visual Language Model with extended guidelines. | |
system_message = ( | |
"You are a Visual Language Model (VLM). Carefully extract information from images of any type—" | |
"such as diagrams, figures, terminal screenshots, user interfaces, tables, code snippets, and other visuals—and output structured text " | |
"directly in the specified format without any additional explanations or notes. Your goal is to help make the document " | |
"readable for those who can see the image. You will be given a chunk of the preceding and following texts as context for the figure.\n\n" | |
"Below are the extraction guidelines:\n\n" | |
"1. For all content types, extract:\n" | |
"- **Title**: If present.\n" | |
"- **Description**: A concise description of the content.\n" | |
"- **Key Points**: A summary of the main ideas or important details.\n" | |
"- **Data**: Important data, values, or text displayed, if present.\n" | |
"- **Annotations**: Any comments or labels present, if applicable.\n\n" | |
"2. **Output Format:**\n" | |
"- Wrap the entire output with markdown triple backticks.\n" | |
"- Use the following structure for visual contents:\n\n" | |
"```markdown\n" | |
"### Content Type: [Content Type]\n" | |
"#### Title: [Title]\n" | |
"#### Description:\n" | |
"- [Description of the content]\n" | |
"#### Key Points:\n" | |
"- [List of key points]\n" | |
"#### Data:\n" | |
"- [Optional, if there's extracted data, values, or text]\n" | |
"#### Annotations:\n" | |
"- [Optional, any annotations or comments]\n" | |
"```\n\n" | |
"3. **For Images Containing Text Information (e.g., terminal screenshots, tables, code snippets):**\n" | |
"- Extract only the text content present in the image.\n" | |
"- Preserve the original formatting by using appropriate markdown formatting such as triple backticks for code or markdown table formatting for tables.\n" | |
"- Do not include additional explanations or structure beyond the extracted text.\n\n" | |
"4. **Multiple Contents:**\n" | |
"- If the image contains multiple distinct contents, separate them clearly using the same format for each.\n" | |
"5. **Unreadable Content:**\n" | |
"- For any unreadable or uninterpretable sections, directly output: \"<!--- Figure unreadable --->\" without any other details and without backticks.\n" | |
"6. **Unimportant Content:**\n" | |
"- If the image is not important (e.g., a logo) and does not add additional details, output: \"<!-- logo -->\".\n" | |
"7. **Direct Output Only:**\n" | |
"- Provide only the extracted information in the specified format without explanations or extra notes." | |
) | |
# Initialize the markdown splitter with header rules. | |
headers_to_split_on = [ | |
("#", "Title"), | |
("##", "Chapter"), | |
("###", "Subchapter"), | |
] | |
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on, strip_headers=False) | |
# Load and chunk the markdown file. | |
logging.info(f"Loading markdown file from {input_markdown}") | |
full_content, chunks = load_and_chunk_markdown(input_markdown, markdown_splitter) | |
# Determine base path for resolving relative image paths. | |
base_path = os.path.dirname(os.path.abspath(input_markdown)) | |
# Initialize the OpenAI client; add the model endpoint if provided. | |
client_kwargs = {} | |
if model_endpoint: | |
client_kwargs["base_url"] = model_endpoint | |
if api_key: | |
client_kwargs["api_key"] = api_key | |
client = OpenAI(**client_kwargs) | |
# Process the markdown to describe images. | |
logging.info("Processing images in the markdown content...") | |
updated_content = describe_images(full_content, chunks, client, system_message, base_path) | |
# Write the updated markdown to the output file. | |
try: | |
with open(output_markdown, "w", encoding="utf-8") as out_file: | |
out_file.write(updated_content) | |
logging.info(f"Processed markdown saved to {output_markdown}") | |
except Exception as e: | |
logging.error(f"Failed to write output file {output_markdown}: {e}") | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment