Skip to content

Instantly share code, notes, and snippets.

@hitorilabs
Last active March 24, 2025 15:19
Show Gist options
  • Save hitorilabs/e194e8e012318108549806be673b7bc1 to your computer and use it in GitHub Desktop.
Save hitorilabs/e194e8e012318108549806be673b7bc1 to your computer and use it in GitHub Desktop.

crappy little cli i used to process textbooks as i was exploring what the api can do.

extract markdown cat data/output.jsonl | jq .markdown -r > output.md

uv pip install mistralai click Pillow

basically to turn pdf into image... not really sure what else to use but it looks a little sketchy lol

uv pip install pymupdf
import os
from pathlib import Path
import click
from PIL import Image
from mistralai import Mistral, OCRPageObject
api_key = os.environ["MISTRAL_API_KEY"]
client = Mistral(api_key=api_key)
@click.group()
def cli():
pass
@cli.command()
@click.argument("file", type=click.Path())
def upload(file):
uploaded_pdf = client.files.upload(
file={
"file_name": file,
"content": open(file, "rb"),
},
purpose="ocr",
)
print(uploaded_pdf.id)
@cli.command()
@click.argument("id")
@click.option("--output_path", required=True, type=click.Path())
def process(id, output_path):
signed_url = client.files.get_signed_url(file_id=id)
ocr_response = client.ocr.process(
model="mistral-ocr-latest",
document={
"type": "document_url",
"document_url": signed_url.url,
},
)
output_path = output_path or Path.cwd() / "data" / "output.jsonl"
print(f"writing to {output_path}")
with open(output_path, "w+") as f:
f.writelines([page.model_dump_json() + "\n" for page in ocr_response.pages])
@cli.command()
@click.argument("data_file")
@click.argument("pdf_file")
@click.option("--output_path", required=False, type=click.Path())
def extract_images(data_file, pdf_file, output_path):
import pymupdf
import io
data_file_path = Path(data_file)
output_path = output_path or Path.cwd() / "images" / data_file_path.stem
output_path.mkdir(parents=True, exist_ok=True)
pdf_document = pymupdf.open(pdf_file)
docs = []
with open(data_file, "r") as f:
docs = map(lambda line: OCRPageObject.model_validate_json(line), f.readlines())
for doc in docs:
if len(doc.images) > 0 and doc.dimensions:
page = pdf_document.load_page(doc.index)
pix = page.get_pixmap(dpi=doc.dimensions.dpi)
image_data = pix.tobytes("ppm")
image = Image.open(io.BytesIO(image_data))
for img_data in doc.images:
cropped_image = image.crop(
(
img_data.top_left_x,
img_data.top_left_y,
img_data.bottom_right_x,
img_data.bottom_right_y,
)
)
output_path_id = output_path / img_data.id
print(f"writing image to {output_path_id}")
cropped_image.save(output_path_id)
if __name__ == "__main__":
cli()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment