hitorilabs · March 24, 2025 15:19
diff --git a/README.md b/README.md
diff --git a/MISTRAL_OCR_CLI.py b/MISTRAL_OCR_CLI.py
 import os
 from pathlib import Path

 import click
 from PIL import Image

 from mistralai import Mistral, OCRPageObject

 api_key = os.environ["MISTRAL_API_KEY"]
 client = Mistral(api_key=api_key)


 @click.group()
 def cli():
    pass


 @cli.command()
 @click.argument("file", type=click.Path())
 def upload(file):
    uploaded_pdf = client.files.upload(
        file={
            "file_name": file,
            "content": open(file, "rb"),
        },
        purpose="ocr",
    )
    print(uploaded_pdf.id)


 @cli.command()
 @click.argument("id")
 @click.option("--output_path", required=True, type=click.Path())
 def process(id, output_path):
    signed_url = client.files.get_signed_url(file_id=id)

    ocr_response = client.ocr.process(
        model="mistral-ocr-latest",
        document={
            "type": "document_url",
            "document_url": signed_url.url,
        },
    )

    output_path = output_path or Path.cwd() / "data" / "output.jsonl"
    print(f"writing to {output_path}")
    with open(output_path, "w+") as f:
        f.writelines([page.model_dump_json() + "\n" for page in ocr_response.pages])


 @cli.command()
 @click.argument("data_file")
 @click.argument("pdf_file")
 @click.option("--output_path", required=False, type=click.Path())
 def extract_images(data_file, pdf_file, output_path):
    import pymupdf
    import io

    data_file_path = Path(data_file)
    output_path = output_path or Path.cwd() / "images" / data_file_path.stem
    output_path.mkdir(parents=True, exist_ok=True)

    pdf_document = pymupdf.open(pdf_file)
    docs = []
    with open(data_file, "r") as f:
        docs = map(lambda line: OCRPageObject.model_validate_json(line), f.readlines())

    for doc in docs:
        if len(doc.images) > 0 and doc.dimensions:
            page = pdf_document.load_page(doc.index)
            pix = page.get_pixmap(dpi=doc.dimensions.dpi)
            image_data = pix.tobytes("ppm")
            image = Image.open(io.BytesIO(image_data))
            for img_data in doc.images:
                cropped_image = image.crop(
                    (
                        img_data.top_left_x,
                        img_data.top_left_y,
                        img_data.bottom_right_x,
                        img_data.bottom_right_y,
                    )
                )
                output_path_id = output_path / img_data.id
                print(f"writing image to {output_path_id}")
                cropped_image.save(output_path_id)


 if __name__ == "__main__":
    cli()
	import os
	from pathlib import Path

	import click
	from PIL import Image

	from mistralai import Mistral, OCRPageObject

	api_key = os.environ["MISTRAL_API_KEY"]
	client = Mistral(api_key=api_key)


	@click.group()
	def cli():
	pass


	@cli.command()
	@click.argument("file", type=click.Path())
	def upload(file):
	uploaded_pdf = client.files.upload(
	file={
	"file_name": file,
	"content": open(file, "rb"),
	},
	purpose="ocr",
	)
	print(uploaded_pdf.id)


	@cli.command()
	@click.argument("id")
	@click.option("--output_path", required=True, type=click.Path())
	def process(id, output_path):
	signed_url = client.files.get_signed_url(file_id=id)

	ocr_response = client.ocr.process(
	model="mistral-ocr-latest",
	document={
	"type": "document_url",
	"document_url": signed_url.url,
	},
	)

	output_path = output_path or Path.cwd() / "data" / "output.jsonl"
	print(f"writing to {output_path}")
	with open(output_path, "w+") as f:
	f.writelines([page.model_dump_json() + "\n" for page in ocr_response.pages])


	@cli.command()
	@click.argument("data_file")
	@click.argument("pdf_file")
	@click.option("--output_path", required=False, type=click.Path())
	def extract_images(data_file, pdf_file, output_path):
	import pymupdf
	import io

	data_file_path = Path(data_file)
	output_path = output_path or Path.cwd() / "images" / data_file_path.stem
	output_path.mkdir(parents=True, exist_ok=True)

	pdf_document = pymupdf.open(pdf_file)
	docs = []
	with open(data_file, "r") as f:
	docs = map(lambda line: OCRPageObject.model_validate_json(line), f.readlines())

	for doc in docs:
	if len(doc.images) > 0 and doc.dimensions:
	page = pdf_document.load_page(doc.index)
	pix = page.get_pixmap(dpi=doc.dimensions.dpi)
	image_data = pix.tobytes("ppm")
	image = Image.open(io.BytesIO(image_data))
	for img_data in doc.images:
	cropped_image = image.crop(
	(
	img_data.top_left_x,
	img_data.top_left_y,
	img_data.bottom_right_x,
	img_data.bottom_right_y,
	)
	)
	output_path_id = output_path / img_data.id
	print(f"writing image to {output_path_id}")
	cropped_image.save(output_path_id)


	if __name__ == "__main__":
	cli()