|
import os |
|
from pathlib import Path |
|
|
|
import click |
|
from PIL import Image |
|
|
|
from mistralai import Mistral, OCRPageObject |
|
|
|
api_key = os.environ["MISTRAL_API_KEY"] |
|
client = Mistral(api_key=api_key) |
|
|
|
|
|
@click.group() |
|
def cli(): |
|
pass |
|
|
|
|
|
@cli.command() |
|
@click.argument("file", type=click.Path()) |
|
def upload(file): |
|
uploaded_pdf = client.files.upload( |
|
file={ |
|
"file_name": file, |
|
"content": open(file, "rb"), |
|
}, |
|
purpose="ocr", |
|
) |
|
print(uploaded_pdf.id) |
|
|
|
|
|
@cli.command() |
|
@click.argument("id") |
|
@click.option("--output_path", required=True, type=click.Path()) |
|
def process(id, output_path): |
|
signed_url = client.files.get_signed_url(file_id=id) |
|
|
|
ocr_response = client.ocr.process( |
|
model="mistral-ocr-latest", |
|
document={ |
|
"type": "document_url", |
|
"document_url": signed_url.url, |
|
}, |
|
) |
|
|
|
output_path = output_path or Path.cwd() / "data" / "output.jsonl" |
|
print(f"writing to {output_path}") |
|
with open(output_path, "w+") as f: |
|
f.writelines([page.model_dump_json() + "\n" for page in ocr_response.pages]) |
|
|
|
|
|
@cli.command() |
|
@click.argument("data_file") |
|
@click.argument("pdf_file") |
|
@click.option("--output_path", required=False, type=click.Path()) |
|
def extract_images(data_file, pdf_file, output_path): |
|
import pymupdf |
|
import io |
|
|
|
data_file_path = Path(data_file) |
|
output_path = output_path or Path.cwd() / "images" / data_file_path.stem |
|
output_path.mkdir(parents=True, exist_ok=True) |
|
|
|
pdf_document = pymupdf.open(pdf_file) |
|
docs = [] |
|
with open(data_file, "r") as f: |
|
docs = map(lambda line: OCRPageObject.model_validate_json(line), f.readlines()) |
|
|
|
for doc in docs: |
|
if len(doc.images) > 0 and doc.dimensions: |
|
page = pdf_document.load_page(doc.index) |
|
pix = page.get_pixmap(dpi=doc.dimensions.dpi) |
|
image_data = pix.tobytes("ppm") |
|
image = Image.open(io.BytesIO(image_data)) |
|
for img_data in doc.images: |
|
cropped_image = image.crop( |
|
( |
|
img_data.top_left_x, |
|
img_data.top_left_y, |
|
img_data.bottom_right_x, |
|
img_data.bottom_right_y, |
|
) |
|
) |
|
output_path_id = output_path / img_data.id |
|
print(f"writing image to {output_path_id}") |
|
cropped_image.save(output_path_id) |
|
|
|
|
|
if __name__ == "__main__": |
|
cli() |