PIP deps
pip install pytesseract pdf2image python-docx pypdf
Install tesseract-lang
brew install tesseract-lang
Install poppler
brew install poppler
import pytesseract | |
from pdf2image import convert_from_path | |
from docx import Document | |
# Ensure pytesseract knows where the tesseract executable is located | |
pytesseract.pytesseract.tesseract_cmd = r'/opt/homebrew/bin/tesseract' | |
# Path to the PDF file | |
pdf_path = './Stepanov_I_Parizhskaya_kommuna_1871_goda_i_voprosy_taktiki_proletarskoy_revolyutsii.pdf' | |
# Convert PDF to images | |
pages = convert_from_path(pdf_path, 344) | |
# Create a new document | |
doc = Document() | |
# Iterate over all pages | |
for page in pages: | |
# Recognize text in Cyrillic | |
text = pytesseract.image_to_string(page, lang='rus') | |
# Add text to the document | |
doc.add_paragraph(text) | |
# Save the DOCX file | |
doc.save("output.docx") |