atomlab · August 31, 2024 11:40
diff --git a/README.md b/README.md
diff --git a/pdf2doc.py b/pdf2doc.py
 import pytesseract
 from pdf2image import convert_from_path
 from docx import Document

 # Ensure pytesseract knows where the tesseract executable is located
 pytesseract.pytesseract.tesseract_cmd = r'/opt/homebrew/bin/tesseract'

 # Path to the PDF file
 pdf_path = './Stepanov_I_Parizhskaya_kommuna_1871_goda_i_voprosy_taktiki_proletarskoy_revolyutsii.pdf'

 # Convert PDF to images
 pages = convert_from_path(pdf_path, 344)

 # Create a new document
 doc = Document()

 # Iterate over all pages
 for page in pages:
    # Recognize text in Cyrillic
    text = pytesseract.image_to_string(page, lang='rus')
    
    # Add text to the document
    doc.add_paragraph(text)

 # Save the DOCX file
 doc.save("output.docx")
	import pytesseract
	from pdf2image import convert_from_path
	from docx import Document

	# Ensure pytesseract knows where the tesseract executable is located
	pytesseract.pytesseract.tesseract_cmd = r'/opt/homebrew/bin/tesseract'

	# Path to the PDF file
	pdf_path = './Stepanov_I_Parizhskaya_kommuna_1871_goda_i_voprosy_taktiki_proletarskoy_revolyutsii.pdf'

	# Convert PDF to images
	pages = convert_from_path(pdf_path, 344)

	# Create a new document
	doc = Document()

	# Iterate over all pages
	for page in pages:
	# Recognize text in Cyrillic
	text = pytesseract.image_to_string(page, lang='rus')

	# Add text to the document
	doc.add_paragraph(text)

	# Save the DOCX file
	doc.save("output.docx")