Created
November 18, 2024 14:21
-
-
Save david-rodriguez/101822ee91fd8f68d956fadddd7bec4d to your computer and use it in GitHub Desktop.
This script processes PDF files, extracts text, and splits it into chunks for machine learning applications. It reads PDFs from an input directory, cleans and trims the text, chunks the text based on specified sizes, and outputs the results in JSON Lines format.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import os | |
import fitz | |
import argparse | |
import json | |
import nltk | |
import re | |
nltk.download('punkt_tab', download_dir='../.venv/nltk_data', quiet=True) | |
def read_pdf(file_path): | |
doc = fitz.open(file_path) | |
text = '' | |
for page in doc: | |
text += page.get_text().encode('unicode_escape').decode('unicode_escape') | |
doc.close() | |
return text | |
def trim_text(text): | |
text = re.sub(r'\\[uU][0-9a-fA-F]{2,4}', lambda m: m.group(0).encode().decode('unicode_escape'), text) | |
text = re.sub(r'\s+', ' ', text) | |
text = re.sub(r'([!.?])\1+', r'\1', text) | |
text = re.sub(r'([a-zA-Z])\1+', r'\1', text) | |
return text.strip() | |
def chunk_data(text, size): | |
sentences = nltk.sent_tokenize(text) | |
chunks = [] | |
current_chunk = [] | |
current_size = 0 | |
for sentence in sentences: | |
if current_size + len(sentence) > size: | |
chunks.append(' '.join(current_chunk)) | |
current_chunk = [] | |
current_size = 0 | |
current_chunk.append(sentence) | |
current_size += len(sentence) | |
if current_chunk: | |
chunks.append(' '.join(current_chunk)) | |
return chunks | |
def escape_text(text): | |
return text.replace('\n', ' ').replace('\t', ' ') | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser(description='Break PDF contents into chunks for machine learning tasks. This script reads PDF files from a directory, processes their contents into chunks of a specified size, and writes the chunks to a JSON Lines file.') | |
parser.add_argument('input_dir', type=str, help='Path to the directory containing PDF files') | |
parser.add_argument('-s', '--size', type=int, default=10000, help='Chunk size') | |
parser.add_argument('-o', '--output', type=str, required=True, help='Output file path') | |
args = parser.parse_args() | |
if not os.path.isdir(args.input_dir): | |
print('Error: Input directory does not exist') | |
sys.exit(1) | |
files = [] | |
for root, _, filenames in os.walk(args.input_dir): | |
for filename in filenames: | |
if filename.endswith('.pdf'): | |
files.append(os.path.join(root, filename)) | |
data = [] | |
for file in files: | |
text = read_pdf(file) | |
chunks = chunk_data(text, args.size) | |
for chunk in chunks: | |
if chunk: | |
tokens = nltk.word_tokenize(chunk) | |
data.append({'file': os.path.basename(file)[:-4], 'text': escape_text(trim_text(chunk)), 'tokens': len(tokens), 'timestamp': os.path.getmtime(file)}) | |
with open(args.output, 'w') as f: | |
for item in data: | |
f.write(json.dumps(item) + '\n') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment