Skip to content

Instantly share code, notes, and snippets.

@david-rodriguez
Created November 18, 2024 14:21
Show Gist options
  • Save david-rodriguez/101822ee91fd8f68d956fadddd7bec4d to your computer and use it in GitHub Desktop.
Save david-rodriguez/101822ee91fd8f68d956fadddd7bec4d to your computer and use it in GitHub Desktop.
This script processes PDF files, extracts text, and splits it into chunks for machine learning applications. It reads PDFs from an input directory, cleans and trims the text, chunks the text based on specified sizes, and outputs the results in JSON Lines format.
import sys
import os
import fitz
import argparse
import json
import nltk
import re
nltk.download('punkt_tab', download_dir='../.venv/nltk_data', quiet=True)
def read_pdf(file_path):
doc = fitz.open(file_path)
text = ''
for page in doc:
text += page.get_text().encode('unicode_escape').decode('unicode_escape')
doc.close()
return text
def trim_text(text):
text = re.sub(r'\\[uU][0-9a-fA-F]{2,4}', lambda m: m.group(0).encode().decode('unicode_escape'), text)
text = re.sub(r'\s+', ' ', text)
text = re.sub(r'([!.?])\1+', r'\1', text)
text = re.sub(r'([a-zA-Z])\1+', r'\1', text)
return text.strip()
def chunk_data(text, size):
sentences = nltk.sent_tokenize(text)
chunks = []
current_chunk = []
current_size = 0
for sentence in sentences:
if current_size + len(sentence) > size:
chunks.append(' '.join(current_chunk))
current_chunk = []
current_size = 0
current_chunk.append(sentence)
current_size += len(sentence)
if current_chunk:
chunks.append(' '.join(current_chunk))
return chunks
def escape_text(text):
return text.replace('\n', ' ').replace('\t', ' ')
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Break PDF contents into chunks for machine learning tasks. This script reads PDF files from a directory, processes their contents into chunks of a specified size, and writes the chunks to a JSON Lines file.')
parser.add_argument('input_dir', type=str, help='Path to the directory containing PDF files')
parser.add_argument('-s', '--size', type=int, default=10000, help='Chunk size')
parser.add_argument('-o', '--output', type=str, required=True, help='Output file path')
args = parser.parse_args()
if not os.path.isdir(args.input_dir):
print('Error: Input directory does not exist')
sys.exit(1)
files = []
for root, _, filenames in os.walk(args.input_dir):
for filename in filenames:
if filename.endswith('.pdf'):
files.append(os.path.join(root, filename))
data = []
for file in files:
text = read_pdf(file)
chunks = chunk_data(text, args.size)
for chunk in chunks:
if chunk:
tokens = nltk.word_tokenize(chunk)
data.append({'file': os.path.basename(file)[:-4], 'text': escape_text(trim_text(chunk)), 'tokens': len(tokens), 'timestamp': os.path.getmtime(file)})
with open(args.output, 'w') as f:
for item in data:
f.write(json.dumps(item) + '\n')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment