david-rodriguez · November 18, 2024 14:21
diff --git a/scrapedata.py b/scrapedata.py
 import sys
 import os
 import fitz
 import argparse
 import json
 import nltk
 import re

 nltk.download('punkt_tab', download_dir='../.venv/nltk_data', quiet=True)

 def read_pdf(file_path):
    doc = fitz.open(file_path)
    text = ''

    for page in doc:
        text += page.get_text().encode('unicode_escape').decode('unicode_escape')

    doc.close()
    return text

 def trim_text(text):
    text = re.sub(r'\\[uU][0-9a-fA-F]{2,4}', lambda m: m.group(0).encode().decode('unicode_escape'), text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'([!.?])\1+', r'\1', text)
    text = re.sub(r'([a-zA-Z])\1+', r'\1', text)

    return text.strip()

 def chunk_data(text, size):
    sentences = nltk.sent_tokenize(text)
    chunks = []
    current_chunk = []
    current_size = 0

    for sentence in sentences:
        if current_size + len(sentence) > size:
            chunks.append(' '.join(current_chunk))
            current_chunk = []
            current_size = 0
            
        current_chunk.append(sentence)
        current_size += len(sentence)

    if current_chunk:
        chunks.append(' '.join(current_chunk))

    return chunks

 def escape_text(text):
    return text.replace('\n', ' ').replace('\t', ' ')

 if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Break PDF contents into chunks for machine learning tasks. This script reads PDF files from a directory, processes their contents into chunks of a specified size, and writes the chunks to a JSON Lines file.')
    parser.add_argument('input_dir', type=str, help='Path to the directory containing PDF files')
    parser.add_argument('-s', '--size', type=int, default=10000, help='Chunk size')
    parser.add_argument('-o', '--output', type=str, required=True, help='Output file path')
    args = parser.parse_args()

    if not os.path.isdir(args.input_dir):
        print('Error: Input directory does not exist')
        sys.exit(1)

    files = []
    for root, _, filenames in os.walk(args.input_dir):
        for filename in filenames:
            if filename.endswith('.pdf'):
                files.append(os.path.join(root, filename))

    data = []
    for file in files:
        text = read_pdf(file)
        chunks = chunk_data(text, args.size)

        for chunk in chunks:
            if chunk:
                tokens = nltk.word_tokenize(chunk)
                data.append({'file': os.path.basename(file)[:-4], 'text': escape_text(trim_text(chunk)), 'tokens': len(tokens), 'timestamp': os.path.getmtime(file)})

    with open(args.output, 'w') as f:
        for item in data:
            f.write(json.dumps(item) + '\n')
	import sys
	import os
	import fitz
	import argparse
	import json
	import nltk
	import re

	nltk.download('punkt_tab', download_dir='../.venv/nltk_data', quiet=True)

	def read_pdf(file_path):
	doc = fitz.open(file_path)
	text = ''

	for page in doc:
	text += page.get_text().encode('unicode_escape').decode('unicode_escape')

	doc.close()
	return text

	def trim_text(text):
	text = re.sub(r'\\[uU][0-9a-fA-F]{2,4}', lambda m: m.group(0).encode().decode('unicode_escape'), text)
	text = re.sub(r'\s+', ' ', text)
	text = re.sub(r'([!.?])\1+', r'\1', text)
	text = re.sub(r'([a-zA-Z])\1+', r'\1', text)

	return text.strip()

	def chunk_data(text, size):
	sentences = nltk.sent_tokenize(text)
	chunks = []
	current_chunk = []
	current_size = 0

	for sentence in sentences:
	if current_size + len(sentence) > size:
	chunks.append(' '.join(current_chunk))
	current_chunk = []
	current_size = 0

	current_chunk.append(sentence)
	current_size += len(sentence)

	if current_chunk:
	chunks.append(' '.join(current_chunk))

	return chunks

	def escape_text(text):
	return text.replace('\n', ' ').replace('\t', ' ')

	if __name__ == '__main__':
	parser = argparse.ArgumentParser(description='Break PDF contents into chunks for machine learning tasks. This script reads PDF files from a directory, processes their contents into chunks of a specified size, and writes the chunks to a JSON Lines file.')
	parser.add_argument('input_dir', type=str, help='Path to the directory containing PDF files')
	parser.add_argument('-s', '--size', type=int, default=10000, help='Chunk size')
	parser.add_argument('-o', '--output', type=str, required=True, help='Output file path')
	args = parser.parse_args()

	if not os.path.isdir(args.input_dir):
	print('Error: Input directory does not exist')
	sys.exit(1)

	files = []
	for root, _, filenames in os.walk(args.input_dir):
	for filename in filenames:
	if filename.endswith('.pdf'):
	files.append(os.path.join(root, filename))

	data = []
	for file in files:
	text = read_pdf(file)
	chunks = chunk_data(text, args.size)

	for chunk in chunks:
	if chunk:
	tokens = nltk.word_tokenize(chunk)
	data.append({'file': os.path.basename(file)[:-4], 'text': escape_text(trim_text(chunk)), 'tokens': len(tokens), 'timestamp': os.path.getmtime(file)})

	with open(args.output, 'w') as f:
	for item in data:
	f.write(json.dumps(item) + '\n')