pszemraj · April 1, 2025 00:57
diff --git a/install_olmocr.sh b/install_olmocr.sh
 sudo apt-get update && sudo apt upgrade -y
 sudo apt-get install -y poppler-utils ttf-mscorefonts-installer msttcorefonts fonts-crosextra-caladea fonts-crosextra-carlito gsfonts lcdf-typetools

 git clone https://github.com/allenai/olmocr.git --depth 1
 cd olmocr
 pip install -q ninja
 pip install -e .[gpu] --find-links https://flashinfer.ai/whl/cu124/torch2.4/flashinfer/

 # clean up
 pip cache purge && apt autoremove -y

 URL="https://www.survivorlibrary.com/library/Beekeeping.zip"
 wget $URL
 unzip -j -o *.zip -d /workspace/input-pdfs && rm *.zip

 python -m olmocr.pipeline /workspace/outputworkspace --pdfs /workspace/input-pdfs/*.pdf \
  --workers 4 --apply_filter
diff --git a/postprocess_outputs.py b/postprocess_outputs.py
 """
 pip install datasets mdformat
 """
 import os
 from pathlib import Path

 import mdformat

 from datasets import load_dataset

 # Path to your olmOCR results (JSONL files)
 results_path = Path("/workspace/outputworkspace/results")
 assert results_path.is_dir()

 # Load the JSONL files into a Hugging Face dataset
 output_files = list(results_path.glob("output_*.jsonl"))
 output_files_str = [str(file) for file in output_files]
 ds = load_dataset("json", data_files=output_files_str)

 # clean up text field
 ds = ds.map(
    lambda x: {
        "text": mdformat.text(
            x["text"],
            options={
                "number": True,
                "wrap": "no",
            },
        )
    },
    num_proc=min(os.cpu_count(), 8),
 )

 # Now you have a dataset object
 print(ds)
diff --git a/runpod_fix.md b/runpod_fix.md
diff --git a/split_large_pdfs.sh b/split_large_pdfs.sh
 #!/bin/bash
 set -e

 if [ $# -ne 1 ]; then
    echo "Usage: $0 directory_path"
    exit 1
 fi

 DIR_PATH=$1

 # Check if directory exists
 if [ ! -d "$DIR_PATH" ]; then
    echo "Error: Directory '$DIR_PATH' does not exist"
    exit 1
 fi

 # Function to split a PDF
 split_pdf() {
    local INPUT=$1

    # First check if the file exists and is readable
    if [ ! -r "$INPUT" ]; then
        echo "Error: Cannot read file '$INPUT'"
        return 1
    fi

    # Get page count
    local TOTAL=0
    if command -v qpdf &> /dev/null; then
        TOTAL=$(qpdf --show-npages "$INPUT")
    else
        if command -v pdfinfo &> /dev/null; then
            TOTAL=$(pdfinfo "$INPUT" | grep Pages | awk '{print $2}')
        else
            echo "Error: Neither qpdf nor pdfinfo is available"
            return 1
        fi
    fi

    if [ -z "$TOTAL" ] || [ "$TOTAL" -eq 0 ]; then
        echo "Error: Could not determine page count for '$INPUT'"
        return 1
    fi

    echo "Processing: $INPUT"
    echo "Total pages: $TOTAL"

    # Check if PDF has more than 3000 pages
    if [ "$TOTAL" -le 3000 ]; then
        echo "Skipping: $INPUT (has $TOTAL pages, not more than 3000)"
        return 0
    fi

    local PAGES_PER_PART=$(( (TOTAL + 3) / 4 ))
    echo "Pages per part: $PAGES_PER_PART"

    # Create output filenames
    local PART1="${INPUT%.pdf}_part1.pdf"
    local PART2="${INPUT%.pdf}_part2.pdf"
    local PART3="${INPUT%.pdf}_part3.pdf"
    local PART4="${INPUT%.pdf}_part4.pdf"

    # Split the PDF
    gs -dBATCH -dNOPAUSE -q -sDEVICE=pdfwrite -dFirstPage=1 -dLastPage=$PAGES_PER_PART -sOutputFile="$PART1" "$INPUT"
    gs -dBATCH -dNOPAUSE -q -sDEVICE=pdfwrite -dFirstPage=$(( PAGES_PER_PART + 1 )) -dLastPage=$(( PAGES_PER_PART * 2 )) -sOutputFile="$PART2" "$INPUT"
    gs -dBATCH -dNOPAUSE -q -sDEVICE=pdfwrite -dFirstPage=$(( PAGES_PER_PART * 2 + 1 )) -dLastPage=$(( PAGES_PER_PART * 3 )) -sOutputFile="$PART3" "$INPUT"
    gs -dBATCH -dNOPAUSE -q -sDEVICE=pdfwrite -dFirstPage=$(( PAGES_PER_PART * 3 + 1 )) -dLastPage=$TOTAL -sOutputFile="$PART4" "$INPUT"

    # Check output files
    local SUCCESS=true
    for part in "$PART1" "$PART2" "$PART3" "$PART4"; do
        if [ -f "$part" ]; then
            echo "Created: $(realpath "$part")"
        else
            echo "Failed to create: $part"
            SUCCESS=false
        fi
    done

    # Remove original only if all parts were created successfully
    if [ "$SUCCESS" = true ]; then
        echo "Removing original file: $INPUT"
        rm "$INPUT"
        echo "Successfully split $INPUT into 4 parts and removed the original"
    else
        echo "Warning: Not removing original file due to split errors"
    fi
 }

 # Process all PDFs in the directory
 echo "Looking for PDFs in: $DIR_PATH"
 count=0
 for pdf in "$DIR_PATH"/*.pdf; do
    if [ -f "$pdf" ]; then
        split_pdf "$pdf"
        count=$((count + 1))
    fi
 done

 if [ $count -eq 0 ]; then
    echo "No PDF files found in $DIR_PATH"
 else
    echo "Processed $count PDF files"
 fi
diff --git a/split_pdf.sh b/split_pdf.sh
 #!/bin/bash
 set -e

 if [ $# -ne 1 ]; then
  echo "Usage: $0 input.pdf"
  exit 1
 fi

 INPUT=$1

 # First check if the file exists and is readable
 if [ ! -r "$INPUT" ]; then
  echo "Error: Cannot read file '$INPUT'. Check if file exists and you have read permissions."
  exit 1
 fi

 # Use qpdf for page count instead of gs as it's more reliable in containers
 if command -v qpdf &> /dev/null; then
  TOTAL=$(qpdf --show-npages "$INPUT")
 else
  # Fallback to pdfinfo if qpdf is not available
  if command -v pdfinfo &> /dev/null; then
    TOTAL=$(pdfinfo "$INPUT" | grep Pages | awk '{print $2}')
  else
    echo "Error: Neither qpdf nor pdfinfo is available. Please install one of them."
    exit 1
  fi
 fi

 if [ -z "$TOTAL" ] || [ "$TOTAL" -eq 0 ]; then
  echo "Error: Could not determine page count for '$INPUT'"
  exit 1
 fi

 PAGES_PER_PART=$(( (TOTAL + 3) / 4 ))

 echo "Total pages: $TOTAL"
 echo "Pages per part: $PAGES_PER_PART"

 # Create output filenames
 PART1="${INPUT%.pdf}_part1.pdf"
 PART2="${INPUT%.pdf}_part2.pdf"
 PART3="${INPUT%.pdf}_part3.pdf"
 PART4="${INPUT%.pdf}_part4.pdf"

 # Split the PDF
 gs -dBATCH -dNOPAUSE -q -sDEVICE=pdfwrite -dFirstPage=1 -dLastPage=$PAGES_PER_PART -sOutputFile="$PART1" "$INPUT"
 gs -dBATCH -dNOPAUSE -q -sDEVICE=pdfwrite -dFirstPage=$(( PAGES_PER_PART + 1 )) -dLastPage=$(( PAGES_PER_PART * 2 )) -sOutputFile="$PART2" "$INPUT"
 gs -dBATCH -dNOPAUSE -q -sDEVICE=pdfwrite -dFirstPage=$(( PAGES_PER_PART * 2 + 1 )) -dLastPage=$(( PAGES_PER_PART * 3 )) -sOutputFile="$PART3" "$INPUT"
 gs -dBATCH -dNOPAUSE -q -sDEVICE=pdfwrite -dFirstPage=$(( PAGES_PER_PART * 3 + 1 )) -dLastPage=$TOTAL -sOutputFile="$PART4" "$INPUT"

 # Display real paths of output files
 echo "Output files:"
 for part in "$PART1" "$PART2" "$PART3" "$PART4"; do
  if [ -f "$part" ]; then
    echo "$(realpath "$part")"
  else
    echo "Failed to create: $part"
  fi
 done

 rm $INPUT

 echo "Done splitting $INPUT into 4 parts."
	sudo apt-get update && sudo apt upgrade -y
	sudo apt-get install -y poppler-utils ttf-mscorefonts-installer msttcorefonts fonts-crosextra-caladea fonts-crosextra-carlito gsfonts lcdf-typetools

	git clone https://github.com/allenai/olmocr.git --depth 1
	cd olmocr
	pip install -q ninja
	pip install -e .[gpu] --find-links https://flashinfer.ai/whl/cu124/torch2.4/flashinfer/

	# clean up
	pip cache purge && apt autoremove -y

	URL="https://www.survivorlibrary.com/library/Beekeeping.zip"
	wget $URL
	unzip -j -o .zip -d /workspace/input-pdfs && rm .zip

	python -m olmocr.pipeline /workspace/outputworkspace --pdfs /workspace/input-pdfs/*.pdf \
	--workers 4 --apply_filter
	"""
	pip install datasets mdformat
	"""
	import os
	from pathlib import Path

	import mdformat

	from datasets import load_dataset

	# Path to your olmOCR results (JSONL files)
	results_path = Path("/workspace/outputworkspace/results")
	assert results_path.is_dir()

	# Load the JSONL files into a Hugging Face dataset
	output_files = list(results_path.glob("output_*.jsonl"))
	output_files_str = [str(file) for file in output_files]
	ds = load_dataset("json", data_files=output_files_str)

	# clean up text field
	ds = ds.map(
	lambda x: {
	"text": mdformat.text(
	x["text"],
	options={
	"number": True,
	"wrap": "no",
	},
	)
	},
	num_proc=min(os.cpu_count(), 8),
	)

	# Now you have a dataset object
	print(ds)
	#!/bin/bash
	set -e

	if [ $# -ne 1 ]; then
	echo "Usage: $0 directory_path"
	exit 1
	fi

	DIR_PATH=$1

	# Check if directory exists
	if [ ! -d "$DIR_PATH" ]; then
	echo "Error: Directory '$DIR_PATH' does not exist"
	exit 1
	fi

	# Function to split a PDF
	split_pdf() {
	local INPUT=$1

	# First check if the file exists and is readable
	if [ ! -r "$INPUT" ]; then
	echo "Error: Cannot read file '$INPUT'"
	return 1
	fi

	# Get page count
	local TOTAL=0
	if command -v qpdf &> /dev/null; then
	TOTAL=$(qpdf --show-npages "$INPUT")
	else
	if command -v pdfinfo &> /dev/null; then
	TOTAL=$(pdfinfo "$INPUT" \| grep Pages \| awk '{print $2}')
	else
	echo "Error: Neither qpdf nor pdfinfo is available"
	return 1
	fi
	fi

	if [ -z "$TOTAL" ] \|\| [ "$TOTAL" -eq 0 ]; then
	echo "Error: Could not determine page count for '$INPUT'"
	return 1
	fi

	echo "Processing: $INPUT"
	echo "Total pages: $TOTAL"

	# Check if PDF has more than 3000 pages
	if [ "$TOTAL" -le 3000 ]; then
	echo "Skipping: $INPUT (has $TOTAL pages, not more than 3000)"
	return 0
	fi

	local PAGES_PER_PART=$(( (TOTAL + 3) / 4 ))
	echo "Pages per part: $PAGES_PER_PART"

	# Create output filenames
	local PART1="${INPUT%.pdf}_part1.pdf"
	local PART2="${INPUT%.pdf}_part2.pdf"
	local PART3="${INPUT%.pdf}_part3.pdf"
	local PART4="${INPUT%.pdf}_part4.pdf"

	# Split the PDF
	gs -dBATCH -dNOPAUSE -q -sDEVICE=pdfwrite -dFirstPage=1 -dLastPage=$PAGES_PER_PART -sOutputFile="$PART1" "$INPUT"
	gs -dBATCH -dNOPAUSE -q -sDEVICE=pdfwrite -dFirstPage=$(( PAGES_PER_PART + 1 )) -dLastPage=$(( PAGES_PER_PART * 2 )) -sOutputFile="$PART2" "$INPUT"
	gs -dBATCH -dNOPAUSE -q -sDEVICE=pdfwrite -dFirstPage=$(( PAGES_PER_PART * 2 + 1 )) -dLastPage=$(( PAGES_PER_PART * 3 )) -sOutputFile="$PART3" "$INPUT"
	gs -dBATCH -dNOPAUSE -q -sDEVICE=pdfwrite -dFirstPage=$(( PAGES_PER_PART * 3 + 1 )) -dLastPage=$TOTAL -sOutputFile="$PART4" "$INPUT"

	# Check output files
	local SUCCESS=true
	for part in "$PART1" "$PART2" "$PART3" "$PART4"; do
	if [ -f "$part" ]; then
	echo "Created: $(realpath "$part")"
	else
	echo "Failed to create: $part"
	SUCCESS=false
	fi
	done

	# Remove original only if all parts were created successfully
	if [ "$SUCCESS" = true ]; then
	echo "Removing original file: $INPUT"
	rm "$INPUT"
	echo "Successfully split $INPUT into 4 parts and removed the original"
	else
	echo "Warning: Not removing original file due to split errors"
	fi
	}

	# Process all PDFs in the directory
	echo "Looking for PDFs in: $DIR_PATH"
	count=0
	for pdf in "$DIR_PATH"/*.pdf; do
	if [ -f "$pdf" ]; then
	split_pdf "$pdf"
	count=$((count + 1))
	fi
	done

	if [ $count -eq 0 ]; then
	echo "No PDF files found in $DIR_PATH"
	else
	echo "Processed $count PDF files"
	fi