Skip to content

Instantly share code, notes, and snippets.

@aaronedev
Last active July 15, 2025 08:54
Show Gist options
  • Save aaronedev/16fd8f7ba468eba1ea88550b39627c19 to your computer and use it in GitHub Desktop.
Save aaronedev/16fd8f7ba468eba1ea88550b39627c19 to your computer and use it in GitHub Desktop.
auto OCR pdf files in current directory
# File name: pdf_auto_ocr.sh
# Author: aaronedev | https://github.com/aaronedev
# Date created: 2025-07-15 10:54:00
# Date modified: 2025-07-15 10:54:22
# ------
# Copyright 2025
set -euo pipefail
# --- Configuration ---
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
# Default values
DEFAULT_LANG="eng"
DEFAULT_JOBS=$(nproc)
LOG_FILE="ocr_processing.log"
TEMP_DIR="/tmp/pdf_ocr_$"
FORCE_YES=false
# --- Helper Functions ---
# Function to print colored output
print_status() {
echo -e "${BLUE}[INFO]${NC} $1"
}
print_success() {
echo -e "${GREEN}[SUCCESS]${NC} $1"
}
print_warning() {
echo -e "${YELLOW}[WARNING]${NC} $1"
}
print_error() {
echo -e "${RED}[ERROR]${NC} $1"
}
# Function to show help message
show_help() {
echo "PDF OCR Script v2.0"
echo
echo "Usage: $0 -l <LANG> [options]"
echo
echo "Options:"
echo " -l, --lang <LANG> (Required) Specify the language for OCR (e.g., 'eng', 'deu')."
echo " -j, --jobs <NUM> Set the maximum number of parallel jobs. Defaults to the number of CPU cores ($DEFAULT_JOBS)."
echo " -y, --yes Assume 'yes' to all prompts and run non-interactively."
echo " -h, --help Show this help message."
echo " -v, --version Show version information."
echo
echo "This script automatically:"
echo " 1. Finds all PDF files in the current directory (excluding existing '*_OCR.pdf' files)."
echo " 2. Performs OCR with the specified language."
echo " 3. Runs jobs in parallel for efficiency."
echo " 4. Outputs files with an '_OCR' suffix."
echo
echo "Requirements:"
echo " - ocrmypdf, tesseract, poppler (pdfinfo, pdftotext)"
echo
echo "Install on Arch Linux:"
echo " sudo pacman -S ocrmypdf tesseract-data-eng poppler"
}
# Function to check dependencies
check_dependencies() {
local missing_deps=()
command -v ocrmypdf >/dev/null 2>&1 || missing_deps+=("ocrmypdf")
command -v tesseract >/dev/null 2>&1 || missing_deps+=("tesseract")
command -v pdfinfo >/dev/null 2>&1 || missing_deps+=("poppler (pdfinfo)")
command -v pdftotext >/dev/null 2>&1 || missing_deps+=("poppler (pdftotext)")
command -v nproc >/dev/null 2>&1 || missing_deps+=("nproc (coreutils)")
if [ ${#missing_deps[@]} -ne 0 ]; then
print_error "Missing dependencies: ${missing_deps[*]}"
exit 1
fi
}
# Function to check if tesseract language data is available
check_tesseract_lang() {
local lang_code="$1"
if ! tesseract --list-langs 2>/dev/null | grep -q "^$lang_code$"; then
print_error "Tesseract language data for '$lang_code' not found."
print_error "Please install the appropriate tesseract data package (e.g., 'tesseract-data-$lang_code')."
exit 1
fi
}
# Function to detect if PDF already has text
has_text() {
local pdf_file="$1"
# Consider a PDF to have text if pdftotext can extract more than 100 bytes of non-whitespace characters.
if [ "$(pdftotext "$pdf_file" - 2>/dev/null | tr -d '[:space:]' | wc -c)" -gt 100 ]; then
return 0 # Has text
else
return 1 # No text
fi
}
# Function to process a single PDF
# This function is designed to be run in a subshell for parallel execution
process_pdf() {
local pdf_file="$1"
local lang_code="$2"
local base_name
local output_file
base_name=$(basename "$pdf_file" .pdf)
output_file="${base_name}_OCR.pdf"
# This function's output is redirected to the main log file,
# so we use standard echo for logging progress.
echo "Starting OCR for: $pdf_file with language '$lang_code'"
# Perform OCR. Redirect all output to the log file.
if ocrmypdf --language "$lang_code" --output-type pdfa --optimize 1 --jpeg-quality 85 --png-quality 85 "$pdf_file" "$output_file"; then
local original_size
local ocr_size
original_size=$(du -h "$pdf_file" | cut -f1)
ocr_size=$(du -h "$output_file" | cut -f1)
echo "SUCCESS: OCR for $pdf_file. Size: $original_size -> $ocr_size. Output: $output_file"
return 0
else
echo "ERROR: OCR failed for $pdf_file."
return 1
fi
}
# Function to cleanup temporary files
cleanup() {
if [ -d "$TEMP_DIR" ]; then
rm -rf "$TEMP_DIR"
fi
}
# --- Main Execution ---
main() {
local lang=""
local max_jobs="$DEFAULT_JOBS"
# Parse command-line arguments
while [ $# -gt 0 ]; do
case "$1" in
-l | --lang)
lang="$2"
shift 2
;;
-j | --jobs)
if ! [[ "$2" =~ ^[0-9]+$ ]] || [ "$2" -eq 0 ]; then
print_error "Invalid number of jobs: $2. Must be a positive integer."
exit 1
fi
max_jobs="$2"
shift 2
;;
-y | --yes)
FORCE_YES=true
shift 1
;;
-h | --help)
show_help
exit 0
;;
-v | --version)
show_help # Version is at the top of help
exit 0
;;
*)
print_error "Unknown option: $1"
show_help
exit 1
;;
esac
done
# Language parameter is mandatory
if [ -z "$lang" ]; then
print_error "Language not specified. Use -l <LANG>."
show_help
exit 1
fi
check_dependencies
check_tesseract_lang "$lang"
# Setup temp dir and cleanup trap
mkdir -p "$TEMP_DIR"
trap cleanup EXIT SIGINT SIGTERM
# Initialize log file
echo "OCR Processing Log - $(date)" >"$LOG_FILE"
print_status "Full output is being logged to $LOG_FILE"
local pdf_files=()
mapfile -t pdf_files < <(find . -maxdepth 1 -name "*.pdf" -type f ! -name "*_OCR.pdf" | sort)
if [ ${#pdf_files[@]} -eq 0 ]; then
print_warning "No PDF files found to process."
exit 0
fi
local files_to_process=()
if [ "$FORCE_YES" = true ]; then
files_to_process=("${pdf_files[@]}")
print_status "Found ${#files_to_process[@]} PDF(s) to process."
else
print_status "Found ${#pdf_files[@]} PDF file(s):"
for pdf in "${pdf_files[@]}"; do
if has_text "$pdf"; then
echo " - $(basename "$pdf") (Warning: Already has text)"
else
echo " - $(basename "$pdf")"
fi
done
echo
read -p "Process all ${#pdf_files[@]} file(s)? (Y/n): " -n 1 -r
echo
if [[ ! $REPLY =~ ^[Nn]$ ]]; then
files_to_process=("${pdf_files[@]}")
else
print_status "Processing cancelled by user."
exit 0
fi
fi
if [ ${#files_to_process[@]} -eq 0 ]; then
print_status "No files selected for processing."
exit 0
fi
local total_start_time
total_start_time=$(date +%s)
export -f process_pdf
export LOG_FILE
# Run jobs in parallel and log their output
printf "%s\0" "${files_to_process[@]}" |
xargs -0 -P "$max_jobs" -I {} bash -c "process_pdf '{}' '$lang'" >>"$LOG_FILE" 2>&1 &
local xargs_pid=$!
print_status "Started parallel processing (PID: $xargs_pid). Monitoring progress..."
# Simple progress indicator
while ps -p $xargs_pid >/dev/null; do
echo -n "."
sleep 2
done
echo
wait $xargs_pid
local total_end_time
local total_duration
total_end_time=$(date +%s)
total_duration=$((total_end_time - total_start_time))
# Grep log file for success/error counts
local processed
local failed
processed=$(grep -c "SUCCESS: OCR for" "$LOG_FILE" || true)
failed=$(grep -c "ERROR: OCR failed for" "$LOG_FILE" || true)
echo "========================================"
print_success "Processing summary:"
print_status " - Successfully processed: $processed"
[ "$failed" -gt 0 ] && print_warning " - Failed files: $failed"
print_status " - Total time: ${total_duration}s"
print_status " - A detailed log is available at: $LOG_FILE"
echo "========================================"
}
# Script entry point
if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
main "$@"
fi
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment