Last active
July 15, 2025 08:54
-
-
Save aaronedev/16fd8f7ba468eba1ea88550b39627c19 to your computer and use it in GitHub Desktop.
auto OCR pdf files in current directory
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# File name: pdf_auto_ocr.sh | |
# Author: aaronedev | https://github.com/aaronedev | |
# Date created: 2025-07-15 10:54:00 | |
# Date modified: 2025-07-15 10:54:22 | |
# ------ | |
# Copyright 2025 | |
set -euo pipefail | |
# --- Configuration --- | |
# Colors for output | |
RED='\033[0;31m' | |
GREEN='\033[0;32m' | |
YELLOW='\033[1;33m' | |
BLUE='\033[0;34m' | |
NC='\033[0m' # No Color | |
# Default values | |
DEFAULT_LANG="eng" | |
DEFAULT_JOBS=$(nproc) | |
LOG_FILE="ocr_processing.log" | |
TEMP_DIR="/tmp/pdf_ocr_$" | |
FORCE_YES=false | |
# --- Helper Functions --- | |
# Function to print colored output | |
print_status() { | |
echo -e "${BLUE}[INFO]${NC} $1" | |
} | |
print_success() { | |
echo -e "${GREEN}[SUCCESS]${NC} $1" | |
} | |
print_warning() { | |
echo -e "${YELLOW}[WARNING]${NC} $1" | |
} | |
print_error() { | |
echo -e "${RED}[ERROR]${NC} $1" | |
} | |
# Function to show help message | |
show_help() { | |
echo "PDF OCR Script v2.0" | |
echo | |
echo "Usage: $0 -l <LANG> [options]" | |
echo | |
echo "Options:" | |
echo " -l, --lang <LANG> (Required) Specify the language for OCR (e.g., 'eng', 'deu')." | |
echo " -j, --jobs <NUM> Set the maximum number of parallel jobs. Defaults to the number of CPU cores ($DEFAULT_JOBS)." | |
echo " -y, --yes Assume 'yes' to all prompts and run non-interactively." | |
echo " -h, --help Show this help message." | |
echo " -v, --version Show version information." | |
echo | |
echo "This script automatically:" | |
echo " 1. Finds all PDF files in the current directory (excluding existing '*_OCR.pdf' files)." | |
echo " 2. Performs OCR with the specified language." | |
echo " 3. Runs jobs in parallel for efficiency." | |
echo " 4. Outputs files with an '_OCR' suffix." | |
echo | |
echo "Requirements:" | |
echo " - ocrmypdf, tesseract, poppler (pdfinfo, pdftotext)" | |
echo | |
echo "Install on Arch Linux:" | |
echo " sudo pacman -S ocrmypdf tesseract-data-eng poppler" | |
} | |
# Function to check dependencies | |
check_dependencies() { | |
local missing_deps=() | |
command -v ocrmypdf >/dev/null 2>&1 || missing_deps+=("ocrmypdf") | |
command -v tesseract >/dev/null 2>&1 || missing_deps+=("tesseract") | |
command -v pdfinfo >/dev/null 2>&1 || missing_deps+=("poppler (pdfinfo)") | |
command -v pdftotext >/dev/null 2>&1 || missing_deps+=("poppler (pdftotext)") | |
command -v nproc >/dev/null 2>&1 || missing_deps+=("nproc (coreutils)") | |
if [ ${#missing_deps[@]} -ne 0 ]; then | |
print_error "Missing dependencies: ${missing_deps[*]}" | |
exit 1 | |
fi | |
} | |
# Function to check if tesseract language data is available | |
check_tesseract_lang() { | |
local lang_code="$1" | |
if ! tesseract --list-langs 2>/dev/null | grep -q "^$lang_code$"; then | |
print_error "Tesseract language data for '$lang_code' not found." | |
print_error "Please install the appropriate tesseract data package (e.g., 'tesseract-data-$lang_code')." | |
exit 1 | |
fi | |
} | |
# Function to detect if PDF already has text | |
has_text() { | |
local pdf_file="$1" | |
# Consider a PDF to have text if pdftotext can extract more than 100 bytes of non-whitespace characters. | |
if [ "$(pdftotext "$pdf_file" - 2>/dev/null | tr -d '[:space:]' | wc -c)" -gt 100 ]; then | |
return 0 # Has text | |
else | |
return 1 # No text | |
fi | |
} | |
# Function to process a single PDF | |
# This function is designed to be run in a subshell for parallel execution | |
process_pdf() { | |
local pdf_file="$1" | |
local lang_code="$2" | |
local base_name | |
local output_file | |
base_name=$(basename "$pdf_file" .pdf) | |
output_file="${base_name}_OCR.pdf" | |
# This function's output is redirected to the main log file, | |
# so we use standard echo for logging progress. | |
echo "Starting OCR for: $pdf_file with language '$lang_code'" | |
# Perform OCR. Redirect all output to the log file. | |
if ocrmypdf --language "$lang_code" --output-type pdfa --optimize 1 --jpeg-quality 85 --png-quality 85 "$pdf_file" "$output_file"; then | |
local original_size | |
local ocr_size | |
original_size=$(du -h "$pdf_file" | cut -f1) | |
ocr_size=$(du -h "$output_file" | cut -f1) | |
echo "SUCCESS: OCR for $pdf_file. Size: $original_size -> $ocr_size. Output: $output_file" | |
return 0 | |
else | |
echo "ERROR: OCR failed for $pdf_file." | |
return 1 | |
fi | |
} | |
# Function to cleanup temporary files | |
cleanup() { | |
if [ -d "$TEMP_DIR" ]; then | |
rm -rf "$TEMP_DIR" | |
fi | |
} | |
# --- Main Execution --- | |
main() { | |
local lang="" | |
local max_jobs="$DEFAULT_JOBS" | |
# Parse command-line arguments | |
while [ $# -gt 0 ]; do | |
case "$1" in | |
-l | --lang) | |
lang="$2" | |
shift 2 | |
;; | |
-j | --jobs) | |
if ! [[ "$2" =~ ^[0-9]+$ ]] || [ "$2" -eq 0 ]; then | |
print_error "Invalid number of jobs: $2. Must be a positive integer." | |
exit 1 | |
fi | |
max_jobs="$2" | |
shift 2 | |
;; | |
-y | --yes) | |
FORCE_YES=true | |
shift 1 | |
;; | |
-h | --help) | |
show_help | |
exit 0 | |
;; | |
-v | --version) | |
show_help # Version is at the top of help | |
exit 0 | |
;; | |
*) | |
print_error "Unknown option: $1" | |
show_help | |
exit 1 | |
;; | |
esac | |
done | |
# Language parameter is mandatory | |
if [ -z "$lang" ]; then | |
print_error "Language not specified. Use -l <LANG>." | |
show_help | |
exit 1 | |
fi | |
check_dependencies | |
check_tesseract_lang "$lang" | |
# Setup temp dir and cleanup trap | |
mkdir -p "$TEMP_DIR" | |
trap cleanup EXIT SIGINT SIGTERM | |
# Initialize log file | |
echo "OCR Processing Log - $(date)" >"$LOG_FILE" | |
print_status "Full output is being logged to $LOG_FILE" | |
local pdf_files=() | |
mapfile -t pdf_files < <(find . -maxdepth 1 -name "*.pdf" -type f ! -name "*_OCR.pdf" | sort) | |
if [ ${#pdf_files[@]} -eq 0 ]; then | |
print_warning "No PDF files found to process." | |
exit 0 | |
fi | |
local files_to_process=() | |
if [ "$FORCE_YES" = true ]; then | |
files_to_process=("${pdf_files[@]}") | |
print_status "Found ${#files_to_process[@]} PDF(s) to process." | |
else | |
print_status "Found ${#pdf_files[@]} PDF file(s):" | |
for pdf in "${pdf_files[@]}"; do | |
if has_text "$pdf"; then | |
echo " - $(basename "$pdf") (Warning: Already has text)" | |
else | |
echo " - $(basename "$pdf")" | |
fi | |
done | |
echo | |
read -p "Process all ${#pdf_files[@]} file(s)? (Y/n): " -n 1 -r | |
echo | |
if [[ ! $REPLY =~ ^[Nn]$ ]]; then | |
files_to_process=("${pdf_files[@]}") | |
else | |
print_status "Processing cancelled by user." | |
exit 0 | |
fi | |
fi | |
if [ ${#files_to_process[@]} -eq 0 ]; then | |
print_status "No files selected for processing." | |
exit 0 | |
fi | |
local total_start_time | |
total_start_time=$(date +%s) | |
export -f process_pdf | |
export LOG_FILE | |
# Run jobs in parallel and log their output | |
printf "%s\0" "${files_to_process[@]}" | | |
xargs -0 -P "$max_jobs" -I {} bash -c "process_pdf '{}' '$lang'" >>"$LOG_FILE" 2>&1 & | |
local xargs_pid=$! | |
print_status "Started parallel processing (PID: $xargs_pid). Monitoring progress..." | |
# Simple progress indicator | |
while ps -p $xargs_pid >/dev/null; do | |
echo -n "." | |
sleep 2 | |
done | |
echo | |
wait $xargs_pid | |
local total_end_time | |
local total_duration | |
total_end_time=$(date +%s) | |
total_duration=$((total_end_time - total_start_time)) | |
# Grep log file for success/error counts | |
local processed | |
local failed | |
processed=$(grep -c "SUCCESS: OCR for" "$LOG_FILE" || true) | |
failed=$(grep -c "ERROR: OCR failed for" "$LOG_FILE" || true) | |
echo "========================================" | |
print_success "Processing summary:" | |
print_status " - Successfully processed: $processed" | |
[ "$failed" -gt 0 ] && print_warning " - Failed files: $failed" | |
print_status " - Total time: ${total_duration}s" | |
print_status " - A detailed log is available at: $LOG_FILE" | |
echo "========================================" | |
} | |
# Script entry point | |
if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then | |
main "$@" | |
fi |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment