aaronedev · July 15, 2025 08:54
diff --git a/pdf_auto_ocr.sh b/pdf_auto_ocr.sh
 # File name: pdf_auto_ocr.sh
 # Author: aaronedev | https://github.com/aaronedev
 # Date created: 2025-07-15 10:54:00
 # Date modified: 2025-07-15 10:54:22
 # ------
 # Copyright 2025

 set -euo pipefail

 # --- Configuration ---
 # Colors for output
 RED='\033[0;31m'
 GREEN='\033[0;32m'
 YELLOW='\033[1;33m'
 BLUE='\033[0;34m'
 NC='\033[0m' # No Color

 # Default values
 DEFAULT_LANG="eng"
 DEFAULT_JOBS=$(nproc)
 LOG_FILE="ocr_processing.log"
 TEMP_DIR="/tmp/pdf_ocr_$"
 FORCE_YES=false

 # --- Helper Functions ---

 # Function to print colored output
 print_status() {
 	echo -e "${BLUE}[INFO]${NC} $1"
 }

 print_success() {
 	echo -e "${GREEN}[SUCCESS]${NC} $1"
 }

 print_warning() {
 	echo -e "${YELLOW}[WARNING]${NC} $1"
 }

 print_error() {
 	echo -e "${RED}[ERROR]${NC} $1"
 }

 # Function to show help message
 show_help() {
 	echo "PDF OCR Script v2.0"
 	echo
 	echo "Usage: $0 -l <LANG> [options]"
 	echo
 	echo "Options:"
 	echo "  -l, --lang <LANG>    (Required) Specify the language for OCR (e.g., 'eng', 'deu')."
 	echo "  -j, --jobs <NUM>     Set the maximum number of parallel jobs. Defaults to the number of CPU cores ($DEFAULT_JOBS)."
 	echo "  -y, --yes            Assume 'yes' to all prompts and run non-interactively."
 	echo "  -h, --help           Show this help message."
 	echo "  -v, --version        Show version information."
 	echo
 	echo "This script automatically:"
 	echo "  1. Finds all PDF files in the current directory (excluding existing '*_OCR.pdf' files)."
 	echo "  2. Performs OCR with the specified language."
 	echo "  3. Runs jobs in parallel for efficiency."
 	echo "  4. Outputs files with an '_OCR' suffix."
 	echo
 	echo "Requirements:"
 	echo "  - ocrmypdf, tesseract, poppler (pdfinfo, pdftotext)"
 	echo
 	echo "Install on Arch Linux:"
 	echo "  sudo pacman -S ocrmypdf tesseract-data-eng poppler"
 }

 # Function to check dependencies
 check_dependencies() {
 	local missing_deps=()
 	command -v ocrmypdf >/dev/null 2>&1 || missing_deps+=("ocrmypdf")
 	command -v tesseract >/dev/null 2>&1 || missing_deps+=("tesseract")
 	command -v pdfinfo >/dev/null 2>&1 || missing_deps+=("poppler (pdfinfo)")
 	command -v pdftotext >/dev/null 2>&1 || missing_deps+=("poppler (pdftotext)")
 	command -v nproc >/dev/null 2>&1 || missing_deps+=("nproc (coreutils)")

 	if [ ${#missing_deps[@]} -ne 0 ]; then
 		print_error "Missing dependencies: ${missing_deps[*]}"
 		exit 1
 	fi
 }

 # Function to check if tesseract language data is available
 check_tesseract_lang() {
 	local lang_code="$1"
 	if ! tesseract --list-langs 2>/dev/null | grep -q "^$lang_code$"; then
 		print_error "Tesseract language data for '$lang_code' not found."
 		print_error "Please install the appropriate tesseract data package (e.g., 'tesseract-data-$lang_code')."
 		exit 1
 	fi
 }

 # Function to detect if PDF already has text
 has_text() {
 	local pdf_file="$1"
 	# Consider a PDF to have text if pdftotext can extract more than 100 bytes of non-whitespace characters.
 	if [ "$(pdftotext "$pdf_file" - 2>/dev/null | tr -d '[:space:]' | wc -c)" -gt 100 ]; then
 		return 0 # Has text
 	else
 		return 1 # No text
 	fi
 }

 # Function to process a single PDF
 # This function is designed to be run in a subshell for parallel execution
 process_pdf() {
 	local pdf_file="$1"
 	local lang_code="$2"
 	local base_name
 	local output_file

 	base_name=$(basename "$pdf_file" .pdf)
 	output_file="${base_name}_OCR.pdf"

 	# This function's output is redirected to the main log file,
 	# so we use standard echo for logging progress.
 	echo "Starting OCR for: $pdf_file with language '$lang_code'"

 	# Perform OCR. Redirect all output to the log file.
 	if ocrmypdf --language "$lang_code" --output-type pdfa --optimize 1 --jpeg-quality 85 --png-quality 85 "$pdf_file" "$output_file"; then
 		local original_size
 		local ocr_size
 		original_size=$(du -h "$pdf_file" | cut -f1)
 		ocr_size=$(du -h "$output_file" | cut -f1)
 		echo "SUCCESS: OCR for $pdf_file. Size: $original_size -> $ocr_size. Output: $output_file"
 		return 0
 	else
 		echo "ERROR: OCR failed for $pdf_file."
 		return 1
 	fi
 }

 # Function to cleanup temporary files
 cleanup() {
 	if [ -d "$TEMP_DIR" ]; then
 		rm -rf "$TEMP_DIR"
 	fi
 }

 # --- Main Execution ---
 main() {
 	local lang=""
 	local max_jobs="$DEFAULT_JOBS"

 	# Parse command-line arguments
 	while [ $# -gt 0 ]; do
 		case "$1" in
 		-l | --lang)
 			lang="$2"
 			shift 2
 			;;
 		-j | --jobs)
 			if ! [[ "$2" =~ ^[0-9]+$ ]] || [ "$2" -eq 0 ]; then
 				print_error "Invalid number of jobs: $2. Must be a positive integer."
 				exit 1
 			fi
 			max_jobs="$2"
 			shift 2
 			;;
 		-y | --yes)
 			FORCE_YES=true
 			shift 1
 			;;
 		-h | --help)
 			show_help
 			exit 0
 			;;
 		-v | --version)
 			show_help # Version is at the top of help
 			exit 0
 			;;
 		*)
 			print_error "Unknown option: $1"
 			show_help
 			exit 1
 			;;
 		esac
 	done

 	# Language parameter is mandatory
 	if [ -z "$lang" ]; then
 		print_error "Language not specified. Use -l <LANG>."
 		show_help
 		exit 1
 	fi

 	check_dependencies
 	check_tesseract_lang "$lang"

 	# Setup temp dir and cleanup trap
 	mkdir -p "$TEMP_DIR"
 	trap cleanup EXIT SIGINT SIGTERM

 	# Initialize log file
 	echo "OCR Processing Log - $(date)" >"$LOG_FILE"
 	print_status "Full output is being logged to $LOG_FILE"

 	local pdf_files=()
 	mapfile -t pdf_files < <(find . -maxdepth 1 -name "*.pdf" -type f ! -name "*_OCR.pdf" | sort)

 	if [ ${#pdf_files[@]} -eq 0 ]; then
 		print_warning "No PDF files found to process."
 		exit 0
 	fi

 	local files_to_process=()
 	if [ "$FORCE_YES" = true ]; then
 		files_to_process=("${pdf_files[@]}")
 		print_status "Found ${#files_to_process[@]} PDF(s) to process."
 	else
 		print_status "Found ${#pdf_files[@]} PDF file(s):"
 		for pdf in "${pdf_files[@]}"; do
 			if has_text "$pdf"; then
 				echo "  - $(basename "$pdf") (Warning: Already has text)"
 			else
 				echo "  - $(basename "$pdf")"
 			fi
 		done
 		echo
 		read -p "Process all ${#pdf_files[@]} file(s)? (Y/n): " -n 1 -r
 		echo
 		if [[ ! $REPLY =~ ^[Nn]$ ]]; then
 			files_to_process=("${pdf_files[@]}")
 		else
 			print_status "Processing cancelled by user."
 			exit 0
 		fi
 	fi

 	if [ ${#files_to_process[@]} -eq 0 ]; then
 		print_status "No files selected for processing."
 		exit 0
 	fi

 	local total_start_time
 	total_start_time=$(date +%s)

 	export -f process_pdf
 	export LOG_FILE

 	# Run jobs in parallel and log their output
 	printf "%s\0" "${files_to_process[@]}" |
 		xargs -0 -P "$max_jobs" -I {} bash -c "process_pdf '{}' '$lang'" >>"$LOG_FILE" 2>&1 &

 	local xargs_pid=$!
 	print_status "Started parallel processing (PID: $xargs_pid). Monitoring progress..."

 	# Simple progress indicator
 	while ps -p $xargs_pid >/dev/null; do
 		echo -n "."
 		sleep 2
 	done
 	echo

 	wait $xargs_pid

 	local total_end_time
 	local total_duration
 	total_end_time=$(date +%s)
 	total_duration=$((total_end_time - total_start_time))

 	# Grep log file for success/error counts
 	local processed
 	local failed
 	processed=$(grep -c "SUCCESS: OCR for" "$LOG_FILE" || true)
 	failed=$(grep -c "ERROR: OCR failed for" "$LOG_FILE" || true)

 	echo "========================================"
 	print_success "Processing summary:"
 	print_status "  - Successfully processed: $processed"
 	[ "$failed" -gt 0 ] && print_warning "  - Failed files: $failed"
 	print_status "  - Total time: ${total_duration}s"
 	print_status "  - A detailed log is available at: $LOG_FILE"
 	echo "========================================"
 }

 # Script entry point
 if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
 	main "$@"
 fi
	# File name: pdf_auto_ocr.sh
	# Author: aaronedev \| https://github.com/aaronedev
	# Date created: 2025-07-15 10:54:00
	# Date modified: 2025-07-15 10:54:22
	# ------
	# Copyright 2025

	set -euo pipefail

	# --- Configuration ---
	# Colors for output
	RED='\033[0;31m'
	GREEN='\033[0;32m'
	YELLOW='\033[1;33m'
	BLUE='\033[0;34m'
	NC='\033[0m' # No Color

	# Default values
	DEFAULT_LANG="eng"
	DEFAULT_JOBS=$(nproc)
	LOG_FILE="ocr_processing.log"
	TEMP_DIR="/tmp/pdf_ocr_$"
	FORCE_YES=false

	# --- Helper Functions ---

	# Function to print colored output
	print_status() {
	echo -e "${BLUE}[INFO]${NC} $1"
	}

	print_success() {
	echo -e "${GREEN}[SUCCESS]${NC} $1"
	}

	print_warning() {
	echo -e "${YELLOW}[WARNING]${NC} $1"
	}

	print_error() {
	echo -e "${RED}[ERROR]${NC} $1"
	}

	# Function to show help message
	show_help() {
	echo "PDF OCR Script v2.0"
	echo
	echo "Usage: $0 -l <LANG> [options]"
	echo
	echo "Options:"
	echo " -l, --lang <LANG> (Required) Specify the language for OCR (e.g., 'eng', 'deu')."
	echo " -j, --jobs <NUM> Set the maximum number of parallel jobs. Defaults to the number of CPU cores ($DEFAULT_JOBS)."
	echo " -y, --yes Assume 'yes' to all prompts and run non-interactively."
	echo " -h, --help Show this help message."
	echo " -v, --version Show version information."
	echo
	echo "This script automatically:"
	echo " 1. Finds all PDF files in the current directory (excluding existing '*_OCR.pdf' files)."
	echo " 2. Performs OCR with the specified language."
	echo " 3. Runs jobs in parallel for efficiency."
	echo " 4. Outputs files with an '_OCR' suffix."
	echo
	echo "Requirements:"
	echo " - ocrmypdf, tesseract, poppler (pdfinfo, pdftotext)"
	echo
	echo "Install on Arch Linux:"
	echo " sudo pacman -S ocrmypdf tesseract-data-eng poppler"
	}

	# Function to check dependencies
	check_dependencies() {
	local missing_deps=()
	command -v ocrmypdf >/dev/null 2>&1 \|\| missing_deps+=("ocrmypdf")
	command -v tesseract >/dev/null 2>&1 \|\| missing_deps+=("tesseract")
	command -v pdfinfo >/dev/null 2>&1 \|\| missing_deps+=("poppler (pdfinfo)")
	command -v pdftotext >/dev/null 2>&1 \|\| missing_deps+=("poppler (pdftotext)")
	command -v nproc >/dev/null 2>&1 \|\| missing_deps+=("nproc (coreutils)")

	if [ ${#missing_deps[@]} -ne 0 ]; then
	print_error "Missing dependencies: ${missing_deps[*]}"
	exit 1
	fi
	}

	# Function to check if tesseract language data is available
	check_tesseract_lang() {
	local lang_code="$1"
	if ! tesseract --list-langs 2>/dev/null \| grep -q "^$lang_code$"; then
	print_error "Tesseract language data for '$lang_code' not found."
	print_error "Please install the appropriate tesseract data package (e.g., 'tesseract-data-$lang_code')."
	exit 1
	fi
	}

	# Function to detect if PDF already has text
	has_text() {
	local pdf_file="$1"
	# Consider a PDF to have text if pdftotext can extract more than 100 bytes of non-whitespace characters.
	if [ "$(pdftotext "$pdf_file" - 2>/dev/null \| tr -d '[:space:]' \| wc -c)" -gt 100 ]; then
	return 0 # Has text
	else
	return 1 # No text
	fi
	}

	# Function to process a single PDF
	# This function is designed to be run in a subshell for parallel execution
	process_pdf() {
	local pdf_file="$1"
	local lang_code="$2"
	local base_name
	local output_file

	base_name=$(basename "$pdf_file" .pdf)
	output_file="${base_name}_OCR.pdf"

	# This function's output is redirected to the main log file,
	# so we use standard echo for logging progress.
	echo "Starting OCR for: $pdf_file with language '$lang_code'"

	# Perform OCR. Redirect all output to the log file.
	if ocrmypdf --language "$lang_code" --output-type pdfa --optimize 1 --jpeg-quality 85 --png-quality 85 "$pdf_file" "$output_file"; then
	local original_size
	local ocr_size
	original_size=$(du -h "$pdf_file" \| cut -f1)
	ocr_size=$(du -h "$output_file" \| cut -f1)
	echo "SUCCESS: OCR for $pdf_file. Size: $original_size -> $ocr_size. Output: $output_file"
	return 0
	else
	echo "ERROR: OCR failed for $pdf_file."
	return 1
	fi
	}

	# Function to cleanup temporary files
	cleanup() {
	if [ -d "$TEMP_DIR" ]; then
	rm -rf "$TEMP_DIR"
	fi
	}

	# --- Main Execution ---
	main() {
	local lang=""
	local max_jobs="$DEFAULT_JOBS"

	# Parse command-line arguments
	while [ $# -gt 0 ]; do
	case "$1" in
	-l \| --lang)
	lang="$2"
	shift 2
	;;
	-j \| --jobs)
	if ! [[ "$2" =~ ^[0-9]+$ ]] \|\| [ "$2" -eq 0 ]; then
	print_error "Invalid number of jobs: $2. Must be a positive integer."
	exit 1
	fi
	max_jobs="$2"
	shift 2
	;;
	-y \| --yes)
	FORCE_YES=true
	shift 1
	;;
	-h \| --help)
	show_help
	exit 0
	;;
	-v \| --version)
	show_help # Version is at the top of help
	exit 0
	;;
	*)
	print_error "Unknown option: $1"
	show_help
	exit 1
	;;
	esac
	done

	# Language parameter is mandatory
	if [ -z "$lang" ]; then
	print_error "Language not specified. Use -l <LANG>."
	show_help
	exit 1
	fi

	check_dependencies
	check_tesseract_lang "$lang"

	# Setup temp dir and cleanup trap
	mkdir -p "$TEMP_DIR"
	trap cleanup EXIT SIGINT SIGTERM

	# Initialize log file
	echo "OCR Processing Log - $(date)" >"$LOG_FILE"
	print_status "Full output is being logged to $LOG_FILE"

	local pdf_files=()
	mapfile -t pdf_files < <(find . -maxdepth 1 -name ".pdf" -type f ! -name "_OCR.pdf" \| sort)

	if [ ${#pdf_files[@]} -eq 0 ]; then
	print_warning "No PDF files found to process."
	exit 0
	fi

	local files_to_process=()
	if [ "$FORCE_YES" = true ]; then
	files_to_process=("${pdf_files[@]}")
	print_status "Found ${#files_to_process[@]} PDF(s) to process."
	else
	print_status "Found ${#pdf_files[@]} PDF file(s):"
	for pdf in "${pdf_files[@]}"; do
	if has_text "$pdf"; then
	echo " - $(basename "$pdf") (Warning: Already has text)"
	else
	echo " - $(basename "$pdf")"
	fi
	done
	echo
	read -p "Process all ${#pdf_files[@]} file(s)? (Y/n): " -n 1 -r
	echo
	if [[ ! $REPLY =~ ^[Nn]$ ]]; then
	files_to_process=("${pdf_files[@]}")
	else
	print_status "Processing cancelled by user."
	exit 0
	fi
	fi

	if [ ${#files_to_process[@]} -eq 0 ]; then
	print_status "No files selected for processing."
	exit 0
	fi

	local total_start_time
	total_start_time=$(date +%s)

	export -f process_pdf
	export LOG_FILE

	# Run jobs in parallel and log their output
	printf "%s\0" "${files_to_process[@]}" \|
	xargs -0 -P "$max_jobs" -I {} bash -c "process_pdf '{}' '$lang'" >>"$LOG_FILE" 2>&1 &

	local xargs_pid=$!
	print_status "Started parallel processing (PID: $xargs_pid). Monitoring progress..."

	# Simple progress indicator
	while ps -p $xargs_pid >/dev/null; do
	echo -n "."
	sleep 2
	done
	echo

	wait $xargs_pid

	local total_end_time
	local total_duration
	total_end_time=$(date +%s)
	total_duration=$((total_end_time - total_start_time))

	# Grep log file for success/error counts
	local processed
	local failed
	processed=$(grep -c "SUCCESS: OCR for" "$LOG_FILE" \|\| true)
	failed=$(grep -c "ERROR: OCR failed for" "$LOG_FILE" \|\| true)

	echo "========================================"
	print_success "Processing summary:"
	print_status " - Successfully processed: $processed"
	[ "$failed" -gt 0 ] && print_warning " - Failed files: $failed"
	print_status " - Total time: ${total_duration}s"
	print_status " - A detailed log is available at: $LOG_FILE"
	echo "========================================"
	}

	# Script entry point
	if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
	main "$@"
	fi