Skip to content

Instantly share code, notes, and snippets.

@jflemer
Created January 16, 2021 17:30
Show Gist options
  • Save jflemer/1af81929fc55bfcf1b9875769f5b5fcd to your computer and use it in GitHub Desktop.
Save jflemer/1af81929fc55bfcf1b9875769f5b5fcd to your computer and use it in GitHub Desktop.
OCR for pdf/tiff/png/jpg (wrapper for GraphicsMagick and Tesseract OCR)
#!/bin/bash
##############################################################################
# OCR
#
# Usage:
# ocr.sh file.{png,pdf,jpg} ...
#
# Requires:
# * Tesseract
# * ImageMagick / GraphicsMagick
# * Poppler Utils (pdfimages)
# * apt install -y graphicsmagick poppler-utils tesseract-ocr tesseract-ocr-eng
##############################################################################
cleanup() {
if [ -n "$tmpdir" ] && [ -d "$tmpdir" ]; then
rm -r "$tmpdir"
fi
tmpdir=""
}
trim() {
cat "$@"
}
remove() {
rm "$@"
}
image_adjust() {
local i="$1"; shift
local j="$1"; shift
image_resize_adjust "$i" "$j"
}
image_resize_adjust() {
if [ -z "$CONVERT" ]; then
if gm -version >/dev/null 2>&1; then
CONVERT="gm convert"
else
CONVERT="convert"
fi
fi
local i="$1"; shift
local j="$1"; shift
if [ -n "$1" ]; then
local sz="$1"; shift
$CONVERT "$i" \
-resize "$sz" \
-white-threshold 75% \
-colorspace gray \
-depth 8 \
-compress lzw \
"$j"
else
$CONVERT "$i" \
-white-threshold 75% \
-colorspace gray \
-depth 8 \
-compress lzw \
"$j"
fi
}
ocr_tiff() {
local i="$1"
local out="$2"
local j="${i%.*}"
echo "==> $j.txt"
tesseract "$i" "$j" nobatch
trim "$j.txt" >> "$out"
#remove "$j.txt"
}
ocr_png() {
local i="$1"
local out="$2"
local j="${i%.*}"
image_adjust "$i" "$j-000.tif"
ocr_tiff "$j-000.tif" "$out"
remove "$j-000.tif"
}
ocr_jpeg() {
ocr_png "$@"
}
ocr_pdf() {
local i="$1"
local out="$2"
pdfimages -j "$i" "$tmpdir/$i"
local j
for j in "$tmpdir/$i"*.p[bp]m; do
local k="${j%.*}"
#convert_args="-white-threshold 75% -auto-level -compress lzw"
#convert_args="-resize 400% -white-threshold 75% -colorspace gray -depth 8 -compress lzw"
#convert "$j" $convert_args "$k.tif" && rm "$j"
#convert "$j" -fill white -draw 'rectangle 0,0 425,650' -draw 'rectangle 730,300 850,900' -draw 'rectangle 0,850 850,900' -resize 400% -white-threshold 75% -colorspace gray -depth 8 -compress lzw "$k.tif"
image_resize_adjust "$j" "$k.tif" '400%'
ocr_tiff "$k.tif" "$out"
remove "$k.tif"
done
}
tmpdir=$(mktemp -p "${TMPDIR:-/tmp}" -d ocr-XXXXXXXX) || exit 1
trap cleanup 0
for i in "$@"; do
out="${i%.*}.txt"
: > $out
case "${i,,}" in
*.pdf)
ocr_pdf "$i" "$out"
;;
*.png)
ocr_png "$i" "$out"
;;
*.jpg|*.jpeg)
ocr_jpeg "$i" "$out"
;;
*.tif|*.tiff)
ocr_tiff "$i" "$out"
;;
esac
done
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment