Created
January 16, 2021 17:30
-
-
Save jflemer/1af81929fc55bfcf1b9875769f5b5fcd to your computer and use it in GitHub Desktop.
OCR for pdf/tiff/png/jpg (wrapper for GraphicsMagick and Tesseract OCR)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
############################################################################## | |
# OCR | |
# | |
# Usage: | |
# ocr.sh file.{png,pdf,jpg} ... | |
# | |
# Requires: | |
# * Tesseract | |
# * ImageMagick / GraphicsMagick | |
# * Poppler Utils (pdfimages) | |
# * apt install -y graphicsmagick poppler-utils tesseract-ocr tesseract-ocr-eng | |
############################################################################## | |
cleanup() { | |
if [ -n "$tmpdir" ] && [ -d "$tmpdir" ]; then | |
rm -r "$tmpdir" | |
fi | |
tmpdir="" | |
} | |
trim() { | |
cat "$@" | |
} | |
remove() { | |
rm "$@" | |
} | |
image_adjust() { | |
local i="$1"; shift | |
local j="$1"; shift | |
image_resize_adjust "$i" "$j" | |
} | |
image_resize_adjust() { | |
if [ -z "$CONVERT" ]; then | |
if gm -version >/dev/null 2>&1; then | |
CONVERT="gm convert" | |
else | |
CONVERT="convert" | |
fi | |
fi | |
local i="$1"; shift | |
local j="$1"; shift | |
if [ -n "$1" ]; then | |
local sz="$1"; shift | |
$CONVERT "$i" \ | |
-resize "$sz" \ | |
-white-threshold 75% \ | |
-colorspace gray \ | |
-depth 8 \ | |
-compress lzw \ | |
"$j" | |
else | |
$CONVERT "$i" \ | |
-white-threshold 75% \ | |
-colorspace gray \ | |
-depth 8 \ | |
-compress lzw \ | |
"$j" | |
fi | |
} | |
ocr_tiff() { | |
local i="$1" | |
local out="$2" | |
local j="${i%.*}" | |
echo "==> $j.txt" | |
tesseract "$i" "$j" nobatch | |
trim "$j.txt" >> "$out" | |
#remove "$j.txt" | |
} | |
ocr_png() { | |
local i="$1" | |
local out="$2" | |
local j="${i%.*}" | |
image_adjust "$i" "$j-000.tif" | |
ocr_tiff "$j-000.tif" "$out" | |
remove "$j-000.tif" | |
} | |
ocr_jpeg() { | |
ocr_png "$@" | |
} | |
ocr_pdf() { | |
local i="$1" | |
local out="$2" | |
pdfimages -j "$i" "$tmpdir/$i" | |
local j | |
for j in "$tmpdir/$i"*.p[bp]m; do | |
local k="${j%.*}" | |
#convert_args="-white-threshold 75% -auto-level -compress lzw" | |
#convert_args="-resize 400% -white-threshold 75% -colorspace gray -depth 8 -compress lzw" | |
#convert "$j" $convert_args "$k.tif" && rm "$j" | |
#convert "$j" -fill white -draw 'rectangle 0,0 425,650' -draw 'rectangle 730,300 850,900' -draw 'rectangle 0,850 850,900' -resize 400% -white-threshold 75% -colorspace gray -depth 8 -compress lzw "$k.tif" | |
image_resize_adjust "$j" "$k.tif" '400%' | |
ocr_tiff "$k.tif" "$out" | |
remove "$k.tif" | |
done | |
} | |
tmpdir=$(mktemp -p "${TMPDIR:-/tmp}" -d ocr-XXXXXXXX) || exit 1 | |
trap cleanup 0 | |
for i in "$@"; do | |
out="${i%.*}.txt" | |
: > $out | |
case "${i,,}" in | |
*.pdf) | |
ocr_pdf "$i" "$out" | |
;; | |
*.png) | |
ocr_png "$i" "$out" | |
;; | |
*.jpg|*.jpeg) | |
ocr_jpeg "$i" "$out" | |
;; | |
*.tif|*.tiff) | |
ocr_tiff "$i" "$out" | |
;; | |
esac | |
done |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment