davidpfahler · September 26, 2021 18:28
diff --git a/ocr.sh b/ocr.sh
 #!/bin/sh

 mkdir -p __searchable__

 y="`pwd`/$1"
 echo Will create a searchable PDF for $y

 x=`basename "$y"`
 name=${x%.*}

 mkdir "$name"
 cd "$name"

 # splitting to individual pages
 gs -dSAFER -dBATCH -dNOPAUSE -sDEVICE=jpeg -r300 -dTextAlphaBits=4 -o out_%04d.jpg -f "$y"

 # process each page
 for f in $( ls *.jpg ); do
  # extract text
  tesseract $f ${f%.*} -l deu --psm 3 pdf
  rm $f
 done

 # combine all pages back to a single file
 gs -dCompatibilityLevel=1.4 -dNOPAUSE -dQUIET -dBATCH -dNOPAUSE -q -sDEVICE=pdfwrite -sOutputFile="../__searchable__/${name}.pdf" *.pdf

 cd ..
 rm -rf "${name}"
	#!/bin/sh

	mkdir -p __searchable__

	y="`pwd`/$1"
	echo Will create a searchable PDF for $y

	x=`basename "$y"`
	name=${x%.*}

	mkdir "$name"
	cd "$name"

	# splitting to individual pages
	gs -dSAFER -dBATCH -dNOPAUSE -sDEVICE=jpeg -r300 -dTextAlphaBits=4 -o out_%04d.jpg -f "$y"

	# process each page
	for f in $( ls *.jpg ); do
	# extract text
	tesseract $f ${f%.*} -l deu --psm 3 pdf
	rm $f
	done

	# combine all pages back to a single file
	gs -dCompatibilityLevel=1.4 -dNOPAUSE -dQUIET -dBATCH -dNOPAUSE -q -sDEVICE=pdfwrite -sOutputFile="../__searchable__/${name}.pdf" *.pdf

	cd ..
	rm -rf "${name}"