Last active
June 12, 2021 01:09
-
-
Save christianroman/8485394 to your computer and use it in GitHub Desktop.
Rompiendo Captcha de CURP usando Python, OpenCV, Tesseract OCR y Tornado
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import tornado.ioloop | |
import tornado.web | |
import urllib2 as urllib | |
from PIL import Image | |
from cStringIO import StringIO | |
import numpy as np | |
import tesserwrap | |
import cv2 | |
class MainHandler(tornado.web.RequestHandler): | |
def get(self): | |
# Obtenemos el captcha | |
url = "http://consultas.curp.gob.mx/CurpSP/imagenCatcha" | |
file = StringIO(urllib.urlopen(url).read()) | |
original = Image.open(file) | |
# Convertimos formato PIL a CV2 | |
cv_img = np.asarray(original)[:,:,::].copy() | |
# Convertimos imagen a scala de grises. | |
gray = cv2.cvtColor(cv_img, cv2.COLOR_BGR2GRAY) | |
# Aplicamos filtro Canny para eliminar lineas. | |
edges = cv2.Canny(gray, 60, 200, apertureSize = 3) | |
# Obtenemos las lineas. | |
lines = cv2.HoughLinesP(edges, 1, np.pi / 180, 1, None, 0, 0) | |
# Dibujamos las lineas encontradas en color blanco. | |
for x1, y1, x2, y2 in lines[0]: | |
cv2.line(cv_img, (x1, y1), (x2, y2), (255,255,255 ), 2) | |
# Creamos una copia de nuestra imagen limpia sin lineas. | |
processed = cv2.cvtColor(cv_img, cv2.COLOR_BGR2GRAY) | |
# Aplicamos un desenfoque gaussiano. | |
blur = cv2.GaussianBlur(processed, (3, 3), 0) | |
# Aplicamos threshold. | |
threshold = cv2.threshold(blur, 128, 255, cv2.THRESH_BINARY)[1] | |
# Aplicamos transformación morfologica. | |
kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (6, 6)) | |
morph = cv2.morphologyEx(threshold, cv2.MORPH_OPEN, kernel) | |
# Convertimos nuestra imagen final procesada a PIL. | |
pil_img = Image.fromarray(morph) | |
# Iniciamos tesseract y leemos la imagen. | |
tesseract = tesserwrap.tesseract() | |
tesseract.set_variable("tessedit_char_whitelist", "0123456789abcdefghijklmnopqrstuvwxyz") | |
tesseract.set_page_seg_mode(8) | |
text = tesseract.ocr_image(pil_img) | |
self.write(text.strip()) | |
application = tornado.web.Application([ | |
(r"/", MainHandler), | |
]) | |
if __name__ == "__main__": | |
application.listen(8888) | |
tornado.ioloop.IOLoop.instance().start() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hola, me podrías ayudar?, estoy tratando de instalar tesserwrap pero no me deja:
λ python server.py
Traceback (most recent call last):
File "server.py", line 10, in
import tesserwrap
ImportError: No module named tesserwrap
λ pip install tesserwrap
Collecting tesserwrap
Using cached https://files.pythonhosted.org/packages/04/92/4c2134fc465d576c05d4426bc2f1ba7871652d78d3d913bec0bffe0afe8b/tesserwrap-0.1.6.tar.gz
Complete output from command python setup.py egg_info:
"ld" no se reconoce como un comando interno o externo,
programa o archivo por lotes ejecutable.
"ld" no se reconoce como un comando interno o externo,
programa o archivo por lotes ejecutable.
Traceback (most recent call last):
File "", line 1, in
File "c:\users\usuario\appdata\local\temp\pip-install-xcu3ya\tesserwrap\setup.py", line 45, in
extra_lib_paths)
File "c:\users\usuario\appdata\local\temp\pip-install-xcu3ya\tesserwrap\setup.py", line 30, in find_closest_libname
"Cannot find Tesseract via ldconfig, confirm it is installed.")
Exception: Cannot find Tesseract via ldconfig, confirm it is installed.
Command "python setup.py egg_info" failed with error code 1 in c:\users\usuario\appdata\local\temp\pip-install-xcu3ya\tesserwrap\