PBJI · January 21, 2023 15:20
diff --git a/DocDataMiner.ipynb b/DocDataMiner.ipynb
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "1d0cd599",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[X] Directory  images  already exists\n",
      "[X] Directory  text  already exists\n"
     ]
    }
   ],
   "source": [
    "import skimage\n",
    "#import pdf2image (Not in use)\n",
    "from pdf2image import convert_from_path as cpdf_img\n",
    "from docx2pdf import convert as cword_pdf\n",
    "from skimage import feature\n",
    "from skimage import io\n",
    "from skimage.exposure import is_low_contrast\n",
    "import PIL\n",
    "from PIL import Image\n",
    "from pdf2image import pdfinfo_from_path as pfp #Warning: The acronym is too short\n",
    "import docx2pdf\n",
    "from os import remove as removePDF\n",
    "from os import system\n",
    "from os.path import exists\n",
    "from os.path import basename\n",
    "import os\n",
    "from mimetypes import guess_type as guessFile\n",
    "from pathlib import Path  \n",
    "import glob\n",
    "import imutils\n",
    "\n",
    "#from doctyper import *  ## the library that will store all doc classifying functions.\n",
    "\n",
    "import pytesseract  # from pytesseract\n",
    "from PIL import ImageFile  # from Pillow\n",
    "import cv2\n",
    "\n",
    "# import pandas as pd\n",
    "# from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "\n",
    "ImageFile.LOAD_TRUNCATED_IMAGES = True #Faster Image processing\n",
    "\n",
    "DIRNAME = ['images', 'text']\n",
    "\n",
    "for i in DIRNAME:\n",
    "\n",
    "    try:\n",
    "\n",
    "        os.makedirs(i)\n",
    "\n",
    "        print(\"[!] Directory \", i, \" Created\")\n",
    "\n",
    "    except FileExistsError:\n",
    "\n",
    "        print(\"[X] Directory \", i, \" already exists\")\n",
    "\n",
    "content = os.listdir(\"images\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "615cd3c9",
   "metadata": {},
   "source": [
    "Import images, pdf or word document and process them all to PIL Image objects."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "bb8d217c",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Help on function is_low_contrast in module skimage.exposure.exposure:\n",
      "\n",
      "is_low_contrast(image, fraction_threshold=0.05, lower_percentile=1, upper_percentile=99, method='linear')\n",
      "    Determine if an image is low contrast.\n",
      "    \n",
      "    Parameters\n",
      "    ----------\n",
      "    image : array-like\n",
      "        The image under test.\n",
      "    fraction_threshold : float, optional\n",
      "        The low contrast fraction threshold. An image is considered low-\n",
      "        contrast when its range of brightness spans less than this\n",
      "        fraction of its data type's full range. [1]_\n",
      "    lower_percentile : float, optional\n",
      "        Disregard values below this percentile when computing image contrast.\n",
      "    upper_percentile : float, optional\n",
      "        Disregard values above this percentile when computing image contrast.\n",
      "    method : str, optional\n",
      "        The contrast determination method.  Right now the only available\n",
      "        option is \"linear\".\n",
      "    \n",
      "    Returns\n",
      "    -------\n",
      "    out : bool\n",
      "        True when the image is determined to be low contrast.\n",
      "    \n",
      "    Notes\n",
      "    -----\n",
      "    For boolean images, this function returns False only if all values are\n",
      "    the same (the method, threshold, and percentile arguments are ignored).\n",
      "    \n",
      "    References\n",
      "    ----------\n",
      "    .. [1] https://scikit-image.org/docs/dev/user_guide/data_types.html\n",
      "    \n",
      "    Examples\n",
      "    --------\n",
      "    >>> image = np.linspace(0, 0.04, 100)\n",
      "    >>> is_low_contrast(image)\n",
      "    True\n",
      "    >>> image[-1] = 1\n",
      "    >>> is_low_contrast(image)\n",
      "    True\n",
      "    >>> is_low_contrast(image, upper_percentile=100)\n",
      "    False\n",
      "\n"
     ]
    }
   ],
   "source": [
    "help(is_low_contrast)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "a2045fea",
   "metadata": {},
   "outputs": [],
   "source": [
    "# This script require abiword to be installed and \n",
    "# the executable path saved to system environment path.\n",
    "# Step 1:\n",
    "from pdf2image import convert_from_path as cpdf_img\n",
    "from docx2pdf import convert as cword_pdf\n",
    "from PIL import Image\n",
    "from os.path import exists\n",
    "from os.path import basename\n",
    "import os.path import splitext\n",
    "from os import system\n",
    "from os import remove as removePDF\n",
    "from os import sep\n",
    "from mimetypes import guess_type as guessFile\n",
    "\n",
    "\n",
    "#fileopen loads any format specified and transforms them into PIL Image Object\n",
    "def fileopen(file:'Image/Pdf/Word path', dpi=300) -> '[PIL image object]':\n",
    "    if (exists(file)):\n",
    "        if \"pdf\" in guessFile(file)[0]:\n",
    "            pil_images = cpdf_img(file, dpi)\n",
    "            return [pil_images, basename(file)]\n",
    "        elif \"image\" in guessFile(file)[0]:\n",
    "            pil_image = Image.open(file)\n",
    "            return [[pil_image], basename(file)]\n",
    "        elif \"word\" in guessFile(file)[0]:\n",
    "            cmd = \"abiword --to=PDF --to-name='temp.pdf' \" + file\n",
    "            system(cmd)\n",
    "            pil_images = cpdf_img('temp.pdf', dpi)\n",
    "            removePDF('temp.pdf')\n",
    "            return [pil_images, basename(file)]\n",
    "        else:\n",
    "            return None\n",
    "\n",
    "#imagesave saves list of PIL Image object in the specified folder or default if none\n",
    "#specified.\n",
    "#also if we want we can give custom names to the saved images by supplying filename\n",
    "def imageSave(pil_list:'[PIL image object]', dirname=DIRNAME[0], filename):\n",
    "    if \"str\" not in type(filename):\n",
    "        filename, ext = splitext(pil_list[1])\n",
    "    file_index = \"0\"\n",
    "    for ob in pil_list[0]:\n",
    "        imagePath = '.'+sep+dirname+sep+filename+'['+file_index+']'\".png\"\n",
    "        ob.save(imagePath)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "c5ef6b9a",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "id": "27c95a1c",
   "metadata": {},
   "source": [
    "Use the PIL object images to transform them into meaningful pre-processed images before feeding them to machine learning algorithms."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "c80ffb31",
   "metadata": {},
   "outputs": [
    {
     "ename": "NameError",
     "evalue": "name 'DIRNAME' is not defined",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[1], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mtextExtraction\u001b[39m(file:\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mImage\u001b[39m\u001b[38;5;124m'\u001b[39m, txtdir\u001b[38;5;241m=\u001b[39m\u001b[43mDIRNAME\u001b[49m[\u001b[38;5;241m1\u001b[39m]) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mText\u001b[39m\u001b[38;5;124m'\u001b[39m:\n\u001b[1;32m      2\u001b[0m     psm \u001b[38;5;241m=\u001b[39m [\u001b[38;5;241m3\u001b[39m,\u001b[38;5;241m4\u001b[39m,\u001b[38;5;241m6\u001b[39m]\n\u001b[1;32m      4\u001b[0m     config  \u001b[38;5;241m=\u001b[39m (\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m-l eng --oem 3 --psm \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;241m+\u001b[39m \u001b[38;5;28mstr\u001b[39m(\u001b[38;5;241m11\u001b[39m))\n",
      "\u001b[0;31mNameError\u001b[0m: name 'DIRNAME' is not defined"
     ]
    }
   ],
   "source": [
    "def textExtraction(file:'Image', txtdir=DIRNAME[1]) -> 'Text':\n",
    "    psm = [3,4,6]\n",
    "    \n",
    "    config  = ('-l eng --oem 3 --psm '+ str(11))\n",
    "    \n",
    "    img = cv2.imread(file)\n",
    "    \n",
    "    grayscale = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)\n",
    "    \n",
    "    count = 0\n",
    "    \n",
    "#     while (count < 200):\n",
    "    \n",
    "#         ret,thresh = cv2.threshold(grayscale,count,count+70,0)\n",
    "    \n",
    "# #         print(file)\n",
    "#         ok_flag = True\n",
    "#         while ok_flag:\n",
    "#             cv2.imshow(\"Binary Image\", thresh)\n",
    "#             if cv2.waitKey(0) == 13:\n",
    "#                 ok_flag = False\n",
    "#             if cv2.waitKey(0) == 97:\n",
    "#                 print(count)\n",
    "        \n",
    "# #         print(cv2.waitKey(0))\n",
    "#         count += 10\n",
    "    ret,thresh = cv2.threshold(grayscale,130,180,0)\n",
    "    cv2.imshow(\"Binary Image\", thresh)\n",
    "    cv2.waitKey(0)\n",
    "    \n",
    "    blurred = cv2.GaussianBlur(grayscale, (5, 5), 0)\n",
    "    cv2.imshow(\"Blurred Image\", blurred)\n",
    "    cv2.waitKey(0)\n",
    "    edged = cv2.Canny(blurred, 30, 150)\n",
    "    cv2.imshow(\"Edged Image\", edged)\n",
    "    cv2.waitKey(0)\n",
    "    \n",
    "    cnts = cv2.findContours(edged.copy(), cv2.RETR_EXTERNAL,\n",
    "    cv2.CHAIN_APPROX_SIMPLE)\n",
    "    cnts = imutils.grab_contours(cnts)\n",
    "    cv2.drawContours(img, [cnts], -1, (0, 255, 0), 2)\n",
    "    cv2.imshow(\"Contoured\", img)\n",
    "    cv2.waitKey(0)\n",
    "    \n",
    "    print(is_low_contrast(img, 0.35))\n",
    "\n",
    "#     text = pytesseract.pytesseract.image_to_string(thresh, lang='eng', config=config)\n",
    "    \n",
    "#     help(thresh)\n",
    "\n",
    "    # Display the Binary Image\n",
    "#     help(is_low_contrast)\n",
    "#     if cv2.is_low_contrast(img, )\n",
    "    cv2.destroyAllWindows()\n",
    "    \n",
    "#     filename, ext = basename(file).split(\".\")\n",
    "    \n",
    "#     fileTxt = open(f\"{txtdir}/{filename}.txt\", \"w\")\n",
    "    \n",
    "#     fileTxt.write(text)\n",
    "    \n",
    "#     return text\n",
    "\n",
    "# imageSave(fileopen('./fileopen-test/database.docx', 400))\n",
    "# imageSave(fileopen('./fileopen-test/candles.pdf', 400))\n",
    "\n",
    "# image_files = glob.glob(\"docImages/aadhar/*\")\n",
    "\n",
    "# for img in image_files:\n",
    "#     imageSave(fileopen(img, 400))\n",
    "    \n",
    "text_files = glob.glob(f\"{DIRNAME[0]}/*\")\n",
    "\n",
    "for img in text_files[:10]:\n",
    "    print(textExtraction(img))\n",
    "    \n",
    "# textExtraction(img, DIRNAME[0], DIRNAME[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "048b1f27",
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "ename": "NameError",
     "evalue": "name 'TfidfVectorizer' is not defined",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[8], line 25\u001b[0m\n\u001b[1;32m     23\u001b[0m     display(top_df)\n\u001b[1;32m     24\u001b[0m \u001b[38;5;66;03m#     print(feature_names)\u001b[39;00m\n\u001b[0;32m---> 25\u001b[0m \u001b[43muniqueWords\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mtext\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n",
      "Cell \u001b[0;32mIn[8], line 13\u001b[0m, in \u001b[0;36muniqueWords\u001b[0;34m(txtdir)\u001b[0m\n\u001b[1;32m     10\u001b[0m text_files \u001b[38;5;241m=\u001b[39m glob\u001b[38;5;241m.\u001b[39mglob(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mtxtdir\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/*.txt\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m     11\u001b[0m text_titles \u001b[38;5;241m=\u001b[39m [Path(text)\u001b[38;5;241m.\u001b[39mstem \u001b[38;5;28;01mfor\u001b[39;00m text \u001b[38;5;129;01min\u001b[39;00m text_files]\n\u001b[0;32m---> 13\u001b[0m vectorizer \u001b[38;5;241m=\u001b[39m \u001b[43mTfidfVectorizer\u001b[49m(\u001b[38;5;28minput\u001b[39m\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mfilename\u001b[39m\u001b[38;5;124m'\u001b[39m,stop_words\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124menglish\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[1;32m     14\u001b[0m vectors \u001b[38;5;241m=\u001b[39m vectorizer\u001b[38;5;241m.\u001b[39mfit_transform(text_files)\n\u001b[1;32m     15\u001b[0m feature_names \u001b[38;5;241m=\u001b[39m vectorizer\u001b[38;5;241m.\u001b[39mget_feature_names_out()\n",
      "\u001b[0;31mNameError\u001b[0m: name 'TfidfVectorizer' is not defined"
     ]
    }
   ],
   "source": [
    "import os\n",
    "import glob\n",
    "from pathlib import Path\n",
    "DIRNAME = ['','']\n",
    "def uniqueWords(txtdir=DIRNAME[1]):\n",
    "    content = os.listdir(txtdir)\n",
    "    dirPath = txtdir+os.sep\n",
    "    content = [f'./{dirPath}{a}' for a in content]\n",
    "    \n",
    "    text_files = glob.glob(f\"{txtdir}/*.txt\")\n",
    "    text_titles = [Path(text).stem for text in text_files]\n",
    "    \n",
    "    vectorizer = TfidfVectorizer(input='filename',stop_words='english')\n",
    "    vectors = vectorizer.fit_transform(text_files)\n",
    "    feature_names = vectorizer.get_feature_names_out()\n",
    "    df = pd.DataFrame(vectors.toarray(), index=text_titles, columns=feature_names)\n",
    "#     df.loc['00_Document Frequency'] = (df > 0).sum()\n",
    "    dummy_df = df.stack().reset_index()\n",
    "    dummy_df = dummy_df.rename(columns={0:'tfidf', 'level_0': 'document','level_1': 'term', 'level_2': 'term'})\n",
    "    top_df = dummy_df.sort_values(by=['tfidf'], ascending=[False\n",
    "                                                          \n",
    "                                                          ]).groupby(['document']).head(10)\n",
    "    display(top_df)\n",
    "#     print(feature_names)\n",
    "uniqueWords(\"text\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "348cd986",
   "metadata": {},
   "outputs": [],
   "source": [
    "help(PIL.Image.open)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "8967efe2",
   "metadata": {},
   "outputs": [
    {
     "ename": "TypeError",
     "evalue": "can only concatenate list (not \"str\") to list",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[24], line 6\u001b[0m\n\u001b[1;32m      3\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msys\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m argv\n\u001b[1;32m      4\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mos\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m remove \u001b[38;5;28;01mas\u001b[39;00m remove\n\u001b[0;32m----> 6\u001b[0m image_list \u001b[38;5;241m=\u001b[39m glob\u001b[38;5;241m.\u001b[39mglob(\u001b[43margv\u001b[49m\u001b[38;5;241;43m+\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m*\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m)\n",
      "\u001b[0;31mTypeError\u001b[0m: can only concatenate list (not \"str\") to list"
     ]
    }
   ],
   "source": [
    "import glob\n",
    "import cv2\n",
    "from sys import argv\n",
    "from os import remove as remove\n",
    "\n",
    "image_list = glob.glob(argv+\"*\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b8ccda80",
   "metadata": {},
   "outputs": [],
   "source": [
    "from glob import glob\n",
    "import cv2\n",
    "import imutils\n",
    "import numpy as np\n",
    "import imutils\n",
    "import pytesseract\n",
    "from pytesseract import Output\n",
    "\n",
    "files = glob('./docImages/aadhar/*')\n",
    "# print(files)\n",
    "for i in files:\n",
    "    img = cv2.imread(i)\n",
    "    temp = img.copy()\n",
    "    temp_state = []\n",
    "    total_quit = False\n",
    "    while (True):\n",
    "        cv2.imshow(\"Preview\", temp)\n",
    "        key = cv2.waitKey(0)\n",
    "        # Gray (g)\n",
    "        if key == ord(\"g\") and \"g\" not in temp_state:\n",
    "            temp = cv2.cvtColor(temp, cv2.COLOR_BGR2GRAY)\n",
    "            temp_state.append(\"g\")\n",
    "        \n",
    "        # Blur (b)\n",
    "        if key == ord(\"b\"):\n",
    "            temp = cv2.medianBlur(temp, 9)\n",
    "        \n",
    "        # Threshold (t)\n",
    "        if key == ord(\"t\"):\n",
    "            temp = cv2.adaptiveThreshold(temp, 255, \n",
    "                                  cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 31, 2)\n",
    "            temp_state.append(\"t\")\n",
    "        \n",
    "        # Edges (e)\n",
    "        if key == ord(\"e\") and \"t\" not in temp_state and \"g\" in temp_state:\n",
    "            edged = cv2.Canny(temp, 50, 150)\n",
    "            backtorgb = cv2.cvtColor(temp,cv2.COLOR_GRAY2RGB)\n",
    "            cnts = cv2.findContours(edged.copy(), cv2.RETR_LIST,\n",
    "            cv2.CHAIN_APPROX_SIMPLE)\n",
    "            cnts = cnts[0] if len(cnts) == 2 else cnts[1]\n",
    "            initiated = True\n",
    "            max_area_cnt = []\n",
    "            max_area = None\n",
    "            for i in cnts:\n",
    "                x, y, w, h = cv2.boundingRect(i)\n",
    "                cv2.rectangle(backtorgb, (x, y), (x + w, y + h), (0, 255, 0), 2)\n",
    "            cv2.imshow(\"backtorgb\", backtorgb)\n",
    "            cv2.waitKey(0)\n",
    "#             cropped = False\n",
    "#             while(True):\n",
    "#                 cv2.imshow(\"backtorgb\", backtorgb)\n",
    "#                 key2 = cv2.waitKey(0)\n",
    "#                 if key2 == ord(\"p\") and not cropped:\n",
    "#                     c = max(cnts, key = cv2.contourArea)\n",
    "#                     x,y,w,h = cv2.boundingRect(c)\n",
    "#                     cv2.rectangle(backtorgb,(x,y),(x+w,y+h),(0,255,0), 2)\n",
    "#                     cropped = True\n",
    "#                 if key2 == ord(\"q\"):\n",
    "#                     break\n",
    "            \n",
    "            cv2.destroyAllWindows()\n",
    "            \n",
    "        \n",
    "        # Big (r)\n",
    "        if key == ord(\"r\"):\n",
    "            temp = cv2.resize(temp, None, fx=2, fy=2, interpolation=cv2.INTER_LINEAR)\n",
    "            \n",
    "        # Small (r)\n",
    "        if key == ord(\"y\"):\n",
    "            temp = cv2.resize(temp, None, fx=0.5, fy=0.5, interpolation=cv2.INTER_AREA)\n",
    "        \n",
    "        # Reset (s)\n",
    "        if key == ord(\"s\"):\n",
    "            temp = img.copy()\n",
    "            temp_state = []\n",
    "        \n",
    "        # Sharpening (z)\n",
    "        if key == ord(\"z\"):\n",
    "            kernel = np.array([[-1,-1,-1], [-1,9,-1], [-1,-1,-1]])\n",
    "            temp = cv2.filter2D(temp, -1, kernel)\n",
    "        \n",
    "        #Erosion: (k)\n",
    "        if key == ord(\"k\") and \"g\" in temp_state:\n",
    "            temp = cv2.erode(temp.copy(), None, 1)\n",
    "        \n",
    "        #Dilation (j)\n",
    "        if key == ord(\"j\") and \"g\" in temp_state:\n",
    "            temp = cv2.dilate(temp.copy(), None, 1)\n",
    "            \n",
    "        #Opening (o)\n",
    "        if key == ord(\"o\") and \"g\" in temp_state:\n",
    "            kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3))\n",
    "            temp = cv2.morphologyEx(temp, cv2.MORPH_OPEN, kernel)\n",
    "        \n",
    "        #Closing (p)\n",
    "        if key == ord(\"o\") and \"g\" in temp_state:\n",
    "            kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3))\n",
    "            temp = cv2.morphologyEx(temp, cv2.MORPH_OPEN, kernel)\n",
    "        \n",
    "        #Lines (l)\n",
    "        if key == ord(\"l\") and \"t\" not in temp_state and \"g\" in temp_state:\n",
    "            edged = cv2.Canny(temp, 50, 150)\n",
    "            lines = cv2.HoughLinesP(edged, 1, np.pi/180, 200)\n",
    "            backtorgb = cv2.cvtColor(temp,cv2.COLOR_GRAY2RGB)\n",
    "            for line in lines:\n",
    "                x1, y1, x2, y2 = line[0]\n",
    "                print(x1, y1, x2, y2)\n",
    "        \n",
    "        if key == ord(\"i\") and \"t\" not in temp_state and \"g\" in temp_state:\n",
    "            rgb = cv2.cvtColor(temp, cv2.COLOR_BGR2RGB)\n",
    "            config  = ('-l eng --oem 3 --psm '+ str(11))\n",
    "            \n",
    "            try:\n",
    "                text = pytesseract.image_to_string(rgb, lang='eng', config=config)\n",
    "            except Exception as e:\n",
    "                print(\"Fuck you \",e)\n",
    "                continue\n",
    "            \n",
    "            try:\n",
    "                results = pytesseract.image_to_osd(rgb, config='--psm 0 -c min_characters_to_try=4', output_type=Output.DICT)\n",
    "            except Exception as e:\n",
    "                print(\"Fuck you \",e)\n",
    "                continue\n",
    "            \n",
    "            # display the orientation information\n",
    "            print(\"[INFO] detected orientation: {}\".format(\n",
    "                results[\"orientation\"]))\n",
    "            print(\"[INFO] rotate by {} degrees to correct\".format(\n",
    "                results[\"rotate\"]))\n",
    "            print(\"[INFO] detected script: {}\".format(results[\"script\"]))\n",
    "            \n",
    "            \n",
    "        # Quit (q)\n",
    "        if key == ord(\"q\"):\n",
    "            break\n",
    "            \n",
    "        # Total Quit (/)\n",
    "        if key == ord(\"/\"):\n",
    "            total_quit = True\n",
    "            break\n",
    "        \n",
    "    \n",
    "    if total_quit:\n",
    "        break\n",
    "cv2.destroyAllWindows()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "id": "8181fd73",
   "metadata": {},
   "outputs": [],
   "source": [
    "cv2.destroyAllWindows()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "id": "4fdf22f9",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "./images/34[0].png Not to be Deleted\n",
      "./images/7[0].png Not to be Deleted\n",
      "./images/32[0].png Not to be Deleted\n",
      "Quitted\n"
     ]
    }
   ],
   "source": [
    "# Script for filtering images manually. \n",
    "# Usage: python imageF.py \"{path}/\"\n",
    "# After running the script and When Image Window in Focus:\n",
    "# Press q to quit the script\n",
    "# Press y to accept the image\n",
    "# Press n to delete the image\n",
    "\n",
    "import glob\n",
    "import cv2\n",
    "from sys import argv\n",
    "from os import remove as remove\n",
    "from os.path import exists\n",
    "\n",
    "def image_filter(files):\n",
    "    file_list = files\n",
    "    if \"list\" not in str(type(files)):\n",
    "        if \"str\" not in str(type(files)):\n",
    "            raise Exception(\"Neither list of paths nor string\")\n",
    "        elif not exists(files):\n",
    "            raise Exception(\"No path found\")\n",
    "        else:\n",
    "            file_list = glob.glob(files)\n",
    "    q = 0\n",
    "    for i in file_list:\n",
    "        try:\n",
    "            img = cv2.imread(i)\n",
    "            cv2.imshow(\"Preview\", img)\n",
    "\n",
    "            okay = True\n",
    "            while(okay):\n",
    "                key = cv2.waitKey(0)\n",
    "                if key == 110:\n",
    "                    try:\n",
    "                        remove(i)\n",
    "                        print(i + \" Deleted\")\n",
    "                        okay = False\n",
    "                    except Exception as e:\n",
    "                        print(e)\n",
    "                elif key == 121:\n",
    "                    okay = False\n",
    "                    print(i + \" Not to be Deleted\")\n",
    "                elif key == 113:\n",
    "                    q = 1\n",
    "                    print(\"Quitted\")\n",
    "                    break\n",
    "                else:\n",
    "                    print(str(key) + \" is not the right key type either y or n\")\n",
    "            cv2.destroyAllWindows()\n",
    "            \n",
    "            if (q == 1):\n",
    "                break\n",
    "        except Exception as e:\n",
    "            print(e)\n",
    "            \n",
    "if __name__ == \"__main__\":\n",
    "    try:\n",
    "        image_list = glob.glob(argv+\"*\")\n",
    "    except:\n",
    "        image_list = glob.glob(\"./images/*\")\n",
    "    \n",
    "    image_filter(image_list)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ec2cf6d3",
   "metadata": {},
   "outputs": [],
   "source": [
    "import cv2\n",
    "import pytesseract\n",
    "\n",
    "#This file requires tesseract to installed on the system and\n",
    "#saved in the system environment path\n",
    "def textExtraction(file:'Image', txtdir=DIRNAME[1]) -> 'Text':\n",
    "    psm = [3,4,6, 11] # Not implemented\n",
    "    config  = ('-l eng --oem 3 --psm '+ str(psm[11]))\n",
    "    img = cv2.imread(file)\n",
    "    text = pytesseract.pytesseract.image_to_string(img, lang='eng', config=config)\n",
    "    return [text, filename]\n",
    "\n",
    "def textSave(text, filename, txtdir=DIRNAME[1]):\n",
    "    filename, ext = file.split(\".\")\n",
    "    fileTxt = open(f\"{txtdir}/{filename}.txt\", \"w\")\n",
    "    fileTxt.write(text)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }