Initial commit

nathanaday · Apr 9, 2021 · 66a894d · 66a894d
commit 66a894d
Show file tree

Hide file tree

Showing 13 changed files with 768 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,2 @@
+# Project exclude paths
+/venv/
diff --git a/.idea/.gitignore b/.idea/.gitignore
diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml
diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml
diff --git a/.idea/misc.xml b/.idea/misc.xml
diff --git a/.idea/modules.xml b/.idea/modules.xml
diff --git a/.idea/realtimeTesseract.iml b/.idea/realtimeTesseract.iml
diff --git a/Linguist.py b/Linguist.py
@@ -0,0 +1,70 @@
+import csv
+import os
+
+
+def supported_langs_file():
+    if os.path.exists("Tesseract_Langs.txt"):
+        return "Tesseract_Langs.txt"  # Text file containing every supported language code with full name
+    else:
+        print("The expected supported languages file is not in the directory. "
+              "Tesseract_Langs.txt is available in the github repo")
+        return None
+
+
+def get_language_from_code(code):
+    """
+    Get the full-language name from an ISO 639-2/T code (tesseract-supported language code)
+
+    Text file containing language codes is generated from tesseract's supported languages document:
+    https://github.com/tesseract-ocr/tessdoc/blob/master/Data-Files-in-different-versions.md
+
+    :return: Returns full language name if the code is matched, or returns the code in the case of no-match
+    """
+    file = supported_langs_file()
+
+    l_dict = {}
+    with open(file, 'r') as file:
+        reader = csv.reader(file, delimiter='\t')
+        for el in reader:
+            key = el[0]
+            name = el[1]
+            l_dict[key] = name
+
+    try:
+        return l_dict[code]
+    except KeyError:  # If no language name matches the code, just return the code
+        return code
+
+
+def show_codes():
+    """
+    Print a list of all tesseract-supported language codes next to the full language name
+    """
+    file = supported_langs_file()
+
+    with open(file, 'r') as file:
+        reader = csv.reader(file, delimiter='\t')
+        print("{:<20s}{:<40s}".format("CODE", "LANGUAGE"))
+        for el in reader:
+            print("{:<20s}{:<40s}".format(el[0], el[1]))
+
+
+def language_string(language):
+    """
+    Generate a string containing a full language name given its code as used in the ocr process
+
+    :param language: ISO 639-2/T code (tesseract-supported)
+    :return: Full language name
+    """
+    if language is not None:  # If the user has specified a language, or multiple languages
+        name_list = []
+        codes = language.split('+')  # Multiple languages have the format "lang1+lang2+lang3"
+        for language in codes:
+            name = get_language_from_code(language)
+            name_list.append(name)
+        lang_name = ', '.join(name_list)
+
+    else:  # Tesseract default
+        lang_name = 'English'
+
+    return lang_name
diff --git a/Main.py b/Main.py
@@ -0,0 +1,63 @@
+import argparse
+
+import OCR
+import Linguist
+
+
+def main():
+    """
+    Handles command line arguments and begins the real-time OCR by calling ocr_stream().
+    A path to the Tesseract cmd root is required, but all other params are optional.
+
+    Example command-line use: python3 Main.py -t /usr/local/Cellar/tesseract/4.1.1/bin/tesseract
+
+    optional arguments:
+      -h, --help         show this help message and exit
+      -c  , --crop       crop OCR area in pixels (two vals required): width height
+      -v , --view_mode   view mode for OCR boxes display (default=1)
+      -sv, --show_views  show the available view modes and descriptions
+      -l , --language    code for tesseract language, use + to add multiple (ex: chi_sim+chi_tra)
+      -sl, --show_langs  show list of tesseract (4.0+) supported langs
+
+    required named arguments:
+      -t , --tess_path   path to the cmd root of tesseract install (see docs for further help)
+    """
+    parser = argparse.ArgumentParser()
+
+    # Required:
+    requiredNamed = parser.add_argument_group('required named arguments')
+
+    requiredNamed.add_argument('-t', '--tess_path',
+                               help="path to the cmd root of tesseract install (see docs for further help)",
+                               metavar='', required=True)
+
+    # Optional:
+    parser.add_argument('-c', '--crop', help="crop OCR area in pixels (two vals required): width height",
+                        nargs=2, type=int, metavar='')
+
+    parser.add_argument('-v', '--view_mode', help="view mode for OCR boxes display (default=1)",
+                        default=1, type=int, metavar='')
+    parser.add_argument('-sv', '--show_views', help="show the available view modes and descriptions",
+                        action="store_true")
+
+    parser.add_argument("-l", "--language",
+                        help="code for tesseract language, use + to add multiple (ex: chi_sim+chi_tra)",
+                        metavar='', default=None)
+    parser.add_argument("-sl", "--show_langs", help="show list of tesseract (4.0+) supported langs",
+                        action="store_true")
+
+    args = parser.parse_args()
+
+    if args.show_langs:
+        Linguist.show_codes()
+
+    if args.show_views:
+        print(OCR.views.__doc__)
+
+    # This is where OCR is started...
+    OCR.tesseract_location(args.tess_path)
+    OCR.ocr_stream(view_mode=args.view_mode, crop=args.crop, language=args.language)
+
+
+if __name__ == '__main__':
+    main()  # '/usr/local/Cellar/tesseract/4.1.1/bin/tesseract'