Skip to content

Commit

Permalink
Initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
nathanaday committed Apr 9, 2021
0 parents commit 66a894d
Show file tree
Hide file tree
Showing 13 changed files with 768 additions and 0 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# Project exclude paths
/venv/
8 changes: 8 additions & 0 deletions .idea/.gitignore

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 6 additions & 0 deletions .idea/inspectionProfiles/Project_Default.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 6 additions & 0 deletions .idea/inspectionProfiles/profiles_settings.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 4 additions & 0 deletions .idea/misc.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 8 additions & 0 deletions .idea/modules.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

10 changes: 10 additions & 0 deletions .idea/realtimeTesseract.iml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

70 changes: 70 additions & 0 deletions Linguist.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
import csv
import os


def supported_langs_file():
if os.path.exists("Tesseract_Langs.txt"):
return "Tesseract_Langs.txt" # Text file containing every supported language code with full name
else:
print("The expected supported languages file is not in the directory. "
"Tesseract_Langs.txt is available in the github repo")
return None


def get_language_from_code(code):
"""
Get the full-language name from an ISO 639-2/T code (tesseract-supported language code)
Text file containing language codes is generated from tesseract's supported languages document:
https://github.com/tesseract-ocr/tessdoc/blob/master/Data-Files-in-different-versions.md
:return: Returns full language name if the code is matched, or returns the code in the case of no-match
"""
file = supported_langs_file()

l_dict = {}
with open(file, 'r') as file:
reader = csv.reader(file, delimiter='\t')
for el in reader:
key = el[0]
name = el[1]
l_dict[key] = name

try:
return l_dict[code]
except KeyError: # If no language name matches the code, just return the code
return code


def show_codes():
"""
Print a list of all tesseract-supported language codes next to the full language name
"""
file = supported_langs_file()

with open(file, 'r') as file:
reader = csv.reader(file, delimiter='\t')
print("{:<20s}{:<40s}".format("CODE", "LANGUAGE"))
for el in reader:
print("{:<20s}{:<40s}".format(el[0], el[1]))


def language_string(language):
"""
Generate a string containing a full language name given its code as used in the ocr process
:param language: ISO 639-2/T code (tesseract-supported)
:return: Full language name
"""
if language is not None: # If the user has specified a language, or multiple languages
name_list = []
codes = language.split('+') # Multiple languages have the format "lang1+lang2+lang3"
for language in codes:
name = get_language_from_code(language)
name_list.append(name)
lang_name = ', '.join(name_list)

else: # Tesseract default
lang_name = 'English'

return lang_name
63 changes: 63 additions & 0 deletions Main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import argparse

import OCR
import Linguist


def main():
"""
Handles command line arguments and begins the real-time OCR by calling ocr_stream().
A path to the Tesseract cmd root is required, but all other params are optional.
Example command-line use: python3 Main.py -t /usr/local/Cellar/tesseract/4.1.1/bin/tesseract
optional arguments:
-h, --help show this help message and exit
-c , --crop crop OCR area in pixels (two vals required): width height
-v , --view_mode view mode for OCR boxes display (default=1)
-sv, --show_views show the available view modes and descriptions
-l , --language code for tesseract language, use + to add multiple (ex: chi_sim+chi_tra)
-sl, --show_langs show list of tesseract (4.0+) supported langs
required named arguments:
-t , --tess_path path to the cmd root of tesseract install (see docs for further help)
"""
parser = argparse.ArgumentParser()

# Required:
requiredNamed = parser.add_argument_group('required named arguments')

requiredNamed.add_argument('-t', '--tess_path',
help="path to the cmd root of tesseract install (see docs for further help)",
metavar='', required=True)

# Optional:
parser.add_argument('-c', '--crop', help="crop OCR area in pixels (two vals required): width height",
nargs=2, type=int, metavar='')

parser.add_argument('-v', '--view_mode', help="view mode for OCR boxes display (default=1)",
default=1, type=int, metavar='')
parser.add_argument('-sv', '--show_views', help="show the available view modes and descriptions",
action="store_true")

parser.add_argument("-l", "--language",
help="code for tesseract language, use + to add multiple (ex: chi_sim+chi_tra)",
metavar='', default=None)
parser.add_argument("-sl", "--show_langs", help="show list of tesseract (4.0+) supported langs",
action="store_true")

args = parser.parse_args()

if args.show_langs:
Linguist.show_codes()

if args.show_views:
print(OCR.views.__doc__)

# This is where OCR is started...
OCR.tesseract_location(args.tess_path)
OCR.ocr_stream(view_mode=args.view_mode, crop=args.crop, language=args.language)


if __name__ == '__main__':
main() # '/usr/local/Cellar/tesseract/4.1.1/bin/tesseract'
Loading

0 comments on commit 66a894d

Please sign in to comment.