-
Notifications
You must be signed in to change notification settings - Fork 36
Commit
- Loading branch information
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
# Project exclude paths | ||
/venv/ |
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
import csv | ||
import os | ||
|
||
|
||
def supported_langs_file(): | ||
if os.path.exists("Tesseract_Langs.txt"): | ||
return "Tesseract_Langs.txt" # Text file containing every supported language code with full name | ||
else: | ||
print("The expected supported languages file is not in the directory. " | ||
"Tesseract_Langs.txt is available in the github repo") | ||
return None | ||
|
||
|
||
def get_language_from_code(code): | ||
""" | ||
Get the full-language name from an ISO 639-2/T code (tesseract-supported language code) | ||
Text file containing language codes is generated from tesseract's supported languages document: | ||
https://github.com/tesseract-ocr/tessdoc/blob/master/Data-Files-in-different-versions.md | ||
:return: Returns full language name if the code is matched, or returns the code in the case of no-match | ||
""" | ||
file = supported_langs_file() | ||
|
||
l_dict = {} | ||
with open(file, 'r') as file: | ||
reader = csv.reader(file, delimiter='\t') | ||
for el in reader: | ||
key = el[0] | ||
name = el[1] | ||
l_dict[key] = name | ||
|
||
try: | ||
return l_dict[code] | ||
except KeyError: # If no language name matches the code, just return the code | ||
return code | ||
|
||
|
||
def show_codes(): | ||
""" | ||
Print a list of all tesseract-supported language codes next to the full language name | ||
""" | ||
file = supported_langs_file() | ||
|
||
with open(file, 'r') as file: | ||
reader = csv.reader(file, delimiter='\t') | ||
print("{:<20s}{:<40s}".format("CODE", "LANGUAGE")) | ||
for el in reader: | ||
print("{:<20s}{:<40s}".format(el[0], el[1])) | ||
|
||
|
||
def language_string(language): | ||
""" | ||
Generate a string containing a full language name given its code as used in the ocr process | ||
:param language: ISO 639-2/T code (tesseract-supported) | ||
:return: Full language name | ||
""" | ||
if language is not None: # If the user has specified a language, or multiple languages | ||
name_list = [] | ||
codes = language.split('+') # Multiple languages have the format "lang1+lang2+lang3" | ||
for language in codes: | ||
name = get_language_from_code(language) | ||
name_list.append(name) | ||
lang_name = ', '.join(name_list) | ||
|
||
else: # Tesseract default | ||
lang_name = 'English' | ||
|
||
return lang_name |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
import argparse | ||
|
||
import OCR | ||
import Linguist | ||
|
||
|
||
def main(): | ||
""" | ||
Handles command line arguments and begins the real-time OCR by calling ocr_stream(). | ||
A path to the Tesseract cmd root is required, but all other params are optional. | ||
Example command-line use: python3 Main.py -t /usr/local/Cellar/tesseract/4.1.1/bin/tesseract | ||
optional arguments: | ||
-h, --help show this help message and exit | ||
-c , --crop crop OCR area in pixels (two vals required): width height | ||
-v , --view_mode view mode for OCR boxes display (default=1) | ||
-sv, --show_views show the available view modes and descriptions | ||
-l , --language code for tesseract language, use + to add multiple (ex: chi_sim+chi_tra) | ||
-sl, --show_langs show list of tesseract (4.0+) supported langs | ||
required named arguments: | ||
-t , --tess_path path to the cmd root of tesseract install (see docs for further help) | ||
""" | ||
parser = argparse.ArgumentParser() | ||
|
||
# Required: | ||
requiredNamed = parser.add_argument_group('required named arguments') | ||
|
||
requiredNamed.add_argument('-t', '--tess_path', | ||
help="path to the cmd root of tesseract install (see docs for further help)", | ||
metavar='', required=True) | ||
|
||
# Optional: | ||
parser.add_argument('-c', '--crop', help="crop OCR area in pixels (two vals required): width height", | ||
nargs=2, type=int, metavar='') | ||
|
||
parser.add_argument('-v', '--view_mode', help="view mode for OCR boxes display (default=1)", | ||
default=1, type=int, metavar='') | ||
parser.add_argument('-sv', '--show_views', help="show the available view modes and descriptions", | ||
action="store_true") | ||
|
||
parser.add_argument("-l", "--language", | ||
help="code for tesseract language, use + to add multiple (ex: chi_sim+chi_tra)", | ||
metavar='', default=None) | ||
parser.add_argument("-sl", "--show_langs", help="show list of tesseract (4.0+) supported langs", | ||
action="store_true") | ||
|
||
args = parser.parse_args() | ||
|
||
if args.show_langs: | ||
Linguist.show_codes() | ||
|
||
if args.show_views: | ||
print(OCR.views.__doc__) | ||
|
||
# This is where OCR is started... | ||
OCR.tesseract_location(args.tess_path) | ||
OCR.ocr_stream(view_mode=args.view_mode, crop=args.crop, language=args.language) | ||
|
||
|
||
if __name__ == '__main__': | ||
main() # '/usr/local/Cellar/tesseract/4.1.1/bin/tesseract' |