-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
193 lines (162 loc) · 6.83 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
import argparse
import logging
import os
import shutil
import subprocess
import sys
import tempfile
from constants import VALID_IMAGE_EXTENSIONS, WINDOWS_CHECK_COMMAND, DEFAULT_CHECK_COMMAND, TESSERACT_DATA_PATH_VAR
def create_directory(path):
"""
Create directory at given path if directory does not exist
:param path:
:return:
"""
if not os.path.exists(path):
os.makedirs(path)
def check_path(path):
"""
Check if file path exists or not
:param path:
:return: boolean
"""
return bool(os.path.exists(path))
def get_command():
"""
Check OS and return command to identify if tesseract is installed or not
:return:
"""
if sys.platform.startswith('win'):
return WINDOWS_CHECK_COMMAND
return DEFAULT_CHECK_COMMAND
def run_tesseract(filename, output_path, image_file_name):
# Run tesseract
filename_without_extension = os.path.splitext(filename)[0]
# If no output path is provided
if not output_path:
temp_dir = tempfile.mkdtemp()
temp_file = os.path.join(temp_dir, filename_without_extension)
subprocess.run(['tesseract', image_file_name, temp_file],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
with open('{}.txt'.format(temp_file), 'r', encoding="utf8") as f:
text = f.read()
shutil.rmtree(temp_dir)
return text
text_file_path = os.path.join(output_path, filename_without_extension)
subprocess.run(['tesseract', image_file_name, text_file_path],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
return
def check_pre_requisites_tesseract():
"""
Check if the pre-requisites required for running the tesseract application are satisfied or not
:param : NA
:return: boolean
"""
check_command = get_command()
logging.debug("Running `{}` to check if tesseract is installed or not.".format(check_command))
result = subprocess.run([check_command, 'tesseract'], stdout=subprocess.PIPE)
if not result.stdout:
logging.error("tesseract-ocr missing, install `tesseract` to resolve. Refer to README for more instructions.")
return False
logging.debug("Tesseract correctly installed!\n")
if sys.platform.startswith('win'):
environment_variables = os.environ
logging.debug(
"Checking if the Tesseract Data path is set correctly or not.\n")
if TESSERACT_DATA_PATH_VAR in environment_variables:
if environment_variables[TESSERACT_DATA_PATH_VAR]:
path = environment_variables[TESSERACT_DATA_PATH_VAR]
logging.debug("Checking if the path configured for Tesseract Data Environment variable `{}` \
as `{}` is valid or not.".format(TESSERACT_DATA_PATH_VAR, path))
if os.path.isdir(path) and os.access(path, os.R_OK):
logging.debug("All set to go!")
return True
else:
logging.error(
"Configured path for Tesseract data is not accessible!")
return False
else:
logging.error("Tesseract Data path Environment variable '{}' configured to an empty string!\
".format(TESSERACT_DATA_PATH_VAR))
return False
else:
logging.error("Tesseract Data path Environment variable '{}' needs to be configured to point to\
the tessdata!".format(TESSERACT_DATA_PATH_VAR))
return False
else:
return True
def main(input_path, output_path):
# Check if tesseract is installed or not
if not check_pre_requisites_tesseract():
return
# Check if a valid input directory is given or not
if not check_path(input_path):
logging.error("Nothing found at `{}`".format(input_path))
return
# Create output directory
if output_path:
create_directory(output_path)
logging.debug("Creating Output Path {}".format(output_path))
# Check if input_path is directory or file
if os.path.isdir(input_path):
logging.debug("The Input Path is a directory.")
# Check if input directory is empty or not
total_file_count = len(os.listdir(input_path))
if total_file_count == 0:
logging.error("No files found at your input location")
return
# Iterate over all images in the input directory
# and get text from each image
other_files = 0
successful_files = 0
logging.info("Found total {} file(s)\n".format(total_file_count))
for ctr, filename in enumerate(os.listdir(input_path)):
logging.debug("Parsing {}".format(filename))
extension = os.path.splitext(filename)[1]
if extension.lower() not in VALID_IMAGE_EXTENSIONS:
other_files += 1
continue
image_file_name = os.path.join(input_path, filename)
print(run_tesseract(filename, output_path, image_file_name))
successful_files += 1
logging.info("Parsing Completed!\n")
if successful_files == 0:
logging.error("No valid image file found.")
logging.error("Supported formats: [{}]".format(
", ".join(VALID_IMAGE_EXTENSIONS)))
else:
logging.info(
"Successfully parsed images: {}".format(successful_files))
logging.info(
"Files with unsupported file extensions: {}".format(other_files))
else:
filename = os.path.basename(input_path)
logging.debug("The Input Path is a file {}".format(filename))
print(run_tesseract(filename, output_path, input_path))
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser._action_groups.pop()
required = parser.add_argument_group('required arguments')
optional = parser.add_argument_group('optional arguments')
required.add_argument('-i', '--input', help="Single image file path or images directory path", required=True)
optional.add_argument('-o', '--output', help="(Optional) Output directory for converted text")
optional.add_argument('-d', '--debug', action='store_true', help="Enable verbose DEBUG logging")
args = parser.parse_args()
input_path = os.path.abspath(args.input)
if args.output:
output_path = os.path.abspath(args.output)
else:
output_path = None
if args.debug:
logging.getLogger().setLevel(logging.DEBUG)
else:
logging.getLogger().setLevel(logging.INFO)
logging.debug("Input Path is {}".format(input_path))
# Check Python version
if sys.version_info[0] < 3:
logging.error("You are using Python {0}.{1}. Please use Python>=3".format(
sys.version_info[0], sys.version_info[1]))
exit()
main(input_path, output_path)