-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrecognize.py
130 lines (98 loc) · 4.13 KB
/
recognize.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
# -*- coding: utf-8 -*-
import numpy as np
import pandas as pd
import pytesseract
import glob
import cv2
import random
import re
import os
from random import randint
import s3_bucket
extra_symbols = r"\»|\<|\>|\)|\(|\d+|\?|\-|\.|\,|\!|\/|\;|\:|\:-|\_|\—|\n|\|"
def get_cropped_word(postfix, points, orig):
(xmin, xmax, ymin) = points
file_name = 'cropped %s (%s).png' % postfix
path = './cropped/' + file_name
(xmin, xmax) = (0 if xmin-7 < 0 else xmin-7, xmax+10)
(ymin, ymax) = (0 if ymin-32 < 0 else ymin-32, ymin+7)
cropped = orig[ymin:ymax, xmin:xmax]
print('cropping by: ymin: %s, ymax: %s, xmin: %s, xmax: %s' % (ymin, ymax, xmin, xmax))
return (path, file_name, cropped)
def crop_words_on_line(y_line_df, line_idx, orig):
sorted_line_df = y_line_df.sort_values(by=['x1'])
points_on_line = []
(xmin, xmax, ymin) = (0, 0, 0)
for i, row in sorted_line_df.iterrows():
(x, y) = (row["x1"], row["y1"])
if (xmin == 0):
(xmin, xmax, ymin) = (x, x, y)
elif (x > xmax):
if (abs(xmax - x) < 100):
xmax = x
else:
points_on_line.append((xmin, xmax, ymin))
xmin = x
xmax = x
ymin = y
points_on_line.append((xmin, xmax, ymin))
return [get_cropped_word((line_idx, idx), points, orig) for idx, points in enumerate(points_on_line)]
def get_words_on_photo(path, file_id):
testImagePath = path
orig = cv2.imread(testImagePath)
gray = cv2.imread(testImagePath)
#lower_blue = np.array([100,10,10])
#upper_blue = np.array([150,255,255])
lower_blue = np.array([90,0,0])
upper_blue = np.array([170,255,255])
hsv = cv2.cvtColor(gray, cv2.COLOR_BGR2HSV)
mask = cv2.inRange(hsv, lower_blue, upper_blue)
cv2.imwrite('mask.jpg',mask)
print('mask of the photo created')
blue_hough_lines = cv2.HoughLinesP(cv2.Canny(mask,50,150), 1, np.pi/180, 0, maxLineGap=150)
a,b,c = blue_hough_lines.shape
# collect hough lines under word to df
df = pd.DataFrame(columns=['x1','y1','x2','y2'])
for i in range(a):
df.loc[i] = [
blue_hough_lines[i][0][0],
blue_hough_lines[i][0][1],
blue_hough_lines[i][0][2],
blue_hough_lines[i][0][3]
]
# group lines by height
y_arrange_values = np.arange(60, 3500, 40)
sorted_df = df.sort_values(by=['y1', 'y2'])
grouped_df = sorted_df.groupby([pd.cut(sorted_df["y1"], y_arrange_values)])
words = []
for idx, group in enumerate(grouped_df):
key,item = group
cv2.line(gray, (0, key.left), (3000, key.right), (0,0,255), 3, cv2.LINE_AA)
if item.empty or len(item) < 30:
continue
y_line_df = pd.DataFrame(columns=['x1','y1','x2','y2'])
color = (random.randint(0,255),random.randint(0,255),random.randint(0,255))
# underline word with different colors
for line_idx, line in enumerate(item.values):
x1,y1,x2,y2 = line
y_line_df.loc[line_idx] = [x1,y1,x2,y2]
cv2.line(gray, (x1, y1), (x2, y2), color, 3, cv2.LINE_AA)
# find words in y line by continuous x values
cropped_files = crop_words_on_line(y_line_df, idx, orig)
# write cropped words in files
for (croppedFilePath, file_name, cropped) in cropped_files:
cv2.imwrite(croppedFilePath, cropped)
stat_info = os.stat(croppedFilePath);
# recognize cropped words
if (stat_info.st_size > 0):
# debug cropped files
s3_bucket.upload_file(croppedFilePath, file_id + file_name)
word = pytesseract.image_to_string(cropped)
print("pytesseract result for: ", croppedFilePath, " - ", word)
words.append(re.sub(extra_symbols, "", word).strip())
cv2.imwrite('houghlines5.jpg', gray)
# debug source file
s3_bucket.upload_file(path, 'file %s.png' % (file_id))
s3_bucket.upload_file('./mask.jpg', 'mask %s.png' % (file_id))
s3_bucket.upload_file('./houghlines5.jpg', 'houghlines5 %s.png' % (file_id))
return words