-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
188 lines (134 loc) · 5.13 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
import re
import os
import PyPDF2
from PIL import Image
import pytesseract
IBAN_REGEX = r'([A-Z]{2}\s?\d{2}\s?(?:\d{4}\s?){5}\d{1,4})'
IBAN_CLEANED_REGEX = r'([A-Z]{2}\d{2}(?:\d{4}){5}\d{1,4})'
TARGET_IBAN = 'FR7611111111111111111111111'
# EXTRACT TEXT FROM FILES
def extract_text_pdf(pdf_path: str) -> str | None:
'''Extract text from a pdf file'''
with open(pdf_path, 'rb') as pdf_file:
pdf_reader = PyPDF2.PdfFileReader(pdf_file)
text = ''
for page_num in range(pdf_reader.numPages):
page_text = pdf_reader.getPage(page_num).extractText()
text += page_text
return text
def extract_text_image(image_path: str) -> str | None:
'''Extract text from an image file'''
text = pytesseract.image_to_string(Image.open(image_path))
return text
def search_regex(text: str | None, regex_pattern: str) -> list[str] | None:
'''Search regex_pattern in a given text'''
if text is None:
return None
matches = re.findall(regex_pattern, text)
return matches
def get_file_type(file_name: str) -> str:
'''Return file extension'''
_, extension = os.path.splitext(file_name)
return extension.lower()
# IBAN RELATED FUNCTIONS
def format_iban(iban_str: str, expected_format_regex: str) -> str:
'''Format IBAN'''
cleaned_iban = re.sub(r'\s+', '', iban_str)
if re.match(expected_format_regex, cleaned_iban):
return cleaned_iban.upper()
raise AssertionError('Iban bad format')
def detect_target_iban(text: str,
target_iban: str,
iban_raw_regex: str = IBAN_REGEX,
iban_cleaned_regex: str = IBAN_CLEANED_REGEX
)-> dict:
'''Detect IBAN in a given text'''
target_iban_present = False
matches = search_regex(text, iban_raw_regex)
formatted_matches = set()
if matches == []:
return False
for match in matches:
formatted_match = format_iban(match, iban_cleaned_regex)
formatted_matches.add(formatted_match)
if target_iban in formatted_matches:
target_iban_present = True
return target_iban_present
# DATE EXTRACTION
def detect_date(text: str) -> list[str] | None:
'''Detect date in a given text'''
regex = r'\d{1,2}\/\d{1,2}\/\d{2,4}'
scraped_dates = set()
matches = search_regex(text, regex)
# TODO Cas où matches = None ?
for match in matches:
formatted_match = format_date(match)
scraped_dates.add(formatted_match)
return list(scraped_dates)
def format_date(date_str: str) -> str:
'''Format date'''
return date_str
# AMOUNT EXTRACTION
def detect_amount(text: str) -> list[float] | None:
'''Detect date in a given text'''
regex = r'\d{1,3}(?:\s?\d{3})*(?:,\d{1,2})?\s?€'
scraped_values = set()
matches = search_regex(text, regex)
# TODO : traiter les cas où 8 EUR et €8
# TODO Cas où matches = None ?
for match in matches:
formatted_match = format_amount(match)
scraped_values.add(formatted_match)
return list(scraped_values)
def format_amount(amount_str: str) -> float:
'''Extract float value of currency amount given in argument'''
extraction_regex = r'(\d+(?:\s?\d{3})*(?:,\d{1,2})?)'
value_extraction = search_regex(amount_str, extraction_regex)
if value_extraction == []:
raise AssertionError('Impossible to extract amount')
amount_without_currency_symbol = value_extraction[0].replace(",", ".")
amount = float(amount_without_currency_symbol)
return amount
# MAIN FUNCTION
def extract_gather_information(filename: str,
iban_target: str,
date_needed: bool = True,
amount_needed: bool= True
) -> dict:
'''MAIN FUNCTION :
Detect informations from a file
Accepted extensions :
- `pdf`
- images : `png`, `jpg`, `jpeg`
'''
returned_dict = {}
extension = get_file_type(filename)
# Extract informations from documents
if extension == '.pdf':
text = extract_text_pdf(filename)
elif extension in ['.png', '.jpg', '.jpeg']:
text = extract_text_image(filename)
else:
raise TypeError('Unknown file format')
# Get target IBAN presence in extracted text
target_present = detect_target_iban(text, iban_target)
returned_dict.update({"target_iban_present": target_present})
if date_needed:
extracted_dates = detect_date(text)
returned_dict.update({"detected_dates": extracted_dates})
if amount_needed:
extracted_amounts = detect_amount(text)
returned_dict.update({"detected_amounts": extracted_amounts})
return returned_dict
FILENAMES = [
'files/fichier1.pdf',
'files/fichier2.jpeg',
'files/fichier3.pdf',
'files/fichier4.jpg',
'files/fichier5.png',
'files/fichier6.png',
'files/test.jpeg'
]
if __name__=="__main__":
for file in FILENAMES:
print(extract_gather_information(file, TARGET_IBAN))