forked from Vanuan/bocharov_edr_loader
-
Notifications
You must be signed in to change notification settings - Fork 0
/
parseuo.py
executable file
·179 lines (165 loc) · 5.43 KB
/
parseuo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
#!/usr/bin/python3
import re
import xmltodict
import csv
# import from another module
import sys
import os
from pathlib import Path
pre_entity=re.compile('(&\w+;)')
re_pindex = re.compile('(^\d{5}$)')
multy_spaces = re.compile('\s+')
english_word = re.compile('^[A-Z\d]+$')
digits_re = re.compile('^\d+$')
zeros_re = re.compile('^0+$')
reduce_leading_zeros = re.compile('^0+')
kved_re = re.compile('(\d\d\.\d\d)')
split_words_re = re.compile('\W')
# re exp for non digit symbol
non_digit_re = re.compile('\D+')
def fix_pindex(p):
p = p.strip()
# if found non digit - its bad pindex
if non_digit_re.search(p):
return None
# reduce leading zeros to 1, e.g. 00012 -> 012
p = reduce_leading_zeros.sub('0', p)
if len(p) == 5:
return p
elif len(p) == 4:
if p[0] == '0':
p = p + '0'
else:
p = '0' + p
return p
else:
return None
def fix_edrpou(edrpou):
edrpou = edrpou.strip()
all_digits = digits_re.match(edrpou)
if not all_digits or len(edrpou) > 8 or zeros_re.match(edrpou):
return None
else:
return '0'*(8-len(edrpou)) + edrpou
letters_to_ua={
'Ў':"І",
'A': 'А',
'E': 'Е',
'I': 'І',
'K': 'К',
'M': 'М',
'H': 'Н',
'O': 'О',
'P': 'Р',
'C': 'С',
'Y': 'У',
'X': 'Х',
'B': 'В',
'T': 'Т'
}
f=open("data/UO.xml.utf8")
print(f.readline())
print(f.readline())
# o=open("data/uo.csv","w")
uo_file_name = 'data/uo.csv'
uo_file = open(uo_file_name, 'w', encoding='utf8')
uo_writer = csv.writer(uo_file, delimiter=';', quotechar='"', lineterminator='\n', quoting=csv.QUOTE_MINIMAL)
ef=open("data/edrpou_founder.csv","w")
mef=open("data/multy_edrpou_founder.csv","w")
multy_edrpou_founders = {}
line_processed = 0
no_founders_count = 0
orgs = {}
edrpou_founders = {}
while(1):
line=f.readline()
if line == '</DATA>': break
# protect from random '\n' in string fields (names etc)
while line[-10:] != '</RECORD>\n':
line+=f.readline()
# escape and clean
line=line.strip().replace('\n', '').replace('\t', '').replace('\\', '\\\\').replace('‘', "'").replace('’',"'")
line=multy_spaces.sub(' ', line)
# upper - for unifing
line=line.upper()
# replace english and other letter in cyrilic words
line2 = ''
for word in line.split(' '):
if english_word.match(word):
line2 += word
else:
for i, l in enumerate(word):
# english or other letter in cyrilic word
if l in letters_to_ua and (i>0 and ord(line[i-1])>=ord('А') or i+1<len(line) and ord(line[i+1])>=ord('А')):
# new_line_arr.append(letters_to_ua[l])
line2 += letters_to_ua[l]
else:
# new_line_arr.append(l)
line2 += l
# predefined entities to lower
line_copy = line
for entity in pre_entity.findall(line):
line_copy = line_copy.replace(entity, entity.lower())
line = line_copy
# xml to dict
try:
data=xmltodict.parse(line)
except Exception as e:
print('\nError! {}'.format(str(e)))
e_n=int(str(e).split()[-1])
print(1, line[e_n-10:e_n+10])
# line=f.readline()
# print(2, line)
break
record=data['RECORD']
edrpou=fix_edrpou(record['EDRPOU'])
# if edrpou == '26386221':
# print(record)
# exit()
if edrpou is None: edrpou = ''
address=record['ADDRESS'] if record['ADDRESS'] else '' #.replace('"', """)
name=record['NAME'] if record['NAME'] else '' #.replace('"', """)
sname=record['SHORT_NAME'] if record['SHORT_NAME'] else '' #.replace('"', """)
boss=record['BOSS'] if record['BOSS'] else '' #.replace('"', """)
kved_full=record['KVED'] if record['KVED'] else '' #.replace('"', """)
kved=kved_re.findall(kved_full)
if len(kved) > 0:
kved = kved[0]
else:
kved = ''
# pindex
if address is not None:
pindex = fix_pindex(address[0:5])
if pindex is None: pindex=''
address=address[7:]
# name,short_name,edrpou,address,pindex,boss_name,kved_full,status,kved
orgs[edrpou] = [name,sname,edrpou,address,pindex,boss,kved_full,record['STAN'],kved]
if 'FOUNDERS' in record:
founders=record['FOUNDERS']['FOUNDER']
if isinstance(founders, list):
multy_edrpou_founders[edrpou] = founders
if '&sem;' in founders:
print(founders)
raise Exception('Error! &sem; is bad separator for founders!')
quit()
edrpou_founders[edrpou] = '&sem;'.join(founders) #.replace('"', """)
else:
edrpou_founders[edrpou] = founders #.replace('"', """)
else:
no_founders_count+=1
line_processed+=1
if line_processed%10**4 == 0:
print('Lines processed count: {}'.format(line_processed))
# TODO: delete this
for org_data in orgs.values():
uo_writer.writerow(org_data)
for edrpou, founder in edrpou_founders.items():
ef.write("'{}'\t'{}'\n".format(edrpou, founder))
print('edrpous that have more then one founder count: {}. Writen to multy_edrpou_founder.csv file'.format(len(multy_edrpou_founders)))
print("edrpous that don't have founders count: {}".format(no_founders_count))
for edrpou, founders in multy_edrpou_founders.items():
mef.write("'{}'\t'{}'\n".format(edrpou, founders))
uo_file.close()
ef.close()
mef.close()
f.close()