-
Notifications
You must be signed in to change notification settings - Fork 4
/
paragrahvid3.py
executable file
·199 lines (161 loc) · 4.42 KB
/
paragrahvid3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import re, sys, string
from pyvabamorf import analyze
from pprint import pprint
from urllib.request import urlopen
from urllib.parse import unquote
from urllib.error import URLError
from difflib import SequenceMatcher
import csv, html, codecs
#import collections
IGN_TYYP = ['J', 'Z', 'D', 'P'] # sidesõna, lausemärk, operaator, küsisõna
IGN_WORD = ["olema"]
BF = 25
SLV = 0.8
SLV_YS = 0.95
LAWNAMES = ("seadus", "seadustik")
seadused = {}
def get_para_at(blob, pos):
willbepara = ""
for chunk in reversed(latest):
if chunk.isnumeric():
willbepara = chunk + willbepara
else:
break
if len(willbepara)>0:
return str(int(willbepara))
return None
def in_dict(nimi):
highest = 0, None
for s in seadused.keys():
seq = SequenceMatcher(None, s, nimi)
if seq.ratio() > highest[0]:
highest = seq.ratio(), seadused[s]
if highest[0] > SLV:
return highest
else:
return 0, None
def lyhike_nimi(cur, add_len):
for nimi in LAWNAMES:
if cur.endswith(nimi) and len(cur) <= len(nimi) + add_len:
return True
return False
def ins_or_add(start, end, data, paras):
while True:
lastword = data[start:end].split()[-1]
#print(lastword)
if lastword.endswith(LAWNAMES):
break
end = end - 1
chunk=data[start:end]
if start-BF<0:
newstart=0
else:
newstart=start-BF
pre=data[newstart:start]
if end+BF>len(data):
newend=len(data)
else:
newend=end+BF
post=data[end:newend]
#print(pre, chunk, post)
closest = 0, None, None
jupid=chunk.split()
#print(">>>"+" ".join(jupid))
for i in range(0, len(jupid)):
variant = jupid[i:len(jupid)]
cur = " ".join(variant)
#print(cur)
if cur in LAWNAMES:
break
simil = in_dict(cur)
if simil[0] > closest[0]:
#print(i, len(jupid), variant)
if i == 1 or lyhike_nimi(cur, 4): # viimane sõna, st ühesõnalised seadusenimed
#print(variant)
if simil[0] > SLV_YS:
closest = simil[0], simil[1], cur
if closest[0] == 1:
break
else: # mitmesõnalistega on kõik tavaline
closest = simil[0], simil[1], cur
if closest[0] == 1:
break
if closest[0] > SLV:
if closest[1] in paras.keys():
paras[closest[1]] = paras[closest[1]] + 1
else:
paras[closest[1]] = 1
#paras.append(closest)
#print(closest)
def get_paras(data):
paras={}
pat = re.compile(r'[^;.§]+seadus\w*')
for m in re.finditer(pat, data):
res = analyze(m.group(0).split()[-1])
#print(res[0]['analysis'][0])
if res[0]['analysis'][0]['lemma'].endswith(LAWNAMES):
ins_or_add(m.start(), m.end(), data, paras)
return sorted(paras.items(), key=lambda x:x[1], reverse=True)
def dl_doc(url):
while True:
try:
resource = urlopen(url)
content = resource.read().decode(resource.headers.get_content_charset())
break
except URLError as e:
print(e)
return content
def decode_html(data):
string = unquote(data)
return html.unescape(string)
def strip_html(data):
p = re.compile(r'<.*?>')
return p.sub('', data)
def remove_newlines(data):
return data.replace('\n', ' ').replace('\r', '')
def get_content(url):
resp = dl_doc(url)
content = remove_newlines(strip_html(decode_html(resp)))
return content
if len(sys.argv) >= 2:
with open("seadused.csv",'r') as csvfile:
reader = csv.DictReader(csvfile, delimiter='\t', quotechar='"')
for row in reader:
seadused[row["nimi"].lower()] = row["lyhend"]
#print(seadused)
fw = codecs.open("paragrahvid.csv",'w','utf-8')
for arg in sys.argv:
if not arg.endswith(".csv"):
continue
with open(arg) as f:
data_file = f.readlines()
if arg=="suur_eelnou.csv" or arg=="otsus.csv":
pos=0
else:
pos=1
for line1 in data_file:
url = line1.split()[pos].strip('"')
#url = "http://info.raad.tartu.ee/webaktid.nsf/web/gpunid/GC2257DD200719704C2257CC5001C07BC?OpenDocument"
#url = "http://info.raad.tartu.ee/webaktid.nsf/web/gpunid/GC2257ED20030C63DC2257E70001D7F80?OpenDocument"
#url = "http://info.raad.tartu.ee/webaktid.nsf/gpunid/GC2257ED20030C63DC2257C7C00495225"
#print(url)
if len(url)>30:
data = get_content(url)
#print data
parad = get_paras(data)
#print (parad)
if parad is not None:
#parad = list(set(parad))
#print(parad)
if len(parad)>0:
tags = []
for zzz in parad:
tags.append(zzz[0])
line = "\""+url+"\"\t\"" + " ".join(tags) + "\""
print("\""+line.split("/")[-1])
fw.write(line + "\n")
#sys.exit(1)
fw.close()
#pprint(words)