-
Notifications
You must be signed in to change notification settings - Fork 0
/
utils_patterns.py
118 lines (102 loc) · 3.1 KB
/
utils_patterns.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import re
from collections import Counter
import logging
from logging.config import fileConfig
from sqlite3.dbapi2 import paramstyle
logger = logging.getLogger(__name__)
def get_regexes():
return {
"^": re.compile("[A-Z]{2,}"),
"|": re.compile("[A-Z][a-z]+"),
"U": re.compile("[A-Z]+"),
"L": re.compile("[a-z]+"),
"*": re.compile(r"[\u4e00-\u9fff]+"),
"N": re.compile(r"\d+"),
"B": re.compile(r"[()\[\]{}']+"),
"P": re.compile(r"[,!@#$%&+=?]+"),
".": re.compile(r"[\. _\-]+")
}
def profile(input, regexes):
positions = {}
p = {}
for k,v in regexes.items():
for m in v.finditer(input):
l = len(m.group())
s = m.start()
e = m.end()
if s not in p:
p[s] = {
"start": s,
"end": e,
"type": k.replace("^", "U").replace("|", "W"),
"value": m.group()
}
input = "{}{}{}".format(input[:s], k*l, input[e:])
if l == 1:
positions[s] = k
else:
positions[s] = "{}{}".format(k,l)
output = []
parts = []
for k,v in sorted(positions.items()):
output.append(v)
parts.append(p.get(k))
return {
"parts": parts,
"profile": "".join(output).replace("^","U").replace("|", "W"),
"expand": input.replace("^", "U").replace("|", "W")
}
def expand_profile(s):
stack = []
cc = None
for c in s:
if c.isdigit():
if cc:
stack.append(cc*int(c))
cc = None
else:
if cc:
stack.append(cc)
cc = c
return "".join(stack)
def consolidate_profile(s):
p = re.sub(r"\.\.+", "-", s).replace('.','')
p = re.sub("WW+", "w", p)
p = re.sub("NN+", "n", p)
p = re.sub("UU+", "u", p)
p = re.sub("PP+", "p", p).replace('J','-')
return p
def frequency(s):
"""
:type s: str
:rtype: str
"""
d = Counter(s)
return d
if __name__ == "__main__":
logging.basicConfig(level=logging.INFO)
strings = [
'1.4 Protection of Privacy - Information Incidents (Privacy Breaches)',
'Accept Amendment to Disclosure Statement_02112019_0328',
'DS1 29560',
'RBC-20080401-ABCPmarket',
'Accept Form V_05222018_1043',
'Accept Disclosure_06202018_0915',
'Deficiency (Discl)_06072018_0908',
'29563 - Undertaking - Hillcrest Place',
'BCFSA Application for Consent Application Section #2 - Sub-Section #2 - November 30 2019 MERGER AGREEMENTS',
'Bulkley Valley - X020317'
]
regexes = get_regexes()
for s in strings:
pr = profile(s, regexes=regexes)
np = pr.get("profile")
ep = expand_profile(np)
cp = consolidate_profile(ep)
hs = frequency(s)
logger.info(s)
for k,v in pr.items():
logger.info("\t:{}: {}".format(k,v))
logger.info("""\t{}
\t{}
\t{}""".format(hs, ep, cp))