-
Notifications
You must be signed in to change notification settings - Fork 0
/
regularEx.py
89 lines (72 loc) · 3.17 KB
/
regularEx.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
# Using RegEx as a tool to understand languages.
# * gives us the ability to match like baa, baaa, baaa with ba*
# ? has the same ability as * but for single char
# . returns single characters
# ^ checks for starts with
# $ checks for ends with
def search(pattern, text):
"Match pattern anywhere in text; return longest earliest match or None."
for i in range(len(text)):
m = match(pattern, text[i:])
if (m is not None):
return m
def match(pattern, text):
"Match pattern against start of text; return longest match found or None."
remainders = matchset(pattern, text)
if remainders:
shortest = min(remainders, key=len)
return text[:(len(text) - len(shortest))]
#Grammar of a language is on which regEx works
#UI is used by the user while APIs are used by programmers
#Writing different language processing functions now
def matchset(pattern, text):
"Match pattern at start of text; return a set of remainders of text."
op, x, y = components(pattern)
if ('lit' == op):
return set([text[len(x):]]) if text.startswith(x) else null
elif ('seq' == op):
return set(t2 for t1 in matchset(x, text) for t2 in matchset(y, t1))
elif ('alt' == op):
return matchset(x, text) | matchset(y, text)
elif ('dot' == op):
return set([text[1:]]) if text else null
elif ('oneof' == op):
return set([text[1:]]) if text.startswith(x) else null
elif ('eol' == op):
return set(['']) if text == '' else null
elif ('star' == op):
return (set([text]) |
set(t2 for t1 in matchset(x, text)
for t2 in matchset(pattern, t1) if t1 != text))
else:
raise ValueError('unknown pattern: %s' % pattern)
null = frozenset()
def components(pattern):
"Return the op, x, and y arguments; x and y are None if missing."
x = pattern[1] if len(pattern) > 1 else None
y = pattern[2] if len(pattern) > 2 else None
return pattern[0], x, y
def test():
assert matchset(('lit', 'abc'), 'abcdef') == set(['def'])
assert matchset(('seq', ('lit', 'hi '),
('lit', 'there ')),
'hi there nice to meet you') == set(['nice to meet you'])
assert matchset(('alt', ('lit', 'dog'),
('lit', 'cat')), 'dog and cat') == set([' and cat'])
assert matchset(('dot',), 'am i missing something?') == set(['m i missing something?'])
assert matchset(('oneof', 'a'), 'aabc123') == set(['abc123'])
assert matchset(('eol',),'') == set([''])
assert matchset(('eol',),'not end of line') == frozenset([])
assert matchset(('star', ('lit', 'hey')), 'heyhey!') == set(['!', 'heyhey!', 'hey!'])
return 'tests pass'
print (test())
#filling out the APIs
def lit(string): return ('lit', string)
def seq(x, y): return ('seq', x, y)
def alt(x, y): return ('alt', x, y)
def star(x): return ('star', x)
def plus(x): return seq(x, star(x))
def opt(x): return alt(lit(''), x) #opt(x) means that x is optional
def oneof(chars): return ('oneof', tuple(chars))
dot = ('dot',)
eol = ('eol',)