-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwl.py
73 lines (70 loc) · 2.71 KB
/
wl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
# import io
# import json
# from collections import defaultdict
# stemwords = defaultdict(dict)
# with io.open("word_list.txt", encoding='utf-8') as txt:
# index = 0
# for line in txt:
# line = line.strip()
# if len(line) > 0:
# index += 1
# wordEndIndex = line.find(">")
# word = line[2:wordEndIndex]
# line = line[wordEndIndex + 1:]
# baseEndIndex = line.find("]")
# base = line[1:baseEndIndex].strip()
# line = line[baseEndIndex + 1:]
# stem = None
# if len(base) >= 0:
# stemEndIndex = base.find('-')
# if stemEndIndex > 0:
# stem = base[:stemEndIndex]
# line = line[line.find("{") + 1: line.find("}")].strip()
# related = list()
# if len(line) > 0:
# split = line.split(",")
# for s in split:
# related.append(s[:s.find("|")])
# if stem == None and len(related) > 0:
# stem = related[0]
# if stem != None:
# stemwords[word] = defaultdict(dict)
# stemwords[word]["stem"] = stem
# stemwords[word]["related"] = related
# with io.open("process_word_list_marathi.txt", "w", encoding="utf-8") as tf:
# tf.write(repr(stemwords))
import io
import json
from collections import defaultdict
def process_word_list(filename):
stemwords = defaultdict(dict)
with io.open(filename, encoding='utf-8') as txt:
index = 0
for line in txt:
line = line.strip()
if len(line) > 0:
index += 1
wordEndIndex = line.find(">")
word = line[2:wordEndIndex]
line = line[wordEndIndex + 1:]
baseEndIndex = line.find("]")
base = line[1:baseEndIndex].strip()
line = line[baseEndIndex + 1:]
stem = None
if len(base) >= 0:
stemEndIndex = base.find('-')
if stemEndIndex > 0:
stem = base[:stemEndIndex]
line = line[line.find("{") + 1: line.find("}")].strip()
related = list()
if len(line) > 0:
split = line.split(",")
for s in split:
related.append(s[:s.find("|")])
if stem == None and len(related) > 0:
stem = related[0]
if stem != None:
stemwords[word] = defaultdict(dict)
stemwords[word]["stem"] = stem
stemwords[word]["related"] = related
return stemwords