-
Notifications
You must be signed in to change notification settings - Fork 8
/
data-stats.py
28 lines (26 loc) · 1.01 KB
/
data-stats.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
import os
import re
import gzip
from collections import Counter
if __name__ == '__main__':
disambiguated_wikipedia_dirs = ('data/disambiguated-wikipedia-wordnet/hdn',
'data/disambiguated-wikipedia-wordnet/synset')
for dir in disambiguated_wikipedia_dirs:
c = Counter()
paths = (os.path.join(root, fname)
for root, subdirs, fnames in os.walk(dir)
for fname in fnames
if re.search(r'\.txt.gz', fname))
i = 0
for path in paths:
with gzip.open(path, 'rt') as f:
for line in f:
for wn_offset in re.findall(r'eng-30-(\d+-\w)', line):
c[wn_offset] += 1
i += 1
if i % 1000000 == 0:
print(i)
with open('output/synset-count-%s.csv' %os.path.basename(dir), 'wt') as f2:
for synset in c:
f2.write('%d\n' %c[synset])
print(c.most_common(100))