-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
60 lines (48 loc) · 1.38 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import string
import re
def get_bn2wn_mapping(path):
"""
Returns a dictionary with a mapping between
BabelNet synsets and WordNet synsets
"""
bn2wn = dict()
with open(path) as f:
for line in f:
# TODO: check the line with 3 entries
bn, wn = line.strip().split()[:2]
bn2wn[bn] = wn
return bn2wn
def process_text(s):
"""
Removes punctuation and multiple consecutive
spaces from text
"""
# remove punctuation characters
s = s.translate(
str.maketrans('', '', string.punctuation))
# remove multiple consecutive spaces
s = re.sub(' +', ' ', s)
return s.lower()
def get_longest_lemma_from_anchor(lemm_anchor, lemmas):
"""
Returns the longest lemma containing the `anchor`
string. According to high precision specification of Eurosense.
"""
relevant_lemmas = list(filter(lambda x: lemm_anchor in x, lemmas))
longest_lemma = max(relevant_lemmas, key=len)
return longest_lemma
def filter_sense_embeddings(path):
"""
Removes word embeddings from a word2vec
formatted embeddings file
"""
senses = []
with open(path, 'r') as f:
for line in f:
key = line.split(' ', 1)[0]
if '_' in key:
senses.append(line)
with open(path, 'w') as f:
f.write("{} {}\n".format(len(senses), len(senses[0].split(' ')) - 1))
for sense in senses:
file.write(sense + '\n')