-
Notifications
You must be signed in to change notification settings - Fork 3
/
labeler.py
68 lines (61 loc) · 2.44 KB
/
labeler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
from config import *
import numpy as np
class Labeler:
def __init__(self):
self.domain = config['domain']
self.root_path = path_mapper[self.domain]
def __call__(self):
categories = aspect_category_mapper[self.domain]
polarities = sentiment_category_mapper[self.domain]
# Distributions
dist = {}
for cat in categories:
dist[cat] = []
for pol in polarities:
dist[pol] = []
# Read scores
with open(f'{self.root_path}/scores.txt', 'r') as f:
for idx, line in enumerate(f):
if idx % 2 == 1:
values = line.strip().split()
for j, val in enumerate(values):
if j % 2 == 1:
dist[values[j-1][:-1]].append(float(val))
# Compute mean and sigma for each category
means = {}
sigma = {}
for key in dist:
means[key] = np.mean(dist[key])
sigma[key] = np.std(dist[key])
nf = open(f'{self.root_path}/label.txt', 'w')
cnt = {}
with open(f'{self.root_path}/scores.txt', 'r') as f:
sentence = None
for idx, line in enumerate(f):
if idx % 2 == 1:
aspect = []
sentiment = []
key = None
for j, val in enumerate(line.strip().split()):
if j % 2 == 1:
# Normalise score
dev = (float(val) - means[key]) / sigma[key]
if dev >= lambda_threshold:
if key in categories:
aspect.append(key)
else:
sentiment.append(key)
else:
key = val[:-1]
# No conflict (avoid multi-class sentences)
if len(aspect) == 1 and len(sentiment) == 1:
nf.write(sentence)
nf.write(f'{aspect[0]} {sentiment[0]}\n')
keyword = f'{aspect[0]}-{sentiment[0]}'
cnt[keyword] = cnt.get(keyword, 0) + 1
else:
sentence = line
nf.close()
# Labeled data statistics
print('Labeled data statistics:')
print(cnt)