-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclean_detox_tsv.py
40 lines (32 loc) · 1.15 KB
/
clean_detox_tsv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
"""
Cleans up the original Detox TSV files
"""
import csv
import json
import sys
import traceback
def main(argv=None):
rows = csv.DictReader(sys.stdin, delimiter='\t', quotechar="\"")
for row in rows:
try:
del row['diff']
del row['clean_diff']
doc = {
'rev_id': int(row['rev_id']),
'ns_name': row['ns'],
'src': row['src'],
'sample': row['sample'],
'worker_id': int(row['_worker_id']),
'user_id': int(float(row['user_id'] or "0")),
'other': bool(float(row['other'])),
'third_party': bool(float(row['third_party'])),
'recipient': bool(float(row['recipient'])),
'quoting': bool(float(row['quoting'])),
'attack': bool(float(row['attack'])),
'aggression': bool(float(row['aggression']))
}
print(json.dumps(doc))
except Exception:
sys.stderr.write("Error while processing row: {0}".format(row))
sys.stderr.write(traceback.format_exc())
if __name__ == "__main__": main()