-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathprepare_hatespeech.py
executable file
·46 lines (37 loc) · 1.49 KB
/
prepare_hatespeech.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
#!/usr/bin/env python3
import json
ROOT = "/projects/tir1/users/pmichel1/hatespeech"
# FOUNTA
data = {"train": [], "valid": [], "test": []}
with open(f"{ROOT}/data/founta/exp/label/random/all.jsonlist", "r", encoding="utf-8") as f:
for idx, l in enumerate(f):
sample = json.loads(l)
split = "train"
if sample["folds"]["0"] == "test":
split = "test"
elif sample["folds"]["0"] == 0:
split = "valid"
data[split].append(sample)
for split in data:
with open(f"{ROOT}/data/founta/{split}.tsv", "w", encoding="utf-8") as f:
print("\t".join(["label", "dialect", "text"]), file=f)
for sample in data[split]:
print(
"\t".join([sample["label"], sample["dialect"], sample["text"]]), file=f)
# DAVIDSON
data = {"train": [], "valid": [], "test": []}
with open(f"{ROOT}/data/davidson/exp/label/random/all.jsonlist", "r", encoding="utf-8") as f:
for idx, l in enumerate(f):
sample = json.loads(l)
split = "train"
if sample["folds"]["0"] == "test":
split = "test"
elif sample["folds"]["0"] == "0":
split = "valid"
data[split].append(sample)
for split in data:
with open(f"{ROOT}/data/davidson/{split}.tsv", "w", encoding="utf-8") as f:
print("\t".join(["label", "dialect", "text"]), file=f)
for sample in data[split]:
print(
"\t".join([sample["label"], sample["dialect"], sample["text"]]), file=f)