-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpythonify.py
65 lines (49 loc) · 1.48 KB
/
pythonify.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import json
import numpy as np
import os
from collections import Counter
import pprint
pp = pprint.PrettyPrinter(indent=4)
def read_files():
jsondir = 'data/json'
files = os.listdir(jsondir)
all_data = []
for filename in files:
with open(jsondir + '/' + filename) as file:
data = json.load(file)
all_data.extend(data)
return all_data
raw_data = read_files()
train_articles = []
train_labels = []
test_articles = []
test_labels = []
all_articles = []
all_labels = []
label_counts = {}
for item in raw_data:
if item['topic'] in label_counts:
label_counts[item['topic']] += 1
else:
label_counts[item['topic']] = 1
top_label_counts = Counter(label_counts).most_common(6)
top_labels = []
for count in top_label_counts:
top_labels.append(count[0])
for item in raw_data:
if item['topic'] in top_labels:
if item['testing']:
test_articles.append(item['body'])
test_labels.append(item['topic'])
else:
train_articles.append(item['body'])
train_labels.append(item['topic'])
all_articles.append(item['body'])
all_labels.append(item['topic'])
out_dir = 'data/bin'
np.save(out_dir + '/test_articles', test_articles)
np.save(out_dir + '/test_labels', test_labels)
np.save(out_dir + '/train_articles', train_articles)
np.save(out_dir + '/train_labels', train_labels)
np.save(out_dir + '/all_articles', all_articles)
np.save(out_dir + '/all_labels', all_labels)