-
Notifications
You must be signed in to change notification settings - Fork 0
/
extract.py
77 lines (60 loc) · 2.48 KB
/
extract.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import json
import pandas as pd
class Extractor:
@staticmethod
def extract_examples(filename,
data_fields,
target_field,
metadata_filename=None,
metadata_fields=None,
drop_duplicates=False,
return_X_y=False):
"""Extracts examples from a JSON Lines file.
Returns a pandas.core.frame.DataFrame object by default.
Args:
filename (str): File path of the JSON Lines file.
data_fields (List[str]): List of fields that correspond to the
feature variables.
target_field (str): Field that corresponds to the target variable.
metadata_filename (str, optional): File path of JSON Lines
metadata file.
metadata_fields (List[str], optional): List of fields that
correspond to additional feature variables.
drop_duplicates (bool): Whether to drop duplicate examples.
Defaults to False.
return_X_y (bool): Whether to return data and target as seperate
objects. Defaults to False.
Returns:
pandas.core.frame.DataFrame or pandas.core.series.Series object
depending on len(target_fields) and return_X_y flag.
"""
if metadata_filename or metadata_fields:
raise NotImplementedError(
("Adding metadata to extracted examples is currently "
"not supported."))
data = []
with open(filename) as f:
for line in f:
data.append(json.loads(line))
df = pd.DataFrame.from_records(data)
df = df[data_fields + [target_field]].dropna()
if drop_duplicates:
df.drop_duplicates(inplace=True)
if return_X_y:
if len(data_fields) == 1:
X = df[data_fields[0]]
else:
X = df[data_fields]
y = df[target_field]
return X, y
return df
if __name__ == "__main__":
# example use
filename = "data/AMAZON_FASHION_5.json"
data_fields = ["reviewText"]
target_field = "overall"
X, y = Extractor.extract_examples(filename,
data_fields,
target_field,
drop_duplicates=True,
return_X_y=True)