-
Notifications
You must be signed in to change notification settings - Fork 0
/
dataset.py
63 lines (48 loc) · 2.26 KB
/
dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import download
import extract
class DatasetBuilder:
def __init__(self, dest_dir="data/"):
self._downloader = download.Downloader(dest_dir=dest_dir)
self._extractor = extract.Extractor()
def get_data(self,
url,
data_fields,
target_field,
drop_duplicates=False,
return_X_y=False):
"""Downloads JSON List file, unzips it, and extracts examples.
Returns a pandas.core.frame.DataFrame object by default.
Args:
url (str): URL of the file.
data_fields (List[str]): List of fields that correspond to the
feature variables.
target_field (str): Field that corresponds to the target variable.
drop_duplicates (bool): Whether to drop duplicate examples.
Defaults to False.
return_X_y (bool): Whether to return data and target as seperate
objects. Defaults to False.
Returns:
pandas.core.frame.DataFrame or pandas.core.series.Series object
depending on len(target_fields) and return_X_y flag.
"""
self._downloader.get_unzip(url)
filename = self._downloader._url_to_filename(url).with_suffix("")
data = self._extractor.extract_examples(filename=filename,
data_fields=data_fields,
target_field=target_field,
metadata_filename=None,
metadata_fields=None,
return_X_y=return_X_y,
drop_duplicates=drop_duplicates)
return data
if __name__ == "__main__":
# example use
url = 'http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/AMAZON_FASHION_5.json.gz'
data_fields = ["reviewText"]
target_field = "overall"
dataset_builder = DatasetBuilder()
X, y = dataset_builder.get_data(url,
data_fields,
target_field,
drop_duplicates=True,
return_X_y=True)