-
Notifications
You must be signed in to change notification settings - Fork 0
/
historical.py
113 lines (91 loc) · 3.49 KB
/
historical.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import pandas as pd
from datetime import datetime
import os
import requests
import urllib3
from zipfile import ZipFile, BadZipfile
import io
from citibike import Station
TRIP_DATA_DIR = os.path.join(os.getcwd(), "data", "trip_data")
TRIPDATA_URL = "https://s3.amazonaws.com/tripdata"
def read_zip_file(url):
response = requests.get(url, stream=True, verify=False)
with ZipFile(io.BytesIO(response.content)) as myzip:
with myzip.open(myzip.namelist()[0]) as myfile:
return pd.read_csv(myfile)
class TripData:
def __init__(
self,
month: str,
data_dir=TRIP_DATA_DIR,
trip_data_url=TRIPDATA_URL,
snapshot=None
):
self.file_name = f"{str(month)}-citibike-tripdata.csv"
self.file_path = os.path.join(data_dir, self.file_name)
self.start = datetime(int(month[:4]), int(month[4:]), 1)
self.end = self.start + pd.offsets.DateOffset(months=1)
self.trip_data_url = trip_data_url
self.snapshot = snapshot
self._data = None
self._station_names = None
self._stations = None
def download(self, save=True):
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
url = os.path.join(self.trip_data_url, self.file_name + ".zip")
url = f"{self.trip_data_url}/{self.file_name}.zip"
# handling typos in url names
try:
self._data = read_zip_file(url)
except: # BadZipFile
url = url.replace("-citibike-tripdata.csv", "-citbike-tripdata.csv")
try:
self._data = read_zip_file(url)
except: #BadZipFile
url = url.replace("-citbike-tripdata.csv", "-citbike-tripdata")
self._data = read_zip_file(url)
if save:
self._data.to_csv(self.file_path)
print("Finished.")
def get_data(self, download=True):
try:
data = pd.read_csv(self.file_path, low_memory=False)
except FileNotFoundError:
if download:
self.download(save=True)
data = pd.read_csv(self.file_path, low_memory=False)
else:
raise(FileNotFoundError)
data.loc[:, 'started_at'] = pd.to_datetime(data.loc[:, 'started_at'])
data.loc[:, 'ended_at'] = pd.to_datetime(data.loc[:, 'ended_at'])
self._data = data
def get_station_names(self):
station_names = self.data[
['start_station_name']
].drop_duplicates()['start_station_name']
self._station_names = list(station_names)
@property
def data(self):
if self._data is None:
self.get_data(download=True)
return self._data
@property
def station_names(self):
if self._station_names is None:
self.get_station_names()
return self._station_names
@property
def stations(self):
if self._stations is None:
self._stations = [
Station(
trip_data=self, name=k, snapshot=self.snapshot
) for k in self.station_names
]
return self._stations
def find_station(self, station_name):
stations = self.stations
res = [i for i in stations if i.name == station_name][0]
return res
def time_range(self, freq='1min'):
return pd.date_range(self.start, self.end, freq=freq)[:-1]