-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcyclistic_analysis.sql
303 lines (255 loc) · 7.45 KB
/
cyclistic_analysis.sql
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
--Upload the files to BigQuery and merge using
INSERT INTO `vaulted-quarter-375910.Cyclistic.202201-divvy-tripdata`
SELECT *
FROM `vaulted-quarter-375910.Cyclistic.202202-divvy-tripdata`
--Query above copies data from table 202202-divvy-tripdata to 202201-divvytripdata. Copy the rest of the tables too
--After copying confirm total using
SELECT COUNT (*)
FROM `vaulted-quarter-375910.Cyclistic.202201-divvy-tripdata`
--Confirm if there are any nulls using
SELECT *
FROM `vaulted-quarter-375910.Cyclistic.202201-divvy-tripdata`
WHERE ride_id IS NULL
--Repeat the same for all other columns
--833,064 mulls in the start_station_name & start_station_id columns
--892,742 nulls in end_station_name & end_station_id columns
--5,858 nulls in end_lng & end_lat columns
--We'll proceed since we can still answer the business problem with the rest of the information
--To clean data(with all 5,667,717 observations) use
SELECT
ride_id,
rideable_type,
member_casual AS membership,
start_station_name,
end_station_name,
EXTRACT(date FROM started_at) AS start_date,
EXTRACT(time FROM started_at) AS start_time,
EXTRACT(date FROM ended_at) AS end_date,
EXTRACT(time FROM ended_at) AS end_time,
date_diff(ended_at,started_at, MINUTE) AS ride_length,
FROM
`vaulted-quarter-375910.Cyclistic.202201-divvy-tripdata`
--Saved results in new table as clean-data
--Clean data (without nulls)
SELECT
ride_id,
rideable_type,
member_casual AS membership,
start_station_name,
end_station_name,
EXTRACT(date FROM started_at) AS start_date,
EXTRACT(time FROM started_at) AS start_time,
EXTRACT(date FROM ended_at) AS end_date,
EXTRACT(time FROM ended_at) AS end_time,
date_diff(ended_at,started_at, MINUTE) AS ride_length,
FROM
`vaulted-quarter-375910.Cyclistic.202201-divvy-tripdata`
WHERE
start_station_name IS NOT NULL AND
end_station_name IS NOT NULL;
--Explore the clean data and save results in tables for visualization(I exported to Excel and created pivot charts and tables)
--Number of rides per start_hour
SELECT
membership,
EXTRACT(hour FROM start_time) AS start_hour,
COUNT(*) AS rides_per_hour
FROM
`vaulted-quarter-375910.Cyclistic.clean-data`
GROUP BY
EXTRACT(hour FROM start_time),membership
--peak hours for annual members are 7-9 am & 4-6 pm. This could mean they mostly use the rides for commuting to work.As for casual me
mbers, the number steadily increases throughout the day and peaks at 3-6 pm which could mean they use the bikes to for short runs e.g to run errands.
--Number of rides per end_hour
SELECT
membership,
EXTRACT(hour FROM end_time) AS end_hour,
COUNT(*) AS rides_per_hour
FROM
`vaulted-quarter-375910.Cyclistic.clean-data`
GROUP BY
EXTRACT(hour FROM end_time),membership
--Interpretation same as rides per start_hour
--Number of rides according to weekday
SELECT
EXTRACT(dayofweek FROM start_date) AS start_day,
membership,
COUNT(*)
FROM
`vaulted-quarter-375910.Cyclistic.clean-data`
GROUP BY
EXTRACT(dayofweek FROM start_date),membership
--number of rides per weekday for members is almost constant throughtout the week but tends to reduce from day 6 to 1. In BigQuery 1=Monday, 2=Tuesday, etc. Meaning they use
the bikes to commute to work. For casual riders it's a downward slope from the beginning of the week. It increases on Saturday & Sunday meaning they use the bikes for leisure, to exercise, run errands, etc.
--Number of rides according to month
SELECT
EXTRACT(month FROM start_date) AS start_month,
membership,
COUNT(*) as num_of_rides
FROM
`vaulted-quarter-375910.Cyclistic.double_clean`
GROUP BY
EXTRACT(month FROM start_date),membership
--Weather is a factor in bike usage. There's an increase during summer months.
--Popularity of bikes according to type
SELECT
membership,
rideable_type,
COUNT(*)
FROM
`vaulted-quarter-375910.Cyclistic.clean-data`
GROUP BY
rideable_type, membership
--Electric bikes are more preferred by the casual riders. Docked bikes are only used by casual members because of their nature. Preference is almost the same for annual members.
--Total ride length per day
SELECT
EXTRACT(dayofweek FROM start_date) AS weekday,
sum(ride_length) AS total_ride_length,
membership
FROM
`vaulted-quarter-375910.Cyclistic.clean-data`
GROUP BY
EXTRACT(dayofweek FROM start_date),membership
--Average ride length per day
SELECT
EXTRACT(dayofweek FROM start_date) AS weekday,
avg(ride_length) AS average_ride_length,
membership
FROM
`vaulted-quarter-375910.Cyclistic.double_clean`
GROUP BY
EXTRACT(dayofweek FROM start_date),membership;
--On average, casual riders ride the bikes longer than annual members.It’s consistent for annual members maybe because their schedules are more consistent after commuting to work mostly between day 1 and 5 i.e Monday to Friday
--Descriptive statistics
SELECT
membership,
max(ride_length) AS max,
min(ride_length) AS min,
avg(ride_length) AS average,
FROM
`vaulted-quarter-375910.Cyclistic.clean-data`
GROUP BY
Membership
--A closer look at observations with negative ride_length
SELECT
start_date,
start_time,
end_date,
end_time,
ride_length
FROM
`vaulted-quarter-375910.Cyclistic.clean-data`
WHERE
ride_length = -138 OR ride_length = -10353;
--Min ride_length for casual riders and annual members were -138 & -10353 respectively
--Established that started_at & ended_at had been reversed
SELECT
start_date,
start_time,
end_date,
end_time,
ride_length
FROM
`vaulted-quarter-375910.Cyclistic.clean-data`
WHERE
ride_length <0
ORDER BY
ride_length;
--Revealed that 77 observations had start and end times reversed
SELECT
start_date,
start_time,
end_date,
end_time,
ride_length
FROM
`vaulted-quarter-375910.Cyclistic.clean-data`
WHERE
ride_length =0
--95,957 results.Cleaned the data again and omitted ride_length <=0 and repeated all above steps for analysis
--Rides per time intervals.1 - 60mins
SELECT
COUNT(*),
membership
FROM
`vaulted-quarter-375910.Cyclistic.clean-data`
WHERE
ride_length <60 AND ride_length !=0
GROUP BY
membership;
--1 to 3 hrs
SELECT
COUNT(*),
membership
FROM
`vaulted-quarter-375910.Cyclistic.clean-data`
WHERE
ride_length BETWEEN 61 AND 180
GROUP BY
membership;
--3 to 6 hrs
SELECT
COUNT(*),
membership
FROM
`vaulted-quarter-375910.Cyclistic.clean-data`
WHERE
ride_length BETWEEN 181 AND 360
GROUP BY
membership;
--6 to 12 hrs
SELECT
COUNT(*),
membership
FROM
`vaulted-quarter-375910.Cyclistic.clean-data`
WHERE
ride_length BETWEEN 360 AND 720
GROUP BY
membership;
--12 to 24 hrs
SELECT
COUNT(*),
membership
FROM
`vaulted-quarter-375910.Cyclistic.clean-data`
WHERE
ride_length BETWEEN 721 AND 1440
GROUP BY
membership;
--24 to 72 hrs
SELECT
COUNT(*),
membership
FROM
`vaulted-quarter-375910.Cyclistic.clean-data`
WHERE
ride_length BETWEEN 1441 AND 4320
GROUP BY
membership;
--Above 72hrs
SELECT
COUNT(*),
membership
FROM
`vaulted-quarter-375910.Cyclistic.clean-data`
WHERE
ride_length >4320
GROUP BY
membership;
--These results would be for pricing flexibility and discounted prices (according to time intervals)
--Top 20 start locations
SELECT
DISTINCT start_station_name,
membership,
count (*) AS count,
FROM
`vaulted-quarter-375910.Cyclistic.clean-data`
WHERE
start_station_name IS NOT NULL
GROUP BY
start_station_name, membership
ORDER BY
count DESC
LIMIT 20;
--Out of the top 20 start_stations where most rides were started 13/20 had casual riders. 4/5 of the first 5 stations had casual riders
--Location is a factor and marketing may consider targeting the environs near the first 5 start_stations where most casual riders started their rides