Analyze & Share

kk 2022-05-19

Analyze and share

## read merged file from summary csv created in last step
getwd()

## [1] "/Users/kat/Dev/Google-Data-Analytics-R/bike-sharing-capstone"

library(readr)
library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──

## ✔ ggplot2 3.3.6     ✔ dplyr   1.0.9
## ✔ tibble  3.1.7     ✔ stringr 1.4.0
## ✔ tidyr   1.2.0     ✔ forcats 0.5.1
## ✔ purrr   0.3.4

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

df <- read_csv("output-data/allbikedata-clean_onlyvalidridelength.csv")

## Rows: 5757322 Columns: 18

## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (10): column_label, ride_id, rideable_type, start_station_name, start_s...
## dbl   (6): start_lat, start_lng, end_lat, end_lng, ride_length, ride_length_num
## dttm  (2): started_at, ended_at
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

###Some descriptive statistics:

## some descriptive stats
library(skimr)
skim(df)


Name	df
Number of rows	5757322
Number of columns	18
_______________________
Column type frequency:
character	10
numeric	6
POSIXct	2
________________________
Group variables	None

Data summary

Variable type: character

skim_variable	n_missing	complete_rate	min	max	n_unique
column_label	0	1.00	8	8	12
ride_id	0	1.00	16	16	5757322
rideable_type	0	1.00	11	13	3
start_station_name	790168	0.86	3	53	865
start_station_id	790165	0.86	3	44	856
end_station_name	843319	0.85	10	53	864
end_station_id	843319	0.85	3	44	856
member_casual	0	1.00	6	6	2
ride_length_period	0	1.00	2	15	25268
day_of_week	0	1.00	6	9	7

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
start_lat	0	1	41.90	0.05	41.64	41.88	41.90	41.93	45.64	▇▁▁▁▁
start_lng	0	1	-87.65	0.03	-87.84	-87.66	-87.64	-87.63	-73.80	▇▁▁▁▁
end_lat	4765	1	41.90	0.05	41.39	41.88	41.90	41.93	42.17	▁▁▁▇▁
end_lng	4765	1	-87.65	0.03	-88.97	-87.66	-87.64	-87.63	-87.49	▁▁▁▁▇
ride_length	0	1	1268.49	10323.09	0.00	387.00	691.00	1257.00	3356649.00	▇▁▁▁▁
ride_length_num	0	1	1268.49	10323.09	0.00	387.00	691.00	1257.00	3356649.00	▇▁▁▁▁

Variable type: POSIXct

skim_variable	n_missing	complete_rate	min	max	median	n_unique
started_at	0	1	2021-05-01 00:00:11	2022-04-30 23:59:54	2021-08-31 17:15:58	4820908
ended_at	0	1	2021-05-01 00:03:26	2022-05-02 00:35:01	2021-08-31 17:32:32	4815672

glimpse(df)

## Rows: 5,757,322
## Columns: 18
## $ column_label       <chr> "df_05_21", "df_05_21", "df_05_21", "df_05_21", "df…
## $ ride_id            <chr> "C809ED75D6160B2A", "DD59FDCE0ACACAF3", "0AB83CB88C…
## $ rideable_type      <chr> "electric_bike", "electric_bike", "electric_bike", …
## $ started_at         <dttm> 2021-05-30 11:58:15, 2021-05-30 11:29:14, 2021-05-…
## $ ended_at           <dttm> 2021-05-30 12:10:39, 2021-05-30 12:14:09, 2021-05-…
## $ start_station_name <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ start_station_id   <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ end_station_name   <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ end_station_id     <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ start_lat          <dbl> 41.90000, 41.88000, 41.92000, 41.92000, 41.94000, 4…
## $ start_lng          <dbl> -87.63000, -87.62000, -87.70000, -87.70000, -87.690…
## $ end_lat            <dbl> 41.89000, 41.79000, 41.92000, 41.94000, 41.94000, 4…
## $ end_lng            <dbl> -87.61000, -87.58000, -87.70000, -87.69000, -87.700…
## $ member_casual      <chr> "casual", "casual", "casual", "casual", "casual", "…
## $ ride_length        <dbl> 744, 2695, 72, 913, 413, 1416, 883, 1075, 157, 1581…
## $ ride_length_period <chr> "12M 24S", "44M 55S", "1M 12S", "15M 13S", "6M 53S"…
## $ day_of_week        <chr> "Sunday", "Sunday", "Sunday", "Sunday", "Sunday", "…
## $ ride_length_num    <dbl> 744, 2695, 72, 913, 413, 1416, 883, 1075, 157, 1581…

Average ride length and ride count for members and casual riders:

# calculate average ride length for members and casual riders
df %>% group_by(member_casual) %>% summarise(mean_length=mean(ride_length_num), 
                                             sd_length=sd(ride_length_num))

## # A tibble: 2 × 3
##   member_casual mean_length sd_length
##   <chr>               <dbl>     <dbl>
## 1 casual              1878.    15423.
## 2 member               789.     1631.

print(mean(df$ride_length))

## [1] 1268.488

tapply(df$ride_length, df$member_casual, summary)

## $casual
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0     530     934    1878    1709 3356649 
## 
## $member
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0   320.0   551.0   788.8   956.0 93595.0

colnames(df)

##  [1] "column_label"       "ride_id"            "rideable_type"     
##  [4] "started_at"         "ended_at"           "start_station_name"
##  [7] "start_station_id"   "end_station_name"   "end_station_id"    
## [10] "start_lat"          "start_lng"          "end_lat"           
## [13] "end_lng"            "member_casual"      "ride_length"       
## [16] "ride_length_period" "day_of_week"        "ride_length_num"

# count number of casual and member rides
df %>% group_by(member_casual) %>% summarise(count = n())

## # A tibble: 2 × 2
##   member_casual   count
##   <chr>           <int>
## 1 casual        2536267
## 2 member        3221055

Bike type and weekdays for members and casual riders:

# count bike type by members and casual riders
df %>% group_by(member_casual, rideable_type) %>% summarise(count = n())

## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.

## # A tibble: 5 × 3
## # Groups:   member_casual [2]
##   member_casual rideable_type   count
##   <chr>         <chr>           <int>
## 1 casual        classic_bike  1234372
## 2 casual        docked_bike    291382
## 3 casual        electric_bike 1010513
## 4 member        classic_bike  1968298
## 5 member        electric_bike 1252757

# count weekdays by members and casual riders
df %>% group_by(member_casual, day_of_week) %>% summarise(count = n())

## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.

## # A tibble: 14 × 3
## # Groups:   member_casual [2]
##    member_casual day_of_week  count
##    <chr>         <chr>        <int>
##  1 casual        Friday      358198
##  2 casual        Monday      289026
##  3 casual        Saturday    558610
##  4 casual        Sunday      476962
##  5 casual        Thursday    298061
##  6 casual        Tuesday     270544
##  7 casual        Wednesday   284866
##  8 member        Friday      453278
##  9 member        Monday      445628
## 10 member        Saturday    442736
## 11 member        Sunday      387960
## 12 member        Thursday    485839
## 13 member        Tuesday     498679
## 14 member        Wednesday   506935

Visualizations

library(ggplot2)
# bar plot of member and causal count
p <- ggplot(data=df, aes(x=member_casual)) +
  geom_bar(fill = "#00abff") +
  geom_text(stat="count", aes(label=scales::comma(..count..)), vjust = -0.5)
## format labels to show actual numbers and not abbreviated
require(scales)

## Loading required package: scales

## 
## Attaching package: 'scales'

## The following object is masked from 'package:purrr':
## 
##     discard

## The following object is masked from 'package:readr':
## 
##     col_factor

p + scale_y_continuous(labels = comma)

Interpretation: There are 800,000 more rides from people in the ‘annual member’ category than in the ‘casual rides’ category. We don’t know here if there are more users in one or the other category in total.

# pie charts for bike type by member group
## make dfs with only member or casual type each
df_m <- subset(df, member_casual == "member")
df_c <- subset(df, member_casual == "casual")

## pie chart for members
### make summary df with count as value because the pie chart does not work for me with the actual count
df_m_sum <- df_m %>% group_by(rideable_type) %>% summarise(n = n())
print(df_m_sum)

## # A tibble: 2 × 2
##   rideable_type       n
##   <chr>           <int>
## 1 classic_bike  1968298
## 2 electric_bike 1252757

ggplot(df_m_sum, aes(x="",y=n, fill=rideable_type)) +
  geom_bar(stat="identity", width = 1, color="white") +
  coord_polar("y", start=0) +
  theme_void() +
  theme(legend.position = "none", plot.title = element_text(hjust =0.5)) +
  geom_text(aes(y= n, label=scales::comma(n)), color = "white", size=4, vjust=-6) +
  geom_text(aes(y= n, label=rideable_type), color = "white", size=4, vjust=-8) +
  ggtitle("Rideable types of annual members")

## pie chart for casual riders
df_c_sum <- df_c %>% group_by(rideable_type) %>% summarise(n = n())
print(df_c_sum)

## # A tibble: 3 × 2
##   rideable_type       n
##   <chr>           <int>
## 1 classic_bike  1234372
## 2 docked_bike    291382
## 3 electric_bike 1010513

ggplot(df_c_sum, aes(x="",y=n, fill=rideable_type)) +
  geom_bar(stat="identity", width = 1, color="white") +
  coord_polar("y", start=0) +
  theme_void() +
  theme(legend.position = "none", plot.title = element_text(hjust =0.5)) +
  geom_text(aes(y= n, label=scales::comma(n)), position = position_stack(vjust = 0.5), color = "white", size=4) +
  geom_text(aes(y= n, label=rideable_type), position = position_stack(vjust = 0.5), color = "white", size=4, vjust=2.5) +
  ggtitle("Rideable types of casual riders")

Interpretation: Annual members only use two categories of bikes, casual members use all three. Maybe docked bikes are only available for casual members, or all annual members are automatically provided with a regular or electric bike. Classic bikes are by far the most popular version for both member types. Docked bikes are the least used version.

# histogram for ride lengths
## for members
## does not look like a helpful plot. 
#ggplot(df_m, aes(x=ride_length_num)) +
#  geom_histogram(bins = 6) +
#  scale_x_continuous(labels = comma) +
#  scale_y_continuous(labels = comma)
print(mean(df_m$ride_length_num))

## [1] 788.7578

## cut off everything under 10th and over 90th quantile
quantile(df_m$ride_length_num, prob=c(.10,.5,.90))

##  10%  50%  90% 
##  196  551 1555

df_m_small <- subset(df_m, (ride_length_num > 196 & ride_length_num < 1555))
mean(df_m_small$ride_length_num)

## [1] 636.0195

median(df_m_small$ride_length_num)

## [1] 551

#skim(df_m_small)
df_m_small$ride_length_min <- df_m_small$ride_length_num/60
#skim(df_m_small)

ggplot(df_m_small, aes(ride_length_min)) +
  geom_histogram(binwidth = 0.5, col = "black", fill = "cornflowerblue") +
  labs(title="Frequency of ride lengths for annual members",
       subtitle="From 10th to 90th quantile",
       x="ride length in minutes")

## for casual riders
#ggplot(df_c, aes(x=ride_length_num)) +
#  geom_histogram(bins = 6) +
#  scale_x_continuous(labels = comma) +
#  scale_y_continuous(labels = comma)
print(mean(df_c$ride_length_num))

## [1] 1877.744

quantile(df_c$ride_length_num, prob=c(.10,.5,.90))

##  10%  50%  90% 
##  317  934 3217

df_c_small <- subset(df_c, (ride_length_num > 317 & ride_length_num < 3217))
mean(df_c_small$ride_length_num)

## [1] 1127.975

median(df_c_small$ride_length_num)

## [1] 935

df_c_small$ride_length_min <- df_c_small$ride_length_num/60
skim(df_c_small)


Name	df_c_small
Number of rows	2027327
Number of columns	19
_______________________
Column type frequency:
character	10
numeric	7
POSIXct	2
________________________
Group variables	None

Data summary

Variable type: character

skim_variable	n_missing	complete_rate	min	max	n_unique
column_label	0	1.00	8	8	12
ride_id	0	1.00	16	16	2027327
rideable_type	0	1.00	11	13	3
start_station_name	273767	0.86	3	53	859
start_station_id	273765	0.86	3	37	850
end_station_name	304764	0.85	10	53	856
end_station_id	304764	0.85	3	37	848
member_casual	0	1.00	6	6	1
ride_length_period	0	1.00	5	7	2899
day_of_week	0	1.00	6	9	7

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
start_lat	0	1	41.90	0.04	41.65	41.88	41.90	41.93	42.07	▁▁▇▇▁
start_lng	0	1	-87.65	0.03	-87.84	-87.66	-87.64	-87.63	-87.52	▁▁▆▇▁
end_lat	340	1	41.90	0.04	41.57	41.88	41.90	41.93	42.12	▁▁▇▇▁
end_lng	340	1	-87.65	0.03	-87.87	-87.66	-87.64	-87.63	-87.50	▁▁▆▇▁
ride_length	0	1	1127.98	672.58	318.00	600.00	935.00	1495.00	3216.00	▇▅▂▁▁
ride_length_num	0	1	1127.98	672.58	318.00	600.00	935.00	1495.00	3216.00	▇▅▂▁▁
ride_length_min	0	1	18.80	11.21	5.30	10.00	15.58	24.92	53.60	▇▅▂▁▁

Variable type: POSIXct

skim_variable	n_missing	complete_rate	min	max	median	n_unique
started_at	0	1	2021-05-01 00:00:31	2022-04-30 23:59:52	2021-08-15 12:18:37	1858664
ended_at	0	1	2021-05-01 00:09:28	2022-05-01 00:33:05	2021-08-15 12:39:45	1853508

ggplot(df_c_small, aes(ride_length_min)) +
  geom_histogram(binwidth = 0.5, col = "black", fill = "cornflowerblue") +
  labs(title="Frequency of ride lengths for casual riders",
       subtitle="From 10th to 90th quantile",
       x="ride length in minutes")

Interpretation: I cut off extreme values on both ends (10% each), so we only see the main distributions here. Annual members seem to make shorter rides than casual riders, most under 10 minutes. Casual riders’ ride length peaks round 10 min, and there are also many rides that go up to 50 min. For annual members, this is rare. Casual riders maybe have a higher barrier to use the bikes, so they might rather use them for rides when it is ‘worth it’.

# bar plot for ride lengths per weekday (ordered), member groups next to each other
## make summary df for ggplot
df_weekday_member <- aggregate(df, by=list(df$day_of_week, df$member_casual), FUN=length)
df_weekday_member <- df_weekday_member %>% select("Group.1", "Group.2", "ride_id")
df_weekday_member <- df_weekday_member %>% rename(weekday = Group.1, member_casual = Group.2, count = ride_id)
#View(df_weekday_member)
df_weekday_member$weekday <- as.factor(df_weekday_member$weekday)
df_weekday_member$weekday <- ordered(df_weekday_member$weekday,
                                     levels = c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday","Sunday"))

glimpse(df_weekday_member)

## Rows: 14
## Columns: 3
## $ weekday       <ord> Friday, Monday, Saturday, Sunday, Thursday, Tuesday, Wed…
## $ member_casual <chr> "casual", "casual", "casual", "casual", "casual", "casua…
## $ count         <int> 358198, 289026, 558610, 476962, 298061, 270544, 284866, …

df_weekday_member$weekday

##  [1] Friday    Monday    Saturday  Sunday    Thursday  Tuesday   Wednesday
##  [8] Friday    Monday    Saturday  Sunday    Thursday  Tuesday   Wednesday
## 7 Levels: Monday < Tuesday < Wednesday < Thursday < Friday < ... < Sunday

## make bar chart
ggplot(df_weekday_member, aes(x=weekday,y=count,fill=member_casual)) +
  geom_bar(stat="identity", position="dodge") +
  scale_fill_discrete(name="Membership type",
                      breaks=c("casual","member"),
                      labels=c("Casual", "Member")) +
  xlab("Weekday") + ylab("Count") +
  scale_y_continuous(labels = comma) +
  labs(title="Rides per weekday and membership type")

# casual riders tend to ride more on weekends, members more during weekdays

Interpretation: Casual riders tend to ride more during weekends, annual members more during weekdays. This could mean that casual riders use the bikes in their freetime, annual members for work.

Ideas to improve data / analysis

There is no member id in the data (probably privacy reasons), but so we don’t know how many rides members and casual riders make. If the company could provide this data safely, it could give more interesting insights.
I could have spent way more time reporting the numbers, most of my interpretations rely only on the visualizations. This was a time-issue, and I might come back to this analysis later to give a more complete analysis and interpretation.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

analyze-share.md

analyze-share.md

Analyze & Share

Analyze and share

Visualizations

Ideas to improve data / analysis

Files

analyze-share.md

Latest commit

History

analyze-share.md

File metadata and controls

Analyze & Share

Analyze and share

Visualizations

Ideas to improve data / analysis