-
Notifications
You must be signed in to change notification settings - Fork 0
/
DataWrangling_1.R
54 lines (43 loc) · 1.38 KB
/
DataWrangling_1.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
#read in data
bikes <- read.csv("train_DCbikes.csv")
## separate the strings
datetime <- bikes$datetime
process_datetime <- as.character(datetime)
process_datetime <- strsplit(process_datetime, " ")
## transform the list into a data frame and set appropriate column names
library("plyr")
df <- ldply(process_datetime)
colnames(df) <- c("Date", "Time")
## separate month,date and year from Date
Date <- strsplit(df$Date,"/")
Month<- rep(0,length(Date))
for (i in 1:length(Date)) {
Month[i] <- Date[[i]][1]
}
Day <- rep(0,length(Date))
for (i in 1:length(Date)) {
Day[i] <- Date[[i]][2]
}
Year <- rep(0,length(Date))
for (i in 1:length(Date)){
Year[i] <- Date[[i]][3]
}
##New bikes data
bikes <- data.frame(Month, Day, Year,df,bikes)
str(bikes)
## preliminary data exploration
## First rename seasons
x <- factor(bikes$season)
levels(x)
library(plyr)
season2 <- revalue(x, c("1"="spring", "2"="summer","3"="fall","4"="winter"))
## or do this
##mapvalues(x, from = c("1", "2","3","4"), to = c("spring", "summer", "fall", "winter"))
bikes <- data.frame(season2, bikes)
boxplot(humidity~season2, data=bikes)
## Rename Month
y <- factor(bikes$Month)
levels(y)
Month2 <- revalue(y, c("1"="Jan","2" = "Feb", "3"="March","4"="April","5"="May","6"="June","7"="July","8"="Aug","9"="Sep","10"="Oct","11"="Nov","12"="Dec"))
bikes <- data.frame(Month2, bikes)
boxplot(humidity~Month2, data=bikes, order="Month2")