-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_preprocess.Rmd
119 lines (90 loc) · 2.41 KB
/
data_preprocess.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
---
title: "Data Preprocessing"
output:
html_document:
code_folding: hide
toc_float: true
---
## Load Packages
```{r, message=FALSE}
library(tidyverse)
library(dplyr)
library(patchwork)
```
## Load Raw Data
```{r}
library("nycflights13")
flights_2013 = flights |>
janitor::clean_names()
weather_2013 = weather |>
janitor::clean_names()
load("data/flights_2017.rda")
load("data/weather_2017.rda")
flights_2017 = flights |>
janitor::clean_names()
weather_2017 = weather |>
janitor::clean_names()
```
## Examine Missingness: remove columns with large amount of missing values
```{r}
missing_values = tibble(
column_names = colnames(weather_2017),
missing_percentage = colSums(is.na(weather_2017)) / nrow(weather_2017)
)
print(missing_values)
```
## Data Cleaning: 2013 dataset
```{r}
flights_2013_clean =
flights_2013 |>
drop_na() |>
unique() |>
select(
-dep_time,
-arr_time,
-dep_delay,
-sched_arr_time,
-sched_dep_time,
-time_hour
)
weather_2013_clean =
weather_2013 |>
select(-c("temp", "dewp", "humid")) |>
drop_na() |>
select(-time_hour)
merge_data_2013 = merge(flights_2013_clean, weather_2013_clean, by =c("origin",
"year",
"month",
"day",
"hour"))
```
## Data Cleaning: 2017 dataset
```{r}
flights_2017_clean =
flights_2017 |>
drop_na() |>
unique() |>
select(
-dep_time,
-arr_time,
-dep_delay,
-sched_arr_time,
-sched_dep_time,
-time_hour
)
weather_2017_clean =
weather_2017 |>
select(-c("temp", "dewp", "humid")) |>
drop_na() |>
select(-time_hour)
merge_data_2017 = merge(flights_2017_clean, weather_2017_clean, by =c("origin",
"year",
"month",
"day",
"hour"))
```
## Export to csv
```{r}
write.csv(merge_data_2013, "data/merge_data_2013.csv", row.names = FALSE)
write.csv(merge_data_2017, "data/merge_data_2017.csv", row.names = FALSE)
```