-
Notifications
You must be signed in to change notification settings - Fork 0
/
example_script.R
executable file
·211 lines (139 loc) · 6.38 KB
/
example_script.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
# Read in my_data (.csv) #####
## Base R can handle .csv and .txt files "out of the box"
## Without assigning to an object, contents of file will be PRINTED
read.csv("example_data.csv")
## read data and ASSIGN to OBJECT
my_data_csv <- read.csv("example_data.csv")
## View data in a spreadsheet-style viewer
View(my_data_csv)
## To read in different file types, we will need to download additional R
## packages.
# Install packages ####
## R packages are add-ons that expand the functionality of base R. Its helpful
## to think of R as a series of books. "Base R" is the starter set of "books"
## you get when you first install R. install.packages() downloads the new "book"
## and adds it to your collection. This step needs to be done only once, or
## periodically to update.
## Note: quotation marks ("") ARE REQUIRED to specify a package name.
install.packages("openxlsx")
install.packages("haven")
install.packages("ggplot2") ## for plotting
install.packages("gtsummary") ## for tables
## openxlsx provides functions to read and write Microsoft Excel files.
## haven provides functions to read from various stats packages, including SPSS,
## Stata, and SAS. Notably, haven is able to read in values and value labels.
## ggplot2 streamlines the creation of plots to visualize your data.
## gtsummary streamlines the creation of tables to summarize your data.
# Load Packages ####
## In order to load the package, we use library(). This is like taking the book
## off your shelf and opening it to read/use. This step needs to be done each
## time you open R/RStudio, so it's helpful to have a section at the top of the
## script that loads all packages needed.
## Note: quotation marks ("") are NOT required for the library function.
library(haven)
library(openxlsx)
library(ggplot2)
library(gtsummary)
# Read in my_data (.xlsx) ####
## Excel can be convenient for data management, but like .csv files, it does not
## allow you to store a numeric code and a text label in a single cell.
## Understanding coded values will require referring to a separate metadata
## file.
my_data_xlsx <- read.xlsx("example_data.xlsx")
## View
View(my_data_xlsx)
#### Summarize (Frequency Tables)
table(my_data_xlsx$SEX)
table(my_data_xlsx$EDATTAIN)
table(my_data_xlsx$EMPSTAT)
# Read in my_data (with labels) ####
## The haven package provides functions to read and write the formatted data
## files used by Stata, SPSS, and SAS. This package makes it easier to get your
## data into R, and also provides support for LABELED VALUES.
my_data <- read_dta("example_data.dta")
View(my_data) # It's still numeric!
## See Environment Panel, haven labels are complex; converting to the factor
## format, which is native to R, will preserve the labels and work more
## smoothly with other R functions.
## as_factor in the haven package to convert to factors for further analysis
my_data$SEX <- as_factor(my_data$SEX)
my_data$EDATTAIN <- as_factor(my_data$EDATTAIN)
my_data$EMPSTAT <- as_factor(my_data$EMPSTAT)
View(my_data) # Labels present!
#### Summarize (Frequency Tables)
table(my_data$SEX)
table(my_data$EDATTAIN)
table(my_data$EMPSTAT)
#### Summarize (Numeric)
summary(my_data$AGE)
#### Summarize (Two-Way Tables)
table(my_data$EDATTAIN, my_data$SEX)
# Basic Data Manipulation ####
#### Subset observations
## use the subset() function specifying a DATA.FRAME followed by an expression that
## will return a LOGICAL (TRUE or FALSE) value
sub_data <- subset(my_data, subset = EDATTAIN != "Unknown")
table(sub_data$EDATTAIN) ## empty levels (Unknown) present
#### Drop unused factor levels
sub_data <- droplevels(sub_data) ## drop removed labels
# This time we will save the resulting table to an object for further analysis
crosstab <- table(sub_data$EDATTAIN, sub_data$SEX)
## Calling an R object will print the contents of that object
crosstab
## Large objects may get truncated/summarized
my_data
# Write Out R objects #####
## Options to write R objects out to various file types. Note the similarities
## and differences in syntax. In most file-writing functions, the first argument
## is the R object to be written out.
write.csv(crosstab, file = "xtab_emp_by_sex.csv")
write.xlsx(crosstab, file = "xtab_emp_by_sex.xlsx")
write_dta(as.data.frame(crosstab), path = "xtab_emp_by_sex.dta")
## note different syntax (argument name is `path` instead of `file`)
# Base R vs Packages - Tables #####
#### Fancier Tables (gtsummary) ####
crosstab ## print frequencies
ppn_crosstab <- prop.table(crosstab) ## get proportions
ppn_crosstab
ppn_crosstab <- round(ppn_crosstab, 2) ## round to 2 decimal places
ppn_crosstab
## Or use tbl_summary() from the gtsummary package to do it all in one line:
tbl_summary(sub_data, by = "SEX")
# Base R vs Packages - Graphs #####
#### Basic Visualization (Base R) ####
## Base R provides MANY built in functions, each with many options to customize
## your plots.
barplot(crosstab)
## The flexibility/customization comes at the trade-off of user input. Adding a
## legend takes 2 extra steps in Base R.
## First, we use layout() to tell R to treat the plot window as a matrix(),
## consisting of 1 row and 4 columns: 1,1,2,2. The first plot will occupy the
## first two columns (as a merged column), while the second plot will occupy the
## last two columns.
layout(matrix(c(1,1,2,2), ncol = 4))
## barplot as normal
barplot(crosstab, col = rainbow(4), main = "Education by Sex")
## Make an empty plot (of a single point) to use as the base for the legend. "n"
## indicates "null" for many arguments to the plot function.
plot(
1,
type = "n", ## do not plot the points
bty = "n", ## do not plot a bounding box
xaxt = "n", ## do not plot an X axis
xlab = "", ## label the x axis as "", in other words, as blank
yaxt = "n", ## do not plot a Y axis
ylab ="" ## label the Y axis as "", in other words, as blank
)
legend(
"center",
legend = levels(sub_data$EDATTAIN),
pch = 22,
pt.bg = rainbow(4),
inset = c(-0.2,0),
title = "Ed Attainment"
)
#### Basic Visualization (ggplot2) ####
# ggplot2 is EXTREMELY popular for quickly generating polished looking plots. It
# does use a slightly different syntax than Base R, but there is ample
# user-support.
qplot(x = SEX, fill = EDATTAIN, data = sub_data, geom = "bar")