-
Notifications
You must be signed in to change notification settings - Fork 0
/
01-data_prep_cleaning.R
221 lines (152 loc) · 7.55 KB
/
01-data_prep_cleaning.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
#############################################
########### Data Prep and Cleaning ##########
#############################################
#####
# Date: TODAY'S DATE
# By: *INSERT NAME HERE*
# Description: PROJECT DISCRIPTION
# Version of R used: CURRENT VERSION OF R
#####################################
### Import data & load libraries ####
#####################################
## Install the required script packages if not yet installed
# Install pacman package if necessary
if(!"pacman" %in% rownames(installed.packages())) install.packages("pacman")
pacman::p_load(parallel, rio, psych, sjmisc, lubridate, tidyverse, janitor)
## load data
# RData files work the best in R.
# Try to only open RData files to avoid any issues.
# CSV works the next best in R.
# Try to only save and open CSV files to avoid any issue if you cannot load RData files.
# SPSS files can be buggy to import, especially factors and labels
# the following command will open a dialog box and allow you to select the file you wish to laod
dat <- import(file.choose()) %>%
clean_names()
# check to see that you loaded the correct dataset
View(dat)
# list variables in dataset
glimpse(dat)
# NB: Alt + Shift + K --> will bring up keyboard shortcuts
################################
###### Recoding Variables ######
################################
### remove non-reconsenting pparticipants, if needed
# count how many participants did not provide reconsent
sum(!is.na(dat$reconsent[dat$reconsent == 2]))
frq(dat$reconsent)
#remove 2 NAs & 1 no
dat <- filter(dat, (reconsent != 2))
frq(dat$reconsent)
#### recode IVs
# Note: this suggested recoding scheme is based on Qualtrics data
# where all levels of the random assignment conditions are coded as 1
## Recode IV1 levels....high = 2, low = 1
dat$iv1 <- with(dat, rowSums(cbind((iv1_high * 2), iv1_low), na.rm = T))
dat$iv1[dat$iv1 == 0] <- NA
dat$iv1
## Recode IV2 levels....high = 2, low = 1
dat$iv2 <- with(dat, rowSums(cbind((iv2_high * 2), iv2_low), na.rm = T))
dat$iv2[dat$iv2 == 0] <- NA
dat$iv2
# reverse code any items required:
# example syntax is for a 1 to 9 scale (10 - x)
dat$dvitem_r <- 10 - dat$dvitem
dat$dvitem_r
# check number of valid responses per condition
sum(!is.na(dat$iv1_high))
sum(!is.na(dat$iv1_low))
sum(!is.na(dat$iv2_high))
sum(!is.na(dat$iv2_low))
sum(!is.na(dat$iv1))
sum(!is.na(dat$iv2))
describe(select(dat, iv1_high, iv1_low, iv1, iv2_high, iv2_low, iv2))
##################################################################
####### Adding labels and levels to categorical variables ########
##################################################################
# Set the variables to factors and labels, if experimental manipulations
# if no conditions (e.g., a survey), then create labels and factor levels as need for ethnicity, year in school, etc.
dat$iv1 <- factor(dat$iv1, levels=c(1, 2), labels=c("Low", "High"), exclude = NA)
dat$iv2 <- factor(dat$iv2, levels=c(1, 2), labels=c("Low", "High"), exclude = NA)
dat$gender <- factor(dat$gender, levels=c(1, 2), labels=c("Male", "Female"), exclude = NA)
dat$year <- factor(dat$year, levels=c(1, 2, 3, 4, 5),
labels=c("First","Second", "Third","Fourth", "GradStudent"), exclude = NA)
dat$ethnicity <- factor(dat$ethnicity, levels=c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10),
labels=c(“Indigenous/First Nations”,“African”, “East Asian”, “South Asian”, “European”,“Hispanic/Latin-American”, “Middle Eastern”, “Euro-North American”, “Pacific Islander”, “Other”),
exclude = NA)
## check IVs to ensure recoding worked corrected for manipulations
describe(select(dat, iv1, iv2))
#########################################
###### Recode manipulation timings ######
#########################################
# again, based on Qualtrics data, assume page timings were obtained
## Recode manipulation timings based on default Qualtrics variable naming scheme
dat$iv1_timing <- dat %>%
select(iv1_1_3, iv1_2_3) %>%
rowSums(na.rm = TRUE)
dat$iv2_timing <- dat %>%
select(iv2_1_3, iv2_2_3) %>%
rowSums(na.rm = TRUE)
# summary of page timings
describe(select(dat, iv1_timing, iv2_timing))
##########################
###### Creating DVs ######
##########################
## Create DVs
dat$avg_dv1 <- dat %>%
select(dv1_1, dv1_2, dv1_3, dv1_4) %>%
rowMeans(na.rm = TRUE)
dat$avg_dv2 <- dat %>%
select(dv2_1, dv2_2, dv2_3, dv2_4) %>%
rowMeans(na.rm = TRUE)
# NB: if your variables start with same characters, then could use the following to simplify:
# dat$avg_dv1 <- dat %>% select(starts_with("dv1_")) %>% rowMeans(na.rm = TRUE)
# if difference score is needed:
dat$diff <- with(dat, avg_dv1 - avg_dv2)
#######################################
########### Initial Analyses ##########
#######################################
##### Example demographic summaries
# Number of males and females in sample
table(dat$gender)
# Percentage of males and females in sample
table(dat$gender) / sum(table(dat$gender) * 1)
# Mean of sample age
mean(dat$age, na.rm = T)
# Number from each ethnicity in sample
table(dat$ethnicity)
# Table of gender by faculty
table(dat$gender, dat$ethnicity)
# descriptives of IVs and DVs
describe(select(dat, iv1, iv2, avg_dv1, avg_dv2))
########################################################################
############### Remove unwanted variables from dataset #################
########################################################################
# get list of variables in dataset
glimpse(dat)
# included any unwanted variables inside the -c()....example: -c(iv1_h, iv1_l)
dat <- select(dat, -c(iv1_h, iv1_l, iv2_h, iv2_l))
glimpse(dat)
########################################################################
###### Create Project Folder System and Saving Data and Workspace ######
########################################################################
## the following commands create a coherent system for automating the thoughtless parts of a data analysis project
## It provides a log of your work in a clear file/folder hierarchy
# replace PROJECT_NAME with your desire project name
if(!exists("./PROJECT_NAME")) {dir.create("./PROJECT_NAME")} # this is where we will save our entire project
# setwd("./PROJECT_NAME/") # this will set the working directory to your new project
# do not modify these lines
if(!exists("./r")) {dir.create("./r")} # this is where we will save our R scripts
if(!exists("./data")) {dir.create("./data")} # this is where we will save our datasets
if(!exists("./data/raw_data")) {dir.create("./data/raw_data")} # this is where we will save our datasets
if(!exists("./doc")) {dir.create("./doc")} # this is where we will save our manuscripts and reports
if(!exists("./figures")) {dir.create("./figures")} # this is where we will save our figures
if(!exists("./tables")) {dir.create("./tables")} # this is where we will save our tables
if(!exists("./manuscript")) {dir.create("./manuscript")} # this is where we will save our tables
download.file(url = "https://raw.githubusercontent.com/rastlab/R_project_template/master/.gitignore",
destfile = ".gitignore") # creates recommended .gitignore file
# Save current workspace:
export(dat, "./data/00_data_cleaned.RData")
# save R data file as CSV
export(dat, "./data/00_data_cleaned.csv")
# save R data file as SAV SPSS file
export(dat, "./data/00_data_cleaned.sav")