From 70743ad730c43db64dc91cc3919b5104d7a2227d Mon Sep 17 00:00:00 2001 From: Elizabeth Humphries Date: Thu, 6 Jun 2024 21:30:43 -0400 Subject: [PATCH] update datasets --- modules/Functions/Functions.Rmd | 71 +++++++++++---------- modules/Functions/clean_data.csv | 33 ---------- modules/Functions/lab/Functions_Lab.Rmd | 36 ++++++++--- modules/Functions/lab/Functions_Lab_Key.Rmd | 61 +++++++++--------- 4 files changed, 94 insertions(+), 107 deletions(-) delete mode 100644 modules/Functions/clean_data.csv diff --git a/modules/Functions/Functions.Rmd b/modules/Functions/Functions.Rmd index 28e9df57..a0229339 100644 --- a/modules/Functions/Functions.Rmd +++ b/modules/Functions/Functions.Rmd @@ -8,7 +8,7 @@ output: ```{r, echo = FALSE, message = FALSE} library(dplyr) -library(jhur) +library(dasehr) library(knitr) library(stringr) library(tidyr) @@ -189,13 +189,13 @@ We can use `filter(row_number() == n)` to extract a row of a tibble: ```{r message=FALSE} get_row <- function(dat, row) dat %>% filter(row_number() == row) -cars <- read_kaggle() -cars_1_8 <- cars %>% select(1:8) +ces <- calenviroscreen +ces_1_8 <- ces %>% select(1:8) ``` ```{r} -get_row(dat = cars, row = 10) -get_row(dat = iris, row = 4) +get_row(dat = ces, row = 10) +get_row(dat = ces, row = 4) ``` @@ -210,7 +210,7 @@ get_index <- function(dat, row, col) { select(all_of(col)) } -get_index(dat = cars, row = 10, col = 8) +get_index(dat = ces, row = 10, col = 7) ``` @@ -225,7 +225,7 @@ get_top <- function(dat, row = 1, col = 1) { select(all_of(col)) } -get_top(dat = cars) +get_top(dat = ces) ``` ## Functions for tibbles @@ -239,7 +239,7 @@ clean_dataset <- function(dataset, col_name) { return(my_data_out) } -clean_dataset(dataset = mtcars, col_name = "cyl") +clean_dataset(dataset = ces, col_name = "CES4.0Score") ``` ## Summary @@ -278,17 +278,19 @@ sapply(, some_function) You can also pipe into your function. ```{r comment=""} -head(iris, n = 2) -sapply(iris, class) -iris %>% sapply(class) +er_visits <- CO_heat_ER + +head(er_visits, n = 2) +sapply(er_visits, class) +er_visits %>% sapply(class) ``` ## Using your custom functions: `sapply()` ```{r} -select(cars, VehYear:VehicleAge) %>% head() -select(cars, VehYear:VehicleAge) %>% +select(er_visits, rate:upper95cl) %>% head() +select(er_visits, rate:upper95cl) %>% sapply(times_2) %>% head() ``` @@ -297,7 +299,7 @@ select(cars, VehYear:VehicleAge) %>% ## Using your custom functions "on the fly" to iterate ```{r comment=""} -select(cars, VehYear:VehicleAge) %>% +select(er_visits, rate:upper95cl) %>% sapply(function(x) x / 1000) %>% head() ``` @@ -310,10 +312,10 @@ select(cars, VehYear:VehicleAge) %>% Already know how to use functions to modify columns using `mutate()` or calculate summary statistics using `summarize()`. ```{r} -cars %>% - mutate(VehOdo_round = round(VehOdo, -3)) %>% - summarize(max_Odo_round = max(VehOdo_round), - max_Odo = max(VehOdo)) +er_visits %>% + mutate(rate_round = round(rate, 2)) %>% + summarize(max_rate_round = max(rate_round, na.rm = T), + max_rate = max(rate, na.rm = T)) ``` @@ -339,21 +341,20 @@ mutate(across(.cols = , .fns = function)) Combining with `summarize()` ```{r warning=FALSE} -cars_dbl <- cars %>% select(Make, starts_with("Veh")) +ces_dbl <- ces %>% select(CaliforniaCounty, CES4.0Score, CES4.0Percentile) -cars_dbl %>% - summarize(across(.cols = everything(), .fns = mean)) +ces_dbl %>% + summarize(across(.cols = everything(), .fns = mean, na.rm=T)) ``` - ## Applying functions with `across` from `dplyr` Can use with other tidyverse functions like `group_by`! ```{r} -cars_dbl %>% - group_by(Make) %>% - summarize(across(.cols = everything(), .fns = mean)) +ces_dbl %>% + group_by(CaliforniaCounty) %>% + summarize(across(.cols = everything(), .fns = mean, na.rm=T)) ``` @@ -362,8 +363,8 @@ cars_dbl %>% To add arguments to functions, may need to use anonymous function. In this syntax, the shorthand `\(x)` is equivalent to `function(x)`. ```{r warning=FALSE} -cars_dbl %>% - group_by(Make) %>% +ces_dbl %>% + group_by(CaliforniaCounty) %>% summarize(across(.cols = everything(), .fns = \(x) mean(x, na.rm = TRUE))) ``` @@ -373,9 +374,9 @@ cars_dbl %>% Using different `tidyselect()` options (e.g., `starts_with()`, `ends_with()`, `contains()`) ```{r warning=FALSE} -cars_dbl %>% - group_by(Make) %>% - summarize(across(.cols = starts_with("Veh"), .fns = mean)) +ces_dbl %>% + group_by(CaliforniaCounty) %>% + summarize(across(.cols = contains("Perc"), .fns = mean)) ``` @@ -384,11 +385,11 @@ cars_dbl %>% Combining with `mutate()`: rounding to the nearest power of 10 (with negative digits value) ```{r} -cars_dbl %>% +ces_dbl %>% mutate(across( - .cols = starts_with("Veh"), + .cols = starts_with("CES"), .fns = round, - digits = -3 + digits = 3 )) ``` @@ -403,9 +404,9 @@ or ```{r warning=FALSE, message=FALSE} # Child mortality data -mort <- read_mortality() %>% rename(country = `...1`) +#co2 <- yearly_co2_emissions -mort %>% +co2 %>% select(country, starts_with("194")) %>% mutate(across( .cols = c(`1943`, `1944`, `1945`), diff --git a/modules/Functions/clean_data.csv b/modules/Functions/clean_data.csv deleted file mode 100644 index 49929669..00000000 --- a/modules/Functions/clean_data.csv +++ /dev/null @@ -1,33 +0,0 @@ -cyl -6 -6 -4 -6 -8 -6 -8 -4 -4 -6 -6 -8 -8 -8 -8 -8 -8 -4 -4 -4 -4 -8 -8 -8 -8 -4 -4 -4 -8 -6 -8 -4 diff --git a/modules/Functions/lab/Functions_Lab.Rmd b/modules/Functions/lab/Functions_Lab.Rmd index 0c274d6c..2c98e38a 100644 --- a/modules/Functions/lab/Functions_Lab.Rmd +++ b/modules/Functions/lab/Functions_Lab.Rmd @@ -38,7 +38,16 @@ return(result) ``` ```{r 1.1response} +nums <- c(2, 7, 21, 30, 90) +sum_squared <- function(x) sum(x)^2 +sum_squared(x = nums) + +sum_squared <- function(x) { + out <- sum(x)^2 + return(out) +} +sum_squared(x = nums) ``` ### 1.2 @@ -46,7 +55,11 @@ return(result) Create a function that takes two arguments, (1) a vector and (2) a numeric value. This function tests whether the number (2) is contained within the vector (1). **Hint**: use `%in%`. Call it `has_n`. Test your function on the vector `c(2,7,21,30,90)` and number `21` - you should get the answer TRUE. ```{r 1.2response} +nums <- c(2, 7, 21, 30, 90) +a_num <- 21 +has_n <- function(x, n) n %in% x +has_n(x = nums, n = a_num) ``` ### 1.3 @@ -54,7 +67,11 @@ Create a function that takes two arguments, (1) a vector and (2) a numeric value Amend the function `has_n` from question 1.2 so that it takes a default value of `21` for the numeric argument. ```{r 1.3response} +nums <- c(2, 7, 21, 30, 90) +a_num <- 21 +has_n <- function(x, n = 21) n %in% x +has_n(x = nums) ``` ### 1.4 @@ -62,7 +79,8 @@ Amend the function `has_n` from question 1.2 so that it takes a default value of Create a new number `b_num` that is not contained with `nums`. Use your updated `has_n` function with the default value and add `b_num` as the `n` argument when calling the function. What is the outcome? ```{r 1.4response} - +b_num <- 11 +has_n(x = nums, n = b_num) ``` @@ -70,7 +88,7 @@ Create a new number `b_num` that is not contained with `nums`. Use your updated ### 2.1 -Read in the SARS-CoV-2 Vaccination data from https://daseh.org/data/USA_covid19_vaccinations.csv. Assign the data the name "vacc". +Read in the CalEnviroScreen data from https://daseh.org/data/CalEnvironmentalScreen_data.csv. Assign the data the name "ces". ```{r message = FALSE, label = '2.1response'} @@ -78,7 +96,7 @@ Read in the SARS-CoV-2 Vaccination data from https://daseh.org/data/USA_covid19_ ### 2.2 -We want to get some summary statistics on the Moderna vaccines. Use `across` inside `summarize` to get the sum total number vaccine doses for any variable containing the word "Moderna" AND starting with "Total". **Hint**: use `contains()` AND `starts_with()` to select the right columns inside `across`. Keep in mind that this includes the United States as a whole and so it is not totally accurate! Remember that `NA` values can influence calculations. +We want to get some summary statistics on water contamination. Use `across` inside `summarize` to get the sum total number vaccine doses for any variable containing the string "water" AND ending with "Pctl". **Hint**: use `contains()` AND `ends_with()` to select the right columns inside `across`. Remember that `NA` values can influence calculations. ``` # General format @@ -96,7 +114,7 @@ data %>% ### 2.3 -Use `across` and `mutate` to convert all columns containing the word "Percent" into proportions (i.e., divide that value by 100). **Hint**: use `contains()` to select the right columns within `across()`. Use a "function on the fly" to divide by 100. It will also be easier to check your work if you `select()` columns that match "Percent". +Use `across` and `mutate` to convert all columns containing the word "Pctl" into proportions (i.e., divide that value by 100). **Hint**: use `contains()` to select the right columns within `across()`. Use a "function on the fly" to divide by 100. It will also be easier to check your work if you `select()` columns that match "Pctl". ```{r 2.3response} @@ -104,7 +122,7 @@ Use `across` and `mutate` to convert all columns containing the word "Percent" i ### 2.4 -Use `across` and `mutate` to convert all columns starting with the word "Total" into a binary variable: TRUE if the value is greater than 10,000,000 and FALSE if less than or equal to 10,000,000. **Hint**: use `starts_with()` to select the columns starting with "Total". Use a "function on the fly" to do a logical test if the value is greater than 10,000,000. +Use `across` and `mutate` to convert all columns starting with the string "PM" into a binary variable: TRUE if the value is greater than 10 and FALSE if less than or equal to 10. **Hint**: use `starts_with()` to select the columns that start with "PM". Use a "function on the fly" to do a logical test if the value is greater than 10. ```{r 2.4response} @@ -115,11 +133,11 @@ Use `across` and `mutate` to convert all columns starting with the word "Total" ### P.1 -Take your code from question 2.4 and assign it to the variable `vacc_dat`. +Take your code from question 2.4 and assign it to the variable `ces_dat`. -- use `filter()` to drop any rows where "United States" appears in `State/Territory/Federal Entity`. Make sure to reassign this to `vacc_dat`. -- Create a ggplot boxplot (`geom_boxplot()`) where (1) the x-axis is `Total Doses Delivered` and (2) the y-axis is `Percent of fully vaccinated people with booster doses`. -- You change the `labs()` layer so that the x-axis is "Total Doses Delivered: Greater than 10,000,000" +- use `filter()` to drop any rows where "Oakland" appears in `ApproxLocation`. Make sure to reassign this to `ces_dat`. +- Create a ggplot boxplot (`geom_boxplot()`) where (1) the x-axis is `PM2.5` and (2) the y-axis is `Asthma`. +- You change the `labs()` layer so that the x-axis is "ER Visits for Asthma: PM2.5 greater than 10" ```{r P.1response} diff --git a/modules/Functions/lab/Functions_Lab_Key.Rmd b/modules/Functions/lab/Functions_Lab_Key.Rmd index 4b79130a..44b8edb5 100644 --- a/modules/Functions/lab/Functions_Lab_Key.Rmd +++ b/modules/Functions/lab/Functions_Lab_Key.Rmd @@ -88,17 +88,18 @@ has_n(x = nums, n = b_num) ### 2.1 -Read in the SARS-CoV-2 Vaccination data from https://daseh.org/data/USA_covid19_vaccinations.csv. Assign the data the name "vacc". +Read in the CalEnviroScreen from https://daseh.org/data/CalEnvironmentalScreen_data.csv. Assign the data the name "ces". ```{r message = FALSE, label = '2.1response'} -vacc <- read_csv("https://daseh.org/data/USA_covid19_vaccinations.csv") +ces <- read_csv("https://daseh.org/data/CalEnvironmentalScreen_data.csv +") # If downloaded -# vacc <- read_csv("USA_covid19_vaccinations.csv") +# ces <- read_csv("CalEnvironmentalScreen_data.csv") ``` ### 2.2 -We want to get some summary statistics on the Moderna vaccines. Use `across` inside `summarize` to get the sum total number vaccine doses for any variable containing the word "Moderna" AND starting with "Total". **Hint**: use `contains()` AND `starts_with()` to select the right columns inside `across`. Keep in mind that this includes the United States as a whole and so it is not totally accurate! Remember that `NA` values can influence calculations. +We want to get some summary statistics on water contamination. Use `across` inside `summarize` to get the sum total number vaccine doses for any variable containing the string "water" AND ending with "Pctl". **Hint**: use `contains()` AND `ends_with()` to select the right columns inside `across`. Remember that `NA` values can influence calculations. ``` # General format @@ -111,14 +112,14 @@ data %>% ``` ```{r 2.2response} -vacc %>% +ces %>% summarize(across( - .cols = contains("Moderna") & starts_with("Total"), + .cols = contains("Water") & ends_with("Pctl"), .fns = sum )) -vacc %>% +ces %>% summarize(across( - .cols = contains("Moderna") & starts_with("Total"), + .cols = contains("Water") & ends_with("Pctl"), .fns = sum, na.rm = TRUE )) @@ -126,26 +127,26 @@ vacc %>% ### 2.3 -Use `across` and `mutate` to convert all columns containing the word "Percent" into proportions (i.e., divide that value by 100). **Hint**: use `contains()` to select the right columns within `across()`. Use a "function on the fly" to divide by 100. It will also be easier to check your work if you `select()` columns that match "Percent". +Use `across` and `mutate` to convert all columns containing the word "Pctl" into proportions (i.e., divide that value by 100). **Hint**: use `contains()` to select the right columns within `across()`. Use a "function on the fly" to divide by 100. It will also be easier to check your work if you `select()` columns that match "Pctl". ```{r 2.3response} -vacc %>% +ces %>% mutate(across( - .cols = contains("Percent"), + .cols = contains("Pctl"), .fns = function(x) x / 100 )) %>% - select(contains("Percent")) + select(contains("Pctl")) ``` ### 2.4 -Use `across` and `mutate` to convert all columns starting with the word "Total" into a binary variable: TRUE if the value is greater than 10,000,000 and FALSE if less than or equal to 10,000,000. **Hint**: use `starts_with()` to select the columns starting with "Total". Use a "function on the fly" to do a logical test if the value is greater than 10,000,000. +Use `across` and `mutate` to convert all columns starting with the string "PM" into a binary variable: TRUE if the value is greater than 10 and FALSE if less than or equal to 10. **Hint**: use `starts_with()` to select the columns that start with "PM". Use a "function on the fly" to do a logical test if the value is greater than 10. ```{r 2.4response} -vacc %>% +ces %>% mutate(across( - .cols = starts_with("Total"), - .fns = function(x) x > 10000000 + .cols = starts_with("PM"), + .fns = function(x) x > 10 )) ``` @@ -154,28 +155,28 @@ vacc %>% ### P.1 -Take your code from question 2.4 and assign it to the variable `vacc_dat`. +Take your code from question 2.4 and assign it to the variable `ces_dat`. -- use `filter()` to drop any rows where "United States" appears in `State/Territory/Federal Entity`. Make sure to reassign this to `vacc_dat`. -- Create a ggplot boxplot (`geom_boxplot()`) where (1) the x-axis is `Total Doses Delivered` and (2) the y-axis is `Percent of fully vaccinated people with booster doses`. -- You change the `labs()` layer so that the x-axis is "Total Doses Delivered: Greater than 10,000,000" +- use `filter()` to drop any rows where "Oakland" appears in `ApproxLocation`. Make sure to reassign this to `ces_dat`. +- Create a ggplot boxplot (`geom_boxplot()`) where (1) the x-axis is `PM2.5` and (2) the y-axis is `Asthma`. +- You change the `labs()` layer so that the x-axis is "ER Visits for Asthma: PM2.5 greater than 10" ```{r P.1response} -vacc_dat <- - vacc %>% +ces_dat <- + ces %>% mutate(across( - .cols = starts_with("Total"), - .fns = function(x) x > 10000000 + .cols = starts_with("PM"), + .fns = function(x) x > 10 )) %>% - filter(`State/Territory/Federal Entity` != "United States") + filter(`ApproxLocation` != "Oakland") -vacc_boxplot <- function(df) { +ces_boxplot <- function(df) { ggplot(df) + geom_boxplot(aes( - x = `Total Doses Delivered`, - y = `Percent of fully vaccinated people with booster doses` + x = `Asthma`, + y = `PM2.5` )) + - labs(x = "Total Doses Delivered: Greater than 10,000,000") + labs(x = "ER Visits for Asthma: PM2.5 greater than 10") } -vacc_boxplot(vacc_dat) +ces_boxplot(ces_dat) ```