From 70743ad730c43db64dc91cc3919b5104d7a2227d Mon Sep 17 00:00:00 2001
From: Elizabeth Humphries <emarellahumphries@gmail.com>
Date: Thu, 6 Jun 2024 21:30:43 -0400
Subject: [PATCH] update datasets

---
 modules/Functions/Functions.Rmd             | 71 +++++++++++----------
 modules/Functions/clean_data.csv            | 33 ----------
 modules/Functions/lab/Functions_Lab.Rmd     | 36 ++++++++---
 modules/Functions/lab/Functions_Lab_Key.Rmd | 61 +++++++++---------
 4 files changed, 94 insertions(+), 107 deletions(-)
 delete mode 100644 modules/Functions/clean_data.csv
diff --git a/modules/Functions/Functions.Rmd b/modules/Functions/Functions.Rmd
index 28e9df57..a0229339 100644
--- a/modules/Functions/Functions.Rmd
+++ b/modules/Functions/Functions.Rmd
@@ -8,7 +8,7 @@ output:
 
 ```{r, echo = FALSE, message = FALSE}
 library(dplyr)
-library(jhur)
+library(dasehr)
 library(knitr)
 library(stringr)
 library(tidyr)
@@ -189,13 +189,13 @@ We can use `filter(row_number() == n)` to extract a row of a tibble:
 ```{r message=FALSE}
 get_row <- function(dat, row) dat %>% filter(row_number() == row)
 
-cars <- read_kaggle()
-cars_1_8 <- cars %>% select(1:8)
+ces <- calenviroscreen
+ces_1_8 <- ces %>% select(1:8)
 ```
 
 ```{r}
-get_row(dat = cars, row = 10)
-get_row(dat = iris, row = 4)
+get_row(dat = ces, row = 10)
+get_row(dat = ces, row = 4)
 ```
 
 
@@ -210,7 +210,7 @@ get_index <- function(dat, row, col) {
     select(all_of(col))
 }
 
-get_index(dat = cars, row = 10, col = 8)
+get_index(dat = ces, row = 10, col = 7)
 ```
 
 
@@ -225,7 +225,7 @@ get_top <- function(dat, row = 1, col = 1) {
     select(all_of(col))
 }
 
-get_top(dat = cars)
+get_top(dat = ces)
 ```
 
 ## Functions for tibbles
@@ -239,7 +239,7 @@ clean_dataset <- function(dataset, col_name) {
   return(my_data_out)
 }
 
-clean_dataset(dataset = mtcars, col_name = "cyl")
+clean_dataset(dataset = ces, col_name = "CES4.0Score")
 ```
 
 ## Summary
@@ -278,17 +278,19 @@ sapply(<a vector, list, data frame>, some_function)
 You can also pipe into your function.
 
 ```{r comment=""}
-head(iris, n = 2)
-sapply(iris, class)
-iris %>% sapply(class)
+er_visits <- CO_heat_ER
+
+head(er_visits, n = 2)
+sapply(er_visits, class)
+er_visits %>% sapply(class)
 ```
 
 
 ## Using your custom functions: `sapply()`
 
 ```{r}
-select(cars, VehYear:VehicleAge) %>% head()
-select(cars, VehYear:VehicleAge) %>%
+select(er_visits, rate:upper95cl) %>% head()
+select(er_visits, rate:upper95cl) %>%
   sapply(times_2) %>%
   head()
 ```
@@ -297,7 +299,7 @@ select(cars, VehYear:VehicleAge) %>%
 ## Using your custom functions "on the fly" to iterate
 
 ```{r comment=""}
-select(cars, VehYear:VehicleAge) %>%
+select(er_visits, rate:upper95cl) %>%
   sapply(function(x) x / 1000) %>%
   head()
 ```
@@ -310,10 +312,10 @@ select(cars, VehYear:VehicleAge) %>%
 Already know how to use functions to modify columns using `mutate()` or calculate summary statistics using `summarize()`.
 
 ```{r}
-cars %>%
-  mutate(VehOdo_round = round(VehOdo, -3)) %>%
-  summarize(max_Odo_round = max(VehOdo_round),
-            max_Odo = max(VehOdo))
+er_visits %>%
+  mutate(rate_round = round(rate, 2)) %>%
+  summarize(max_rate_round = max(rate_round, na.rm = T),
+            max_rate = max(rate, na.rm = T))
 ```
 
 
@@ -339,21 +341,20 @@ mutate(across(.cols = <columns>, .fns = function))
 Combining with `summarize()`
 
 ```{r warning=FALSE}
-cars_dbl <- cars %>% select(Make, starts_with("Veh"))
+ces_dbl <- ces %>% select(CaliforniaCounty, CES4.0Score, CES4.0Percentile)
 
-cars_dbl %>%
-  summarize(across(.cols = everything(), .fns = mean))
+ces_dbl %>%
+  summarize(across(.cols = everything(), .fns = mean, na.rm=T))
 ```
 
-
 ## Applying functions with `across` from `dplyr`
 
 Can use with other tidyverse functions like `group_by`!
 
 ```{r}
-cars_dbl %>%
-  group_by(Make) %>%
-  summarize(across(.cols = everything(), .fns = mean))
+ces_dbl %>%
+  group_by(CaliforniaCounty) %>%
+  summarize(across(.cols = everything(), .fns = mean, na.rm=T))
 ```
 
 
@@ -362,8 +363,8 @@ cars_dbl %>%
 To add arguments to functions, may need to use anonymous function. In this syntax, the shorthand `\(x)` is equivalent to `function(x)`.
 
 ```{r warning=FALSE}
-cars_dbl %>%
-  group_by(Make) %>%
+ces_dbl %>%
+  group_by(CaliforniaCounty) %>%
   summarize(across(.cols = everything(), .fns = \(x) mean(x, na.rm = TRUE)))
 ```
 
@@ -373,9 +374,9 @@ cars_dbl %>%
 Using different `tidyselect()` options (e.g., `starts_with()`, `ends_with()`, `contains()`)
 
 ```{r warning=FALSE}
-cars_dbl %>%
-  group_by(Make) %>%
-  summarize(across(.cols = starts_with("Veh"), .fns = mean))
+ces_dbl %>%
+  group_by(CaliforniaCounty) %>%
+  summarize(across(.cols = contains("Perc"), .fns = mean))
 ```
 
 
@@ -384,11 +385,11 @@ cars_dbl %>%
 Combining with `mutate()`: rounding to the nearest power of 10 (with negative digits value)
 
 ```{r}
-cars_dbl %>%
+ces_dbl %>%
   mutate(across(
-    .cols = starts_with("Veh"),
+    .cols = starts_with("CES"),
     .fns = round,
-    digits = -3
+    digits = 3
   ))
 ```
 
@@ -403,9 +404,9 @@ or
 
 ```{r warning=FALSE, message=FALSE}
 # Child mortality data
-mort <- read_mortality() %>% rename(country = `...1`)
+#co2 <- yearly_co2_emissions 
 
-mort %>%
+co2 %>%
   select(country, starts_with("194")) %>%
   mutate(across(
     .cols = c(`1943`, `1944`, `1945`),
diff --git a/modules/Functions/clean_data.csv b/modules/Functions/clean_data.csv
deleted file mode 100644
index 49929669..00000000
--- a/modules/Functions/clean_data.csv
+++ /dev/null
@@ -1,33 +0,0 @@
-cyl
-6
-6
-4
-6
-8
-6
-8
-4
-4
-6
-6
-8
-8
-8
-8
-8
-8
-4
-4
-4
-4
-8
-8
-8
-8
-4
-4
-4
-8
-6
-8
-4
diff --git a/modules/Functions/lab/Functions_Lab.Rmd b/modules/Functions/lab/Functions_Lab.Rmd
index 0c274d6c..2c98e38a 100644
--- a/modules/Functions/lab/Functions_Lab.Rmd
+++ b/modules/Functions/lab/Functions_Lab.Rmd
@@ -38,7 +38,16 @@ return(result)
 ```
 
 ```{r 1.1response}
+nums <- c(2, 7, 21, 30, 90)
 
+sum_squared <- function(x) sum(x)^2
+sum_squared(x = nums)
+
+sum_squared <- function(x) {
+  out <- sum(x)^2
+  return(out)
+}
+sum_squared(x = nums)
 ```
 
 ### 1.2
@@ -46,7 +55,11 @@ return(result)
 Create a function that takes two arguments, (1) a vector and (2) a numeric value. This function tests whether the number (2) is contained within the vector (1). **Hint**: use `%in%`. Call it `has_n`. Test your function on the vector `c(2,7,21,30,90)` and number `21` - you should get the answer TRUE.
 
 ```{r 1.2response}
+nums <- c(2, 7, 21, 30, 90)
+a_num <- 21
 
+has_n <- function(x, n) n %in% x
+has_n(x = nums, n = a_num)
 ```
 
 ### 1.3
@@ -54,7 +67,11 @@ Create a function that takes two arguments, (1) a vector and (2) a numeric value
 Amend the function `has_n` from question 1.2 so that it takes a default value of `21` for the numeric argument.
 
 ```{r 1.3response}
+nums <- c(2, 7, 21, 30, 90)
+a_num <- 21
 
+has_n <- function(x, n = 21) n %in% x
+has_n(x = nums)
 ```
 
 ### 1.4
@@ -62,7 +79,8 @@ Amend the function `has_n` from question 1.2 so that it takes a default value of
 Create a new number `b_num` that is not contained with `nums`. Use your updated `has_n` function with the default value and add `b_num` as the `n` argument when calling the function. What is the outcome?
 
 ```{r 1.4response}
-
+b_num <- 11
+has_n(x = nums, n = b_num)
 ```
 
 
@@ -70,7 +88,7 @@ Create a new number `b_num` that is not contained with `nums`. Use your updated
 
 ### 2.1
 
-Read in the SARS-CoV-2 Vaccination data from https://daseh.org/data/USA_covid19_vaccinations.csv. Assign the data the name "vacc".
+Read in the CalEnviroScreen data from https://daseh.org/data/CalEnvironmentalScreen_data.csv. Assign the data the name "ces".
 
 ```{r message = FALSE, label = '2.1response'}
 
@@ -78,7 +96,7 @@ Read in the SARS-CoV-2 Vaccination data from https://daseh.org/data/USA_covid19_
 
 ### 2.2
 
-We want to get some summary statistics on the Moderna vaccines. Use `across` inside `summarize` to get the sum total number vaccine doses for any variable containing the word "Moderna" AND starting with "Total". **Hint**: use `contains()` AND `starts_with()` to select the right columns inside `across`. Keep in mind that this includes the United States as a whole and so it is not totally accurate! Remember that `NA` values can influence calculations.
+We want to get some summary statistics on water contamination. Use `across` inside `summarize` to get the sum total number vaccine doses for any variable containing the string "water" AND ending with "Pctl". **Hint**: use `contains()` AND `ends_with()` to select the right columns inside `across`. Remember that `NA` values can influence calculations.
 
 ```
 # General format
@@ -96,7 +114,7 @@ data %>%
 
 ### 2.3
 
-Use `across` and `mutate` to convert all columns containing the word "Percent" into proportions (i.e., divide that value by 100). **Hint**: use `contains()` to select the right columns within `across()`. Use a "function on the fly" to divide by 100. It will also be easier to check your work if you `select()` columns that match "Percent".
+Use `across` and `mutate` to convert all columns containing the word "Pctl" into proportions (i.e., divide that value by 100). **Hint**: use `contains()` to select the right columns within `across()`. Use a "function on the fly" to divide by 100. It will also be easier to check your work if you `select()` columns that match "Pctl".
 
 ```{r 2.3response}
 
@@ -104,7 +122,7 @@ Use `across` and `mutate` to convert all columns containing the word "Percent" i
 
 ### 2.4
 
-Use `across` and `mutate` to convert all columns starting with the word "Total" into a binary variable: TRUE if the value is greater than 10,000,000 and FALSE if less than or equal to 10,000,000. **Hint**: use `starts_with()` to select the columns starting with "Total". Use a "function on the fly" to do a logical test if the value is greater than 10,000,000.
+Use `across` and `mutate` to convert all columns starting with the string "PM" into a binary variable: TRUE if the value is greater than 10 and FALSE if less than or equal to 10. **Hint**: use `starts_with()` to select the columns that start with "PM". Use a "function on the fly" to do a logical test if the value is greater than 10.
 
 ```{r 2.4response}
 
@@ -115,11 +133,11 @@ Use `across` and `mutate` to convert all columns starting with the word "Total"
 
 ### P.1
 
-Take your code from question 2.4 and assign it to the variable `vacc_dat`. 
+Take your code from question 2.4 and assign it to the variable `ces_dat`. 
 
-- use `filter()` to drop any rows where "United States" appears in `State/Territory/Federal Entity`. Make sure to reassign this to `vacc_dat`.
-- Create a ggplot boxplot (`geom_boxplot()`) where (1) the x-axis is `Total Doses Delivered` and (2) the y-axis is `Percent of fully vaccinated people with booster doses`.
-- You change the `labs()` layer so that the x-axis is "Total Doses Delivered: Greater than 10,000,000"
+- use `filter()` to drop any rows where "Oakland" appears in `ApproxLocation`. Make sure to reassign this to `ces_dat`.
+- Create a ggplot boxplot (`geom_boxplot()`) where (1) the x-axis is `PM2.5` and (2) the y-axis is `Asthma`.
+- You change the `labs()` layer so that the x-axis is "ER Visits for Asthma: PM2.5 greater than 10"
 
 ```{r P.1response}
 
diff --git a/modules/Functions/lab/Functions_Lab_Key.Rmd b/modules/Functions/lab/Functions_Lab_Key.Rmd
index 4b79130a..44b8edb5 100644
--- a/modules/Functions/lab/Functions_Lab_Key.Rmd
+++ b/modules/Functions/lab/Functions_Lab_Key.Rmd
@@ -88,17 +88,18 @@ has_n(x = nums, n = b_num)
 
 ### 2.1
 
-Read in the SARS-CoV-2 Vaccination data from https://daseh.org/data/USA_covid19_vaccinations.csv. Assign the data the name "vacc".
+Read in the CalEnviroScreen from https://daseh.org/data/CalEnvironmentalScreen_data.csv. Assign the data the name "ces".
 
 ```{r message = FALSE, label = '2.1response'}
-vacc <- read_csv("https://daseh.org/data/USA_covid19_vaccinations.csv")
+ces <- read_csv("https://daseh.org/data/CalEnvironmentalScreen_data.csv
+")
 # If downloaded
-# vacc <- read_csv("USA_covid19_vaccinations.csv")
+# ces <- read_csv("CalEnvironmentalScreen_data.csv")
 ```
 
 ### 2.2
 
-We want to get some summary statistics on the Moderna vaccines. Use `across` inside `summarize` to get the sum total number vaccine doses for any variable containing the word "Moderna" AND starting with "Total". **Hint**: use `contains()` AND `starts_with()` to select the right columns inside `across`. Keep in mind that this includes the United States as a whole and so it is not totally accurate! Remember that `NA` values can influence calculations.
+We want to get some summary statistics on water contamination. Use `across` inside `summarize` to get the sum total number vaccine doses for any variable containing the string "water" AND ending with "Pctl". **Hint**: use `contains()` AND `ends_with()` to select the right columns inside `across`. Remember that `NA` values can influence calculations.
 
 ```
 # General format
@@ -111,14 +112,14 @@ data %>%
 ```
 
 ```{r 2.2response}
-vacc %>%
+ces %>%
   summarize(across(
-    .cols = contains("Moderna") & starts_with("Total"),
+    .cols = contains("Water") & ends_with("Pctl"),
     .fns = sum
   ))
-vacc %>%
+ces %>%
   summarize(across(
-    .cols = contains("Moderna") & starts_with("Total"),
+    .cols = contains("Water") & ends_with("Pctl"),
     .fns = sum,
     na.rm = TRUE
   ))
@@ -126,26 +127,26 @@ vacc %>%
 
 ### 2.3
 
-Use `across` and `mutate` to convert all columns containing the word "Percent" into proportions (i.e., divide that value by 100). **Hint**: use `contains()` to select the right columns within `across()`. Use a "function on the fly" to divide by 100. It will also be easier to check your work if you `select()` columns that match "Percent".
+Use `across` and `mutate` to convert all columns containing the word "Pctl" into proportions (i.e., divide that value by 100). **Hint**: use `contains()` to select the right columns within `across()`. Use a "function on the fly" to divide by 100. It will also be easier to check your work if you `select()` columns that match "Pctl".
 
 ```{r 2.3response}
-vacc %>%
+ces %>%
   mutate(across(
-    .cols = contains("Percent"),
+    .cols = contains("Pctl"),
     .fns = function(x) x / 100
   )) %>%
-  select(contains("Percent"))
+  select(contains("Pctl"))
 ```
 
 ### 2.4
 
-Use `across` and `mutate` to convert all columns starting with the word "Total" into a binary variable: TRUE if the value is greater than 10,000,000 and FALSE if less than or equal to 10,000,000. **Hint**: use `starts_with()` to select the columns starting with "Total". Use a "function on the fly" to do a logical test if the value is greater than 10,000,000.
+Use `across` and `mutate` to convert all columns starting with the string "PM" into a binary variable: TRUE if the value is greater than 10 and FALSE if less than or equal to 10. **Hint**: use `starts_with()` to select the columns that start with "PM". Use a "function on the fly" to do a logical test if the value is greater than 10.
 
 ```{r 2.4response}
-vacc %>%
+ces %>%
   mutate(across(
-    .cols = starts_with("Total"),
-    .fns = function(x) x > 10000000
+    .cols = starts_with("PM"),
+    .fns = function(x) x > 10
   ))
 ```
 
@@ -154,28 +155,28 @@ vacc %>%
 
 ### P.1
 
-Take your code from question 2.4 and assign it to the variable `vacc_dat`. 
+Take your code from question 2.4 and assign it to the variable `ces_dat`. 
 
-- use `filter()` to drop any rows where "United States" appears in `State/Territory/Federal Entity`. Make sure to reassign this to `vacc_dat`.
-- Create a ggplot boxplot (`geom_boxplot()`) where (1) the x-axis is `Total Doses Delivered` and (2) the y-axis is `Percent of fully vaccinated people with booster doses`.
-- You change the `labs()` layer so that the x-axis is "Total Doses Delivered: Greater than 10,000,000"
+- use `filter()` to drop any rows where "Oakland" appears in `ApproxLocation`. Make sure to reassign this to `ces_dat`.
+- Create a ggplot boxplot (`geom_boxplot()`) where (1) the x-axis is `PM2.5` and (2) the y-axis is `Asthma`.
+- You change the `labs()` layer so that the x-axis is "ER Visits for Asthma: PM2.5 greater than 10"
 
 ```{r P.1response}
-vacc_dat <-
-  vacc %>%
+ces_dat <-
+  ces %>%
   mutate(across(
-    .cols = starts_with("Total"),
-    .fns = function(x) x > 10000000
+    .cols = starts_with("PM"),
+    .fns = function(x) x > 10
   )) %>%
-  filter(`State/Territory/Federal Entity` != "United States")
+  filter(`ApproxLocation` != "Oakland")
 
-vacc_boxplot <- function(df) {
+ces_boxplot <- function(df) {
   ggplot(df) +
     geom_boxplot(aes(
-      x = `Total Doses Delivered`,
-      y = `Percent of fully vaccinated people with booster doses`
+      x = `Asthma`,
+      y = `PM2.5`
     )) +
-    labs(x = "Total Doses Delivered: Greater than 10,000,000")
+    labs(x = "ER Visits for Asthma: PM2.5 greater than 10")
 }
-vacc_boxplot(vacc_dat)
+ces_boxplot(ces_dat)
 ```