diff --git a/modules/Data_Output/lab/Data_Output_Lab.Rmd b/modules/Data_Output/lab/Data_Output_Lab.Rmd index 706b36aa..3dbe3138 100644 --- a/modules/Data_Output/lab/Data_Output_Lab.Rmd +++ b/modules/Data_Output/lab/Data_Output_Lab.Rmd @@ -29,7 +29,7 @@ library(readr) ### 1.2 -Filter the dataset so that the "reporting_jurisdiction" column is equal to "Maryland" (aka the entire USA). Store the modified dataset as `covid_filtered`. +Filter the dataset so that the "reporting_jurisdiction" column is equal to "Maryland". Store the modified dataset as `covid_filtered`. ``` # General format diff --git a/modules/Data_Output/lab/Data_Output_Lab_Key.Rmd b/modules/Data_Output/lab/Data_Output_Lab_Key.Rmd index 573016d5..2dbb328e 100644 --- a/modules/Data_Output/lab/Data_Output_Lab_Key.Rmd +++ b/modules/Data_Output/lab/Data_Output_Lab_Key.Rmd @@ -30,7 +30,7 @@ covid <- read_csv(file = "https://daseh.org/data/SARS-CoV-2_Wastewater_Data.csv" ### 1.2 -Filter the dataset so that the "reporting_jurisdiction" column is equal to "Maryland" (aka the entire USA). Store the modified dataset as `covid_filtered`. +Filter the dataset so that the "reporting_jurisdiction" column is equal to "Maryland". Store the modified dataset as `covid_filtered`. ``` # General format diff --git a/modules/Data_Output/lab/Data_Output_Lab_Key.html b/modules/Data_Output/lab/Data_Output_Lab_Key.html index 14043ac4..05cd371b 100644 --- a/modules/Data_Output/lab/Data_Output_Lab_Key.html +++ b/modules/Data_Output/lab/Data_Output_Lab_Key.html @@ -366,8 +366,18 @@

1.1

# General format
 library(readr)
 # OBJECT <- read_csv(FILE)
-
library(tidyverse)
-covid <- read_csv(file = "https://daseh.org/data/SARS-CoV-2_Wastewater_Data.csv")
+
library(tidyverse)
+
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
+## ✔ dplyr     1.1.4     ✔ readr     2.1.5
+## ✔ forcats   1.0.0     ✔ stringr   1.5.1
+## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
+## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
+## ✔ purrr     1.0.2     
+## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
+## ✖ dplyr::filter() masks stats::filter()
+## ✖ dplyr::lag()    masks stats::lag()
+## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
+
covid <- read_csv(file = "https://daseh.org/data/SARS-CoV-2_Wastewater_Data.csv")
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
 ## e.g.:
 ##   dat <- vroom(...)
@@ -383,7 +393,7 @@ 

1.1

1.2

Filter the dataset so that the “reporting_jurisdiction” column is -equal to “Maryland” (aka the entire USA). Store the modified dataset as +equal to “Maryland”. Store the modified dataset as covid_filtered.

# General format
 NEW_OBJECT <- OBJECT %>% filter(COLUMNNAME == CRITERIA)
diff --git a/modules/Statistics/Statistics.Rmd b/modules/Statistics/Statistics.Rmd index c121883b..3b079657 100644 --- a/modules/Statistics/Statistics.Rmd +++ b/modules/Statistics/Statistics.Rmd @@ -20,7 +20,7 @@ library(dplyr) options(scipen = 999) library(readr) library(ggplot2) -library(emo) +library(tidyverse) ``` ## Summary @@ -59,10 +59,10 @@ knitr::include_graphics("https://c.tenor.com/O3x8ywLm380AAAAd/chevy-chase.gif") ## Overview ::: {style="color: red;"} -`r emo::ji("rotating_light")` We will focus on how to use R software to do these. We will be glossing over the statistical **theory** and "formulas" for these tests. Moreover, we do not claim the data we use for demonstration meet **assumptions** of the methods. `r emo::ji("rotating_light")` +We will focus on how to use R software to do these. We will be glossing over the statistical **theory** and "formulas" for these tests. Moreover, we do not claim the data we use for demonstration meet **assumptions** of the methods. ::: -There are plenty of resources online for learning more about these methods, as well as dedicated Biostatistics series (at different advancement levels) at the JHU School of Public Health. +There are plenty of resources online for learning more about these methods. Check out [www.opencasestudies.org](https://www.opencasestudies.org/) for deeper dives on some of the concepts covered here and the [resource page](https://daseh.org/resources.html) for more resources. @@ -129,24 +129,27 @@ Like other functions, if there are `NA`s, you get `NA` as the result. But if yo ```{r} # x and y must be numeric vectors -x <- pull(yearly_co2_emissions, `1989`) -y <- pull(yearly_co2_emissions, `2014`) +co2long <- yearly_co2_emissions %>% pivot_longer(-country) +co2wide <-co2long %>% pivot_wider(names_from = country, values_from = value) + +US <- co2wide %>% pull(`United States`) +UK <- co2wide %>% pull(`United Kingdom`) ``` ```{r,fig.width=3, fig.height=3} # have to specify which data on each axis # can accomodate missing data -plot(x, y) +plot(US, UK) ``` ## Correlation coefficient calculation and test ```{r} library(broom) -cor(x, y) -cor(x, y, use = "complete.obs") -cor.test(x, y) +cor(US, UK) +cor(US, UK, use = "complete.obs") +cor.test(US, UK) ``` ## Broom package @@ -154,7 +157,7 @@ cor.test(x, y) The `broom` package helps make stats results look tidy ```{r} -cor_result <- tidy(cor.test(x, y)) +cor_result <- tidy(cor.test(US, UK)) glimpse(cor_result) ``` @@ -162,11 +165,11 @@ glimpse(cor_result) In plot form... `geom_smooth()` and `annotate()` can help. -```{r} +```{r, warning = F} corr_value <- pull(cor_result, estimate) %>% round(digits = 4) cor_label <- paste0("R = ", corr_value) -yearly_co2_emissions %>% - ggplot(aes(x = `1989`, y = `2014`)) + +co2wide %>% + ggplot(aes(x = US, y = UK)) + geom_point(size = 0.3) + geom_smooth() + annotate("text", x = 2000, y = 7500, label = cor_label) @@ -195,7 +198,8 @@ We can compute correlation for all pairs of columns of a data frame / matrix. Th Columns must be all numeric! ```{r} -co2_subset <- yearly_co2_emissions %>% select(c(`1909`, `1929`, `1949`, `1969`, `1989`, `2009`)) +co2_subset <- co2wide %>% select(c(`United States`, `United Kingdom`, Canada)) + head(co2_subset) ``` @@ -253,14 +257,13 @@ It tests the mean of a variable in one group. By default (i.e., without us expli - returns result assuming confidence level 0.95 (`conf.level = 0.95`) - omits `NA` values in data -Let's look at the dataset of haloacetic acid levels in public water sources in Washington, saved as `haa5` in the `dasehr` package. +Let's look at the UK and US CO2 emissions data again. ```{r} -x <- haa5 %>% pull(perc_pop_exposed_to_exceedances) -sum(is.na(x)) # count NAs in x +sum(is.na(US)) # count NAs in x -t.test(x) +t.test(US) ``` ## Running two-sample t-test {.small} @@ -279,13 +282,10 @@ Check out this this [case study](https://www.opencasestudies.org/ocs-bp-rural-an ## Running two-sample t-test in R ```{r} -x <- haa5 %>% pull(pop_on_sampled_PWS) -y <- haa5 %>% pull(pop_exposed_to_exceedances) +sum(is.na(US)) +sum(is.na(UK)) # count NAs in x and y -sum(is.na(x)) -sum(is.na(y)) # count NAs in x and y - -t.test(x, y) +t.test(US, UK) ``` ## T-test: retrieving information from the result with `broom` package @@ -293,14 +293,14 @@ t.test(x, y) The `broom` package has a `tidy()` function that can organize results into a data frame so that they are easily manipulated (or nicely printed) ```{r broom, comment=""} -result <- t.test(x, y) +result <- t.test(US, UK) result_tidy <- tidy(result) glimpse(result_tidy) ``` ## P-value adjustment {.smaller} -`r emo::ji("rotating_light")` You run an increased risk of Type I errors (a "false positive") when multiple hypotheses are tested simultaneously. `r emo::ji("rotating_light")` +You run an increased risk of Type I errors (a "false positive") when multiple hypotheses are tested simultaneously. Use the `p.adjust()` function on a vector of p values. Use `method = ` to specify the adjustment method: @@ -467,22 +467,17 @@ For example, if we want to fit a regression model where outcome is `income` and ## Linear regression -We will use our dataset about nitrate levels by quarter in public water sources in Washington. We'll load a slightly different version of this dataset, which can be found at -https://daseh.org/data/Nitrate_Exposure_for_WA_Public_Water_Systems_byquarter_v2_data.csv. - - -```{r} -nitrate <- read_csv(file = "https://daseh.org/data/Nitrate_Exposure_for_WA_Public_Water_Systems_byquarter_v2_data.csv", show_col_types = F) +We will use our the calenviroscreen dataset from the `dasehr` package to examine how traffic estimates predict diesel particulate emissions. -head(nitrate) -``` ## Linear regression: model fitting -The upper limit of acceptable nitrates in water is 10 ug/L. We fit linear regression model with the number of people on public water systems with more than the limit of nitrates (`pop_exposed_to_exceedances`) as an outcome and the number of people exposed to 3-5 ug/L of nitrates (`pop_>3-5ug/L`) as a predictor. In other words, we are evaluating if the number of people exposed to low levels of nitrates in their water is predictive of the number of people exposed to excess nitrates in their water. +For this model, we will use two variables: +**DieselPM** - estimated diesel particulate emissions from on-road and non-road sources +**TrafficPctl** - percentile ranking of traffic density ```{r} -fit <- glm(pop_exposed_to_exceedances ~ `pop_>3-5ug/L`, data = nitrate) +fit <- glm(DieselPM ~ TrafficPctl, data = calenviroscreen) fit ``` @@ -496,7 +491,7 @@ summary(fit) ## tidy results The broom package can help us here too! -The estimate is the coefficient or slope - for one change in hours worked (1 hour increase), we see 1.58 more visits. The error for this estimate is relatively small at 0.167. This relationship appears to be significant with a small p value <0.001. +The estimate is the coefficient or slope - for one change in the traffic percentile, we see 0.003637 more Diesel particulate emissions. The error for this estimate is relatively small at 0.00009. This relationship appears to be significant with a small p value < 2e-16. ```{r} tidy(fit) %>% glimpse() @@ -504,10 +499,10 @@ tidy(fit) %>% glimpse() ## Linear regression: multiple predictors {.smaller} -Let's try adding another explanatory variable to our model, year (`year`). The tidy function will not work with this unfortunately. The meaning of coefficients is more complicated here. +Let's try adding another explanatory variable to our model, amount of daily Ozone concentration (`Ozone`). Ozone is usually inversely related to particulate measures. The tidy function will not work with this unfortunately. The meaning of coefficients is more complicated here. ```{r} -fit2 <- glm(pop_exposed_to_exceedances ~ `pop_>3-5ug/L` + year, data = nitrate) +fit2 <- glm(DieselPM ~ TrafficPctl + Ozone, data = calenviroscreen) summary(fit2) ``` @@ -525,10 +520,15 @@ fit2 %>% Factors get special treatment in regression models - lowest level of the factor is the comparison group, and all other factors are **relative** to its values. -`quarter` takes values Q1, Q2, Q3, and Q4 to indicate the quarter of the year for each observation. +Let's create a variable that tells us whether a census tract has a high, middle, or low percentage of the population below the poverty line. ```{r} -nitrate %>% count(quarter) +calenviroscreen <- calenviroscreen %>% mutate(PovertyPctl_level = case_when( + PovertyPctl > 0.75 ~ "high", + PovertyPctl >0.25 & PovertyPctl<=0.75 ~ "middle", + PovertyPctl <=0.25 ~ "low", + TRUE ~ "unknown" +)) ``` @@ -539,7 +539,7 @@ The comparison group that is not listed is treated as intercept. All other estim ```{r regressbaseline, comment="", fig.height=4,fig.width=8} -fit3 <- glm(pop_exposed_to_exceedances ~ `pop_>3-5ug/L` + year + factor(quarter), data = nitrate) +fit3 <- glm(DieselPM ~ TrafficPctl + Ozone + factor(PovertyPctl_level), data = calenviroscreen) summary(fit3) ``` @@ -548,13 +548,13 @@ summary(fit3) Relative to the level is not listed. ```{r} -nitrate <- nitrate %>% mutate(quarter = factor(quarter, +calenviroscreen <- calenviroscreen %>% mutate(PovertyPctl_level = factor(PovertyPctl_level, levels = c( - "Q1", "Q2", "Q3", "Q4" + "low", "middle", "high", "unknown" ) )) -fit4 <- glm(pop_exposed_to_exceedances ~ `pop_>3-5ug/L` + year + quarter, data = nitrate) +fit4 <- glm(DieselPM ~ TrafficPctl + Ozone + PovertyPctl_level, data = calenviroscreen) summary(fit4) ``` @@ -563,7 +563,7 @@ summary(fit4) You can view estimates for the comparison group by removing the intercept in the GLM formula `y ~ x - 1`. *Caveat* is that the p-values change. ```{r regress9, comment="", fig.height=4, fig.width=8} -fit5 <- glm(pop_exposed_to_exceedances ~ `pop_>3-5ug/L` + year + quarter - 1, data = nitrate) +fit5 <- glm(DieselPM ~ TrafficPctl + Ozone + PovertyPctl_level - 1, data = calenviroscreen) summary(fit5) ``` @@ -572,7 +572,7 @@ summary(fit5) You can also specify interactions between variables in a formula `y ~ x1 + x2 + x1 * x2`. This allows for not only the intercepts between factors to differ, but also the slopes with regard to the interacting variable. ```{r fig.height=4, fig.width=8} -fit6 <- glm(pop_exposed_to_exceedances ~ `pop_>3-5ug/L` + year + quarter + year * quarter, data = nitrate) +fit6 <- glm(DieselPM ~ TrafficPctl + Ozone + PovertyPctl_level + TrafficPctl*PovertyPctl_level, data = calenviroscreen) tidy(fit6) ``` @@ -581,11 +581,13 @@ tidy(fit6) By default, `ggplot` with a factor added as a color will look include the interaction term. Notice the different intercept and slope of the lines. ```{r fig.height=3.5, fig.width=7, warning=FALSE} -ggplot(nitrate, aes(x = `pop_>3-5ug/L`, y = pop_exposed_to_exceedances, color = quarter)) + - geom_point(size = 1, alpha = 0.8) + +ggplot(calenviroscreen, aes(x = DieselPM, y = TrafficPctl, color = PovertyPctl_level)) + + geom_point(size = 1, alpha = 0.1) + geom_smooth(method = "glm", se = FALSE) + scale_color_manual(values = c("black", "grey45", "grey65", "grey85")) + - theme_classic() + theme_classic() + + ylim(0,100) + + xlim(0, 3) ``` ## Generalized linear models (GLMs) @@ -606,23 +608,23 @@ See `?family` documentation for details of family functions. ## Logistic regression {.smaller} -Let's look at a logistic regression example. We'll use the `nitrate` dataset again, but first we'll need to create a binary variable that tells us whether the percentage of the population exposed to exceedances `perc_pop_exposed_to_exceedances` is greater than 0.1%. +Let's look at a logistic regression example. We'll use the `calenviroscreen` dataset again. We will create a new binary variable based on the DieselPM percentile variable, so we can tell whether a census tract has high or low DieselPM emissions compared to the others. ```{r} -nitrate <- - nitrate %>% +calenviroscreen <- + calenviroscreen %>% mutate( - PercExpMore0.1 = case_when - (perc_pop_exposed_to_exceedances > 0.1 ~ 1, - perc_pop_exposed_to_exceedances <= 0.1 ~ 0)) + DieselPM_level = case_when + (DieselPMPctl > 0.75 ~ 1, + DieselPMPctl <= 0.75 ~ 0)) ``` ## Logistic regression {.smaller} -Now that we've created the `PercExpMore0.1` variable (where a `1` indicates the percentage of the population exposed to excessive nitrates is greater than 0.1%), we can run a logistic regression. +Now that we've created the `DieselPM_level` variable (where a `1` indicates the census tract is one of the top 75% when it comes to dieselPM emissions), we can run a logistic regression. -Let's explore how `quarter` and `year` might predict `PercExpMore0.1`. +Let's explore how `PovertyPctl_level` might predict `DieselPM_level`. ``` # General format @@ -630,7 +632,7 @@ glm(y ~ x, data = DATASET_NAME, family = binomial(link = "logit")) ``` ```{r regress7, comment="", fig.height=4,fig.width=8} -binom_fit <- glm(PercExpMore0.1 ~ year + quarter, data = nitrate, family = binomial(link = "logit")) +binom_fit <- glm(DieselPM_level ~ PovertyPctl_level, data = calenviroscreen, family = binomial(link = "logit")) summary(binom_fit) ``` @@ -645,12 +647,20 @@ Check out [this paper](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2938757/). Use `oddsratio(x, y)` from the `epitools()` package to calculate odds ratios. -In this case, we're calculating the odds ratio for whether the the first 6 months or last 6 months predicts whether the percentage of population exposed to excess nitrates is more than 0.1% +In this case, we're calculating the odds ratio for whether living in a high traffic area predicts high levels of DieselPM. ```{r} library(epitools) -response <- nitrate %>% pull(PercExpMore0.1) -predictor <- nitrate %>% pull(half_of_year) + +calenviroscreen <- + calenviroscreen %>% + mutate( + Traffic_level = case_when + (TrafficPctl > 0.75 ~ 1, + TrafficPctl <= 0.75 ~ 0)) + +response <- calenviroscreen %>% pull(DieselPM_level) +predictor <- calenviroscreen %>% pull(Traffic_level) oddsratio(predictor, response) ``` diff --git a/modules/Statistics/Statistics.html b/modules/Statistics/Statistics.html index 8672109a..b5f3c1b2 100644 --- a/modules/Statistics/Statistics.html +++ b/modules/Statistics/Statistics.html @@ -3181,68 +3181,7 @@ code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } - + @@ -3299,9 +3238,9 @@

Overview

-

🚨 We will focus on how to use R software to do these. We will be glossing over the statistical theory and “formulas” for these tests. Moreover, we do not claim the data we use for demonstration meet assumptions of the methods. 🚨

+

We will focus on how to use R software to do these. We will be glossing over the statistical theory and “formulas” for these tests. Moreover, we do not claim the data we use for demonstration meet assumptions of the methods.

-

There are plenty of resources online for learning more about these methods, as well as dedicated Biostatistics series (at different advancement levels) at the JHU School of Public Health.

+

There are plenty of resources online for learning more about these methods.

Check out www.opencasestudies.org for deeper dives on some of the concepts covered here and the resource page for more resources.

@@ -3385,56 +3324,59 @@

Like other functions, if there are NAs, you get NA as the result. But if you specify use only the complete observations, then it will give you correlation using the non-missing data.

# x and y must be numeric vectors
-x <- pull(yearly_co2_emissions, `1989`)
 
-y <- pull(yearly_co2_emissions, `2014`)
+co2long <- yearly_co2_emissions %>% pivot_longer(-country) +co2wide <-co2long %>% pivot_wider(names_from = country, values_from = value) + +US <- co2wide %>% pull(`United States`) +UK <- co2wide %>% pull(`United Kingdom`)
# have to specify which data on each axis
 # can accomodate missing data
-plot(x, y)
+plot(US, UK) -

+

Correlation coefficient calculation and test

library(broom)
-cor(x, y)
+cor(US, UK)
[1] NA
-
cor(x, y, use = "complete.obs")
+
cor(US, UK, use = "complete.obs")
-
[1] 0.7644918
+
[1] 0.8104108
-
cor.test(x, y)
+
cor.test(US, UK)
     Pearson's product-moment correlation
 
-data:  x and y
-t = 15.463, df = 170, p-value < 0.00000000000000022
+data:  US and UK
+t = 20.188, df = 213, p-value < 0.00000000000000022
 alternative hypothesis: true correlation is not equal to 0
 95 percent confidence interval:
- 0.6942789 0.8202897
+ 0.7588991 0.8518440
 sample estimates:
       cor 
-0.7644918 
+0.8104108

Broom package

The broom package helps make stats results look tidy

-
cor_result <- tidy(cor.test(x, y))
+
cor_result <- tidy(cor.test(US, UK))
 glimpse(cor_result)
Rows: 1
 Columns: 8
-$ estimate    <dbl> 0.7644918
-$ statistic   <dbl> 15.46267
-$ p.value     <dbl> 0.00000000000000000000000000000000030487
-$ parameter   <int> 170
-$ conf.low    <dbl> 0.6942789
-$ conf.high   <dbl> 0.8202897
+$ estimate    <dbl> 0.8104108
+$ statistic   <dbl> 20.1883
+$ p.value     <dbl> 0.000000000000000000000000000000000000000000000000002332853
+$ parameter   <int> 213
+$ conf.low    <dbl> 0.7588991
+$ conf.high   <dbl> 0.851844
 $ method      <chr> "Pearson's product-moment correlation"
 $ alternative <chr> "two.sided"
@@ -3444,13 +3386,13 @@

corr_value <- pull(cor_result, estimate) %>% round(digits = 4)
 cor_label <- paste0("R = ", corr_value)
-yearly_co2_emissions %>%
-  ggplot(aes(x = `1989`, y = `2014`)) +
+co2wide %>%
+  ggplot(aes(x = US, y = UK)) +
   geom_point(size = 0.3) +
   geom_smooth() +
   annotate("text", x = 2000, y = 7500, label = cor_label)
-

+

@@ -3480,18 +3422,19 @@

Columns must be all numeric!

-
co2_subset <- yearly_co2_emissions %>% select(c(`1909`, `1929`, `1949`, `1969`, `1989`, `2009`))
+
co2_subset <- co2wide %>% select(c(`United States`, `United Kingdom`, Canada))
+
 head(co2_subset)
-
# A tibble: 6 × 6
-  `1909` `1929` `1949` `1969` `1989` `2009`
-   <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
-1     NA   NA     14.7    942   2780   6770
-2     NA   NA   1020     3250   8980   4380
-3     NA   80.7  909    11300  80000 121000
-4     NA   NA     NA       NA     NA    517
-5     NA   NA     NA     2790   5010  27800
-6     NA   NA     NA     1260    286    510
+
# A tibble: 6 × 3
+  `United States` `United Kingdom` Canada
+            <dbl>            <dbl>  <dbl>
+1              NA             9360     NA
+2              NA             9360     NA
+3              NA             9360     NA
+4              NA             9370     NA
+5              NA             9370     NA
+6              NA            10000     NA

Correlation for data frame columns

@@ -3500,13 +3443,10 @@

cor_mat <- cor(co2_subset, use = "complete.obs")
 cor_mat
-
          1909      1929      1949      1969      1989      2009
-1909 1.0000000 0.9843659 0.9654205 0.9134400 0.7770616 0.4752203
-1929 0.9843659 1.0000000 0.9870474 0.9401504 0.8115660 0.5085648
-1949 0.9654205 0.9870474 1.0000000 0.9720538 0.8659073 0.5327061
-1969 0.9134400 0.9401504 0.9720538 1.0000000 0.9451710 0.6215263
-1989 0.7770616 0.8115660 0.8659073 0.9451710 1.0000000 0.8110514
-2009 0.4752203 0.5085648 0.5327061 0.6215263 0.8110514 1.0000000
+
               United States United Kingdom    Canada
+United States      1.0000000      0.8104108 0.9832734
+United Kingdom     0.8104108      1.0000000 0.7119204
+Canada             0.9832734      0.7119204 1.0000000

Correlation for data frame columns with plot

@@ -3515,7 +3455,7 @@

library(corrplot)
 corrplot(cor_mat)
-

+

Correlation does not imply causation

@@ -3552,26 +3492,25 @@

  • omits NA values in data
  • -

    Let’s look at the dataset of haloacetic acid levels in public water sources in Washington, saved as haa5 in the dasehr package.

    +

    Let’s look at the UK and US CO2 emissions data again.

    -
    x <- haa5 %>% pull(perc_pop_exposed_to_exceedances)
    -sum(is.na(x)) # count NAs in x
    +
    sum(is.na(US)) # count NAs in x
    -
    [1] 11
    +
    [1] 49
    -
    t.test(x)
    +
    t.test(US)
         One Sample t-test
     
    -data:  x
    -t = 3.7753, df = 21, p-value = 0.00111
    +data:  US
    +t = 13.134, df = 214, p-value < 0.00000000000000022
     alternative hypothesis: true mean is not equal to 0
     95 percent confidence interval:
    - 0.02654098 0.09164084
    + 1485982 2010769
     sample estimates:
    - mean of x 
    -0.05909091 
    +mean of x + 1748376

    Running two-sample t-test

    @@ -3590,55 +3529,52 @@

    Running two-sample t-test in R

    -
    x <- haa5 %>% pull(pop_on_sampled_PWS)
    -y <- haa5 %>% pull(pop_exposed_to_exceedances)
    -
    -sum(is.na(x))
    +
    sum(is.na(US))
    -
    [1] 11
    +
    [1] 49
    -
    sum(is.na(y)) # count NAs in x and y
    +
    sum(is.na(UK)) # count NAs in x and y
    -
    [1] 11
    +
    [1] 0
    -
    t.test(x, y)
    +
    t.test(US, UK)
         Welch Two Sample t-test
     
    -data:  x and y
    -t = 12.23, df = 21, p-value = 0.00000000005122
    +data:  US and UK
    +t = 10.936, df = 218.58, p-value < 0.00000000000000022
     alternative hypothesis: true difference in means is not equal to 0
     95 percent confidence interval:
    - 3501529 4936249
    + 1199859 1727382
     sample estimates:
    -  mean of x   mean of y 
    -4221499.045    2610.364 
    +mean of x mean of y +1748375.7 284755.4

    T-test: retrieving information from the result with broom package

    The broom package has a tidy() function that can organize results into a data frame so that they are easily manipulated (or nicely printed)

    -
    result <- t.test(x, y)
    +
    result <- t.test(US, UK)
     result_tidy <- tidy(result)
     glimpse(result_tidy)
    Rows: 1
     Columns: 10
    -$ estimate    <dbl> 4218889
    -$ estimate1   <dbl> 4221499
    -$ estimate2   <dbl> 2610.364
    -$ statistic   <dbl> 12.23048
    -$ p.value     <dbl> 0.00000000005122359
    -$ parameter   <dbl> 21.00014
    -$ conf.low    <dbl> 3501529
    -$ conf.high   <dbl> 4936249
    +$ estimate    <dbl> 1463620
    +$ estimate1   <dbl> 1748376
    +$ estimate2   <dbl> 284755.4
    +$ statistic   <dbl> 10.93646
    +$ p.value     <dbl> 0.0000000000000000000001731289
    +$ parameter   <dbl> 218.5808
    +$ conf.low    <dbl> 1199859
    +$ conf.high   <dbl> 1727382
     $ method      <chr> "Welch Two Sample t-test"
     $ alternative <chr> "two.sided"

    P-value adjustment

    -

    🚨 You run an increased risk of Type I errors (a “false positive”) when multiple hypotheses are tested simultaneously. 🚨

    +

    You run an increased risk of Type I errors (a “false positive”) when multiple hypotheses are tested simultaneously.

    Use the p.adjust() function on a vector of p values. Use method = to specify the adjustment method:

    @@ -3711,7 +3647,7 @@

    Linear regression

    -

    +

    Linear regression

    @@ -3804,42 +3740,26 @@

    Linear regression

    -

    We will use our dataset about nitrate levels by quarter in public water sources in Washington. We’ll load a slightly different version of this dataset, which can be found at https://daseh.org/data/Nitrate_Exposure_for_WA_Public_Water_Systems_byquarter_v2_data.csv.

    - -
    nitrate <- read_csv(file = "https://daseh.org/data/Nitrate_Exposure_for_WA_Public_Water_Systems_byquarter_v2_data.csv", show_col_types = F)
    -
    -head(nitrate)
    - -
    # A tibble: 6 × 12
    -   year quarter half_of_year pop_on_sampled_PWS `pop_0-3ug/L` `pop_>3-5ug/L`
    -  <dbl> <chr>   <chr>                     <dbl>         <dbl>          <dbl>
    -1  1999 Q1      first                    106720         67775              0
    -2  2000 Q1      first                     34793          5904              0
    -3  2001 Q1      first                     90054         49552            150
    -4  2002 Q1      first                    293486        223488           1474
    -5  2003 Q1      first                   1473586        743676         323767
    -6  2004 Q1      first                   1272283        486491         529354
    -# ℹ 6 more variables: `pop_>5-10ug/L` <dbl>, `pop_>10-20ug/L` <dbl>,
    -#   `pop_>20ug/L` <dbl>, `pop_on_PWS_with_non-detect` <dbl>,
    -#   pop_exposed_to_exceedances <dbl>, perc_pop_exposed_to_exceedances <dbl>
    +

    We will use our the calenviroscreen dataset from the dasehr package to examine how traffic estimates predict diesel particulate emissions.

    Linear regression: model fitting

    -

    The upper limit of acceptable nitrates in water is 10 ug/L. We fit linear regression model with the number of people on public water systems with more than the limit of nitrates (pop_exposed_to_exceedances) as an outcome and the number of people exposed to 3-5 ug/L of nitrates (pop_>3-5ug/L) as a predictor. In other words, we are evaluating if the number of people exposed to low levels of nitrates in their water is predictive of the number of people exposed to excess nitrates in their water.

    +

    For this model, we will use two variables: DieselPM - estimated diesel particulate emissions from on-road and non-road sources TrafficPctl - percentile ranking of traffic density

    -
    fit <- glm(pop_exposed_to_exceedances ~ `pop_>3-5ug/L`, data = nitrate)
    +
    fit <- glm(DieselPM ~ TrafficPctl, data = calenviroscreen)
     fit
    -Call:  glm(formula = pop_exposed_to_exceedances ~ `pop_>3-5ug/L`, data = nitrate)
    +Call:  glm(formula = DieselPM ~ TrafficPctl, data = calenviroscreen)
     
     Coefficients:
    -   (Intercept)  `pop_>3-5ug/L`  
    -    837.633357        0.002052  
    +(Intercept)  TrafficPctl  
    +   0.042452     0.003637  
     
    -Degrees of Freedom: 87 Total (i.e. Null);  86 Residual
    -Null Deviance:      166900000 
    -Residual Deviance: 149100000    AIC: 1518
    +Degrees of Freedom: 7999 Total (i.e. Null); 7998 Residual + (35 observations deleted due to missingness) +Null Deviance: 537.2 +Residual Deviance: 449.1 AIC: -330.9

    Linear regression: model summary

    @@ -3849,62 +3769,63 @@

     Call:
    -glm(formula = pop_exposed_to_exceedances ~ `pop_>3-5ug/L`, data = nitrate)
    +glm(formula = DieselPM ~ TrafficPctl, data = calenviroscreen)
     
     Coefficients:
    -                  Estimate  Std. Error t value Pr(>|t|)   
    -(Intercept)    837.6333569 268.5458637   3.119  0.00247 **
    -`pop_>3-5ug/L`   0.0020516   0.0006391   3.210  0.00186 **
    +              Estimate Std. Error t value             Pr(>|t|)    
    +(Intercept) 0.04245151 0.00529915   8.011   0.0000000000000013 ***
    +TrafficPctl 0.00363651 0.00009177  39.625 < 0.0000000000000002 ***
     ---
     Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
     
    -(Dispersion parameter for gaussian family taken to be 1733185)
    +(Dispersion parameter for gaussian family taken to be 0.05614936)
     
    -    Null deviance: 166917415  on 87  degrees of freedom
    -Residual deviance: 149053912  on 86  degrees of freedom
    -AIC: 1517.9
    +    Null deviance: 537.24  on 7999  degrees of freedom
    +Residual deviance: 449.08  on 7998  degrees of freedom
    +  (35 observations deleted due to missingness)
    +AIC: -330.9
     
     Number of Fisher Scoring iterations: 2

    tidy results

    -

    The broom package can help us here too! The estimate is the coefficient or slope - for one change in hours worked (1 hour increase), we see 1.58 more visits. The error for this estimate is relatively small at 0.167. This relationship appears to be significant with a small p value <0.001.

    +

    The broom package can help us here too! The estimate is the coefficient or slope - for one change in the traffic percentile, we see 0.003637 more Diesel particulate emissions. The error for this estimate is relatively small at 0.00009. This relationship appears to be significant with a small p value < 2e-16.

    tidy(fit) %>% glimpse()
    Rows: 2
     Columns: 5
    -$ term      <chr> "(Intercept)", "`pop_>3-5ug/L`"
    -$ estimate  <dbl> 837.633356922, 0.002051615
    -$ std.error <dbl> 268.5458637101, 0.0006390501
    -$ statistic <dbl> 3.119145, 3.210413
    -$ p.value   <dbl> 0.002468135, 0.001864329
    +$ term <chr> "(Intercept)", "TrafficPctl" +$ estimate <dbl> 0.042451513, 0.003636512 +$ std.error <dbl> 0.00529915058, 0.00009177366 +$ statistic <dbl> 8.011003, 39.624789 +$ p.value <dbl> 0.0000000000000012983396857240023084846536338687883471720851…

    Linear regression: multiple predictors

    -

    Let’s try adding another explanatory variable to our model, year (year). The tidy function will not work with this unfortunately. The meaning of coefficients is more complicated here.

    +

    Let’s try adding another explanatory variable to our model, amount of daily Ozone concentration (Ozone). Ozone is usually inversely related to particulate measures. The tidy function will not work with this unfortunately. The meaning of coefficients is more complicated here.

    -
    fit2 <- glm(pop_exposed_to_exceedances ~ `pop_>3-5ug/L` + year, data = nitrate)
    +
    fit2 <- glm(DieselPM ~ TrafficPctl + Ozone, data = calenviroscreen)
     summary(fit2)
     Call:
    -glm(formula = pop_exposed_to_exceedances ~ `pop_>3-5ug/L` + year, 
    -    data = nitrate)
    +glm(formula = DieselPM ~ TrafficPctl + Ozone, data = calenviroscreen)
     
     Coefficients:
    -                     Estimate     Std. Error t value   Pr(>|t|)    
    -(Intercept)    193320.5254972  46164.1477089   4.188 0.00006852 ***
    -`pop_>3-5ug/L`      0.0033674      0.0006653   5.062 0.00000238 ***
    -year              -96.0210515     23.0288882  -4.170 0.00007319 ***
    +               Estimate  Std. Error t value            Pr(>|t|)    
    +(Intercept)  0.23025068  0.01347754   17.08 <0.0000000000000002 ***
    +TrafficPctl  0.00355094  0.00009067   39.16 <0.0000000000000002 ***
    +Ozone       -3.77418894  0.24967138  -15.12 <0.0000000000000002 ***
     ---
     Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
     
    -(Dispersion parameter for gaussian family taken to be 1455811)
    +(Dispersion parameter for gaussian family taken to be 0.0545963)
     
    -    Null deviance: 166917415  on 87  degrees of freedom
    -Residual deviance: 123743924  on 85  degrees of freedom
    -AIC: 1503.5
    +    Null deviance: 537.24  on 7999  degrees of freedom
    +Residual deviance: 436.61  on 7997  degrees of freedom
    +  (35 observations deleted due to missingness)
    +AIC: -554.3
     
     Number of Fisher Scoring iterations: 2
    @@ -3918,56 +3839,61 @@

    Rows: 3
     Columns: 5
    -$ term      <chr> "(Intercept)", "`pop_>3-5ug/L`", "year"
    -$ estimate  <dbl> 193320.525497167, 0.003367383, -96.021051499
    -$ std.error <dbl> 46164.147708904, 0.000665288, 23.028888212
    -$ statistic <dbl> 4.187677, 5.061541, -4.169591
    -$ p.value   <dbl> 0.000068518905, 0.000002375999, 0.000073186127
    +$ term <chr> "(Intercept)", "TrafficPctl", "Ozone" +$ estimate <dbl> 0.23025068, 0.00355094, -3.77418894 +$ std.error <dbl> 0.01347753532, 0.00009067243, 0.24967137612 +$ statistic <dbl> 17.08403, 39.16229, -15.11663 +$ p.value <dbl> 0.0000000000000000000000000000000000000000000000000000000000…

    Linear regression: factors

    Factors get special treatment in regression models - lowest level of the factor is the comparison group, and all other factors are relative to its values.

    -

    quarter takes values Q1, Q2, Q3, and Q4 to indicate the quarter of the year for each observation.

    - -
    nitrate %>% count(quarter)
    +

    Let’s create a variable that tells us whether a census tract has a high, middle, or low percentage of the population below the poverty line.

    -
    # A tibble: 4 × 2
    -  quarter     n
    -  <chr>   <int>
    -1 Q1         22
    -2 Q2         22
    -3 Q3         22
    -4 Q4         22
    +
    calenviroscreen <- calenviroscreen %>% mutate(PovertyPctl_level = case_when(
    +                    PovertyPctl > 0.75 ~ "high",
    +                    PovertyPctl >0.25 & PovertyPctl<=0.75 ~ "middle",
    +                    PovertyPctl <=0.25 ~ "low",
    +                    TRUE ~ "unknown"
    +))

    Linear regression: factors

    The comparison group that is not listed is treated as intercept. All other estimates are relative to the intercept.

    -
    fit3 <- glm(pop_exposed_to_exceedances ~ `pop_>3-5ug/L` + year + factor(quarter), data = nitrate)
    +
    fit3 <- glm(DieselPM ~ TrafficPctl + Ozone + factor(PovertyPctl_level), data = calenviroscreen)
     summary(fit3)
     Call:
    -glm(formula = pop_exposed_to_exceedances ~ `pop_>3-5ug/L` + year + 
    -    factor(quarter), data = nitrate)
    +glm(formula = DieselPM ~ TrafficPctl + Ozone + factor(PovertyPctl_level), 
    +    data = calenviroscreen)
     
     Coefficients:
    -                        Estimate     Std. Error t value   Pr(>|t|)    
    -(Intercept)       208033.8242583  47597.8780270   4.371 0.00003603 ***
    -`pop_>3-5ug/L`         0.0038251      0.0007477   5.115 0.00000202 ***
    -year                -103.5354110     23.7644678  -4.357 0.00003794 ***
    -factor(quarter)Q2     12.4362229    366.7866716   0.034      0.973    
    -factor(quarter)Q3    473.5397889    390.3813390   1.213      0.229    
    -factor(quarter)Q4    405.3578933    368.5826835   1.100      0.275    
    +                                    Estimate  Std. Error t value
    +(Intercept)                       0.23288777  0.01348689  17.268
    +TrafficPctl                       0.00354476  0.00009056  39.143
    +Ozone                            -3.82361539  0.24960861 -15.318
    +factor(PovertyPctl_level)low     -0.09738847  0.05507632  -1.768
    +factor(PovertyPctl_level)middle  -0.11377196  0.03702058  -3.073
    +factor(PovertyPctl_level)unknown  0.10462654  0.02884150   3.628
    +                                             Pr(>|t|)    
    +(Intercept)                      < 0.0000000000000002 ***
    +TrafficPctl                      < 0.0000000000000002 ***
    +Ozone                            < 0.0000000000000002 ***
    +factor(PovertyPctl_level)low                 0.077058 .  
    +factor(PovertyPctl_level)middle              0.002125 ** 
    +factor(PovertyPctl_level)unknown             0.000288 ***
     ---
     Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
     
    -(Dispersion parameter for gaussian family taken to be 1466474)
    +(Dispersion parameter for gaussian family taken to be 0.05444058)
     
    -    Null deviance: 166917415  on 87  degrees of freedom
    -Residual deviance: 120250886  on 82  degrees of freedom
    -AIC: 1507
    +    Null deviance: 537.24  on 7999  degrees of freedom
    +Residual deviance: 435.20  on 7994  degrees of freedom
    +  (35 observations deleted due to missingness)
    +AIC: -574.15
     
     Number of Fisher Scoring iterations: 2
    @@ -3975,36 +3901,44 @@

    Relative to the level is not listed.

    -
    nitrate <- nitrate %>% mutate(quarter = factor(quarter,
    +
    calenviroscreen <- calenviroscreen %>% mutate(PovertyPctl_level = factor(PovertyPctl_level,
       levels =
         c(
    -      "Q1", "Q2", "Q3", "Q4"
    +      "low", "middle", "high", "unknown"
         )
     ))
    -fit4 <- glm(pop_exposed_to_exceedances ~ `pop_>3-5ug/L` + year + quarter, data = nitrate)
    +fit4 <- glm(DieselPM ~ TrafficPctl + Ozone + PovertyPctl_level, data = calenviroscreen)
     summary(fit4)
     Call:
    -glm(formula = pop_exposed_to_exceedances ~ `pop_>3-5ug/L` + year + 
    -    quarter, data = nitrate)
    +glm(formula = DieselPM ~ TrafficPctl + Ozone + PovertyPctl_level, 
    +    data = calenviroscreen)
     
     Coefficients:
    -                     Estimate     Std. Error t value   Pr(>|t|)    
    -(Intercept)    208033.8242583  47597.8780270   4.371 0.00003603 ***
    -`pop_>3-5ug/L`      0.0038251      0.0007477   5.115 0.00000202 ***
    -year             -103.5354110     23.7644678  -4.357 0.00003794 ***
    -quarterQ2          12.4362229    366.7866716   0.034      0.973    
    -quarterQ3         473.5397889    390.3813390   1.213      0.229    
    -quarterQ4         405.3578933    368.5826835   1.100      0.275    
    +                            Estimate  Std. Error t value             Pr(>|t|)
    +(Intercept)               0.13549930  0.05629309   2.407              0.01611
    +TrafficPctl               0.00354476  0.00009056  39.143 < 0.0000000000000002
    +Ozone                    -3.82361539  0.24960861 -15.318 < 0.0000000000000002
    +PovertyPctl_levelmiddle  -0.01638349  0.06622817  -0.247              0.80462
    +PovertyPctl_levelhigh     0.09738847  0.05507632   1.768              0.07706
    +PovertyPctl_levelunknown  0.20201501  0.06206413   3.255              0.00114
    +                            
    +(Intercept)              *  
    +TrafficPctl              ***
    +Ozone                    ***
    +PovertyPctl_levelmiddle     
    +PovertyPctl_levelhigh    .  
    +PovertyPctl_levelunknown ** 
     ---
     Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
     
    -(Dispersion parameter for gaussian family taken to be 1466474)
    +(Dispersion parameter for gaussian family taken to be 0.05444058)
     
    -    Null deviance: 166917415  on 87  degrees of freedom
    -Residual deviance: 120250886  on 82  degrees of freedom
    -AIC: 1507
    +    Null deviance: 537.24  on 7999  degrees of freedom
    +Residual deviance: 435.20  on 7994  degrees of freedom
    +  (35 observations deleted due to missingness)
    +AIC: -574.15
     
     Number of Fisher Scoring iterations: 2
    @@ -4012,30 +3946,38 @@

    You can view estimates for the comparison group by removing the intercept in the GLM formula y ~ x - 1. Caveat is that the p-values change.

    -
    fit5 <- glm(pop_exposed_to_exceedances ~ `pop_>3-5ug/L` + year + quarter - 1, data = nitrate)
    +
    fit5 <- glm(DieselPM ~ TrafficPctl + Ozone + PovertyPctl_level - 1, data = calenviroscreen)
     summary(fit5)
     Call:
    -glm(formula = pop_exposed_to_exceedances ~ `pop_>3-5ug/L` + year + 
    -    quarter - 1, data = nitrate)
    +glm(formula = DieselPM ~ TrafficPctl + Ozone + PovertyPctl_level - 
    +    1, data = calenviroscreen)
     
     Coefficients:
    -                     Estimate     Std. Error t value   Pr(>|t|)    
    -`pop_>3-5ug/L`      0.0038251      0.0007477   5.115 0.00000202 ***
    -year             -103.5354110     23.7644678  -4.357 0.00003794 ***
    -quarterQ1      208033.8242583  47597.8780270   4.371 0.00003603 ***
    -quarterQ2      208046.2604813  47580.0330230   4.373 0.00003578 ***
    -quarterQ3      208507.3640472  47668.7351298   4.374 0.00003558 ***
    -quarterQ4      208439.1821516  47623.6821872   4.377 0.00003522 ***
    +                            Estimate  Std. Error t value             Pr(>|t|)
    +TrafficPctl               0.00354476  0.00009056  39.143 < 0.0000000000000002
    +Ozone                    -3.82361539  0.24960861 -15.318 < 0.0000000000000002
    +PovertyPctl_levellow      0.13549930  0.05629309   2.407              0.01611
    +PovertyPctl_levelmiddle   0.11911581  0.03867776   3.080              0.00208
    +PovertyPctl_levelhigh     0.23288777  0.01348689  17.268 < 0.0000000000000002
    +PovertyPctl_levelunknown  0.33751432  0.03172424  10.639 < 0.0000000000000002
    +                            
    +TrafficPctl              ***
    +Ozone                    ***
    +PovertyPctl_levellow     *  
    +PovertyPctl_levelmiddle  ** 
    +PovertyPctl_levelhigh    ***
    +PovertyPctl_levelunknown ***
     ---
     Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
     
    -(Dispersion parameter for gaussian family taken to be 1466474)
    +(Dispersion parameter for gaussian family taken to be 0.05444058)
     
    -    Null deviance: 384570288  on 88  degrees of freedom
    -Residual deviance: 120250886  on 82  degrees of freedom
    -AIC: 1507
    +    Null deviance: 939.74  on 8000  degrees of freedom
    +Residual deviance: 435.20  on 7994  degrees of freedom
    +  (35 observations deleted due to missingness)
    +AIC: -574.15
     
     Number of Fisher Scoring iterations: 2
    @@ -4043,33 +3985,35 @@

    You can also specify interactions between variables in a formula y ~ x1 + x2 + x1 * x2. This allows for not only the intercepts between factors to differ, but also the slopes with regard to the interacting variable.

    -
    fit6 <- glm(pop_exposed_to_exceedances ~ `pop_>3-5ug/L` + year + quarter + year * quarter, data = nitrate)
    +
    fit6 <- glm(DieselPM ~ TrafficPctl + Ozone + PovertyPctl_level + TrafficPctl*PovertyPctl_level, data = calenviroscreen) 
     tidy(fit6)
    # A tibble: 9 × 5
    -  term               estimate     std.error statistic    p.value
    -  <chr>                 <dbl>         <dbl>     <dbl>      <dbl>
    -1 (Intercept)    173920.       89077.           1.95  0.0544    
    -2 `pop_>3-5ug/L`      0.00387      0.000761     5.09  0.00000237
    -3 year              -86.6         44.4         -1.95  0.0547    
    -4 quarterQ2      163851.      115853.           1.41  0.161     
    -5 quarterQ3       16037.      115668.           0.139 0.890     
    -6 quarterQ4      -36066.      117691.          -0.306 0.760     
    -7 year:quarterQ2    -81.5         57.7         -1.41  0.161     
    -8 year:quarterQ3     -7.74        57.6         -0.134 0.893     
    -9 year:quarterQ4     18.2         58.6          0.310 0.757     
    + term estimate std.error statistic p.value + <chr> <dbl> <dbl> <dbl> <dbl> +1 (Intercept) 0.203 0.114 1.79 7.38e- 2 +2 TrafficPctl 0.00225 0.00187 1.20 2.31e- 1 +3 Ozone -3.80 0.250 -15.2 1.33e-51 +4 PovertyPctl_levelmiddle 0.00318 0.132 0.0241 9.81e- 1 +5 PovertyPctl_levelhigh 0.0288 0.113 0.255 7.99e- 1 +6 PovertyPctl_levelunknown 0.0836 0.125 0.671 5.02e- 1 +7 TrafficPctl:PovertyPctl_levelmiddle -0.000717 0.00229 -0.313 7.54e- 1 +8 TrafficPctl:PovertyPctl_levelhigh 0.00130 0.00188 0.692 4.89e- 1 +9 TrafficPctl:PovertyPctl_levelunknown 0.00227 0.00206 1.10 2.70e- 1

    Linear regression: interactions

    By default, ggplot with a factor added as a color will look include the interaction term. Notice the different intercept and slope of the lines.

    -
    ggplot(nitrate, aes(x = `pop_>3-5ug/L`, y = pop_exposed_to_exceedances, color = quarter)) +
    -  geom_point(size = 1, alpha = 0.8) +
    +
    ggplot(calenviroscreen, aes(x = DieselPM, y = TrafficPctl, color = PovertyPctl_level)) +
    +  geom_point(size = 1, alpha = 0.1) +
       geom_smooth(method = "glm", se = FALSE) +
       scale_color_manual(values = c("black", "grey45", "grey65", "grey85")) +
    -  theme_classic()
    + theme_classic() + + ylim(0,100) + + xlim(0, 3)
    -

    +

    Generalized linear models (GLMs)

    @@ -4091,49 +4035,46 @@

    Logistic regression

    -

    Let’s look at a logistic regression example. We’ll use the nitrate dataset again, but first we’ll need to create a binary variable that tells us whether the percentage of the population exposed to exceedances perc_pop_exposed_to_exceedances is greater than 0.1%.

    +

    Let’s look at a logistic regression example. We’ll use the calenviroscreen dataset again. We will create a new binary variable based on the DieselPM percentile variable, so we can tell whether a census tract has high or low DieselPM emissions compared to the others.

    -
    nitrate <-
    -  nitrate %>%
    +
    calenviroscreen <-
    +  calenviroscreen %>%
         mutate(
    -      PercExpMore0.1 = case_when
    -      (perc_pop_exposed_to_exceedances > 0.1 ~ 1,
    -        perc_pop_exposed_to_exceedances <= 0.1 ~ 0))
    + DieselPM_level = case_when + (DieselPMPctl > 0.75 ~ 1, + DieselPMPctl <= 0.75 ~ 0))

    Logistic regression

    -

    Now that we’ve created the PercExpMore0.1 variable (where a 1 indicates the percentage of the population exposed to excessive nitrates is greater than 0.1%), we can run a logistic regression.

    +

    Now that we’ve created the DieselPM_level variable (where a 1 indicates the census tract is one of the top 75% when it comes to dieselPM emissions), we can run a logistic regression.

    -

    Let’s explore how quarter and year might predict PercExpMore0.1.

    +

    Let’s explore how PovertyPctl_level might predict DieselPM_level.

    # General format
     glm(y ~ x, data = DATASET_NAME, family = binomial(link = "logit"))
    -
    binom_fit <- glm(PercExpMore0.1 ~ year + quarter, data = nitrate, family = binomial(link = "logit"))
    +
    binom_fit <- glm(DieselPM_level ~ PovertyPctl_level, data = calenviroscreen, family = binomial(link = "logit"))
     summary(binom_fit)
     Call:
    -glm(formula = PercExpMore0.1 ~ year + quarter, family = binomial(link = "logit"), 
    -    data = nitrate)
    +glm(formula = DieselPM_level ~ PovertyPctl_level, family = binomial(link = "logit"), 
    +    data = calenviroscreen)
     
     Coefficients:
    -              Estimate Std. Error z value Pr(>|z|)  
    -(Intercept)  166.29994  107.61737   1.545   0.1223  
    -year          -0.08285    0.05356  -1.547   0.1219  
    -quarterQ2     -2.21841    0.87930  -2.523   0.0116 *
    -quarterQ3    -19.46232 2243.38013  -0.009   0.9931  
    -quarterQ4     -1.74955    0.77826  -2.248   0.0246 *
    ----
    -Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
    +                                 Estimate       Std. Error z value Pr(>|z|)
    +(Intercept)                17.56606841091  932.48064081790   0.019    0.985
    +PovertyPctl_levelmiddle     0.00000007592 1118.59764951226   0.000    1.000
    +PovertyPctl_levelhigh     -12.62378843648  932.48065046127  -0.014    0.989
    +PovertyPctl_levelunknown  -14.68968289500  932.48078242135  -0.016    0.987
     
     (Dispersion parameter for binomial family taken to be 1)
     
    -    Null deviance: 80.363  on 87  degrees of freedom
    -Residual deviance: 58.693  on 83  degrees of freedom
    -AIC: 68.693
    +    Null deviance: 707.22  on 8034  degrees of freedom
    +Residual deviance: 697.17  on 8031  degrees of freedom
    +AIC: 705.17
     
    -Number of Fisher Scoring iterations: 18
    +Number of Fisher Scoring iterations: 16

    Odds ratios

    @@ -4147,31 +4088,43 @@

    Use oddsratio(x, y) from the epitools() package to calculate odds ratios.

    -

    In this case, we’re calculating the odds ratio for whether the the first 6 months or last 6 months predicts whether the percentage of population exposed to excess nitrates is more than 0.1%

    +

    In this case, we’re calculating the odds ratio for whether living in a high traffic area predicts high levels of DieselPM.

    library(epitools)
    -response <- nitrate %>% pull(PercExpMore0.1)
    -predictor <- nitrate %>% pull(half_of_year)
    +
    +calenviroscreen <-
    +  calenviroscreen %>%
    +    mutate(
    +      Traffic_level = case_when
    +      (TrafficPctl > 0.75 ~ 1,
    +        TrafficPctl <= 0.75 ~ 0))
    +
    +response <- calenviroscreen %>% pull(DieselPM_level)
    +predictor <- calenviroscreen %>% pull(Traffic_level)
     oddsratio(predictor, response)
    $data
              Outcome
    -Predictor  0  1 Total
    -   first  32 12    44
    -   second 41  3    44
    -   Total  73 15    88
    +Predictor  0    1 Total
    +    0     23   37    60
    +    1     35 7905  7940
    +    Total 58 7942  8000
     
     $measure
              odds ratio with 95% C.I.
    -Predictor  estimate      lower     upper
    -   first  1.0000000         NA        NA
    -   second 0.2053372 0.04182295 0.7234182
    +Predictor estimate    lower    upper
    +        0   1.0000       NA       NA
    +        1 139.3968 74.58837 260.5596
     
     $p.value
              two-sided
    -Predictor midp.exact fisher.exact chi.square
    -   first          NA           NA         NA
    -   second  0.0122317   0.02103458 0.01072943
    +Predictor midp.exact                                 fisher.exact
    +        0         NA                                           NA
    +        1          0 0.000000000000000000000000000000000007956334
    +         two-sided
    +Predictor                                                                                                                                                                                                                                                                   chi.square
    +        0                                                                                                                                                                                                                                                                           NA
    +        1 0.00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000002491579
     
     $correction
     [1] FALSE
    diff --git a/modules/Statistics/Statistics.pdf b/modules/Statistics/Statistics.pdf
    index 0a95fe32..f09fe066 100644
    Binary files a/modules/Statistics/Statistics.pdf and b/modules/Statistics/Statistics.pdf differ