diff --git a/modules/Data_Output/lab/Data_Output_Lab.Rmd b/modules/Data_Output/lab/Data_Output_Lab.Rmd
index 706b36aa..3dbe3138 100644
--- a/modules/Data_Output/lab/Data_Output_Lab.Rmd
+++ b/modules/Data_Output/lab/Data_Output_Lab.Rmd
@@ -29,7 +29,7 @@ library(readr)
### 1.2
-Filter the dataset so that the "reporting_jurisdiction" column is equal to "Maryland" (aka the entire USA). Store the modified dataset as `covid_filtered`.
+Filter the dataset so that the "reporting_jurisdiction" column is equal to "Maryland". Store the modified dataset as `covid_filtered`.
```
# General format
diff --git a/modules/Data_Output/lab/Data_Output_Lab_Key.Rmd b/modules/Data_Output/lab/Data_Output_Lab_Key.Rmd
index 573016d5..2dbb328e 100644
--- a/modules/Data_Output/lab/Data_Output_Lab_Key.Rmd
+++ b/modules/Data_Output/lab/Data_Output_Lab_Key.Rmd
@@ -30,7 +30,7 @@ covid <- read_csv(file = "https://daseh.org/data/SARS-CoV-2_Wastewater_Data.csv"
### 1.2
-Filter the dataset so that the "reporting_jurisdiction" column is equal to "Maryland" (aka the entire USA). Store the modified dataset as `covid_filtered`.
+Filter the dataset so that the "reporting_jurisdiction" column is equal to "Maryland". Store the modified dataset as `covid_filtered`.
```
# General format
diff --git a/modules/Data_Output/lab/Data_Output_Lab_Key.html b/modules/Data_Output/lab/Data_Output_Lab_Key.html
index 14043ac4..05cd371b 100644
--- a/modules/Data_Output/lab/Data_Output_Lab_Key.html
+++ b/modules/Data_Output/lab/Data_Output_Lab_Key.html
@@ -366,8 +366,18 @@
1.1
# General format
library(readr)
# OBJECT <- read_csv(FILE)
-library(tidyverse)
-covid <- read_csv(file = "https://daseh.org/data/SARS-CoV-2_Wastewater_Data.csv")
+library(tidyverse)
+## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
+## ✔ dplyr 1.1.4 ✔ readr 2.1.5
+## ✔ forcats 1.0.0 ✔ stringr 1.5.1
+## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
+## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
+## ✔ purrr 1.0.2
+## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
+## ✖ dplyr::filter() masks stats::filter()
+## ✖ dplyr::lag() masks stats::lag()
+## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
+covid <- read_csv(file = "https://daseh.org/data/SARS-CoV-2_Wastewater_Data.csv")
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
@@ -383,7 +393,7 @@ 1.1
1.2
Filter the dataset so that the “reporting_jurisdiction” column is
-equal to “Maryland” (aka the entire USA). Store the modified dataset as
+equal to “Maryland”. Store the modified dataset as
covid_filtered
.
# General format
NEW_OBJECT <- OBJECT %>% filter(COLUMNNAME == CRITERIA)
diff --git a/modules/Statistics/Statistics.Rmd b/modules/Statistics/Statistics.Rmd
index c121883b..3b079657 100644
--- a/modules/Statistics/Statistics.Rmd
+++ b/modules/Statistics/Statistics.Rmd
@@ -20,7 +20,7 @@ library(dplyr)
options(scipen = 999)
library(readr)
library(ggplot2)
-library(emo)
+library(tidyverse)
```
## Summary
@@ -59,10 +59,10 @@ knitr::include_graphics("https://c.tenor.com/O3x8ywLm380AAAAd/chevy-chase.gif")
## Overview
::: {style="color: red;"}
-`r emo::ji("rotating_light")` We will focus on how to use R software to do these. We will be glossing over the statistical **theory** and "formulas" for these tests. Moreover, we do not claim the data we use for demonstration meet **assumptions** of the methods. `r emo::ji("rotating_light")`
+We will focus on how to use R software to do these. We will be glossing over the statistical **theory** and "formulas" for these tests. Moreover, we do not claim the data we use for demonstration meet **assumptions** of the methods.
:::
-There are plenty of resources online for learning more about these methods, as well as dedicated Biostatistics series (at different advancement levels) at the JHU School of Public Health.
+There are plenty of resources online for learning more about these methods.
Check out [www.opencasestudies.org](https://www.opencasestudies.org/) for deeper dives on some of the concepts covered here and the [resource page](https://daseh.org/resources.html) for more resources.
@@ -129,24 +129,27 @@ Like other functions, if there are `NA`s, you get `NA` as the result. But if yo
```{r}
# x and y must be numeric vectors
-x <- pull(yearly_co2_emissions, `1989`)
-y <- pull(yearly_co2_emissions, `2014`)
+co2long <- yearly_co2_emissions %>% pivot_longer(-country)
+co2wide <-co2long %>% pivot_wider(names_from = country, values_from = value)
+
+US <- co2wide %>% pull(`United States`)
+UK <- co2wide %>% pull(`United Kingdom`)
```
```{r,fig.width=3, fig.height=3}
# have to specify which data on each axis
# can accomodate missing data
-plot(x, y)
+plot(US, UK)
```
## Correlation coefficient calculation and test
```{r}
library(broom)
-cor(x, y)
-cor(x, y, use = "complete.obs")
-cor.test(x, y)
+cor(US, UK)
+cor(US, UK, use = "complete.obs")
+cor.test(US, UK)
```
## Broom package
@@ -154,7 +157,7 @@ cor.test(x, y)
The `broom` package helps make stats results look tidy
```{r}
-cor_result <- tidy(cor.test(x, y))
+cor_result <- tidy(cor.test(US, UK))
glimpse(cor_result)
```
@@ -162,11 +165,11 @@ glimpse(cor_result)
In plot form... `geom_smooth()` and `annotate()` can help.
-```{r}
+```{r, warning = F}
corr_value <- pull(cor_result, estimate) %>% round(digits = 4)
cor_label <- paste0("R = ", corr_value)
-yearly_co2_emissions %>%
- ggplot(aes(x = `1989`, y = `2014`)) +
+co2wide %>%
+ ggplot(aes(x = US, y = UK)) +
geom_point(size = 0.3) +
geom_smooth() +
annotate("text", x = 2000, y = 7500, label = cor_label)
@@ -195,7 +198,8 @@ We can compute correlation for all pairs of columns of a data frame / matrix. Th
Columns must be all numeric!
```{r}
-co2_subset <- yearly_co2_emissions %>% select(c(`1909`, `1929`, `1949`, `1969`, `1989`, `2009`))
+co2_subset <- co2wide %>% select(c(`United States`, `United Kingdom`, Canada))
+
head(co2_subset)
```
@@ -253,14 +257,13 @@ It tests the mean of a variable in one group. By default (i.e., without us expli
- returns result assuming confidence level 0.95 (`conf.level = 0.95`)
- omits `NA` values in data
-Let's look at the dataset of haloacetic acid levels in public water sources in Washington, saved as `haa5` in the `dasehr` package.
+Let's look at the UK and US CO2 emissions data again.
```{r}
-x <- haa5 %>% pull(perc_pop_exposed_to_exceedances)
-sum(is.na(x)) # count NAs in x
+sum(is.na(US)) # count NAs in x
-t.test(x)
+t.test(US)
```
## Running two-sample t-test {.small}
@@ -279,13 +282,10 @@ Check out this this [case study](https://www.opencasestudies.org/ocs-bp-rural-an
## Running two-sample t-test in R
```{r}
-x <- haa5 %>% pull(pop_on_sampled_PWS)
-y <- haa5 %>% pull(pop_exposed_to_exceedances)
+sum(is.na(US))
+sum(is.na(UK)) # count NAs in x and y
-sum(is.na(x))
-sum(is.na(y)) # count NAs in x and y
-
-t.test(x, y)
+t.test(US, UK)
```
## T-test: retrieving information from the result with `broom` package
@@ -293,14 +293,14 @@ t.test(x, y)
The `broom` package has a `tidy()` function that can organize results into a data frame so that they are easily manipulated (or nicely printed)
```{r broom, comment=""}
-result <- t.test(x, y)
+result <- t.test(US, UK)
result_tidy <- tidy(result)
glimpse(result_tidy)
```
## P-value adjustment {.smaller}
-`r emo::ji("rotating_light")` You run an increased risk of Type I errors (a "false positive") when multiple hypotheses are tested simultaneously. `r emo::ji("rotating_light")`
+You run an increased risk of Type I errors (a "false positive") when multiple hypotheses are tested simultaneously.
Use the `p.adjust()` function on a vector of p values. Use `method = ` to specify the adjustment method:
@@ -467,22 +467,17 @@ For example, if we want to fit a regression model where outcome is `income` and
## Linear regression
-We will use our dataset about nitrate levels by quarter in public water sources in Washington. We'll load a slightly different version of this dataset, which can be found at
-https://daseh.org/data/Nitrate_Exposure_for_WA_Public_Water_Systems_byquarter_v2_data.csv.
-
-
-```{r}
-nitrate <- read_csv(file = "https://daseh.org/data/Nitrate_Exposure_for_WA_Public_Water_Systems_byquarter_v2_data.csv", show_col_types = F)
+We will use our the calenviroscreen dataset from the `dasehr` package to examine how traffic estimates predict diesel particulate emissions.
-head(nitrate)
-```
## Linear regression: model fitting
-The upper limit of acceptable nitrates in water is 10 ug/L. We fit linear regression model with the number of people on public water systems with more than the limit of nitrates (`pop_exposed_to_exceedances`) as an outcome and the number of people exposed to 3-5 ug/L of nitrates (`pop_>3-5ug/L`) as a predictor. In other words, we are evaluating if the number of people exposed to low levels of nitrates in their water is predictive of the number of people exposed to excess nitrates in their water.
+For this model, we will use two variables:
+**DieselPM** - estimated diesel particulate emissions from on-road and non-road sources
+**TrafficPctl** - percentile ranking of traffic density
```{r}
-fit <- glm(pop_exposed_to_exceedances ~ `pop_>3-5ug/L`, data = nitrate)
+fit <- glm(DieselPM ~ TrafficPctl, data = calenviroscreen)
fit
```
@@ -496,7 +491,7 @@ summary(fit)
## tidy results
The broom package can help us here too!
-The estimate is the coefficient or slope - for one change in hours worked (1 hour increase), we see 1.58 more visits. The error for this estimate is relatively small at 0.167. This relationship appears to be significant with a small p value <0.001.
+The estimate is the coefficient or slope - for one change in the traffic percentile, we see 0.003637 more Diesel particulate emissions. The error for this estimate is relatively small at 0.00009. This relationship appears to be significant with a small p value < 2e-16.
```{r}
tidy(fit) %>% glimpse()
@@ -504,10 +499,10 @@ tidy(fit) %>% glimpse()
## Linear regression: multiple predictors {.smaller}
-Let's try adding another explanatory variable to our model, year (`year`). The tidy function will not work with this unfortunately. The meaning of coefficients is more complicated here.
+Let's try adding another explanatory variable to our model, amount of daily Ozone concentration (`Ozone`). Ozone is usually inversely related to particulate measures. The tidy function will not work with this unfortunately. The meaning of coefficients is more complicated here.
```{r}
-fit2 <- glm(pop_exposed_to_exceedances ~ `pop_>3-5ug/L` + year, data = nitrate)
+fit2 <- glm(DieselPM ~ TrafficPctl + Ozone, data = calenviroscreen)
summary(fit2)
```
@@ -525,10 +520,15 @@ fit2 %>%
Factors get special treatment in regression models - lowest level of the factor is the comparison group, and all other factors are **relative** to its values.
-`quarter` takes values Q1, Q2, Q3, and Q4 to indicate the quarter of the year for each observation.
+Let's create a variable that tells us whether a census tract has a high, middle, or low percentage of the population below the poverty line.
```{r}
-nitrate %>% count(quarter)
+calenviroscreen <- calenviroscreen %>% mutate(PovertyPctl_level = case_when(
+ PovertyPctl > 0.75 ~ "high",
+ PovertyPctl >0.25 & PovertyPctl<=0.75 ~ "middle",
+ PovertyPctl <=0.25 ~ "low",
+ TRUE ~ "unknown"
+))
```
@@ -539,7 +539,7 @@ The comparison group that is not listed is treated as intercept. All other estim
```{r regressbaseline, comment="", fig.height=4,fig.width=8}
-fit3 <- glm(pop_exposed_to_exceedances ~ `pop_>3-5ug/L` + year + factor(quarter), data = nitrate)
+fit3 <- glm(DieselPM ~ TrafficPctl + Ozone + factor(PovertyPctl_level), data = calenviroscreen)
summary(fit3)
```
@@ -548,13 +548,13 @@ summary(fit3)
Relative to the level is not listed.
```{r}
-nitrate <- nitrate %>% mutate(quarter = factor(quarter,
+calenviroscreen <- calenviroscreen %>% mutate(PovertyPctl_level = factor(PovertyPctl_level,
levels =
c(
- "Q1", "Q2", "Q3", "Q4"
+ "low", "middle", "high", "unknown"
)
))
-fit4 <- glm(pop_exposed_to_exceedances ~ `pop_>3-5ug/L` + year + quarter, data = nitrate)
+fit4 <- glm(DieselPM ~ TrafficPctl + Ozone + PovertyPctl_level, data = calenviroscreen)
summary(fit4)
```
@@ -563,7 +563,7 @@ summary(fit4)
You can view estimates for the comparison group by removing the intercept in the GLM formula `y ~ x - 1`. *Caveat* is that the p-values change.
```{r regress9, comment="", fig.height=4, fig.width=8}
-fit5 <- glm(pop_exposed_to_exceedances ~ `pop_>3-5ug/L` + year + quarter - 1, data = nitrate)
+fit5 <- glm(DieselPM ~ TrafficPctl + Ozone + PovertyPctl_level - 1, data = calenviroscreen)
summary(fit5)
```
@@ -572,7 +572,7 @@ summary(fit5)
You can also specify interactions between variables in a formula `y ~ x1 + x2 + x1 * x2`. This allows for not only the intercepts between factors to differ, but also the slopes with regard to the interacting variable.
```{r fig.height=4, fig.width=8}
-fit6 <- glm(pop_exposed_to_exceedances ~ `pop_>3-5ug/L` + year + quarter + year * quarter, data = nitrate)
+fit6 <- glm(DieselPM ~ TrafficPctl + Ozone + PovertyPctl_level + TrafficPctl*PovertyPctl_level, data = calenviroscreen)
tidy(fit6)
```
@@ -581,11 +581,13 @@ tidy(fit6)
By default, `ggplot` with a factor added as a color will look include the interaction term. Notice the different intercept and slope of the lines.
```{r fig.height=3.5, fig.width=7, warning=FALSE}
-ggplot(nitrate, aes(x = `pop_>3-5ug/L`, y = pop_exposed_to_exceedances, color = quarter)) +
- geom_point(size = 1, alpha = 0.8) +
+ggplot(calenviroscreen, aes(x = DieselPM, y = TrafficPctl, color = PovertyPctl_level)) +
+ geom_point(size = 1, alpha = 0.1) +
geom_smooth(method = "glm", se = FALSE) +
scale_color_manual(values = c("black", "grey45", "grey65", "grey85")) +
- theme_classic()
+ theme_classic() +
+ ylim(0,100) +
+ xlim(0, 3)
```
## Generalized linear models (GLMs)
@@ -606,23 +608,23 @@ See `?family` documentation for details of family functions.
## Logistic regression {.smaller}
-Let's look at a logistic regression example. We'll use the `nitrate` dataset again, but first we'll need to create a binary variable that tells us whether the percentage of the population exposed to exceedances `perc_pop_exposed_to_exceedances` is greater than 0.1%.
+Let's look at a logistic regression example. We'll use the `calenviroscreen` dataset again. We will create a new binary variable based on the DieselPM percentile variable, so we can tell whether a census tract has high or low DieselPM emissions compared to the others.
```{r}
-nitrate <-
- nitrate %>%
+calenviroscreen <-
+ calenviroscreen %>%
mutate(
- PercExpMore0.1 = case_when
- (perc_pop_exposed_to_exceedances > 0.1 ~ 1,
- perc_pop_exposed_to_exceedances <= 0.1 ~ 0))
+ DieselPM_level = case_when
+ (DieselPMPctl > 0.75 ~ 1,
+ DieselPMPctl <= 0.75 ~ 0))
```
## Logistic regression {.smaller}
-Now that we've created the `PercExpMore0.1` variable (where a `1` indicates the percentage of the population exposed to excessive nitrates is greater than 0.1%), we can run a logistic regression.
+Now that we've created the `DieselPM_level` variable (where a `1` indicates the census tract is one of the top 75% when it comes to dieselPM emissions), we can run a logistic regression.
-Let's explore how `quarter` and `year` might predict `PercExpMore0.1`.
+Let's explore how `PovertyPctl_level` might predict `DieselPM_level`.
```
# General format
@@ -630,7 +632,7 @@ glm(y ~ x, data = DATASET_NAME, family = binomial(link = "logit"))
```
```{r regress7, comment="", fig.height=4,fig.width=8}
-binom_fit <- glm(PercExpMore0.1 ~ year + quarter, data = nitrate, family = binomial(link = "logit"))
+binom_fit <- glm(DieselPM_level ~ PovertyPctl_level, data = calenviroscreen, family = binomial(link = "logit"))
summary(binom_fit)
```
@@ -645,12 +647,20 @@ Check out [this paper](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2938757/).
Use `oddsratio(x, y)` from the `epitools()` package to calculate odds ratios.
-In this case, we're calculating the odds ratio for whether the the first 6 months or last 6 months predicts whether the percentage of population exposed to excess nitrates is more than 0.1%
+In this case, we're calculating the odds ratio for whether living in a high traffic area predicts high levels of DieselPM.
```{r}
library(epitools)
-response <- nitrate %>% pull(PercExpMore0.1)
-predictor <- nitrate %>% pull(half_of_year)
+
+calenviroscreen <-
+ calenviroscreen %>%
+ mutate(
+ Traffic_level = case_when
+ (TrafficPctl > 0.75 ~ 1,
+ TrafficPctl <= 0.75 ~ 0))
+
+response <- calenviroscreen %>% pull(DieselPM_level)
+predictor <- calenviroscreen %>% pull(Traffic_level)
oddsratio(predictor, response)
```
diff --git a/modules/Statistics/Statistics.html b/modules/Statistics/Statistics.html
index 8672109a..b5f3c1b2 100644
--- a/modules/Statistics/Statistics.html
+++ b/modules/Statistics/Statistics.html
@@ -3181,68 +3181,7 @@
code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; }
-
+
@@ -3299,9 +3238,9 @@
Overview
-
🚨 We will focus on how to use R software to do these. We will be glossing over the statistical theory and “formulas” for these tests. Moreover, we do not claim the data we use for demonstration meet assumptions of the methods. 🚨
+We will focus on how to use R software to do these. We will be glossing over the statistical theory and “formulas” for these tests. Moreover, we do not claim the data we use for demonstration meet assumptions of the methods.
-There are plenty of resources online for learning more about these methods, as well as dedicated Biostatistics series (at different advancement levels) at the JHU School of Public Health.
+There are plenty of resources online for learning more about these methods.
Check out www.opencasestudies.org for deeper dives on some of the concepts covered here and the resource page for more resources.
@@ -3385,56 +3324,59 @@
Like other functions, if there are NA
s, you get NA
as the result. But if you specify use only the complete observations, then it will give you correlation using the non-missing data.
# x and y must be numeric vectors
-x <- pull(yearly_co2_emissions, `1989`)
-y <- pull(yearly_co2_emissions, `2014`)
+co2long <- yearly_co2_emissions %>% pivot_longer(-country)
+co2wide <-co2long %>% pivot_wider(names_from = country, values_from = value)
+
+US <- co2wide %>% pull(`United States`)
+UK <- co2wide %>% pull(`United Kingdom`)
# have to specify which data on each axis
# can accomodate missing data
-plot(x, y)
+plot(US, UK)
-
+
Correlation coefficient calculation and test
library(broom)
-cor(x, y)
+cor(US, UK)
[1] NA
-cor(x, y, use = "complete.obs")
+cor(US, UK, use = "complete.obs")
-[1] 0.7644918
+[1] 0.8104108
-cor.test(x, y)
+cor.test(US, UK)
Pearson's product-moment correlation
-data: x and y
-t = 15.463, df = 170, p-value < 0.00000000000000022
+data: US and UK
+t = 20.188, df = 213, p-value < 0.00000000000000022
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
- 0.6942789 0.8202897
+ 0.7588991 0.8518440
sample estimates:
cor
-0.7644918
+0.8104108
Broom package
The broom
package helps make stats results look tidy
-cor_result <- tidy(cor.test(x, y))
+cor_result <- tidy(cor.test(US, UK))
glimpse(cor_result)
Rows: 1
Columns: 8
-$ estimate <dbl> 0.7644918
-$ statistic <dbl> 15.46267
-$ p.value <dbl> 0.00000000000000000000000000000000030487
-$ parameter <int> 170
-$ conf.low <dbl> 0.6942789
-$ conf.high <dbl> 0.8202897
+$ estimate <dbl> 0.8104108
+$ statistic <dbl> 20.1883
+$ p.value <dbl> 0.000000000000000000000000000000000000000000000000002332853
+$ parameter <int> 213
+$ conf.low <dbl> 0.7588991
+$ conf.high <dbl> 0.851844
$ method <chr> "Pearson's product-moment correlation"
$ alternative <chr> "two.sided"
@@ -3444,13 +3386,13 @@
corr_value <- pull(cor_result, estimate) %>% round(digits = 4)
cor_label <- paste0("R = ", corr_value)
-yearly_co2_emissions %>%
- ggplot(aes(x = `1989`, y = `2014`)) +
+co2wide %>%
+ ggplot(aes(x = US, y = UK)) +
geom_point(size = 0.3) +
geom_smooth() +
annotate("text", x = 2000, y = 7500, label = cor_label)
-
+
@@ -3480,18 +3422,19 @@
Columns must be all numeric!
-co2_subset <- yearly_co2_emissions %>% select(c(`1909`, `1929`, `1949`, `1969`, `1989`, `2009`))
+co2_subset <- co2wide %>% select(c(`United States`, `United Kingdom`, Canada))
+
head(co2_subset)
-# A tibble: 6 × 6
- `1909` `1929` `1949` `1969` `1989` `2009`
- <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
-1 NA NA 14.7 942 2780 6770
-2 NA NA 1020 3250 8980 4380
-3 NA 80.7 909 11300 80000 121000
-4 NA NA NA NA NA 517
-5 NA NA NA 2790 5010 27800
-6 NA NA NA 1260 286 510
+# A tibble: 6 × 3
+ `United States` `United Kingdom` Canada
+ <dbl> <dbl> <dbl>
+1 NA 9360 NA
+2 NA 9360 NA
+3 NA 9360 NA
+4 NA 9370 NA
+5 NA 9370 NA
+6 NA 10000 NA
Correlation for data frame columns
@@ -3500,13 +3443,10 @@
cor_mat <- cor(co2_subset, use = "complete.obs")
cor_mat
- 1909 1929 1949 1969 1989 2009
-1909 1.0000000 0.9843659 0.9654205 0.9134400 0.7770616 0.4752203
-1929 0.9843659 1.0000000 0.9870474 0.9401504 0.8115660 0.5085648
-1949 0.9654205 0.9870474 1.0000000 0.9720538 0.8659073 0.5327061
-1969 0.9134400 0.9401504 0.9720538 1.0000000 0.9451710 0.6215263
-1989 0.7770616 0.8115660 0.8659073 0.9451710 1.0000000 0.8110514
-2009 0.4752203 0.5085648 0.5327061 0.6215263 0.8110514 1.0000000
+ United States United Kingdom Canada
+United States 1.0000000 0.8104108 0.9832734
+United Kingdom 0.8104108 1.0000000 0.7119204
+Canada 0.9832734 0.7119204 1.0000000
Correlation for data frame columns with plot
@@ -3515,7 +3455,7 @@
library(corrplot)
corrplot(cor_mat)
-
+
Correlation does not imply causation
@@ -3552,26 +3492,25 @@
omits NA
values in data
-Let’s look at the dataset of haloacetic acid levels in public water sources in Washington, saved as haa5
in the dasehr
package.
+Let’s look at the UK and US CO2 emissions data again.
-x <- haa5 %>% pull(perc_pop_exposed_to_exceedances)
-sum(is.na(x)) # count NAs in x
+sum(is.na(US)) # count NAs in x
-[1] 11
+[1] 49
-t.test(x)
+t.test(US)
One Sample t-test
-data: x
-t = 3.7753, df = 21, p-value = 0.00111
+data: US
+t = 13.134, df = 214, p-value < 0.00000000000000022
alternative hypothesis: true mean is not equal to 0
95 percent confidence interval:
- 0.02654098 0.09164084
+ 1485982 2010769
sample estimates:
- mean of x
-0.05909091
+mean of x
+ 1748376
Running two-sample t-test
@@ -3590,55 +3529,52 @@
Running two-sample t-test in R
-x <- haa5 %>% pull(pop_on_sampled_PWS)
-y <- haa5 %>% pull(pop_exposed_to_exceedances)
-
-sum(is.na(x))
+sum(is.na(US))
-[1] 11
+[1] 49
-sum(is.na(y)) # count NAs in x and y
+sum(is.na(UK)) # count NAs in x and y
-[1] 11
+[1] 0
-t.test(x, y)
+t.test(US, UK)
Welch Two Sample t-test
-data: x and y
-t = 12.23, df = 21, p-value = 0.00000000005122
+data: US and UK
+t = 10.936, df = 218.58, p-value < 0.00000000000000022
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
- 3501529 4936249
+ 1199859 1727382
sample estimates:
- mean of x mean of y
-4221499.045 2610.364
+mean of x mean of y
+1748375.7 284755.4
T-test: retrieving information from the result with broom
package
The broom
package has a tidy()
function that can organize results into a data frame so that they are easily manipulated (or nicely printed)
-result <- t.test(x, y)
+result <- t.test(US, UK)
result_tidy <- tidy(result)
glimpse(result_tidy)
Rows: 1
Columns: 10
-$ estimate <dbl> 4218889
-$ estimate1 <dbl> 4221499
-$ estimate2 <dbl> 2610.364
-$ statistic <dbl> 12.23048
-$ p.value <dbl> 0.00000000005122359
-$ parameter <dbl> 21.00014
-$ conf.low <dbl> 3501529
-$ conf.high <dbl> 4936249
+$ estimate <dbl> 1463620
+$ estimate1 <dbl> 1748376
+$ estimate2 <dbl> 284755.4
+$ statistic <dbl> 10.93646
+$ p.value <dbl> 0.0000000000000000000001731289
+$ parameter <dbl> 218.5808
+$ conf.low <dbl> 1199859
+$ conf.high <dbl> 1727382
$ method <chr> "Welch Two Sample t-test"
$ alternative <chr> "two.sided"
P-value adjustment
-🚨 You run an increased risk of Type I errors (a “false positive”) when multiple hypotheses are tested simultaneously. 🚨
+You run an increased risk of Type I errors (a “false positive”) when multiple hypotheses are tested simultaneously.
Use the p.adjust()
function on a vector of p values. Use method =
to specify the adjustment method:
@@ -3711,7 +3647,7 @@
Linear regression
-
+
Linear regression
@@ -3804,42 +3740,26 @@
Linear regression
-We will use our dataset about nitrate levels by quarter in public water sources in Washington. We’ll load a slightly different version of this dataset, which can be found at https://daseh.org/data/Nitrate_Exposure_for_WA_Public_Water_Systems_byquarter_v2_data.csv.
-
-nitrate <- read_csv(file = "https://daseh.org/data/Nitrate_Exposure_for_WA_Public_Water_Systems_byquarter_v2_data.csv", show_col_types = F)
-
-head(nitrate)
-
-# A tibble: 6 × 12
- year quarter half_of_year pop_on_sampled_PWS `pop_0-3ug/L` `pop_>3-5ug/L`
- <dbl> <chr> <chr> <dbl> <dbl> <dbl>
-1 1999 Q1 first 106720 67775 0
-2 2000 Q1 first 34793 5904 0
-3 2001 Q1 first 90054 49552 150
-4 2002 Q1 first 293486 223488 1474
-5 2003 Q1 first 1473586 743676 323767
-6 2004 Q1 first 1272283 486491 529354
-# ℹ 6 more variables: `pop_>5-10ug/L` <dbl>, `pop_>10-20ug/L` <dbl>,
-# `pop_>20ug/L` <dbl>, `pop_on_PWS_with_non-detect` <dbl>,
-# pop_exposed_to_exceedances <dbl>, perc_pop_exposed_to_exceedances <dbl>
+We will use our the calenviroscreen dataset from the dasehr
package to examine how traffic estimates predict diesel particulate emissions.
Linear regression: model fitting
-The upper limit of acceptable nitrates in water is 10 ug/L. We fit linear regression model with the number of people on public water systems with more than the limit of nitrates (pop_exposed_to_exceedances
) as an outcome and the number of people exposed to 3-5 ug/L of nitrates (pop_>3-5ug/L
) as a predictor. In other words, we are evaluating if the number of people exposed to low levels of nitrates in their water is predictive of the number of people exposed to excess nitrates in their water.
+For this model, we will use two variables: DieselPM - estimated diesel particulate emissions from on-road and non-road sources TrafficPctl - percentile ranking of traffic density
-fit <- glm(pop_exposed_to_exceedances ~ `pop_>3-5ug/L`, data = nitrate)
+fit <- glm(DieselPM ~ TrafficPctl, data = calenviroscreen)
fit
-Call: glm(formula = pop_exposed_to_exceedances ~ `pop_>3-5ug/L`, data = nitrate)
+Call: glm(formula = DieselPM ~ TrafficPctl, data = calenviroscreen)
Coefficients:
- (Intercept) `pop_>3-5ug/L`
- 837.633357 0.002052
+(Intercept) TrafficPctl
+ 0.042452 0.003637
-Degrees of Freedom: 87 Total (i.e. Null); 86 Residual
-Null Deviance: 166900000
-Residual Deviance: 149100000 AIC: 1518
+Degrees of Freedom: 7999 Total (i.e. Null); 7998 Residual
+ (35 observations deleted due to missingness)
+Null Deviance: 537.2
+Residual Deviance: 449.1 AIC: -330.9
Linear regression: model summary
@@ -3849,62 +3769,63 @@
Call:
-glm(formula = pop_exposed_to_exceedances ~ `pop_>3-5ug/L`, data = nitrate)
+glm(formula = DieselPM ~ TrafficPctl, data = calenviroscreen)
Coefficients:
- Estimate Std. Error t value Pr(>|t|)
-(Intercept) 837.6333569 268.5458637 3.119 0.00247 **
-`pop_>3-5ug/L` 0.0020516 0.0006391 3.210 0.00186 **
+ Estimate Std. Error t value Pr(>|t|)
+(Intercept) 0.04245151 0.00529915 8.011 0.0000000000000013 ***
+TrafficPctl 0.00363651 0.00009177 39.625 < 0.0000000000000002 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-(Dispersion parameter for gaussian family taken to be 1733185)
+(Dispersion parameter for gaussian family taken to be 0.05614936)
- Null deviance: 166917415 on 87 degrees of freedom
-Residual deviance: 149053912 on 86 degrees of freedom
-AIC: 1517.9
+ Null deviance: 537.24 on 7999 degrees of freedom
+Residual deviance: 449.08 on 7998 degrees of freedom
+ (35 observations deleted due to missingness)
+AIC: -330.9
Number of Fisher Scoring iterations: 2
tidy results
-The broom package can help us here too! The estimate is the coefficient or slope - for one change in hours worked (1 hour increase), we see 1.58 more visits. The error for this estimate is relatively small at 0.167. This relationship appears to be significant with a small p value <0.001.
+The broom package can help us here too! The estimate is the coefficient or slope - for one change in the traffic percentile, we see 0.003637 more Diesel particulate emissions. The error for this estimate is relatively small at 0.00009. This relationship appears to be significant with a small p value < 2e-16.
tidy(fit) %>% glimpse()
Rows: 2
Columns: 5
-$ term <chr> "(Intercept)", "`pop_>3-5ug/L`"
-$ estimate <dbl> 837.633356922, 0.002051615
-$ std.error <dbl> 268.5458637101, 0.0006390501
-$ statistic <dbl> 3.119145, 3.210413
-$ p.value <dbl> 0.002468135, 0.001864329
+$ term <chr> "(Intercept)", "TrafficPctl"
+$ estimate <dbl> 0.042451513, 0.003636512
+$ std.error <dbl> 0.00529915058, 0.00009177366
+$ statistic <dbl> 8.011003, 39.624789
+$ p.value <dbl> 0.0000000000000012983396857240023084846536338687883471720851…
Linear regression: multiple predictors
-Let’s try adding another explanatory variable to our model, year (year
). The tidy function will not work with this unfortunately. The meaning of coefficients is more complicated here.
+Let’s try adding another explanatory variable to our model, amount of daily Ozone concentration (Ozone
). Ozone is usually inversely related to particulate measures. The tidy function will not work with this unfortunately. The meaning of coefficients is more complicated here.
-fit2 <- glm(pop_exposed_to_exceedances ~ `pop_>3-5ug/L` + year, data = nitrate)
+fit2 <- glm(DieselPM ~ TrafficPctl + Ozone, data = calenviroscreen)
summary(fit2)
Call:
-glm(formula = pop_exposed_to_exceedances ~ `pop_>3-5ug/L` + year,
- data = nitrate)
+glm(formula = DieselPM ~ TrafficPctl + Ozone, data = calenviroscreen)
Coefficients:
- Estimate Std. Error t value Pr(>|t|)
-(Intercept) 193320.5254972 46164.1477089 4.188 0.00006852 ***
-`pop_>3-5ug/L` 0.0033674 0.0006653 5.062 0.00000238 ***
-year -96.0210515 23.0288882 -4.170 0.00007319 ***
+ Estimate Std. Error t value Pr(>|t|)
+(Intercept) 0.23025068 0.01347754 17.08 <0.0000000000000002 ***
+TrafficPctl 0.00355094 0.00009067 39.16 <0.0000000000000002 ***
+Ozone -3.77418894 0.24967138 -15.12 <0.0000000000000002 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-(Dispersion parameter for gaussian family taken to be 1455811)
+(Dispersion parameter for gaussian family taken to be 0.0545963)
- Null deviance: 166917415 on 87 degrees of freedom
-Residual deviance: 123743924 on 85 degrees of freedom
-AIC: 1503.5
+ Null deviance: 537.24 on 7999 degrees of freedom
+Residual deviance: 436.61 on 7997 degrees of freedom
+ (35 observations deleted due to missingness)
+AIC: -554.3
Number of Fisher Scoring iterations: 2
@@ -3918,56 +3839,61 @@
Rows: 3
Columns: 5
-$ term <chr> "(Intercept)", "`pop_>3-5ug/L`", "year"
-$ estimate <dbl> 193320.525497167, 0.003367383, -96.021051499
-$ std.error <dbl> 46164.147708904, 0.000665288, 23.028888212
-$ statistic <dbl> 4.187677, 5.061541, -4.169591
-$ p.value <dbl> 0.000068518905, 0.000002375999, 0.000073186127
+$ term <chr> "(Intercept)", "TrafficPctl", "Ozone"
+$ estimate <dbl> 0.23025068, 0.00355094, -3.77418894
+$ std.error <dbl> 0.01347753532, 0.00009067243, 0.24967137612
+$ statistic <dbl> 17.08403, 39.16229, -15.11663
+$ p.value <dbl> 0.0000000000000000000000000000000000000000000000000000000000…
Linear regression: factors
Factors get special treatment in regression models - lowest level of the factor is the comparison group, and all other factors are relative to its values.
-quarter
takes values Q1, Q2, Q3, and Q4 to indicate the quarter of the year for each observation.
-
-nitrate %>% count(quarter)
+Let’s create a variable that tells us whether a census tract has a high, middle, or low percentage of the population below the poverty line.
-# A tibble: 4 × 2
- quarter n
- <chr> <int>
-1 Q1 22
-2 Q2 22
-3 Q3 22
-4 Q4 22
+calenviroscreen <- calenviroscreen %>% mutate(PovertyPctl_level = case_when(
+ PovertyPctl > 0.75 ~ "high",
+ PovertyPctl >0.25 & PovertyPctl<=0.75 ~ "middle",
+ PovertyPctl <=0.25 ~ "low",
+ TRUE ~ "unknown"
+))
Linear regression: factors
The comparison group that is not listed is treated as intercept. All other estimates are relative to the intercept.
-fit3 <- glm(pop_exposed_to_exceedances ~ `pop_>3-5ug/L` + year + factor(quarter), data = nitrate)
+fit3 <- glm(DieselPM ~ TrafficPctl + Ozone + factor(PovertyPctl_level), data = calenviroscreen)
summary(fit3)
Call:
-glm(formula = pop_exposed_to_exceedances ~ `pop_>3-5ug/L` + year +
- factor(quarter), data = nitrate)
+glm(formula = DieselPM ~ TrafficPctl + Ozone + factor(PovertyPctl_level),
+ data = calenviroscreen)
Coefficients:
- Estimate Std. Error t value Pr(>|t|)
-(Intercept) 208033.8242583 47597.8780270 4.371 0.00003603 ***
-`pop_>3-5ug/L` 0.0038251 0.0007477 5.115 0.00000202 ***
-year -103.5354110 23.7644678 -4.357 0.00003794 ***
-factor(quarter)Q2 12.4362229 366.7866716 0.034 0.973
-factor(quarter)Q3 473.5397889 390.3813390 1.213 0.229
-factor(quarter)Q4 405.3578933 368.5826835 1.100 0.275
+ Estimate Std. Error t value
+(Intercept) 0.23288777 0.01348689 17.268
+TrafficPctl 0.00354476 0.00009056 39.143
+Ozone -3.82361539 0.24960861 -15.318
+factor(PovertyPctl_level)low -0.09738847 0.05507632 -1.768
+factor(PovertyPctl_level)middle -0.11377196 0.03702058 -3.073
+factor(PovertyPctl_level)unknown 0.10462654 0.02884150 3.628
+ Pr(>|t|)
+(Intercept) < 0.0000000000000002 ***
+TrafficPctl < 0.0000000000000002 ***
+Ozone < 0.0000000000000002 ***
+factor(PovertyPctl_level)low 0.077058 .
+factor(PovertyPctl_level)middle 0.002125 **
+factor(PovertyPctl_level)unknown 0.000288 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-(Dispersion parameter for gaussian family taken to be 1466474)
+(Dispersion parameter for gaussian family taken to be 0.05444058)
- Null deviance: 166917415 on 87 degrees of freedom
-Residual deviance: 120250886 on 82 degrees of freedom
-AIC: 1507
+ Null deviance: 537.24 on 7999 degrees of freedom
+Residual deviance: 435.20 on 7994 degrees of freedom
+ (35 observations deleted due to missingness)
+AIC: -574.15
Number of Fisher Scoring iterations: 2
@@ -3975,36 +3901,44 @@
Relative to the level is not listed.
-nitrate <- nitrate %>% mutate(quarter = factor(quarter,
+calenviroscreen <- calenviroscreen %>% mutate(PovertyPctl_level = factor(PovertyPctl_level,
levels =
c(
- "Q1", "Q2", "Q3", "Q4"
+ "low", "middle", "high", "unknown"
)
))
-fit4 <- glm(pop_exposed_to_exceedances ~ `pop_>3-5ug/L` + year + quarter, data = nitrate)
+fit4 <- glm(DieselPM ~ TrafficPctl + Ozone + PovertyPctl_level, data = calenviroscreen)
summary(fit4)
Call:
-glm(formula = pop_exposed_to_exceedances ~ `pop_>3-5ug/L` + year +
- quarter, data = nitrate)
+glm(formula = DieselPM ~ TrafficPctl + Ozone + PovertyPctl_level,
+ data = calenviroscreen)
Coefficients:
- Estimate Std. Error t value Pr(>|t|)
-(Intercept) 208033.8242583 47597.8780270 4.371 0.00003603 ***
-`pop_>3-5ug/L` 0.0038251 0.0007477 5.115 0.00000202 ***
-year -103.5354110 23.7644678 -4.357 0.00003794 ***
-quarterQ2 12.4362229 366.7866716 0.034 0.973
-quarterQ3 473.5397889 390.3813390 1.213 0.229
-quarterQ4 405.3578933 368.5826835 1.100 0.275
+ Estimate Std. Error t value Pr(>|t|)
+(Intercept) 0.13549930 0.05629309 2.407 0.01611
+TrafficPctl 0.00354476 0.00009056 39.143 < 0.0000000000000002
+Ozone -3.82361539 0.24960861 -15.318 < 0.0000000000000002
+PovertyPctl_levelmiddle -0.01638349 0.06622817 -0.247 0.80462
+PovertyPctl_levelhigh 0.09738847 0.05507632 1.768 0.07706
+PovertyPctl_levelunknown 0.20201501 0.06206413 3.255 0.00114
+
+(Intercept) *
+TrafficPctl ***
+Ozone ***
+PovertyPctl_levelmiddle
+PovertyPctl_levelhigh .
+PovertyPctl_levelunknown **
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-(Dispersion parameter for gaussian family taken to be 1466474)
+(Dispersion parameter for gaussian family taken to be 0.05444058)
- Null deviance: 166917415 on 87 degrees of freedom
-Residual deviance: 120250886 on 82 degrees of freedom
-AIC: 1507
+ Null deviance: 537.24 on 7999 degrees of freedom
+Residual deviance: 435.20 on 7994 degrees of freedom
+ (35 observations deleted due to missingness)
+AIC: -574.15
Number of Fisher Scoring iterations: 2
@@ -4012,30 +3946,38 @@
You can view estimates for the comparison group by removing the intercept in the GLM formula y ~ x - 1
. Caveat is that the p-values change.
-fit5 <- glm(pop_exposed_to_exceedances ~ `pop_>3-5ug/L` + year + quarter - 1, data = nitrate)
+fit5 <- glm(DieselPM ~ TrafficPctl + Ozone + PovertyPctl_level - 1, data = calenviroscreen)
summary(fit5)
Call:
-glm(formula = pop_exposed_to_exceedances ~ `pop_>3-5ug/L` + year +
- quarter - 1, data = nitrate)
+glm(formula = DieselPM ~ TrafficPctl + Ozone + PovertyPctl_level -
+ 1, data = calenviroscreen)
Coefficients:
- Estimate Std. Error t value Pr(>|t|)
-`pop_>3-5ug/L` 0.0038251 0.0007477 5.115 0.00000202 ***
-year -103.5354110 23.7644678 -4.357 0.00003794 ***
-quarterQ1 208033.8242583 47597.8780270 4.371 0.00003603 ***
-quarterQ2 208046.2604813 47580.0330230 4.373 0.00003578 ***
-quarterQ3 208507.3640472 47668.7351298 4.374 0.00003558 ***
-quarterQ4 208439.1821516 47623.6821872 4.377 0.00003522 ***
+ Estimate Std. Error t value Pr(>|t|)
+TrafficPctl 0.00354476 0.00009056 39.143 < 0.0000000000000002
+Ozone -3.82361539 0.24960861 -15.318 < 0.0000000000000002
+PovertyPctl_levellow 0.13549930 0.05629309 2.407 0.01611
+PovertyPctl_levelmiddle 0.11911581 0.03867776 3.080 0.00208
+PovertyPctl_levelhigh 0.23288777 0.01348689 17.268 < 0.0000000000000002
+PovertyPctl_levelunknown 0.33751432 0.03172424 10.639 < 0.0000000000000002
+
+TrafficPctl ***
+Ozone ***
+PovertyPctl_levellow *
+PovertyPctl_levelmiddle **
+PovertyPctl_levelhigh ***
+PovertyPctl_levelunknown ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-(Dispersion parameter for gaussian family taken to be 1466474)
+(Dispersion parameter for gaussian family taken to be 0.05444058)
- Null deviance: 384570288 on 88 degrees of freedom
-Residual deviance: 120250886 on 82 degrees of freedom
-AIC: 1507
+ Null deviance: 939.74 on 8000 degrees of freedom
+Residual deviance: 435.20 on 7994 degrees of freedom
+ (35 observations deleted due to missingness)
+AIC: -574.15
Number of Fisher Scoring iterations: 2
@@ -4043,33 +3985,35 @@
You can also specify interactions between variables in a formula y ~ x1 + x2 + x1 * x2
. This allows for not only the intercepts between factors to differ, but also the slopes with regard to the interacting variable.
-fit6 <- glm(pop_exposed_to_exceedances ~ `pop_>3-5ug/L` + year + quarter + year * quarter, data = nitrate)
+fit6 <- glm(DieselPM ~ TrafficPctl + Ozone + PovertyPctl_level + TrafficPctl*PovertyPctl_level, data = calenviroscreen)
tidy(fit6)
# A tibble: 9 × 5
- term estimate std.error statistic p.value
- <chr> <dbl> <dbl> <dbl> <dbl>
-1 (Intercept) 173920. 89077. 1.95 0.0544
-2 `pop_>3-5ug/L` 0.00387 0.000761 5.09 0.00000237
-3 year -86.6 44.4 -1.95 0.0547
-4 quarterQ2 163851. 115853. 1.41 0.161
-5 quarterQ3 16037. 115668. 0.139 0.890
-6 quarterQ4 -36066. 117691. -0.306 0.760
-7 year:quarterQ2 -81.5 57.7 -1.41 0.161
-8 year:quarterQ3 -7.74 57.6 -0.134 0.893
-9 year:quarterQ4 18.2 58.6 0.310 0.757
+ term estimate std.error statistic p.value
+ <chr> <dbl> <dbl> <dbl> <dbl>
+1 (Intercept) 0.203 0.114 1.79 7.38e- 2
+2 TrafficPctl 0.00225 0.00187 1.20 2.31e- 1
+3 Ozone -3.80 0.250 -15.2 1.33e-51
+4 PovertyPctl_levelmiddle 0.00318 0.132 0.0241 9.81e- 1
+5 PovertyPctl_levelhigh 0.0288 0.113 0.255 7.99e- 1
+6 PovertyPctl_levelunknown 0.0836 0.125 0.671 5.02e- 1
+7 TrafficPctl:PovertyPctl_levelmiddle -0.000717 0.00229 -0.313 7.54e- 1
+8 TrafficPctl:PovertyPctl_levelhigh 0.00130 0.00188 0.692 4.89e- 1
+9 TrafficPctl:PovertyPctl_levelunknown 0.00227 0.00206 1.10 2.70e- 1
Linear regression: interactions
By default, ggplot
with a factor added as a color will look include the interaction term. Notice the different intercept and slope of the lines.
-ggplot(nitrate, aes(x = `pop_>3-5ug/L`, y = pop_exposed_to_exceedances, color = quarter)) +
- geom_point(size = 1, alpha = 0.8) +
+ggplot(calenviroscreen, aes(x = DieselPM, y = TrafficPctl, color = PovertyPctl_level)) +
+ geom_point(size = 1, alpha = 0.1) +
geom_smooth(method = "glm", se = FALSE) +
scale_color_manual(values = c("black", "grey45", "grey65", "grey85")) +
- theme_classic()
+ theme_classic() +
+ ylim(0,100) +
+ xlim(0, 3)
-
+
Generalized linear models (GLMs)
@@ -4091,49 +4035,46 @@
Logistic regression
-Let’s look at a logistic regression example. We’ll use the nitrate
dataset again, but first we’ll need to create a binary variable that tells us whether the percentage of the population exposed to exceedances perc_pop_exposed_to_exceedances
is greater than 0.1%.
+Let’s look at a logistic regression example. We’ll use the calenviroscreen
dataset again. We will create a new binary variable based on the DieselPM percentile variable, so we can tell whether a census tract has high or low DieselPM emissions compared to the others.
-nitrate <-
- nitrate %>%
+calenviroscreen <-
+ calenviroscreen %>%
mutate(
- PercExpMore0.1 = case_when
- (perc_pop_exposed_to_exceedances > 0.1 ~ 1,
- perc_pop_exposed_to_exceedances <= 0.1 ~ 0))
+ DieselPM_level = case_when
+ (DieselPMPctl > 0.75 ~ 1,
+ DieselPMPctl <= 0.75 ~ 0))
Logistic regression
-Now that we’ve created the PercExpMore0.1
variable (where a 1
indicates the percentage of the population exposed to excessive nitrates is greater than 0.1%), we can run a logistic regression.
+Now that we’ve created the DieselPM_level
variable (where a 1
indicates the census tract is one of the top 75% when it comes to dieselPM emissions), we can run a logistic regression.
-Let’s explore how quarter
and year
might predict PercExpMore0.1
.
+Let’s explore how PovertyPctl_level
might predict DieselPM_level
.
# General format
glm(y ~ x, data = DATASET_NAME, family = binomial(link = "logit"))
-binom_fit <- glm(PercExpMore0.1 ~ year + quarter, data = nitrate, family = binomial(link = "logit"))
+binom_fit <- glm(DieselPM_level ~ PovertyPctl_level, data = calenviroscreen, family = binomial(link = "logit"))
summary(binom_fit)
Call:
-glm(formula = PercExpMore0.1 ~ year + quarter, family = binomial(link = "logit"),
- data = nitrate)
+glm(formula = DieselPM_level ~ PovertyPctl_level, family = binomial(link = "logit"),
+ data = calenviroscreen)
Coefficients:
- Estimate Std. Error z value Pr(>|z|)
-(Intercept) 166.29994 107.61737 1.545 0.1223
-year -0.08285 0.05356 -1.547 0.1219
-quarterQ2 -2.21841 0.87930 -2.523 0.0116 *
-quarterQ3 -19.46232 2243.38013 -0.009 0.9931
-quarterQ4 -1.74955 0.77826 -2.248 0.0246 *
----
-Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+ Estimate Std. Error z value Pr(>|z|)
+(Intercept) 17.56606841091 932.48064081790 0.019 0.985
+PovertyPctl_levelmiddle 0.00000007592 1118.59764951226 0.000 1.000
+PovertyPctl_levelhigh -12.62378843648 932.48065046127 -0.014 0.989
+PovertyPctl_levelunknown -14.68968289500 932.48078242135 -0.016 0.987
(Dispersion parameter for binomial family taken to be 1)
- Null deviance: 80.363 on 87 degrees of freedom
-Residual deviance: 58.693 on 83 degrees of freedom
-AIC: 68.693
+ Null deviance: 707.22 on 8034 degrees of freedom
+Residual deviance: 697.17 on 8031 degrees of freedom
+AIC: 705.17
-Number of Fisher Scoring iterations: 18
+Number of Fisher Scoring iterations: 16
Odds ratios
@@ -4147,31 +4088,43 @@
Use oddsratio(x, y)
from the epitools()
package to calculate odds ratios.
-In this case, we’re calculating the odds ratio for whether the the first 6 months or last 6 months predicts whether the percentage of population exposed to excess nitrates is more than 0.1%
+In this case, we’re calculating the odds ratio for whether living in a high traffic area predicts high levels of DieselPM.
library(epitools)
-response <- nitrate %>% pull(PercExpMore0.1)
-predictor <- nitrate %>% pull(half_of_year)
+
+calenviroscreen <-
+ calenviroscreen %>%
+ mutate(
+ Traffic_level = case_when
+ (TrafficPctl > 0.75 ~ 1,
+ TrafficPctl <= 0.75 ~ 0))
+
+response <- calenviroscreen %>% pull(DieselPM_level)
+predictor <- calenviroscreen %>% pull(Traffic_level)
oddsratio(predictor, response)
$data
Outcome
-Predictor 0 1 Total
- first 32 12 44
- second 41 3 44
- Total 73 15 88
+Predictor 0 1 Total
+ 0 23 37 60
+ 1 35 7905 7940
+ Total 58 7942 8000
$measure
odds ratio with 95% C.I.
-Predictor estimate lower upper
- first 1.0000000 NA NA
- second 0.2053372 0.04182295 0.7234182
+Predictor estimate lower upper
+ 0 1.0000 NA NA
+ 1 139.3968 74.58837 260.5596
$p.value
two-sided
-Predictor midp.exact fisher.exact chi.square
- first NA NA NA
- second 0.0122317 0.02103458 0.01072943
+Predictor midp.exact fisher.exact
+ 0 NA NA
+ 1 0 0.000000000000000000000000000000000007956334
+ two-sided
+Predictor chi.square
+ 0 NA
+ 1 0.00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000002491579
$correction
[1] FALSE
diff --git a/modules/Statistics/Statistics.pdf b/modules/Statistics/Statistics.pdf
index 0a95fe32..f09fe066 100644
Binary files a/modules/Statistics/Statistics.pdf and b/modules/Statistics/Statistics.pdf differ