diff --git a/heart_disease_case_study_20240312.html b/heart_disease_case_study_20240312.html new file mode 100644 index 0000000..b9266ed --- /dev/null +++ b/heart_disease_case_study_20240312.html @@ -0,0 +1,1203 @@ + + + + +
+ + + + + + + + + + +library(gridExtra)
+df_raw <- read_csv("C:/Users/Clint/OneDrive/Documents/Data/cdc_heart_disease/nvss_hd_2010_2020_raw.csv")
+clean_df <- clean_names(df_raw) #lower case all col, no spaces
+#remove NA from table
+clean_df1 <- clean_df[!is.na(clean_df$data_value), ]
+df_age1.1 <- clean_df1 %>% #filter 2020, omit 'overall" from break_out, exclude negative #'s, type id "age" = break out category is gender/race only
+ filter(data_value_type_id == "AgeStdz") %>%
+ filter(year_start != "2020") %>%
+ filter(break_out_category != "Overall") %>%
+ filter(data_value_alt > 0)
+## # A tibble: 20,468 × 16
+## row_id year_start location_abbr topic data_value_type data_value_unit
+## <chr> <dbl> <chr> <chr> <chr> <chr>
+## 1 NVSS~2010~1~N… 2010 AL Majo… Age-Standardiz… Rate per 100,0…
+## 2 NVSS~2010~1~N… 2010 AL Majo… Age-Standardiz… Rate per 100,0…
+## 3 NVSS~2010~1~N… 2010 AL Majo… Age-Standardiz… Rate per 100,0…
+## 4 NVSS~2010~1~N… 2010 AL Majo… Age-Standardiz… Rate per 100,0…
+## 5 NVSS~2010~1~N… 2010 AL Majo… Age-Standardiz… Rate per 100,0…
+## 6 NVSS~2010~1~N… 2010 AL Majo… Age-Standardiz… Rate per 100,0…
+## 7 NVSS~2010~2~N… 2010 AK Majo… Age-Standardiz… Rate per 100,0…
+## 8 NVSS~2010~2~N… 2010 AK Majo… Age-Standardiz… Rate per 100,0…
+## 9 NVSS~2010~2~N… 2010 AK Majo… Age-Standardiz… Rate per 100,0…
+## 10 NVSS~2010~2~N… 2010 AK Majo… Age-Standardiz… Rate per 100,0…
+## # ℹ 20,458 more rows
+## # ℹ 10 more variables: data_value <dbl>, data_value_alt <dbl>,
+## # break_out_category <chr>, break_out <chr>, class_id <chr>, topic_id <chr>,
+## # data_value_type_id <chr>, break_out_category_id <chr>, break_out_id <chr>,
+## # location_id <dbl>
+#remove columns outside objective
+df_age1 <- df_age1.1[, c(1:4, 7, 9:10)]
+## # A tibble: 20,468 × 7
+## row_id year_start location_abbr topic data_value break_out_category break_out
+## <chr> <dbl> <chr> <chr> <dbl> <chr> <chr>
+## 1 NVSS~… 2010 AL Majo… 416. Gender Male
+## 2 NVSS~… 2010 AL Majo… 352. Gender Female
+## 3 NVSS~… 2010 AL Majo… 370. Race Non-Hisp…
+## 4 NVSS~… 2010 AL Majo… 453. Race Non-Hisp…
+## 5 NVSS~… 2010 AL Majo… 102. Race Hispanic
+## 6 NVSS~… 2010 AL Majo… 133. Race Other
+## 7 NVSS~… 2010 AK Majo… 236. Gender Male
+## 8 NVSS~… 2010 AK Majo… 177 Gender Female
+## 9 NVSS~… 2010 AK Majo… 196. Race Non-Hisp…
+## 10 NVSS~… 2010 AK Majo… 268. Race Other
+## # ℹ 20,458 more rows
+#rename columns
+new_names <- c("year", "state", "type", "rate", "group", "sub_group")
+colnames(df_age1)[c(2:7)] <- new_names
+## # A tibble: 20,468 × 7
+## row_id year state type rate group sub_group
+## <chr> <dbl> <chr> <chr> <dbl> <chr> <chr>
+## 1 NVSS~2010~1~NV001~GEN01~Age-Standard… 2010 AL Majo… 416. Gend… Male
+## 2 NVSS~2010~1~NV001~GEN02~Age-Standard… 2010 AL Majo… 352. Gend… Female
+## 3 NVSS~2010~1~NV001~RAC01~Age-Standard… 2010 AL Majo… 370. Race Non-Hisp…
+## 4 NVSS~2010~1~NV001~RAC02~Age-Standard… 2010 AL Majo… 453. Race Non-Hisp…
+## 5 NVSS~2010~1~NV001~RAC04~Age-Standard… 2010 AL Majo… 102. Race Hispanic
+## 6 NVSS~2010~1~NV001~RAC07~Age-Standard… 2010 AL Majo… 133. Race Other
+## 7 NVSS~2010~2~NV001~GEN01~Age-Standard… 2010 AK Majo… 236. Gend… Male
+## 8 NVSS~2010~2~NV001~GEN02~Age-Standard… 2010 AK Majo… 177 Gend… Female
+## 9 NVSS~2010~2~NV001~RAC01~Age-Standard… 2010 AK Majo… 196. Race Non-Hisp…
+## 10 NVSS~2010~2~NV001~RAC07~Age-Standard… 2010 AK Majo… 268. Race Other
+## # ℹ 20,458 more rows
+#update type values, verify
+df_age1$type[grepl("^Major |^Diseases of", df_age1$type)] <- "Type Unspecified"
+df_age1$type[grepl("^Acute", df_age1$type)] <- "Heart Attack"
+## [1] "Type Unspecified" "Heart Attack" "Coronary Heart Disease"
+## [4] "Heart Failure" "Stroke"
+#rename sub groups, verify
+df_age1$sub_group[grepl("^Non-Hispanic Wh", df_age1$sub_group)] <- "White"
+df_age1$sub_group[grepl("^Non-Hispanic Bl", df_age1$sub_group)] <- "Black"
+df_age1$sub_group[grepl("^Oth", df_age1$sub_group)] <- "Unknown/Other"
+## [1] "Male" "Female" "White" "Black"
+## [5] "Hispanic" "Unknown/Other"
+#summarize gender sub_group
+plot1.1 <- df_age1 %>% #------------------------------------- By Gender
+ filter(group == 'Gender') %>%
+ group_by(year, sub_group) %>%
+ summarize(rate_sum = sum(rate),
+ avg_rate = mean(rate),
+ min_rate = min(rate),
+ max_rate = max(rate))
+plot1 <- plot1.1 %>%
+ mutate(avg_rate = round(avg_rate, 1))
+## # A tibble: 20 × 6
+## # Groups: year [10]
+## year sub_group rate_sum avg_rate min_rate max_rate
+## <dbl> <chr> <dbl> <dbl> <dbl> <dbl>
+## 1 2010 Female 39430. 94.8 8.5 375.
+## 2 2010 Male 48248. 116. 6.3 444.
+## 3 2011 Female 37971. 91.5 7.5 358.
+## 4 2011 Male 47104. 114. 6.5 425.
+## 5 2012 Female 37085. 89.1 8.1 345.
+## 6 2012 Male 46328 112. 7.1 421.
+## 7 2013 Female 36189. 87 7.3 343.
+## 8 2013 Male 46144. 111. 5.7 441.
+## 9 2014 Female 35441. 85.2 6.1 332.
+## 10 2014 Male 45709. 110. 7.7 433.
+## 11 2015 Female 35491. 85.3 6.5 347.
+## 12 2015 Male 45637. 110 5.2 448
+## 13 2016 Female 34480. 82.9 7.9 332.
+## 14 2016 Male 45285. 109. 7.2 442
+## 15 2017 Female 33973. 81.7 6.2 338.
+## 16 2017 Male 45039. 109. 6.3 444.
+## 17 2018 Female 33188. 79.8 5.9 321.
+## 18 2018 Male 44740. 108. 6.7 433.
+## 19 2019 Female 32510. 78.1 6.6 318.
+## 20 2019 Male 44228. 106. 5.5 444.
+#summarize race sub_group
+plot2.1 <- df_age1 %>% #------------------------------------- By Race
+ filter(group == 'Race') %>%
+ group_by(year, sub_group) %>%
+ summarize(rate_sum = sum(rate),
+ avg_rate = mean(rate),
+ min_rate = min(rate),
+ max_rate = max(rate))
+ plot2 <- plot2.1 %>%
+ mutate(avg_rate = round(avg_rate, 1))
+ print(plot2)
+## # A tibble: 40 × 6
+## # Groups: year [10]
+## year sub_group rate_sum avg_rate min_rate max_rate
+## <dbl> <chr> <dbl> <dbl> <dbl> <dbl>
+## 1 2010 Black 38890 131. 8.9 491.
+## 2 2010 Hispanic 15554. 72 4.8 274.
+## 3 2010 Unknown/Other 18408. 77.7 2.7 386.
+## 4 2010 White 43930. 106. 8 391.
+## 5 2011 Black 37619. 128 8.3 443.
+## 6 2011 Hispanic 15636. 71.1 4.9 270.
+## 7 2011 Unknown/Other 18693 76.6 2.7 309.
+## 8 2011 White 42830. 104. 5.6 384.
+## 9 2012 Black 37086. 127. 9.5 468.
+## 10 2012 Hispanic 15820. 68.2 5.2 284.
+## # ℹ 30 more rows
+plot3.1 <- df_age1 %>% #------------------------------------- By Type, All Genders
+ filter(group == 'Gender', type != 'Type Unspecified') %>% #excludes generic types
+ group_by(year, type, sub_group) %>%
+ summarize(avg_rate = mean(rate))
+ plot3 <- plot3.1 %>%
+ mutate(avg_rate = round(avg_rate, 1))
+ print(plot3)
+## # A tibble: 80 × 4
+## # Groups: year, type [40]
+## year type sub_group avg_rate
+## <dbl> <chr> <chr> <dbl>
+## 1 2010 Coronary Heart Disease Female 115.
+## 2 2010 Coronary Heart Disease Male 174.
+## 3 2010 Heart Attack Female 37.8
+## 4 2010 Heart Attack Male 59.1
+## 5 2010 Heart Failure Female 24.6
+## 6 2010 Heart Failure Male 22.3
+## 7 2010 Stroke Female 34.1
+## 8 2010 Stroke Male 28.1
+## 9 2011 Coronary Heart Disease Female 109.
+## 10 2011 Coronary Heart Disease Male 170.
+## # ℹ 70 more rows
+plot4 <- df_age1 %>% #------------------------------------- By Type, Overall
+ filter(type != 'Type Unspecified') %>% #excludes missing type
+ group_by(year, type) %>%
+ summarize(avg_rate = mean(rate))
+## # A tibble: 40 × 3
+## # Groups: year [10]
+## year type avg_rate
+## <dbl> <chr> <dbl>
+## 1 2010 Coronary Heart Disease 127.
+## 2 2010 Heart Attack 44.7
+## 3 2010 Heart Failure 22.6
+## 4 2010 Stroke 31.2
+## 5 2011 Coronary Heart Disease 121.
+## 6 2011 Heart Attack 42.9
+## 7 2011 Heart Failure 22.0
+## 8 2011 Stroke 30.1
+## 9 2012 Coronary Heart Disease 119.
+## 10 2012 Heart Attack 41.3
+## # ℹ 30 more rows
+plot5.1 <- df_age1 %>% #------------------------------------- By Top/Bottom 5 States
+ filter(state != "DC") %>%
+ group_by(state) %>%
+ summarize(avg_rate = mean(rate))
+## # A tibble: 51 × 2
+## state avg_rate
+## <chr> <dbl>
+## 1 AK 79.7
+## 2 AL 118.
+## 3 AR 130.
+## 4 AZ 79.0
+## 5 CA 88.5
+## 6 CO 71.8
+## 7 CT 79.8
+## 8 DE 97.0
+## 9 FL 83.6
+## 10 GA 79.1
+## # ℹ 41 more rows
+plot5.2 <- plot5.1[order(-plot5.1$avg_rate), ] #sort table
+top_bottom <- plot5.2 %>% #create new table, top/bottom 5
+ arrange(avg_rate) %>%
+ filter(row_number() <= 5 | row_number() > n() - 5)
+plot5 <- top_bottom
+## # A tibble: 10 × 2
+## state avg_rate
+## <chr> <dbl>
+## 1 MA 68.1
+## 2 CO 71.8
+## 3 MN 73.7
+## 4 WA 74.3
+## 5 VA 75.7
+## 6 SD 119.
+## 7 OK 121.
+## 8 WV 124.
+## 9 AR 130.
+## 10 MS 135.
+ggplot(plot1, aes(x = year, y = avg_rate, color = sub_group)) +
+ geom_line() +
+ geom_point() +
+ scale_x_continuous(labels = scales::number_format(accuracy = 1)) +
+ scale_y_continuous(labels = scales::number_format(accuracy = 1)) +
+ labs(title = "Heart Disease Mortality Rate - Gender",
+ subtitle = "(2010-2019)",
+ x = "Year",
+ y = "Avg Rate per 100K",
+ color = "Gender") +
+ theme_bw()
+ggplot(plot2, aes(x = year, y = avg_rate, color = sub_group)) +
+ geom_line() +
+ geom_point() +
+ scale_x_continuous(labels = scales::number_format(accuracy = 1)) +
+ scale_y_continuous(labels = scales::number_format(accuracy = 1)) +
+ labs(title = "Heart Disease Mortality Rate - Race",
+ subtitle = "(2010-2019)",
+ x = "Year",
+ y = "Avg Rate per 100K",
+ color = "Race") +
+ theme_bw()
+ggplot(plot3, aes(x=type, y=avg_rate, fill=type)) +
+ geom_bar(stat = "identity", position = "dodge") +
+ facet_wrap(~ sub_group) +
+ labs(title = "Heart Disease Mortality Rate - Gender+Type",
+ subtitle = "(2010-2019)",
+ x = "Type",
+ y = "Avg Rate per 100K",
+ color = "Gender") +
+ theme(axis.text.x = element_text(angle = 45, hjust = 1))
+ggplot(plot4, aes(x = year, y = avg_rate, color = type)) +
+ geom_line() +
+ geom_point() +
+ scale_x_continuous(labels = scales::number_format(accuracy = 1)) +
+ scale_y_continuous(labels = scales::number_format(accuracy = 1)) +
+ labs(title = "Heart Disease Mortality Rate - Type",
+ subtitle = "(2010-2019)",
+ x = "Year",
+ y = "Avg Rate per 100K",
+ color = "Type") +
+ theme_bw()
+ggplot(plot5, aes(x = reorder(state, -avg_rate), y = avg_rate, fill=avg_rate)) +
+ geom_bar(stat = "identity") +
+ #facet_wrap(~ sub_group) +
+ labs(title = "Heart Disease Mortality Rates - Top/Bottom 5 States",
+ subtitle = "(2010-2019)",
+ x = "State",
+ y = "Avg Rate per 100K",
+ color = "Rate")
+ theme_minimal()
