-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path_workshop - script - solutions - ch 7.R
executable file
·93 lines (75 loc) · 3.45 KB
/
_workshop - script - solutions - ch 7.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
####--- | solution: metal fan base ---#####
# load 'metal_data'
metal_data <- read_excel("metal_data.xlsx")
metal_data
metal_meta <- read_excel("metal_meta.xlsx")
metal_meta
metal <- merge(metal_data, metal_meta, by = "ID")
metal
# correlation year formed and number of fans
metal %>%
ggplot(aes(x = formed, y = fans)) +
geom_point() +
geom_smooth(method = "lm")
cor(metal$fans, metal$formed, use = "complete.obs")
shapiro.test(metal$fans); shapiro.test(metal$formed) # both not normal
cor.test(metal$fans, metal$formed, method = "spearman")
# lump countries of origin together into 2 most frequent countries + 'other'
metal$origin <- fct_lump_n(metal$origin, n = 2)
metal$origin
# check differences in fan base between two most common countries
metal %>%
filter(!is.na(origin)) %>%
group_by(origin) %>%
summarise(mean_fans = mean(fans),
median_fans = median(fans),
sd_fans = sd(fans),
IQR_fans = IQR(fans)) -> metal_means
metal_means %>%
ggplot(aes(x = origin, y = mean_fans)) +
geom_bar(stat="identity", fill="gold") +
labs(title = "Metal fan base per country of band origin", x = "", y = "Fan base")
hist(metal$fans)
tapply(metal$fans, metal$origin, shapiro.test)
tapply(metal$fans, metal$origin, mean)
tapply(metal$fans, metal$origin, median)
wilcox.test(metal$fans[metal$origin=="USA"|metal$origin=="Sweden"] ~ metal$origin[metal$origin=="USA"|metal$origin=="Sweden"])
####--- | solution: female chess players ---#####
# load from csv and convert to tibble
ch <- read.csv("https://raw.githubusercontent.com/rikvosters/Basics-in-R/master/women_chess.csv")
ch <- as_tibble(ch)
ch
# remove variables 'Inactive_flag' and 'Gender'
ch %>%
select(-Inactive_flag, -Gender) -> ch
ch
# make variable 'Age' and split into 3 groups (young = under 30, middle = 30-60, older = 60+)
ch %>%
mutate(Age = 2020 - Year_of_birth) -> ch
ch$Age <- cut(ch$Age, breaks = c(0, 30, 60, 101), labels = c("young", "middle", "older"))
# automatically split variable 'Standard_Rating' into 3 groups ("low", "medium", "high")
ch$Rating_ordinal <- cut(ch$Standard_Rating, breaks = 3, labels = c("low", "medium", "high"))
# check out association between ordinal Rating and ordinal Age, numerically
round(prop.table(table(ch$Rating_ordinal, ch$Age),2),2)*100
# check out association between ordinal Rating and ordinal Age, graphically
library(vcd)
assoc(table(ch$Rating_ordinal, ch$Age), shade=TRUE, legend = T) # blue = more observations than expected under null hypothesis
# check out association between ordinal Rating and ordinal Age, statistically
test <- chisq.test(table(ch$Rating_ordinal, ch$Age))
test
test$residuals # notice large deviations
test$expected # test assumptions: enough data points in all expected cells
library(DescTools)
CramerV(table(ch$Rating_ordinal, ch$Age)) # Cramer's V to show strength of association (= effect size); from 0 to 1 (= perfect association)
# convert to long
ch %>%
pivot_longer(cols = `Standard_Rating`:`Blitz_rating`, names_to = "Type", values_to = "Rating") -> ch_wide
ch_wide
# plot year of birth and rating (but only for the Blitz_rating type), for both Russian (RUS) and Polish (POL) players in different colors, and add a LOESS smoother (method = "loess") for each group
ch_wide %>%
filter(Federation %in% c("RUS", "POL")) %>%
filter(Type == "Blitz_rating") %>%
group_by(Federation) %>%
ggplot(aes(x = Year_of_birth, y = Rating, col = Federation)) +
geom_point(alpha = 0.25) +
geom_smooth(method = "loess")