forked from CorrelAidSwitzerland/sdghackathon
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSDG_hackathon_notebook.Rmd
156 lines (126 loc) · 5.32 KB
/
SDG_hackathon_notebook.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
---
title: "SDG Hackathon"
author: "Kai Lim"
date: "05/11/2021"
output: html_document
editor_options:
chunk_output_type: inline
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
The goal of this data visualization hackathon is twofold:
1. To better understand the commonalities and differences between the five query-based SDG labeling systems.
2. To characterize the efforts of the Swiss research landscape to address the UN SDGs.
```{r load libraries, message=FALSE, warning=FALSE}
library(tidyverse)
library(lubridate)
library(ggalt) #needed for geom_dumbbell
library(patchwork)
library(magick)
library(cowplot)
library(ggthemes)
```
```{r load csvs}
sdg_data<-read.csv("sdg_hackathon_data.csv")
sdg_data_supp<-read.csv("supplementary_data.csv")
```
## Which type of SDG a project is in will be detected more easily by all sources?
```{r}
sdg_data_sources <- sdg_data %>%
select(project_number,start_date,end_date,sdg,system, approved_amount) %>%
transform(start_date=as_date(start_date), end_date=as_date(end_date)) %>%
mutate(duration=as.period(interval(ymd(start_date),ymd(end_date)),unit="days")@day) %>%
filter(sdg!="SDG-17", approved_amount!="data not included in P3") %>% #filter SDG-17 because some sources don't have it
group_by(sdg,system) %>%
mutate(count=n()) %>%
mutate(med_duration=median(duration)) %>%
mutate(med_cost=median(as.numeric(approved_amount))) %>%
mutate(system=recode(system,
aurora="Aurora",
elsevier="Elsevier",
ontology="Ontology",
sdsn="SDSN",
siris="Siris"))
table(sdg_data$system)
SDG_names <- c(`SDG-01` = "1: No Poverty",
`SDG-02` = "2: Zero Hunger",
`SDG-03` = "3: Good Health & Well Being",
`SDG-04` = "4: Quality Education",
`SDG-05` = "5: Gender Equality",
`SDG-06` = "6: Clean Water & Sanitation",
`SDG-07` = "7: Affordable Clean Energy",
`SDG-08` = "8: Decent Work & Economy Growth",
`SDG-09` = "9: Industry, Innovation & Infrastructure",
`SDG-10` = "10: Reduced Inequalities",
`SDG-11` = "11: Sustainable Cities",
`SDG-12` = "12: Responsible Consumption & Production",
`SDG-13` = "13: Climate Action",
`SDG-14` = "14: Life Below Water",
`SDG-15` = "15: Life On Land",
`SDG-16` = "16: Peace & Justice")
plot1<-sdg_data_sources %>%
ggplot(aes(x=system,y=count,color=system))+
geom_point(aes(size=med_cost))+
geom_segment(aes(x=system, xend=system, y=0, yend=count)) +
facet_wrap(~sdg,labeller = as_labeller(SDG_names, label_wrap_gen(width=20)))+
theme_bw() +
labs(title="Number of projects identified by each system",
color="Query system",
size="Median approved funding\n(CHF)",
caption="Github:kai-lim")+
theme(legend.position="right",
axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
axis.line.x = element_blank())+
scale_y_continuous(name="Number of projects", limits=c(0,20000))+
scale_x_discrete(name="Query system used to detect SDGs")+
theme(strip.background = element_rect(fill=c("#CCB7AE")))
plot1
# tried to add image but didn't work - can try after deadline.
# ## read PNG file from web
# png <- magick::image_read("~/OneDrive - King's College London/PhD/outside_PhD/SDG_Hackathon/Hackathon_project_folder/SDG_title.png")
# ## turn image into `rasterGrob`
# img <- grid::rasterGrob(png, interpolate = TRUE)
#
# (plot1_1 <- plot1 +
# annotation_custom(img, ymin = 21.5, ymax = 30.5, xmin = 55, xmax = 65.5))
ggsave(file="SDG_2.png", width=8,height=8)
```
Looking good, can consider changing the y-axis to cost or med_ instead.
change med_duration to cost perhaps?
Do it tomorrow.
Consider using BBC's theme.
add a second axis: https://www.r-graph-gallery.com/line-chart-dual-Y-axis-ggplot2.html
```{r}
funding_data<-sdg_data_sources %>%
group_by(system, sdg) %>%
mutate(approved_amount=as.numeric(approved_amount)) %>%
top_n(1, wt=approved_amount)%>%
mutate(approved_amount_K=as.numeric(approved_amount/1000))
# First ensure that the data feame is ordered by the start time
#funding_data <- funding_data[order(funding_data$start_date),]
# Now iterate through each row, calculating how many previous rows have
# earlier starts but haven't yet finished when the current row starts.
# Multiply this number by a small negative offset and add the 1.48 baseline value
#funding_data$offset <- 1.48 - 0.03 * sapply(seq(nrow(funding_data)), function(i) {
# with(funding_data[seq(i),], length(which(start_date < start_date[i] & end_date > start_date[i])))
# })
plot2<-funding_data %>%
ggplot(aes(x=start_date, xend=end_date,y=as.numeric(reorder(approved_amount_K,start_date))))+
geom_segment(aes(color=system,yend=as.numeric(reorder(approved_amount_K,start_date))),
size = 2,
position = position_nudge(x = 0, y = 0),
lineend = "round") +
facet_wrap(~sdg)+
scale_y_continuous(name="Approved amount (in thousands)")+
theme_minimal()+
labs(title="Duration of projects with highest funding detected by each query system",
color="Query system used to detect SDGs",
x="Project start and end date")+
theme(legend.position = "bottom")
plot2
```
```{r both plots tgt}
plot1/plot2
```