-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathhigh_difficulty_exercise.R
169 lines (135 loc) · 7.21 KB
/
high_difficulty_exercise.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
######## Scraping BCE Occasional papers ##########
# Loading necessary packages for web scraping and text analysis
# If necessary: `install.packages("pacman")`
pacman::p_load(tidyverse, # Data manipulation
rvest, # Web scraping
RSelenium, # Automating browser interactions
polite, # Polite scraping (respecting robots.txt)
tidytext) # Text mining and analysis
# Define the ECB domain URL for scraping
ecb_domain <- "https://www.ecb.europa.eu/"
# Create a polite session object for the ECB domain to ensure responsible scraping
session <- polite::bow(ecb_domain,
user_agent = "polite R package - used for academic training by Aurélien Goutsmedt (aurelien.goutsmedt@uclouvain.be)")
#### EDIT: ai changé le nom (c'était Marine Bardou)
# Here is the full path to the page listing occasional papers.
ecb_pub_path <- str_c(ecb_domain, "press/research-publications/occasional-papers/html/index.en.html")
# How would you handle scraping this? What are the big issues.
## Setting up RSelenium for browser automation ---------
# Start a Firefox browser session for interacting with the webpage
remDr <- rsDriver(browser = "firefox", port = 4444L, chromever = NULL)
browser <- remDr[["client"]]
# Navigate to the ECB publication page
browser$navigate(ecb_pub_path)
# Wait for any delay specified by the polite session (based on server settings)
Sys.sleep(session$delay)
# Accept cookies banner (if exists) to continue scraping
browser$findElement("link text", "I do not accept the use of cookies")$clickElement()
# Let's have a first look at the code of the page
pub_page <- browser$getPageSource()[[1]] %>%
read_html()
# We extract the paper identifiers (number) and look at them
id <- pub_page %>%
html_elements(".category") %>%
html_text
print(id)
# Obviously, we just have the first papers, and we need to scroll down to get all the papers
## Manipulating the page to ensure all content is loaded ---------
# The first step is to scroll the page progressively to load dynamic content. Then, we need to click on the "Details" buttons to get the abstracts.
# Function to scroll the page progressively to load dynamic content
progressive_scroll <- function(browser, scroll_step = 100) {
# Get initial scroll height of the page
current_height <- browser$executeScript("return document.body.scrollHeight")
# Set a variable for the scrolling position
scroll_position <- 0
# Continue scrolling until the end of the page
while (scroll_position < current_height) {
# Scroll down by 'scroll_step' pixels
browser$executeScript(paste0("window.scrollBy(0,", scroll_step, ");"))
Sys.sleep(runif(1, max = 0.2)) # Wait for the content to load (adjust this if the page is slower to load)
scroll_position <- scroll_position + scroll_step # Update the scroll position
current_height <- browser$executeScript("return document.body.scrollHeight") # Get the updated scroll height after scrolling (in case more content is loaded)
}
}
# Scroll the ECB page to ensure all dynamic content is visible
progressive_scroll(browser, scroll_step = 1000)
# We identify and click the necessary "Details" buttons
# We need a complex css selector to open the Details menu, but not "Annexes" or others.
buttons <- browser$findElements("css selector", ".ecb-langSelector+ .accordion .header:nth-child(1) .title")
for(i in seq_along(buttons)){
buttons[[i]]$clickElement() # Click each button to reveal hidden content
Sys.sleep(runif(1, max = 0.2)) # Add a slight pause to avoid errors and overloading the website
}
## Extracting data from the page ---------
# Now, extract relevant information from the page: publication date, title, pdf url, authors, abstract, JEL codes, and network information
# Get the page source after interaction and read it into an HTML structure
pub_page <- browser$getPageSource()[[1]] %>%
read_html()
# Extract the publication dates
date <- pub_page %>%
html_elements(".loaded > dt") %>%
html_text()
# Extract the paper identifiers (number)
id <- pub_page %>%
html_elements(".category") %>%
html_text
# Extract the titles of the papers
title <- pub_page %>%
html_elements(".category+ .title a") %>%
html_text()
# Extract the URLs for getting the papers pdf
url <- pub_page %>%
html_elements(".category+ .title a") %>%
html_attr("href")
### Handling multiple authors ---------
# Extract authors (as each paper may have multiple authors listed, we will need to differentiate them)
papers_all_authors <- pub_page %>%
html_elements(".authors")
# Extract author names from the list of papers' authors (each paper may have several authors)
authors_list <- map(papers_all_authors, ~ html_elements(., "a") %>% html_text())
# Flatten the list of authors into a single string per paper, separating authors by commas
authors <- map_chr(authors_list, ~str_flatten(., collapse = ", "))
## Extracting supplementary information (Abstracts, JEL Codes) ---------
supplementary_information <- pub_page %>%
html_elements(".category, .content-box > dl dt, .content-box > dl dd") %>%
html_text() %>%
tibble(text = .) %>%
mutate(id = if_else(str_detect(text, "^No\\. \\d+"), text, NA)) %>%
fill(id, .direction = "down") %>% # Fill the paper ID down for every piece of information
filter(text != id) %>% # Filter out rows that only contain the ID
mutate(info_type = str_extract(text, "^Abstract$|^JEL Code$|^Network$")) %>% # Identify supplementary info place
group_by(id) %>%
fill(info_type, .direction = "down") %>% # Fill info type downwards
filter(text != info_type) %>%
pivot_wider(names_from = info_type, values_from = text) # Reshape the data so each paper has its abstract, JEL codes, etc.
# Put all the extracted information in a single data frame
extracted_data <- tibble(date,
id,
title,
url,
authors)
all_data <- left_join(extracted_data,
supplementary_information,
by = "id")
## Analyzing text from titles and abstracts ---------
# Now we have all the papers, let's have a bit of analysis by looking at the most frequent terms in titles and abstracts
# Defining a list of stop words to filter out from text analysis
## for that purpose, you can use lexicon == "SMART"
stopwords <- stop_words %>% filter(lexicon == "SMART") %>% pull(word)
# Unnest words from titles and count frequent terms (i.e. excluding stopwords)
all_data %>%
unnest_ngrams(word, title, n_min = 1, n = 3) %>%
filter(! str_detect(word, str_c("\\b", stopwords, "\\b", collapse = "|"))) %>%
count(word, sort = TRUE)
# Unnest words from abstracts and count frequent terms (excluding stopwords)
all_data %>%
unnest_ngrams(word, Abstract, n_min = 1, n = 3) %>%
filter(! str_detect(word, str_c("\\b", stopwords, "\\b", collapse = "|"))) %>%
count(word, sort = TRUE)
# Checking for the mention of 'climate change' in the abstracts
all_data %>%
unnest_ngrams(word, Abstract, n = 2) %>%
mutate(mention_climate = str_detect(word, "climate change")) %>%
distinct(id, mention_climate) %>%
pull(mention_climate) %>%
sum(na.rm = TRUE)/nrow(all_data) # Calculate proportion of papers mentioning 'climate change'