-
Notifications
You must be signed in to change notification settings - Fork 6
/
Get_Report_Corpus.r
45 lines (37 loc) · 922 Bytes
/
Get_Report_Corpus.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
## !/user/bin/env RStudio 1.1.423
## -*- coding: utf-8 -*-
## Document Acquisition
library("rvest")
library("dplyr")
library("magrittr")
library("doParallel")
library("foreach")
Links_data <- read.csv(
"./data/Reports_links.csv",
stringsAsFactors = FALSE
) %>% arrange(Year)
Get_Corpus_Report <- function(i){
url = grep(i,Links_data$Year) %>% Links_data$Links[.]
read_html(url) %>%
html_nodes("td.p1,tr > td,div.pages_content") %>%
html_text("both") %>%
cat(file = sprintf("./data/Corpus/%d.txt",i))
}
system.time({
if (!dir.exists("./data/Corpus")){
dir.create("./data/Corpus")
}
cl<- makeCluster(4)
registerDoParallel(cl)
tryCatch({
foreach(
i= Links_data$Year,
.combine = c,
.packages = c("rvest","magrittr")
) %dopar% Get_Corpus_Report(i)
}, error = function(e) {
print(e)
},
finally = stopCluster(cl)
)
})