-
Notifications
You must be signed in to change notification settings - Fork 2
/
EMA_webScrapping.R
126 lines (96 loc) · 3.68 KB
/
EMA_webScrapping.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
initRSelenium = function(){
##########################
# Start automated browser.
##########################
rD <- rsDriver()
remDr <- rD[["client"]]
}
getURLs = function(page,n){
##################################################################
# Loop through each page of the list and save links for each drug
##################################################################
# init vector
drug.webs <- {}
for (i in 1:n){
# prepare new url
wp <- paste(page,as.character(i),sep="")
# go to url
remDr$navigate(wp)
# find elements
webElem <- remDr$findElements("css selector","[href]")
# extract drug links
css_links <- unlist(sapply(webElem,function(x){
# navigate through XML
x$getElementAttribute("href")
}))
# find id
id <- str_detect(css_links,"EPAR/")
# save drug link
drug.webs <- c(drug.webs,css_links[id])}
return(drug.webs)
}
getDrugInfo = function(drug.webs){
########################################################################
# Obtain information from datasheet from each drug.
########################################################################
# Use drug links to retrieve drug info
DB <- list()
done <- {}
not_done <- setdiff(drug.webs,done)
nerr <- 0
it <- 0
# Navigate to each drug's page, extract information of "Publication details" and "Product details", and store it:
for (j in not_done){
it <- it+1
remDr$navigate(j)
d <- remDr$findElements("css selector",".ecl-u-fs-m") #Elements to "find" were defined exploring the webpage in "Inspect" mode.
css_text <- unlist(sapply(d,function(x){x$getElementText()}))
css_text_clean <- css_text[!css_text %in% c("","Publication details","Product details")]
drug_name <- gsub(".*(EPAR/)","",j)
if ((length(css_text_clean)/2)%%1==0){
coln <- css_text_clean[seq(1,length(css_text_clean)-1,2)]
data <- css_text_clean[seq(2,length(css_text_clean),2)]
DB[[drug_name]] <- list(coln,data)
}else{
#Record errors in reading tables from webpage
DB[[drug_name]] <- css_text_clean
nerr <- nerr+1
}
#Check performance
done <- c(done,j)
print(paste("There have been",nerr,"errors."))
print(paste(it,"/",length(drug.webs)))
}
# Save in a variable
drugsDB <- list(drug.webs,DB,nerr,it,not_done)
return(drugsDB)
}
processTable = function(drugsDB){
# Define variables:
DB <- drugsDB[[2]] #retrieve DB variable containing all the information for each drug
fields <- unlist(sapply(DB,function(x){x[[1]]})) #get all the fields catched
reps <- sapply(unique(fields),function(x){sum(str_count(fields,x))}) #Count how many times each field appears
fields_clean <- unique(fields)[reps>3 | reps<1] #save only unique fields
# Generate table placing each value to their corresponding field
table <- list()
for (k in fields_clean){
f <- {}
for (r in 1:length(DB)){
idx <- which(DB[[r]][[1]] %in% k)
if(length(idx)==1){
f <- c(f,DB[[r]][[2]][idx])
}else if(length(idx)==0){
f <- c(f,"-")
}else{
f <- c(f,"ERROR")
}
}
table[[k]] <- f
}
sheet <- data.frame(table)
return(sheet)
}
saveTable = function(df,your.path){
# Save
write.csv(df,file = your.path,row.names = F)
}