-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathcollect_method_fuse_data.R
More file actions
141 lines (99 loc) · 4.58 KB
/
collect_method_fuse_data.R
File metadata and controls
141 lines (99 loc) · 4.58 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
## Name: collect_method_fuse_data.R
## Author: Katherine A. Phillips
## Date Created: March 2015
## Purpose: Collects and parses HTML files for Method's ingredient disclosures and uses.
library(stringr)
library(XML)
##----------------------------------------------------------------------------##
## This finds all the links on the Method webpage, don't really need this ##
## now since I saved the hrefs to a file ##
##----------------------------------------------------------------------------##
## URL to page listing all Method products
method_link <- "http://methodhome.com/products/"
## Parse the HTML on the page
method_page <- htmlParse(method_link)
## Get all the links on the page
method_links <- xpathSApply(method_page,"//a/@href")
method_links <- unique(method_links)
## Stop using the page
free(method_page)
## Convert the HTML object to a vector
method_links <- as.vector(method_links)
## Open a file for writing
method_file <- file("method_links.txt")
## Write each link to a line in the file
writeLines(method_links,method_file)
## Close the file
close(method_file)
##----------------------------------------------------------------------------##
## This loop downloads all the URLs into the pwd. Only run IFF you do not ##
## have the HTML files saved -- this loop takes FOREVER! ##
##----------------------------------------------------------------------------##
## Store all children URLs as vector
urls <- readLines("method_links.txt",warn=FALSE)
## Loop over all URLs
for (i in 1:length(urls)){
## Make full URL
ChemURL <- urls[i]
## Create a HTML file name
ChemFileName <- paste(gsub("-","_",
gsub(" ","",tail(
strsplit(ChemURL,split="[/]")[[1]],n=1))),
".html",sep="")
## Download the file
ChemFile <- download.file(url=ChemURL,destfile=ChemFileName)
## Don't overload the server
Sys.sleep(10)
}
##----------------------------------------------------------------------------##
## This loop uses the downloaded HTML files to create a data frame, which ##
## contains chemical names, uses, and the corresponding file ##
##----------------------------------------------------------------------------##
## Store all children URLs as vector
urls <- readLines("method_links.txt",warn=FALSE)
product_list <- list()
## Loop over all the URLs
for (i in 1:length(urls)){
## Make full URL
ChemURL <- urls[i]
## Create a HTML file name
ChemFileName <- paste(gsub("-","_",
gsub(" ","",tail(
strsplit(ChemURL,split="[/]")[[1]],n=1))),
".html",sep="")
## Check that there is a table in the HTML`
N_tables <- length(readHTMLTable(ChemFileName))
## Cycle loop if there is no table
if (N_tables <= 0) {next}
## Pull ingredient/use table from file
ChemTable <- as.data.frame(readHTMLTable(ChemFileName,which=1,header=TRUE))
## Keep the basename of the file for product name
ChemTable$ChemFile <- strsplit(ChemFileName,split='[.]')[[1]][1]
## Delete unnecessary columns, if it exists
if ("learn more" %in% names(ChemTable)){
ChemTable <- ChemTable[,!(names(ChemTable)=="learn more")]
}
ChemTable <- ChemTable[,!(names(ChemTable)=="environmental + health summary")]
## Add product data frame to list of data frames
product_list[[ChemFileName]] <- ChemTable
}
## Merge all data frames in list
MethodChem <- Reduce(function(x,y) merge(x,y,all=TRUE),product_list)
## Rename data frame columns
colnames(MethodChem)[2] <- "ChemicalName"
colnames(MethodChem)[3] <- "UseCategory"
colnames(MethodChem)[1] <- "File"
## Remove commas so CSV file works
MethodChem$ChemicalName <- gsub(",",";",MethodChem$ChemicalName)
MethodChem$UseCategory <- gsub(",",";",MethodChem$UseCategory)
## Get rid of when product was updated last
MethodChem$first <- sapply(as.character(MethodChem$ChemicalName),
FUN=function(x) {strsplit(x,split=" ")[[1]][1]})
MethodChem <- MethodChem[which(MethodChem$first != "updated"),]
## Put needed data in data frame
MethodChem <- data.frame(ChemicalName=MethodChem$ChemicalName,
UseCategory=MethodChem$UseCategory,
Product=MethodChem$File)
## Write data frame to file
write.csv(MethodChem,"MethodChemicalIngredients_RAW.csv",
quote=FALSE,row.names=FALSE)