Skip to content

Commit a8e5f93

Browse files
committed
Initial commit
0 parents  commit a8e5f93

File tree

9 files changed

+305
-0
lines changed

9 files changed

+305
-0
lines changed

DESCRIPTION

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
Type: Package
2+
Package: gdscrapeR
3+
Title: Easily Web Scrape Glassdoor Company Reviews Into a Data Frame
4+
Description: Tool for scraping company reviews with just one function: `get_reviews()`.
5+
Uses the 'rvest' and 'purr' packages to make it easy to scrape company reviews. For learning purposes only.
6+
Version: 0.99.0
7+
Depends:
8+
R (>= 3.0.1)
9+
Imports:
10+
httr,
11+
magrittr,
12+
purrr,
13+
rvest,
14+
xml2
15+
Suggests:
16+
License: GPL-3
17+
LazyData: true
18+
URL: https://github.com/mguideng/gdscrapeR
19+
BugReports: https://github.com/mguideng/gdscrapeR/issues
20+
Author: Maria Guideng [aut, cre],
21+
Credit Hadley Wickham [aut, cre rvest],
22+
Credit Lionel Henry [aut, cre purrr]
23+
Maintainer: Maria Guideng <imlearningthethings@gmail.com>
24+
Repository: GitHub
25+
Roxygen: list(markdown = TRUE)
26+
Encoding: UTF-8
27+
RoxygenNote: 6.1.1
28+
VignetteBuilder: knitr

NAMESPACE

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
# Generated by roxygen2: do not edit by hand
2+
3+
export(get_reviews)
4+
import(httr)
5+
import(purrr)
6+
import(rvest)
7+
import(xml2)
8+
importFrom(magrittr,"%>%")

NEWS.md

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
NEWS
2+
================
3+
Maria Guideng
4+
5+
gdscrapeR
6+
=========
7+
8+
**nf** = new feature
9+
**bf** = bug fixes
10+
11+
> The site changes frequently and with a lot of moving parts involved, the package can be expected to change accordingly.
12+
13+
#### \[0.1.0\]
14+
15+
- nf: tbd
16+
- bf: tbd
17+
18+
#### \[0.0.9\] beta
19+
20+
- Initial release.

R/get_reviews.R

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
#' @title Easily Web Scrapes Glassdoor Company Reviews Into a Data Frame
2+
#'
3+
#' @description Generate a data frame of company reviews with one function: `get_reviews()`.
4+
#'
5+
#' @param companyNum A string representing a company's unique ID number. Identified by navigating to a company's Glassdoor reviews web page
6+
#' and reviewing the URL for characters between "Reviews-" and ".htm" (usually starts with an 'E' and followed by up to seven digits).
7+
#'
8+
#' @return \code{get_reviews} returns a data frame containing reviews and source information.
9+
#'
10+
#' @examples
11+
#' Reference https://www.glassdoor.com/Reviews/SpaceX-Reviews-E40371.htm
12+
#' String enclosed with quotes.
13+
#' reviews <- get_reviews(companyNum = "E40371")
14+
#' reviews <- get_reviews("E40371")
15+
#'
16+
#' @export get_reviews
17+
get_reviews <- function(companyNum) {
18+
19+
# Set URL
20+
baseurl <- "https://www.glassdoor.com/Reviews/Company-Reviews-"
21+
sort <- ".htm?sort.sortType=RD&sort.ascending=true"
22+
23+
# Nested function for getting max results
24+
get_maxResults <- function(companyNum) {
25+
totalReviews <- xml2::read_html(httr::GET(paste(baseurl, companyNum, sort, sep = ""))) %>%
26+
html_nodes(".tightVert.floatLt strong, .margRtSm.margBot.minor") %>%
27+
html_text() %>%
28+
sub(" reviews", "", .) %>%
29+
sub(",", "", .) %>%
30+
as.integer()
31+
return(ceiling(totalReviews/10))
32+
}
33+
34+
# Message
35+
Sys.sleep(2)
36+
cat("\nNumber of web pages to scrape: ")
37+
maxResults <- get_maxResults(companyNum)
38+
Sys.sleep(6)
39+
cat(maxResults)
40+
41+
# Nested functions to collapse newline (<br>) within pros & cons corpus body of text
42+
collapse_html_text <- function(x, collapse = "\n", trim = F) {
43+
UseMethod("collapse_html_text") # parse xml use method:
44+
}
45+
46+
collapse_html_text.xml_nodeset <- function(x, collapse = "\n", trim = F) {
47+
vapply(x, collapse_html_text.xml_node, character(1),
48+
trim = trim, collapse = collapse)
49+
}
50+
51+
collapse_html_text.xml_node <- function(x, collapse = "\n", trim = F) {
52+
paste(xml_find_all(x, ".//text()"), collapse = collapse)
53+
}
54+
55+
# Nested function to get info (scrape based on CSS selectors pattern)
56+
get_selectors <- function(pg, i) {
57+
data.frame(rev.date = html_text(html_nodes(pg, ".date.subtle.small, .featuredFlag")),
58+
rev.sum = html_text(html_nodes(pg, ".reviewLink .summary:not([class*='toggleBodyOff'])")),
59+
rev.rating = html_attr(html_nodes(pg, ".gdStars.gdRatings.sm .rating .value-title"), "title"),
60+
rev.title = html_text(html_nodes(pg, "span.authorInfo.tbl.hideHH")),
61+
rev.pros = collapse_html_text(html_nodes(pg, ".description .row:nth-child(1) .mainText:not([class*='toggleBodyOff'])")),
62+
rev.cons = collapse_html_text(html_nodes(pg, ".description .row:nth-child(2) .mainText:not([class*='toggleBodyOff'])")),
63+
rev.helpf = html_text(html_nodes(pg, ".tight")),
64+
source.url = paste(baseurl, companyNum, "_P", i, sort, sep = ""),
65+
source.link = html_attr(html_nodes(pg, ".reviewLink"), "href"),
66+
source.iden = html_attr(html_nodes(pg, ".empReview"), "id"),
67+
stringsAsFactors = F)
68+
}
69+
70+
# Message
71+
Sys.sleep(3)
72+
cat("\nStarting")
73+
74+
# Nested function to get data frame
75+
df <- purrr::map_df(1:maxResults, function(i) {
76+
Sys.sleep(sample(seq(3, 8, by = 0.01), 1)) # be polite
77+
cat(" P", i, sep = "")
78+
pg <- xml2::read_html(httr::GET(paste(baseurl, companyNum, "_P", i, sort, sep = "")))
79+
get_selectors(pg, i)
80+
})
81+
82+
# Return
83+
Sys.sleep(3)
84+
return(data.frame(df))
85+
}
86+

README.md

Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
gdscrapeR: scrape Glassdoor company reviews in R
2+
================
3+
4+
ABOUT
5+
-----
6+
7+
**gdscrapeR** is an R package that scrapes company reviews from Glassdoor using a single function: `get_reviews`. It returns a data frame structure for holding the text data, which can be further prepped for text analytics learning projects.
8+
9+
INSTALL & LOAD
10+
--------------
11+
12+
The latest version from GitHub:
13+
14+
``` r
15+
install.packages("devtools")
16+
devtools::install_github("mguideng/gdscrapeR")
17+
18+
library(gdscrapeR)
19+
```
20+
21+
USAGE
22+
-----
23+
24+
#### Example
25+
26+
The URL to scrape the awesome **SpaceX** company will be: [www.glassdoor.com/Reviews/SpaceX-Reviews-E40371.htm](https://www.glassdoor.com/Reviews/SpaceX-Reviews-E40371.htm).
27+
28+
![spacex-url](https://raw.githubusercontent.com/mguideng/gdscrapeR/master/images/spacex-url.PNG)
29+
30+
#### Function
31+
32+
Pass the company number through the `get_reviews` function. The company number is a string representing a company's unique ID number. Identified by navigating to a company's Glassdoor reviews web page and reviewing the URL for characters between "Reviews-" and ".htm" (usually starts with an "E" and followed by digits).
33+
34+
``` r
35+
# Create data frame of: Date, Summary, Rating, Title, Pros, Cons, Helpful
36+
df <- get_reviews(companyNum = "E40371")
37+
```
38+
39+
This will scrape the following variables:
40+
41+
- Date - of when review was posted
42+
- Summary - e.g., "Great People"
43+
- Rating - star rating between 1.0 and 5.0
44+
- Title - e.g., "Current Employee - Manager in Hawthorne, CA"
45+
- Pros - upsides of the workplace
46+
- Cons - downsides of the workplace
47+
- Helpful - count marked as being helpful, if any
48+
- (and other info related to the source link)
49+
50+
PREP FOR TEXT ANALYTICS
51+
-----------------------
52+
53+
#### RegEx
54+
55+
Use regular expressions to clean and extract additional variables:
56+
57+
- Primary Key (uniquely identify rows 1 to N reviewers, sorted from first to last by date)
58+
- Year (from Date)
59+
- Location (e.g., Hawthorne CA)
60+
- Position (e.g., Manager)
61+
- Status (current or former employee)
62+
63+
``` r
64+
# Packages
65+
library(stringr) # pattern matching functions
66+
67+
# Add: PriKey
68+
df$rev.pk <- as.numeric(rownames(df))
69+
70+
# Extract: Year, Position, Location, Status
71+
df$rev.year <- as.numeric(sub(".*, ","", df$rev.date))
72+
73+
df$rev.pos <- sub(".* Employee - ", "", df$rev.title)
74+
df$rev.pos <- sub(" in .*", "", df$rev.pos)
75+
76+
df$rev.loc <- sub(".*\\ in ", "", df$rev.title)
77+
df$rev.loc <- ifelse(df$rev.loc %in%
78+
(grep("Former Employee|Current Employee", df$rev.loc, value = T)),
79+
"Not Given", df$rev.loc)
80+
81+
df$rev.stat <- str_extract(df$rev.title, ".* Employee -")
82+
df$rev.stat <- sub(" Employee -", "", df$rev.stat)
83+
84+
# Clean: Pros, Cons, Helpful
85+
df$rev.pros <- gsub("&amp;", "&", df$rev.pros)
86+
df$rev.cons <- gsub("&amp;", "&", df$rev.cons)
87+
df$rev.helpf <- as.numeric(gsub("\\D", "", df$rev.helpf))
88+
89+
# Export to csv
90+
write.csv(df, "df-results.csv", row.names = F)
91+
```
92+
93+
#### Result
94+
95+
![spacex-results](https://raw.githubusercontent.com/mguideng/gdscrapeR/master/images/spacex-results.PNG)
96+
97+
#### Exploration ideas
98+
99+
`gdscrapeR` is for learning purposes only. Analyze the unstructured text, extract relevant information, and transform it into useful insights.
100+
101+
- Apply Natural Language Processing (NLP) methods to show what is being written about the most.
102+
- Sentiment analysis by categorizing the text data to determine whether a review is considered positive, negative, or neutral as a way of deriving the emotions and attitudes of employees. Here's a sample project: ["Text Mining Company Reviews (in R) - Case of MBB Consulting"](https://mguideng.github.io/2018-07-16-text-mining-glassdoor-big3/).
103+
- Create a metrics profile for a company to track how star rating distributions are changing over time.
104+
- The ["Text Mining with R" book](https://www.tidytextmining.com/) by Julia Silge and David Robinson is highly recommended for further ideas.
105+
106+
**If you find this package useful, feel free to star :star: it. Thanks for visiting :heart: .**
107+
108+
NOTES
109+
-----
110+
111+
- Uses the `rvest` and `purrr` packages to make it easy to scrape company reviews into a data frame.
112+
- Site will change often. Errors due to CSS selector changes are shown as some variation of *"Error in 1:maxResults : argument of length 0"* or *"Error in data.frame(), : arguments imply differing number of rows: 0, 1"*.
113+
- Try it again later.
114+
- It's straightforward to work around them if you know R and how `rvest` and `purrr` work. Copy the `get_reviews` function code and paste it into an R script that you can modify to update the selector(s) in the meantime. For more on this, see the demo write-up: ["It's Harvesting Season - Scraping Ripe Data"](https://mguideng.github.io/2018-08-01-rvesting-glassdoor/).
115+
- Be polite.
116+
- A system sleeper is built in so there will be delays to slow down the scraper (expect ~1 minute for every 100 reviews).
117+
- Also, saving the dataframe to avoid redundant scraping sessions is suggested.
118+
- To contact maintainer: Maria Guideng `[imlearningthethings at gmail]`.

gdscrapeR.Rproj

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
Version: 1.0
2+
3+
RestoreWorkspace: Default
4+
SaveWorkspace: Default
5+
AlwaysSaveHistory: Default
6+
7+
EnableCodeIndexing: Yes
8+
UseSpacesForTab: Yes
9+
NumSpacesForTab: 2
10+
Encoding: UTF-8
11+
12+
RnwWeave: Sweave
13+
LaTeX: pdfLaTeX
14+
15+
AutoAppendNewline: Yes
16+
StripTrailingWhitespace: Yes
17+
18+
BuildType: Package
19+
PackageUseDevtools: Yes
20+
PackageInstallArgs: --no-multiarch --with-keep.source

images/spacex-results.PNG

107 KB
Loading

images/spacex-url.PNG

113 KB
Loading

man/get_reviews.Rd

Lines changed: 25 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)