Initial commit

mguideng · mguideng · commit a8e5f935d805 · 2019-02-27T16:32:35.000-08:00
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -0,0 +1,28 @@
+Type: Package
+Package: gdscrapeR
+Title: Easily Web Scrape Glassdoor Company Reviews Into a Data Frame
+Description: Tool for scraping company reviews with just one function: `get_reviews()`.
+    Uses the 'rvest' and 'purr' packages to make it easy to scrape company reviews. For learning purposes only.
+Version: 0.99.0
+Depends:
+    R (>= 3.0.1)
+Imports:
+    httr,
+    magrittr,
+    purrr,
+    rvest,
+    xml2
+Suggests:
+License: GPL-3
+LazyData: true
+URL: https://github.com/mguideng/gdscrapeR
+BugReports: https://github.com/mguideng/gdscrapeR/issues
+Author: Maria Guideng [aut, cre], 
+    Credit Hadley Wickham [aut, cre rvest], 
+    Credit Lionel Henry [aut, cre purrr]
+Maintainer: Maria Guideng <imlearningthethings@gmail.com>
+Repository: GitHub
+Roxygen: list(markdown = TRUE)
+Encoding: UTF-8
+RoxygenNote: 6.1.1
+VignetteBuilder: knitr
diff --git a/NAMESPACE b/NAMESPACE
@@ -0,0 +1,8 @@
+# Generated by roxygen2: do not edit by hand
+
+export(get_reviews)
+import(httr)
+import(purrr)
+import(rvest)
+import(xml2)
+importFrom(magrittr,"%>%")
diff --git a/NEWS.md b/NEWS.md
@@ -0,0 +1,20 @@
+NEWS
+================
+Maria Guideng
+
+gdscrapeR
+=========
+
+**nf** = new feature
+**bf** = bug fixes
+
+> The site changes frequently and with a lot of moving parts involved, the package can be expected to change accordingly.
+
+#### \[0.1.0\]
+
+-   nf: tbd
+-   bf: tbd
+
+#### \[0.0.9\] beta
+
+-   Initial release.
diff --git a/R/get_reviews.R b/R/get_reviews.R
@@ -0,0 +1,86 @@
+#' @title Easily Web Scrapes Glassdoor Company Reviews Into a Data Frame
+#'
+#' @description Generate a data frame of company reviews with one function: `get_reviews()`.
+#'
+#' @param companyNum A string representing a company's unique ID number. Identified by navigating to a company's Glassdoor reviews web page
+#'  and reviewing the URL for characters between "Reviews-" and ".htm" (usually starts with an 'E' and followed by up to seven digits).
+#'
+#' @return \code{get_reviews} returns a data frame containing reviews and source information.
+#'
+#' @examples
+#' Reference https://www.glassdoor.com/Reviews/SpaceX-Reviews-E40371.htm
+#' String enclosed with quotes.
+#'   reviews <- get_reviews(companyNum = "E40371")
+#'   reviews <- get_reviews("E40371")
+#'
+#' @export get_reviews
+get_reviews <- function(companyNum) {
+
+  # Set URL
+  baseurl <- "https://www.glassdoor.com/Reviews/Company-Reviews-"
+  sort <- ".htm?sort.sortType=RD&sort.ascending=true"
+
+  # Nested function for getting max results
+  get_maxResults <- function(companyNum) {
+    totalReviews <- xml2::read_html(httr::GET(paste(baseurl, companyNum, sort, sep = ""))) %>%
+      html_nodes(".tightVert.floatLt strong, .margRtSm.margBot.minor") %>%
+      html_text() %>%
+      sub(" reviews", "", .) %>%
+      sub(",", "", .) %>%
+      as.integer()
+    return(ceiling(totalReviews/10))
+  }
+
+  # Message
+  Sys.sleep(2)
+  cat("\nNumber of web pages to scrape: ")
+  maxResults <- get_maxResults(companyNum)
+  Sys.sleep(6)
+  cat(maxResults)
+
+  # Nested functions to collapse newline (<br>) within pros & cons corpus body of text
+  collapse_html_text <- function(x, collapse = "\n", trim = F) {
+    UseMethod("collapse_html_text")  # parse xml use method:
+  }
+
+  collapse_html_text.xml_nodeset <- function(x, collapse = "\n", trim = F) {
+    vapply(x, collapse_html_text.xml_node, character(1),
+           trim = trim, collapse = collapse)
+  }
+
+  collapse_html_text.xml_node <- function(x, collapse = "\n", trim = F) {
+    paste(xml_find_all(x, ".//text()"), collapse = collapse)
+  }
+
+  # Nested function to get info (scrape based on CSS selectors pattern)
+  get_selectors <- function(pg, i) {
+    data.frame(rev.date = html_text(html_nodes(pg, ".date.subtle.small, .featuredFlag")),
+               rev.sum = html_text(html_nodes(pg, ".reviewLink .summary:not([class*='toggleBodyOff'])")),
+               rev.rating = html_attr(html_nodes(pg, ".gdStars.gdRatings.sm .rating .value-title"), "title"),
+               rev.title = html_text(html_nodes(pg, "span.authorInfo.tbl.hideHH")),
+               rev.pros = collapse_html_text(html_nodes(pg, ".description .row:nth-child(1) .mainText:not([class*='toggleBodyOff'])")),
+               rev.cons = collapse_html_text(html_nodes(pg, ".description .row:nth-child(2) .mainText:not([class*='toggleBodyOff'])")),
+               rev.helpf = html_text(html_nodes(pg, ".tight")),
+               source.url = paste(baseurl, companyNum, "_P", i, sort, sep = ""),
+               source.link = html_attr(html_nodes(pg, ".reviewLink"), "href"),
+               source.iden = html_attr(html_nodes(pg, ".empReview"), "id"),
+               stringsAsFactors = F)
+  }
+
+  # Message
+  Sys.sleep(3)
+  cat("\nStarting")
+
+  # Nested function to get data frame
+  df <- purrr::map_df(1:maxResults, function(i) {
+    Sys.sleep(sample(seq(3, 8, by = 0.01), 1))  # be polite
+    cat(" P", i, sep = "")
+    pg <- xml2::read_html(httr::GET(paste(baseurl, companyNum, "_P", i, sort, sep = "")))
+    get_selectors(pg, i)
+  })
+
+  # Return
+  Sys.sleep(3)
+  return(data.frame(df))
+}
+
diff --git a/README.md b/README.md
@@ -0,0 +1,118 @@
+gdscrapeR: scrape Glassdoor company reviews in R
+================
+
+ABOUT
+-----
+
+**gdscrapeR** is an R package that scrapes company reviews from Glassdoor using a single function: `get_reviews`. It returns a data frame structure for holding the text data, which can be further prepped for text analytics learning projects.
+
+INSTALL & LOAD
+--------------
+
+The latest version from GitHub:
+
+``` r
+install.packages("devtools")
+devtools::install_github("mguideng/gdscrapeR")
+
+library(gdscrapeR)
+```
+
+USAGE
+-----
+
+#### Example
+
+The URL to scrape the awesome **SpaceX** company will be: [www.glassdoor.com/Reviews/SpaceX-Reviews-E40371.htm](https://www.glassdoor.com/Reviews/SpaceX-Reviews-E40371.htm).
+
+![spacex-url](https://raw.githubusercontent.com/mguideng/gdscrapeR/master/images/spacex-url.PNG)
+
+#### Function
+
+Pass the company number through the `get_reviews` function. The company number is a string representing a company's unique ID number. Identified by navigating to a company's Glassdoor reviews web page and reviewing the URL for characters between "Reviews-" and ".htm" (usually starts with an "E" and followed by digits).
+
+``` r
+# Create data frame of: Date, Summary, Rating, Title, Pros, Cons, Helpful
+df <- get_reviews(companyNum = "E40371")
+```
+
+This will scrape the following variables:
+
+-   Date - of when review was posted
+-   Summary - e.g., "Great People"
+-   Rating - star rating between 1.0 and 5.0
+-   Title - e.g., "Current Employee - Manager in Hawthorne, CA"
+-   Pros - upsides of the workplace
+-   Cons - downsides of the workplace
+-   Helpful - count marked as being helpful, if any
+-   (and other info related to the source link)
+
+PREP FOR TEXT ANALYTICS
+-----------------------
+
+#### RegEx
+
+Use regular expressions to clean and extract additional variables:
+
+-   Primary Key (uniquely identify rows 1 to N reviewers, sorted from first to last by date)
+-   Year (from Date)
+-   Location (e.g., Hawthorne CA)
+-   Position (e.g., Manager)
+-   Status (current or former employee)
+
+``` r
+# Packages
+library(stringr)    # pattern matching functions
+
+# Add: PriKey
+df$rev.pk <- as.numeric(rownames(df))
+
+# Extract: Year, Position, Location, Status
+df$rev.year <- as.numeric(sub(".*, ","", df$rev.date))
+
+df$rev.pos <- sub(".* Employee - ", "", df$rev.title)
+df$rev.pos <- sub(" in .*", "", df$rev.pos)
+
+df$rev.loc <- sub(".*\\ in ", "", df$rev.title)
+df$rev.loc <- ifelse(df$rev.loc %in% 
+                       (grep("Former Employee|Current Employee", df$rev.loc, value = T)), 
+                     "Not Given", df$rev.loc)
+
+df$rev.stat <- str_extract(df$rev.title, ".* Employee -")
+df$rev.stat <- sub(" Employee -", "", df$rev.stat)
+
+# Clean: Pros, Cons, Helpful
+df$rev.pros <- gsub("&amp;", "&", df$rev.pros)
+df$rev.cons <- gsub("&amp;", "&", df$rev.cons)
+df$rev.helpf <- as.numeric(gsub("\\D", "", df$rev.helpf))
+
+# Export to csv
+write.csv(df, "df-results.csv", row.names = F)
+```
+
+#### Result
+
+![spacex-results](https://raw.githubusercontent.com/mguideng/gdscrapeR/master/images/spacex-results.PNG)
+
+#### Exploration ideas
+
+`gdscrapeR` is for learning purposes only. Analyze the unstructured text, extract relevant information, and transform it into useful insights.
+
+-   Apply Natural Language Processing (NLP) methods to show what is being written about the most.
+-   Sentiment analysis by categorizing the text data to determine whether a review is considered positive, negative, or neutral as a way of deriving the emotions and attitudes of employees. Here's a sample project: ["Text Mining Company Reviews (in R) - Case of MBB Consulting"](https://mguideng.github.io/2018-07-16-text-mining-glassdoor-big3/).
+-   Create a metrics profile for a company to track how star rating distributions are changing over time.
+-   The ["Text Mining with R" book](https://www.tidytextmining.com/) by Julia Silge and David Robinson is highly recommended for further ideas.
+
+**If you find this package useful, feel free to star :star: it. Thanks for visiting :heart: .**
+
+NOTES
+-----
+
+-   Uses the `rvest` and `purrr` packages to make it easy to scrape company reviews into a data frame.
+-   Site will change often. Errors due to CSS selector changes are shown as some variation of *"Error in 1:maxResults : argument of length 0"* or *"Error in data.frame(), : arguments imply differing number of rows: 0, 1"*.
+    -   Try it again later.
+    -   It's straightforward to work around them if you know R and how `rvest` and `purrr` work. Copy the `get_reviews` function code and paste it into an R script that you can modify to update the selector(s) in the meantime. For more on this, see the demo write-up: ["It's Harvesting Season - Scraping Ripe Data"](https://mguideng.github.io/2018-08-01-rvesting-glassdoor/).
+-   Be polite.
+    -   A system sleeper is built in so there will be delays to slow down the scraper (expect ~1 minute for every 100 reviews).
+    -   Also, saving the dataframe to avoid redundant scraping sessions is suggested.
+-   To contact maintainer: Maria Guideng `[imlearningthethings at gmail]`.
diff --git a/gdscrapeR.Rproj b/gdscrapeR.Rproj
@@ -0,0 +1,20 @@
+Version: 1.0
+
+RestoreWorkspace: Default
+SaveWorkspace: Default
+AlwaysSaveHistory: Default
+
+EnableCodeIndexing: Yes
+UseSpacesForTab: Yes
+NumSpacesForTab: 2
+Encoding: UTF-8
+
+RnwWeave: Sweave
+LaTeX: pdfLaTeX
+
+AutoAppendNewline: Yes
+StripTrailingWhitespace: Yes
+
+BuildType: Package
+PackageUseDevtools: Yes
+PackageInstallArgs: --no-multiarch --with-keep.source
diff --git a/images/spacex-results.PNG b/images/spacex-results.PNG
diff --git a/images/spacex-url.PNG b/images/spacex-url.PNG
diff --git a/man/get_reviews.Rd b/man/get_reviews.Rd