Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Spring cleaning 2024 #18

Merged
merged 8 commits into from
Jan 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 21 additions & 26 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,32 +1,24 @@
Package: excluder
Title: Checks for Exclusion Criteria in Online Data
Version: 0.5.0
Authors@R:
c(person(given = "Jeffrey R.",
family = "Stevens",
role = c("aut", "cre", "cph"),
email = "jeffrey.r.stevens@protonmail.com",
Authors@R: c(
person("Jeffrey R.", "Stevens", , "jeffrey.r.stevens@protonmail.com", role = c("aut", "cre", "cph"),
comment = c(ORCID = "0000-0003-2375-1360")),
person(given = "Joseph",
family = "O'Brien",
role = c("rev"),
comment = c(ORCID = "0000-0001-9851-5077")),
person(given = "Julia",
family = "Silge",
role = c("rev"),
email = "julia.silge@gmail.com",
comment = c(ORCID = "0000-0002-3671-836X")))
Description: Data that are collected through online sources such as Mechanical
Turk may require excluding rows because of IP address duplication,
geolocation, or completion duration. This package facilitates
exclusion of these data for Qualtrics datasets.
person("Joseph", "O'Brien", role = "rev",
comment = c(ORCID = "0000-0001-9851-5077")),
person("Julia", "Silge", , "julia.silge@gmail.com", role = "rev",
comment = c(ORCID = "0000-0002-3671-836X"))
)
Description: Data that are collected through online sources such as
Mechanical Turk may require excluding rows because of IP address
duplication, geolocation, or completion duration. This package
facilitates exclusion of these data for Qualtrics datasets.
License: GPL (>= 3)
Encoding: UTF-8
LazyData: true
Roxygen: list(markdown = TRUE)
RoxygenNote: 7.2.3
URL: https://docs.ropensci.org/excluder/, https://github.com/ropensci/excluder/
URL: https://docs.ropensci.org/excluder/,
https://github.com/ropensci/excluder/
BugReports: https://github.com/ropensci/excluder/issues/
Depends:
R (>= 3.5.0)
Imports:
cli,
curl,
Expand All @@ -40,8 +32,6 @@ Imports:
stringr,
tidyr,
tidyselect
Depends:
R (>= 3.5.0)
Suggests:
covr,
knitr,
Expand All @@ -50,5 +40,10 @@ Suggests:
rmarkdown,
testthat (>= 3.0.0),
withr
VignetteBuilder:
knitr
Config/testthat/edition: 3
VignetteBuilder: knitr
Encoding: UTF-8
LazyData: true
Roxygen: list(markdown = TRUE)
RoxygenNote: 7.2.3
10 changes: 6 additions & 4 deletions R/duplicates.R
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,8 @@ mark_duplicates <- function(x,
# Rename columns
if (rename) {
x <- rename_columns(x, alert = FALSE)
id_col <- "ResponseId"
ip_col <- "IPAddress"
}

# Check for presence of required columns
Expand All @@ -100,7 +102,7 @@ mark_duplicates <- function(x,
no_nas_ip <- tidyr::drop_na(x, tidyselect::all_of(ip_col))
n_nas_ip <- nrow(x) - nrow(no_nas_ip)
same_ip <- janitor::get_dupes(no_nas_ip, tidyselect::all_of(ip_col)) %>%
dplyr::select(-.data$dupe_count)
dplyr::select(-"dupe_count")
n_same_ip <- nrow(same_ip)
if (identical(quiet, FALSE)) {
cli::cli_alert_info(
Expand All @@ -125,7 +127,7 @@ mark_duplicates <- function(x,
no_nas_loc,
tidyselect::all_of(location_col)
) %>%
dplyr::select(-.data$dupe_count)
dplyr::select(-"dupe_count")
n_same_location <- nrow(same_location)
if (identical(quiet, FALSE)) {
cli::cli_alert_info(
Expand Down Expand Up @@ -244,7 +246,7 @@ check_duplicates <- function(x,
quiet = quiet
) %>%
dplyr::filter(.data$exclusion_duplicates == "duplicates") %>%
keep_marked_column(.data$exclusion_duplicates, keep)
keep_marked_column("exclusion_duplicates", keep)

# Determine whether to print results
print_data(exclusions, print)
Expand Down Expand Up @@ -316,7 +318,7 @@ exclude_duplicates <- function(x,
quiet = quiet
) %>%
dplyr::filter(.data$exclusion_duplicates != "duplicates") %>%
dplyr::select(-.data$exclusion_duplicates)
dplyr::select(-"exclusion_duplicates")

# Print exclusion statement
if (identical(silent, FALSE)) {
Expand Down
5 changes: 3 additions & 2 deletions R/duration.R
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ mark_duration <- function(x,
# Rename columns
if (rename) {
x <- rename_columns(x, alert = FALSE)
id_col <- "ResponseId"
}

# Check for presence of required columns
Expand Down Expand Up @@ -192,7 +193,7 @@ check_duration <- function(x,
) %>%
dplyr::filter(.data$exclusion_duration == "duration_quick" |
.data$exclusion_duration == "duration_slow") %>%
keep_marked_column(.data$exclusion_duration, keep)
keep_marked_column("exclusion_duration", keep)

# Determine whether to print results
print_data(exclusions, print)
Expand Down Expand Up @@ -256,7 +257,7 @@ exclude_duration <- function(x,
) %>%
dplyr::filter(.data$exclusion_duration != "duration_quick" &
.data$exclusion_duration != "duration_slow") %>%
dplyr::select(-.data$exclusion_duration)
dplyr::select(-"exclusion_duration")

# Print exclusion statement

Expand Down
8 changes: 5 additions & 3 deletions R/ip.R
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,8 @@ mark_ip <- function(x,
# Rename columns
if (rename) {
x <- rename_columns(x, alert = FALSE)
id_col <- "ResponseId"
ip_col <- "IPAddress"
}

# Check for presence of required column
Expand Down Expand Up @@ -123,7 +125,7 @@ mark_ip <- function(x,
outside_country <- !ipaddress::is_within_any(survey_ips, country_ip_ranges)
filtered_data <- dplyr::bind_cols(filtered_data, outside = outside_country)
filtered_data <- dplyr::filter(filtered_data, .data$outside == TRUE) %>%
dplyr::select(-.data$outside)
dplyr::select(-"outside")
n_outside_country <- nrow(filtered_data)

# Filter NAs when requested
Expand Down Expand Up @@ -231,7 +233,7 @@ check_ip <- function(x,
quiet = quiet
) %>%
dplyr::filter(.data$exclusion_ip == "ip") %>%
keep_marked_column(.data$exclusion_ip, keep)
keep_marked_column("exclusion_ip", keep)

# Determine whether to print results
print_data(exclusions, print)
Expand Down Expand Up @@ -300,7 +302,7 @@ exclude_ip <- function(x,
quiet = quiet
) %>%
dplyr::filter(.data$exclusion_ip != "ip") %>%
dplyr::select(-.data$exclusion_ip)
dplyr::select(-"exclusion_ip")

# Print exclusion statement
if (identical(silent, FALSE)) {
Expand Down
8 changes: 5 additions & 3 deletions R/location.R
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,8 @@ mark_location <- function(x,
# Rename columns
if (rename) {
x <- rename_columns(x, alert = FALSE)
id_col <- "ResponseId"
location_col <- c("LocationLatitude", "LocationLongitude")
}

# Check for presence of required column
Expand All @@ -91,7 +93,7 @@ mark_location <- function(x,
# Determine if geolocation is within US
no_nas$country <- maps::map.where(database = "usa", longitude, latitude)
outside_us <- dplyr::filter(no_nas, is.na(.data$country)) %>%
dplyr::select(-.data$country)
dplyr::select(-"country")
n_outside_us <- nrow(outside_us)

# Combine no location with outside US
Expand Down Expand Up @@ -189,7 +191,7 @@ check_location <- function(x,
quiet = quiet
) %>%
dplyr::filter(.data$exclusion_location == "location") %>%
keep_marked_column(.data$exclusion_location, keep)
keep_marked_column("exclusion_location", keep)

# Determine whether to print results
print_data(exclusions, print)
Expand Down Expand Up @@ -249,7 +251,7 @@ exclude_location <- function(x,
quiet = quiet
) %>%
dplyr::filter(.data$exclusion_location != "location") %>%
dplyr::select(-.data$exclusion_location)
dplyr::select(-"exclusion_location")

# Print exclusion statement
if (identical(silent, FALSE)) {
Expand Down
6 changes: 4 additions & 2 deletions R/preview.R
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@ mark_preview <- function(x,
# Rename columns
if (rename) {
x <- rename_columns(x, alert = FALSE)
id_col <- "ResponseId"
preview_col <- "Status"
}

# Check for presence of required column
Expand Down Expand Up @@ -148,7 +150,7 @@ check_preview <- function(x,
quiet = quiet
) %>%
dplyr::filter(.data$exclusion_preview == "preview") %>%
keep_marked_column(.data$exclusion_preview, keep)
keep_marked_column("exclusion_preview", keep)

# Determine whether to print results
print_data(exclusions, print)
Expand Down Expand Up @@ -205,7 +207,7 @@ exclude_preview <- function(x,
quiet = quiet
) %>%
dplyr::filter(.data$exclusion_preview != "preview") %>%
dplyr::select(-.data$exclusion_preview)
dplyr::select(-"exclusion_preview")

# Print exclusion statement
if (identical(silent, FALSE)) {
Expand Down
5 changes: 3 additions & 2 deletions R/progress.R
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ mark_progress <- function(x,
# Rename columns
if (rename) {
x <- rename_columns(x, alert = FALSE)
id_col <- "ResponseId"
}

# Check for presence of required column
Expand Down Expand Up @@ -185,7 +186,7 @@ check_progress <- function(x,
quiet = quiet
) %>%
dplyr::filter(.data$exclusion_progress == "progress") %>%
keep_marked_column(.data$exclusion_progress, keep)
keep_marked_column("exclusion_progress", keep)

# Determine whether to print results
print_data(exclusions, print)
Expand Down Expand Up @@ -253,7 +254,7 @@ exclude_progress <- function(x,
quiet = quiet
) %>%
dplyr::filter(.data$exclusion_progress != "progress") %>%
dplyr::select(-.data$exclusion_progress)
dplyr::select(-"exclusion_progress")

# Print exclusion statement
if (identical(silent, FALSE)) {
Expand Down
55 changes: 49 additions & 6 deletions R/qualtrics_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,8 @@
#'
#' A dataset containing the metadata from a standard Qualtrics survey with
#' browser metadata collected and exported with "Use numeric values".
#' These data were randomly generated using [iptools::ip_random()] and
#' These data were randomly generated using [iptools::ip_random()](
#' https://hrbrmstr.github.io/iptools/reference/ip_random.html) and
#' [rgeolocate::ip2location()](
#' https://cran.r-project.org/package=rgeolocate) functions.
#'
Expand Down Expand Up @@ -71,7 +72,8 @@
#'
#' A dataset containing the metadata from a standard Qualtrics survey with
#' browser metadata collected and exported with "Use choice text".
#' These data were randomly generated using [iptools::ip_random()] and
#' These data were randomly generated using [iptools::ip_random()](
#' https://hrbrmstr.github.io/iptools/reference/ip_random.html) and
#' [rgeolocate::ip2location()](
#' https://cran.r-project.org/package=rgeolocate) functions.
#'
Expand Down Expand Up @@ -108,10 +110,8 @@
#' were imported using
#' [`qualtRics::fetch_survey()`](
#' https://docs.ropensci.org/qualtRics/reference/fetch_survey.html).
#' and then the secondary labels were assigned as column names with
#' [`sjlabelled::get_label()`](
#' https://strengejacke.github.io/sjlabelled/reference/get_label.html).
#' These data were randomly generated using [iptools::ip_random()] and
#' These data were randomly generated using [iptools::ip_random()](
#' https://hrbrmstr.github.io/iptools/reference/ip_random.html) and
#' [rgeolocate::ip2location()](
#' https://cran.r-project.org/package=rgeolocate) functions.
#'
Expand Down Expand Up @@ -141,3 +141,46 @@
#' }
#' @family data
"qualtrics_fetch"

#' Example numeric metadata imported with `qualtRics::fetch_survey()` from
#' simulated Qualtrics study but with labels included as column names
#'
#' A dataset containing the metadata from a standard Qualtrics survey with
#' browser metadata collected and exported with "Use numeric values". The data
#' were imported using
#' [`qualtRics::fetch_survey()`](
#' https://docs.ropensci.org/qualtRics/reference/fetch_survey.html).
#' and then the secondary labels were assigned as column names with
#' [`sjlabelled::get_label()`](
#' https://strengejacke.github.io/sjlabelled/reference/get_label.html).
#' These data were randomly generated using [iptools::ip_random()](
#' https://hrbrmstr.github.io/iptools/reference/ip_random.html) and
#' [rgeolocate::ip2location()](
#' https://cran.r-project.org/package=rgeolocate) functions.
#'
#' @format A data frame with 100 rows and 17 variables:
#' \describe{
#' \item{Start Date}{date and time data collection started, in ISO 8601 format}
#' \item{End Date}{date and time data collection ended, in ISO 8601 format}
#' \item{Response Type}{numeric flag for preview (1) vs. implemented survey (0)
#' entries}
#' \item{IP Address}{participant IP address (truncated for anonymity)}
#' \item{Progress}{percentage of survey completed}
#' \item{Duration (in seconds)}{duration of time required to complete survey,
#' in seconds}
#' \item{Finished}{numeric flag for whether survey was completed (1) or
#' progress was < 100 (0)}
#' \item{Recorded Date}{date and time survey was recorded, in ISO 8601 format}
#' \item{Response ID}{random ID for participants}
#' \item{Location Latitude}{latitude geolocated from IP address}
#' \item{Location Longitude}{longitude geolocated from IP address}
#' \item{User Language}{language set in Qualtrics}
#' \item{Click to write the question text - Browser}{user web browser type}
#' \item{Click to write the question text - Version}{user web browser version}
#' \item{Click to write the question text - Operating System}{user operating system}
#' \item{Click to write the question text - Resolution}{user screen resolution}
#' \item{like}{response to question about whether the user liked the survey
#' (1 = Yes, 0 = No)}
#' }
#' @family data
"qualtrics_fetch2"
18 changes: 9 additions & 9 deletions R/rename_columns.R
Original file line number Diff line number Diff line change
Expand Up @@ -58,15 +58,15 @@ rename_columns <- function(x, alert = TRUE) {
# Rename columns
x %>%
dplyr::rename(
StartDate = .data$`Start Date`,
EndDate = .data$`End Date`,
Status = .data$`Response Type`,
IPAddress = .data$`IP Address`,
RecordedDate = .data$`Recorded Date`,
ResponseId = .data$`Response ID`,
LocationLatitude = .data$`Location Latitude`,
LocationLongitude = .data$`Location Longitude`,
UserLanguage = .data$`User Language`
StartDate = "Start Date",
EndDate = "End Date",
Status = "Response Type",
IPAddress = "IP Address",
RecordedDate = "Recorded Date",
ResponseId = "Response ID",
LocationLatitude = "Location Latitude",
LocationLongitude = "Location Longitude",
UserLanguage = "User Language"
) %>%
dplyr::rename_with(~ gsub(throwaway, "", .x), dplyr::contains(throwaway))
} else if (any(grepl("_Resolution", column_names))) {
Expand Down
Loading