From 4476a1d002e2bc42ab6c9bbda94c9d20ea557852 Mon Sep 17 00:00:00 2001 From: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com> Date: Tue, 13 Feb 2024 17:15:33 +0100 Subject: [PATCH 1/6] init --- R/euclidean_logical_joins.R | 217 ++++----------------- R/jaccard_logical_joins.R | 10 +- _pkgdown.yml | 18 +- man/euclidean-joins.Rd | 128 ++++++++++++ man/euclidean_anti_join.Rd | 72 ------- man/euclidean_full_join.Rd | 70 ------- man/euclidean_inner_join.Rd | 71 ------- man/euclidean_left_join.Rd | 71 ------- man/euclidean_right_join.Rd | 70 ------- man/{logical-joins.Rd => jaccard-joins.Rd} | 0 man/zoomerjoin-package.Rd | 2 +- zoomerjoin.Rproj | 19 ++ 12 files changed, 200 insertions(+), 548 deletions(-) create mode 100644 man/euclidean-joins.Rd delete mode 100644 man/euclidean_anti_join.Rd delete mode 100644 man/euclidean_full_join.Rd delete mode 100644 man/euclidean_inner_join.Rd delete mode 100644 man/euclidean_left_join.Rd delete mode 100644 man/euclidean_right_join.Rd rename man/{logical-joins.Rd => jaccard-joins.Rd} (100%) create mode 100644 zoomerjoin.Rproj diff --git a/R/euclidean_logical_joins.R b/R/euclidean_logical_joins.R index 7b92f2c..8a4e9b5 100644 --- a/R/euclidean_logical_joins.R +++ b/R/euclidean_logical_joins.R @@ -1,203 +1,62 @@ -#' Spatial Anti Join Using LSH +#' Spatial joins Using LSH #' -#' @param a the first dataframe you wish to join. -#' @param b the second dataframe you wish to join. +#' @inheritParams jaccard_left_join #' -#' @param by a named vector indicating which columns to join on. Format should -#' be the same as dplyr: \code{by = c("column_name_in_df_a" = "column_name_in_df_b")}, but -#' two columns must be specified in each dataset (x column and y column). Specification -#' made with `dplyr::join_by()` are also accepted. -#' -#' @param n_bands the number of bands used in the LSH algorithm (default -#' is 30). Use this in conjunction with the \code{band_width} to determine the -#' performance of the hashing. -#' -#' @param band_width the length of each band used in the minihashing algorithm -#' (default is 5) Use this in conjunction with the \code{n_bands} to determine -#' the performance of the hashing. -#' -#' @param threshold the distance threshold below which units should be considered a match -#' -#' @param r the r hyperparameter used to govern the sensitivity of the locality sensitive hash, as described in -#' -#' @param progress set to `TRUE` to print progress -#' -#' @return a tibble fuzzily-joined on the basis of the variables in `by.` Tries -#' to adhere to the same standards as the dplyr-joins, and uses the same -#' logical joining patterns (i.e. inner-join joins and keeps only observations in both datasets). +#' @return A tibble fuzzily-joined on the basis of the variables in `by.` Tries +#' to adhere to the same standards as the dplyr-joins, and uses the same +#' logical joining patterns (i.e. inner-join joins and keeps only observations +#' in both datasets). #' #' @references Datar, Mayur, Nicole Immorlica, Pitor Indyk, and Vahab Mirrokni. -#' "Locality-Sensitive Hashing Scheme Based on p-Stable Distributions" SCG '04: -#' Proceedings of the twentieth annual symposium on Computational geometry -#' (2004): 253-262 -#' -#' @examples -#'n <- 10 -#' -#'X_1 <- matrix(c(seq(0,1,1/(n-1)), seq(0,1,1/(n-1))), nrow=n) -#'X_2 <- X_1 + .0000001 -#' -#'X_1 <- as.data.frame(X_1) -#'X_2 <- as.data.frame(X_2) -#' -#'X_1$id_1 <- 1:n -#'X_2$id_2 <- 1:n -#' -#' -#'euclidean_anti_join(X_1, X_2, by = c("V1", "V2"), threshold =.00005) -#' +#' "Locality-Sensitive Hashing Scheme Based on p-Stable Distributions" SCG +#' '04: Proceedings of the twentieth annual symposium on Computational +#' geometry (2004): 253-262 #' #' @export -euclidean_anti_join <- function(a, b, by = NULL, threshold = 1, n_bands = 30, band_width = 5, r=.5, progress = FALSE) { - euclidean_join_core(a, b, mode = "anti", by = by, threshold = threshold, n_bands = n_bands, progress = progress, band_width = band_width, r = r) -} - -#' Spatial Inner Join Using LSH -#' -#' @param a the first dataframe you wish to join. -#' @param b the second dataframe -#' you wish to join. -#' -#' @param by a named vector indicating which columns to join on. Format should -#' be the same as dplyr: \code{by = c("column_name_in_df_a" = "column_name_in_df_b")}, but -#' two columns must be specified in each dataset (x column and y column). -#' -#' @param n_bands the number of bands used in the LSH algorithm (default -#' is 30). Use this in conjunction with the \code{band_width} to determine the -#' performance of the hashing. -#' -#' @param band_width the length of each band used in the minihashing algorithm -#' (default is 5) Use this in conjunction with the \code{n_bands} to determine -#' the performance of the hashing. -#' -#' @param threshold the distance threshold below which units should be considered a match -#' -#' @param r the r hyperparameter used to govern the sensitivity of the locality sensitive hash, as described in -#' -#' @param progress set to `TRUE` to print progress -#' -#' @return a tibble fuzzily-joined on the basis of the variables in `by.` Tries -#' to adhere to the same standards as the dplyr-joins, and uses the same -#' logical joining patterns (i.e. inner-join joins and keeps only observations in both datasets). -#' -#' @references Datar, Mayur, Nicole Immorlica, Pitor Indyk, and Vahab Mirrokni. -#' "Locality-Sensitive Hashing Scheme Based on p-Stable Distributions" SCG '04: -#' Proceedings of the twentieth annual symposium on Computational geometry -#' (2004): 253-262 +#' @rdname euclidean-joins #' #' @examples -#'n <- 10 +#' n <- 10 #' -#'X_1 <- matrix(c(seq(0,1,1/(n-1)), seq(0,1,1/(n-1))), nrow=n) -#'X_2 <- X_1 + .0000001 +#' # Build two matrices that have close values +#' X_1 <- matrix(c(seq(0, 1, 1 / (n - 1)), seq(0, 1, 1 / (n - 1))), nrow = n) +#' X_2 <- X_1 + .0000001 #' -#'X_1 <- as.data.frame(X_1) -#'X_2 <- as.data.frame(X_2) +#' X_1 <- as.data.frame(X_1) +#' X_2 <- as.data.frame(X_2) #' -#'X_1$id_1 <- 1:n -#'X_2$id_2 <- 1:n -#' -#'euclidean_inner_join(X_1, X_2, by = c("V1", "V2"), threshold =.00005) +#' X_1$id_1 <- 1:n +#' X_2$id_2 <- 1:n #' +#' # only keep observations that have a match +#' euclidean_inner_join(X_1, X_2, by = c("V1", "V2"), threshold = .00005) #' +#' # keep all observations from X_1, regardless of whether they have a match +#' euclidean_inner_join(X_1, X_2, by = c("V1", "V2"), threshold = .00005) +euclidean_anti_join <- function(a, b, by = NULL, threshold = 1, n_bands = 30, band_width = 5, r = .5, progress = FALSE) { + euclidean_join_core(a, b, mode = "anti", by = by, threshold = threshold, n_bands = n_bands, progress = progress, band_width = band_width, r = r) +} + +#' @rdname euclidean-joins #' @export -euclidean_inner_join <- function(a, b, by = NULL, threshold = 1, n_bands = 30, band_width = 5, r=.5, progress = FALSE) { - euclidean_join_core(a, b, mode = "inner", by = by, threshold = threshold, n_bands = n_bands,progress = progress, band_width = band_width, r = r) +euclidean_inner_join <- function(a, b, by = NULL, threshold = 1, n_bands = 30, band_width = 5, r = .5, progress = FALSE) { + euclidean_join_core(a, b, mode = "inner", by = by, threshold = threshold, n_bands = n_bands, progress = progress, band_width = band_width, r = r) } -#' Spatial Left Join Using LSH -#' -#' @inheritParams euclidean_anti_join -#' -#' @return a tibble fuzzily-joined on the basis of the variables in `by.` Tries -#' to adhere to the same standards as the dplyr-joins, and uses the same -#' logical joining patterns (i.e. inner-join joins and keeps only observations in both datasets). -#' -#' @references Datar, Mayur, Nicole Immorlica, Pitor Indyk, and Vahab Mirrokni. -#' "Locality-Sensitive Hashing Scheme Based on p-Stable Distributions" SCG '04: -#' Proceedings of the twentieth annual symposium on Computational geometry -#' (2004): 253-262 -#' -#' @examples -#'n <- 10 -#' -#'X_1 <- matrix(c(seq(0,1,1/(n-1)), seq(0,1,1/(n-1))), nrow=n) -#'X_2 <- X_1 + .0000001 -#' -#'X_1 <- as.data.frame(X_1) -#'X_2 <- as.data.frame(X_2) -#' -#'X_1$id_1 <- 1:n -#'X_2$id_2 <- 1:n -#' -#'euclidean_left_join(X_1, X_2, by = c("V1", "V2"), threshold =.00005) -#' -#' +#' @rdname euclidean-joins #' @export -euclidean_left_join <- function(a, b, by = NULL, threshold = 1, n_bands = 30, band_width = 5, r=.5, progress = FALSE) { - euclidean_join_core(a, b, mode = "left", by = by, threshold = threshold, n_bands = n_bands,progress = progress, band_width = band_width, r = r) +euclidean_left_join <- function(a, b, by = NULL, threshold = 1, n_bands = 30, band_width = 5, r = .5, progress = FALSE) { + euclidean_join_core(a, b, mode = "left", by = by, threshold = threshold, n_bands = n_bands, progress = progress, band_width = band_width, r = r) } -#' Spatial Right Join Using LSH -#' -#' @inheritParams euclidean_anti_join -#' -#' @return a tibble fuzzily-joined on the basis of the variables in `by.` Tries -#' to adhere to the same standards as the dplyr-joins, and uses the same -#' logical joining patterns (i.e. inner-join joins and keeps only observations in both datasets). -#' -#' @references Datar, Mayur, Nicole Immorlica, Pitor Indyk, and Vahab Mirrokni. -#' "Locality-Sensitive Hashing Scheme Based on p-Stable Distributions" SCG '04: -#' Proceedings of the twentieth annual symposium on Computational geometry -#' (2004): 253-262 -#' -#' @examples -#'n <- 10 -#' -#'X_1 <- matrix(c(seq(0,1,1/(n-1)), seq(0,1,1/(n-1))), nrow=n) -#'X_2 <- X_1 + .0000001 -#'X_1 <- as.data.frame(X_1) -#'X_2 <- as.data.frame(X_2) -#' -#'X_1$id_1 <- 1:n -#'X_2$id_2 <- 1:n -#' -#'euclidean_right_join(X_1, X_2, by = c("V1", "V2"), threshold =.00005) -#' -#' +#' @rdname euclidean-joins #' @export -euclidean_right_join <- function(a, b, by = NULL, threshold = 1, n_bands = 30, band_width = 5, r=.5, progress = FALSE) { - euclidean_join_core(a, b, mode = "right", by = by, threshold = threshold, n_bands = n_bands,progress = progress, band_width = band_width, r = r) +euclidean_right_join <- function(a, b, by = NULL, threshold = 1, n_bands = 30, band_width = 5, r = .5, progress = FALSE) { + euclidean_join_core(a, b, mode = "right", by = by, threshold = threshold, n_bands = n_bands, progress = progress, band_width = band_width, r = r) } -#' Spatial Full Join Using LSH -#' -#' @inheritParams euclidean_anti_join -#' -#' @return a tibble fuzzily-joined on the basis of the variables in `by.` Tries -#' to adhere to the same standards as the dplyr-joins, and uses the same -#' logical joining patterns (i.e. inner-join joins and keeps only observations in both datasets). -#' -#' @references Datar, Mayur, Nicole Immorlica, Pitor Indyk, and Vahab Mirrokni. -#' "Locality-Sensitive Hashing Scheme Based on p-Stable Distributions" SCG '04: -#' Proceedings of the twentieth annual symposium on Computational geometry -#' (2004): 253-262 -#' -#' @examples -#'n <- 10 -#' -#'X_1 <- matrix(c(seq(0,1,1/(n-1)), seq(0,1,1/(n-1))), nrow=n) -#'X_2 <- X_1 + .0000001 -#' -#'X_1 <- as.data.frame(X_1) -#'X_2 <- as.data.frame(X_2) -#' -#'X_1$id_1 <- 1:n -#'X_2$id_2 <- 1:n -#' -#'euclidean_full_join(X_1, X_2, by = c("V1", "V2"), threshold =.00005) -#' +#' @rdname euclidean-joins #' @export -euclidean_full_join <- function(a, b, by = NULL, threshold = 1, n_bands = 30, band_width = 5, r=.5, progress = FALSE) { - euclidean_join_core(a, b, mode = "full", by = by, threshold = threshold, n_bands = n_bands, progress = progress, band_width = band_width, r = r) +euclidean_full_join <- function(a, b, by = NULL, threshold = 1, n_bands = 30, band_width = 5, r = .5, progress = FALSE) { + euclidean_join_core(a, b, mode = "full", by = by, threshold = threshold, n_bands = n_bands, progress = progress, band_width = band_width, r = r) } diff --git a/R/jaccard_logical_joins.R b/R/jaccard_logical_joins.R index 2c6fa3b..67f7156 100644 --- a/R/jaccard_logical_joins.R +++ b/R/jaccard_logical_joins.R @@ -52,7 +52,7 @@ #' logical joining patterns (i.e. inner-join joins and keeps only observations #' in both datasets). #' -#' @rdname logical-joins +#' @rdname jaccard-joins #' @export #' #' @examples @@ -115,7 +115,7 @@ jaccard_inner_join <- function(a, b, ) } -#' @rdname logical-joins +#' @rdname jaccard-joins #' @export jaccard_anti_join <- function(a, b, by = NULL, @@ -136,7 +136,7 @@ jaccard_anti_join <- function(a, b, ) } -#' @rdname logical-joins +#' @rdname jaccard-joins #' @export jaccard_left_join <- function(a, b, by = NULL, @@ -160,7 +160,7 @@ jaccard_left_join <- function(a, b, ) } -#' @rdname logical-joins +#' @rdname jaccard-joins #' @export jaccard_right_join <- function(a, b, by = NULL, @@ -184,7 +184,7 @@ jaccard_right_join <- function(a, b, ) } -#' @rdname logical-joins +#' @rdname jaccard-joins #' @export jaccard_full_join <- function(a, b, by = NULL, diff --git a/_pkgdown.yml b/_pkgdown.yml index 6540753..d52e704 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -10,14 +10,6 @@ reference: - jaccard_right_join - jaccard_anti_join - - title: Probabilistic Matching Algorithms - contents: - - em_link - - - title: string deduplication - contents: - - jaccard_string_group - - title: dplyr-style distance joins contents: - euclidean_inner_join @@ -26,6 +18,14 @@ reference: - euclidean_right_join - euclidean_anti_join + - title: Probabilistic Matching Algorithms + contents: + - em_link + + - title: String deduplication + contents: + - jaccard_string_group + - title: Utilities contents: - jaccard_similarity @@ -38,7 +38,7 @@ reference: - euclidean_curve - euclidean_probability - - title: data + - title: Data contents: - dime_data diff --git a/man/euclidean-joins.Rd b/man/euclidean-joins.Rd new file mode 100644 index 0000000..36bbd73 --- /dev/null +++ b/man/euclidean-joins.Rd @@ -0,0 +1,128 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/euclidean_logical_joins.R +\name{euclidean_anti_join} +\alias{euclidean_anti_join} +\alias{euclidean_inner_join} +\alias{euclidean_left_join} +\alias{euclidean_right_join} +\alias{euclidean_full_join} +\title{Spatial joins Using LSH} +\usage{ +euclidean_anti_join( + a, + b, + by = NULL, + threshold = 1, + n_bands = 30, + band_width = 5, + r = 0.5, + progress = FALSE +) + +euclidean_inner_join( + a, + b, + by = NULL, + threshold = 1, + n_bands = 30, + band_width = 5, + r = 0.5, + progress = FALSE +) + +euclidean_left_join( + a, + b, + by = NULL, + threshold = 1, + n_bands = 30, + band_width = 5, + r = 0.5, + progress = FALSE +) + +euclidean_right_join( + a, + b, + by = NULL, + threshold = 1, + n_bands = 30, + band_width = 5, + r = 0.5, + progress = FALSE +) + +euclidean_full_join( + a, + b, + by = NULL, + threshold = 1, + n_bands = 30, + band_width = 5, + r = 0.5, + progress = FALSE +) +} +\arguments{ +\item{a, b}{The two dataframes to join.} + +\item{by}{A named vector indicating which columns to join on. Format should +be the same as dplyr: \code{by = c("column_name_in_df_a" = "column_name_in_df_b")}, but two columns must be specified in each dataset +(x column and y column). Specification made with \code{dplyr::join_by()} are +also accepted.} + +\item{threshold}{The Jaccard similarity threshold above which two strings +should be considered a match (default is .95). The similarity is equal to +1 - the Jaccard distance between the two strings, so 1 implies the strings +are identical, while a similarity of zero implies the strings are completely +dissimilar.} + +\item{n_bands}{The number of bands used in the minihash algorithm (default is +40). Use this in conjunction with the \code{band_width} to determine the +performance of the hashing. The default settings are for a +(.2, .8, .001, .999)-sensitive hash i.e. that pairs with a similarity of less +than .2 have a >.1\% chance of being compared, while pairs with a similarity +of greater than .8 have a >99.9\% chance of being compared.} + +\item{band_width}{The length of each band used in the minihashing algorithm +(default is 8) Use this in conjunction with the \code{n_bands} to determine the +performance of the hashing. The default settings are for a +(.2, .8, .001, .999)-sensitive hash i.e. that pairs with a similarity of less +than .2 have a >.1\% chance of being compared, while pairs with a similarity +of greater than .8 have a >99.9\% chance of being compared.} + +\item{progress}{Set to \code{TRUE} to print progress.} +} +\value{ +A tibble fuzzily-joined on the basis of the variables in \code{by.} Tries +to adhere to the same standards as the dplyr-joins, and uses the same +logical joining patterns (i.e. inner-join joins and keeps only observations +in both datasets). +} +\description{ +Spatial joins Using LSH +} +\examples{ +n <- 10 + +X_1 <- matrix(c(seq(0, 1, 1 / (n - 1)), seq(0, 1, 1 / (n - 1))), nrow = n) +X_2 <- X_1 + .0000001 + +X_1 <- as.data.frame(X_1) +X_2 <- as.data.frame(X_2) + +X_1$id_1 <- 1:n +X_2$id_2 <- 1:n + +# only keep observations that have a match +euclidean_inner_join(X_1, X_2, by = c("V1", "V2"), threshold = .00005) + +# keep all observations from X_1, regardless of whether they have a match +euclidean_inner_join(X_1, X_2, by = c("V1", "V2"), threshold = .00005) +} +\references{ +Datar, Mayur, Nicole Immorlica, Pitor Indyk, and Vahab Mirrokni. +"Locality-Sensitive Hashing Scheme Based on p-Stable Distributions" SCG +'04: Proceedings of the twentieth annual symposium on Computational +geometry (2004): 253-262 +} diff --git a/man/euclidean_anti_join.Rd b/man/euclidean_anti_join.Rd deleted file mode 100644 index e66f807..0000000 --- a/man/euclidean_anti_join.Rd +++ /dev/null @@ -1,72 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/euclidean_logical_joins.R -\name{euclidean_anti_join} -\alias{euclidean_anti_join} -\title{Spatial Anti Join Using LSH} -\usage{ -euclidean_anti_join( - a, - b, - by = NULL, - threshold = 1, - n_bands = 30, - band_width = 5, - r = 0.5, - progress = FALSE -) -} -\arguments{ -\item{a}{the first dataframe you wish to join.} - -\item{b}{the second dataframe you wish to join.} - -\item{by}{a named vector indicating which columns to join on. Format should -be the same as dplyr: \code{by = c("column_name_in_df_a" = "column_name_in_df_b")}, but -two columns must be specified in each dataset (x column and y column). Specification -made with \code{dplyr::join_by()} are also accepted.} - -\item{threshold}{the distance threshold below which units should be considered a match} - -\item{n_bands}{the number of bands used in the LSH algorithm (default -is 30). Use this in conjunction with the \code{band_width} to determine the -performance of the hashing.} - -\item{band_width}{the length of each band used in the minihashing algorithm -(default is 5) Use this in conjunction with the \code{n_bands} to determine -the performance of the hashing.} - -\item{r}{the r hyperparameter used to govern the sensitivity of the locality sensitive hash, as described in} - -\item{progress}{set to \code{TRUE} to print progress} -} -\value{ -a tibble fuzzily-joined on the basis of the variables in \code{by.} Tries -to adhere to the same standards as the dplyr-joins, and uses the same -logical joining patterns (i.e. inner-join joins and keeps only observations in both datasets). -} -\description{ -Spatial Anti Join Using LSH -} -\examples{ -n <- 10 - -X_1 <- matrix(c(seq(0,1,1/(n-1)), seq(0,1,1/(n-1))), nrow=n) -X_2 <- X_1 + .0000001 - -X_1 <- as.data.frame(X_1) -X_2 <- as.data.frame(X_2) - -X_1$id_1 <- 1:n -X_2$id_2 <- 1:n - - -euclidean_anti_join(X_1, X_2, by = c("V1", "V2"), threshold =.00005) - - -} -\references{ -Datar, Mayur, Nicole Immorlica, Pitor Indyk, and Vahab Mirrokni. -"Locality-Sensitive Hashing Scheme Based on p-Stable Distributions" SCG '04: -Proceedings of the twentieth annual symposium on Computational geometry -(2004): 253-262 -} diff --git a/man/euclidean_full_join.Rd b/man/euclidean_full_join.Rd deleted file mode 100644 index a3e93be..0000000 --- a/man/euclidean_full_join.Rd +++ /dev/null @@ -1,70 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/euclidean_logical_joins.R -\name{euclidean_full_join} -\alias{euclidean_full_join} -\title{Spatial Full Join Using LSH} -\usage{ -euclidean_full_join( - a, - b, - by = NULL, - threshold = 1, - n_bands = 30, - band_width = 5, - r = 0.5, - progress = FALSE -) -} -\arguments{ -\item{a}{the first dataframe you wish to join.} - -\item{b}{the second dataframe you wish to join.} - -\item{by}{a named vector indicating which columns to join on. Format should -be the same as dplyr: \code{by = c("column_name_in_df_a" = "column_name_in_df_b")}, but -two columns must be specified in each dataset (x column and y column). Specification -made with \code{dplyr::join_by()} are also accepted.} - -\item{threshold}{the distance threshold below which units should be considered a match} - -\item{n_bands}{the number of bands used in the LSH algorithm (default -is 30). Use this in conjunction with the \code{band_width} to determine the -performance of the hashing.} - -\item{band_width}{the length of each band used in the minihashing algorithm -(default is 5) Use this in conjunction with the \code{n_bands} to determine -the performance of the hashing.} - -\item{r}{the r hyperparameter used to govern the sensitivity of the locality sensitive hash, as described in} - -\item{progress}{set to \code{TRUE} to print progress} -} -\value{ -a tibble fuzzily-joined on the basis of the variables in \code{by.} Tries -to adhere to the same standards as the dplyr-joins, and uses the same -logical joining patterns (i.e. inner-join joins and keeps only observations in both datasets). -} -\description{ -Spatial Full Join Using LSH -} -\examples{ -n <- 10 - -X_1 <- matrix(c(seq(0,1,1/(n-1)), seq(0,1,1/(n-1))), nrow=n) -X_2 <- X_1 + .0000001 - -X_1 <- as.data.frame(X_1) -X_2 <- as.data.frame(X_2) - -X_1$id_1 <- 1:n -X_2$id_2 <- 1:n - -euclidean_full_join(X_1, X_2, by = c("V1", "V2"), threshold =.00005) - -} -\references{ -Datar, Mayur, Nicole Immorlica, Pitor Indyk, and Vahab Mirrokni. -"Locality-Sensitive Hashing Scheme Based on p-Stable Distributions" SCG '04: -Proceedings of the twentieth annual symposium on Computational geometry -(2004): 253-262 -} diff --git a/man/euclidean_inner_join.Rd b/man/euclidean_inner_join.Rd deleted file mode 100644 index f1e3bfa..0000000 --- a/man/euclidean_inner_join.Rd +++ /dev/null @@ -1,71 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/euclidean_logical_joins.R -\name{euclidean_inner_join} -\alias{euclidean_inner_join} -\title{Spatial Inner Join Using LSH} -\usage{ -euclidean_inner_join( - a, - b, - by = NULL, - threshold = 1, - n_bands = 30, - band_width = 5, - r = 0.5, - progress = FALSE -) -} -\arguments{ -\item{a}{the first dataframe you wish to join.} - -\item{b}{the second dataframe -you wish to join.} - -\item{by}{a named vector indicating which columns to join on. Format should -be the same as dplyr: \code{by = c("column_name_in_df_a" = "column_name_in_df_b")}, but -two columns must be specified in each dataset (x column and y column).} - -\item{threshold}{the distance threshold below which units should be considered a match} - -\item{n_bands}{the number of bands used in the LSH algorithm (default -is 30). Use this in conjunction with the \code{band_width} to determine the -performance of the hashing.} - -\item{band_width}{the length of each band used in the minihashing algorithm -(default is 5) Use this in conjunction with the \code{n_bands} to determine -the performance of the hashing.} - -\item{r}{the r hyperparameter used to govern the sensitivity of the locality sensitive hash, as described in} - -\item{progress}{set to \code{TRUE} to print progress} -} -\value{ -a tibble fuzzily-joined on the basis of the variables in \code{by.} Tries -to adhere to the same standards as the dplyr-joins, and uses the same -logical joining patterns (i.e. inner-join joins and keeps only observations in both datasets). -} -\description{ -Spatial Inner Join Using LSH -} -\examples{ -n <- 10 - -X_1 <- matrix(c(seq(0,1,1/(n-1)), seq(0,1,1/(n-1))), nrow=n) -X_2 <- X_1 + .0000001 - -X_1 <- as.data.frame(X_1) -X_2 <- as.data.frame(X_2) - -X_1$id_1 <- 1:n -X_2$id_2 <- 1:n - -euclidean_inner_join(X_1, X_2, by = c("V1", "V2"), threshold =.00005) - - -} -\references{ -Datar, Mayur, Nicole Immorlica, Pitor Indyk, and Vahab Mirrokni. -"Locality-Sensitive Hashing Scheme Based on p-Stable Distributions" SCG '04: -Proceedings of the twentieth annual symposium on Computational geometry -(2004): 253-262 -} diff --git a/man/euclidean_left_join.Rd b/man/euclidean_left_join.Rd deleted file mode 100644 index b8c9e54..0000000 --- a/man/euclidean_left_join.Rd +++ /dev/null @@ -1,71 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/euclidean_logical_joins.R -\name{euclidean_left_join} -\alias{euclidean_left_join} -\title{Spatial Left Join Using LSH} -\usage{ -euclidean_left_join( - a, - b, - by = NULL, - threshold = 1, - n_bands = 30, - band_width = 5, - r = 0.5, - progress = FALSE -) -} -\arguments{ -\item{a}{the first dataframe you wish to join.} - -\item{b}{the second dataframe you wish to join.} - -\item{by}{a named vector indicating which columns to join on. Format should -be the same as dplyr: \code{by = c("column_name_in_df_a" = "column_name_in_df_b")}, but -two columns must be specified in each dataset (x column and y column). Specification -made with \code{dplyr::join_by()} are also accepted.} - -\item{threshold}{the distance threshold below which units should be considered a match} - -\item{n_bands}{the number of bands used in the LSH algorithm (default -is 30). Use this in conjunction with the \code{band_width} to determine the -performance of the hashing.} - -\item{band_width}{the length of each band used in the minihashing algorithm -(default is 5) Use this in conjunction with the \code{n_bands} to determine -the performance of the hashing.} - -\item{r}{the r hyperparameter used to govern the sensitivity of the locality sensitive hash, as described in} - -\item{progress}{set to \code{TRUE} to print progress} -} -\value{ -a tibble fuzzily-joined on the basis of the variables in \code{by.} Tries -to adhere to the same standards as the dplyr-joins, and uses the same -logical joining patterns (i.e. inner-join joins and keeps only observations in both datasets). -} -\description{ -Spatial Left Join Using LSH -} -\examples{ -n <- 10 - -X_1 <- matrix(c(seq(0,1,1/(n-1)), seq(0,1,1/(n-1))), nrow=n) -X_2 <- X_1 + .0000001 - -X_1 <- as.data.frame(X_1) -X_2 <- as.data.frame(X_2) - -X_1$id_1 <- 1:n -X_2$id_2 <- 1:n - -euclidean_left_join(X_1, X_2, by = c("V1", "V2"), threshold =.00005) - - -} -\references{ -Datar, Mayur, Nicole Immorlica, Pitor Indyk, and Vahab Mirrokni. -"Locality-Sensitive Hashing Scheme Based on p-Stable Distributions" SCG '04: -Proceedings of the twentieth annual symposium on Computational geometry -(2004): 253-262 -} diff --git a/man/euclidean_right_join.Rd b/man/euclidean_right_join.Rd deleted file mode 100644 index 0bfe8b8..0000000 --- a/man/euclidean_right_join.Rd +++ /dev/null @@ -1,70 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/euclidean_logical_joins.R -\name{euclidean_right_join} -\alias{euclidean_right_join} -\title{Spatial Right Join Using LSH} -\usage{ -euclidean_right_join( - a, - b, - by = NULL, - threshold = 1, - n_bands = 30, - band_width = 5, - r = 0.5, - progress = FALSE -) -} -\arguments{ -\item{a}{the first dataframe you wish to join.} - -\item{b}{the second dataframe you wish to join.} - -\item{by}{a named vector indicating which columns to join on. Format should -be the same as dplyr: \code{by = c("column_name_in_df_a" = "column_name_in_df_b")}, but -two columns must be specified in each dataset (x column and y column). Specification -made with \code{dplyr::join_by()} are also accepted.} - -\item{threshold}{the distance threshold below which units should be considered a match} - -\item{n_bands}{the number of bands used in the LSH algorithm (default -is 30). Use this in conjunction with the \code{band_width} to determine the -performance of the hashing.} - -\item{band_width}{the length of each band used in the minihashing algorithm -(default is 5) Use this in conjunction with the \code{n_bands} to determine -the performance of the hashing.} - -\item{r}{the r hyperparameter used to govern the sensitivity of the locality sensitive hash, as described in} - -\item{progress}{set to \code{TRUE} to print progress} -} -\value{ -a tibble fuzzily-joined on the basis of the variables in \code{by.} Tries -to adhere to the same standards as the dplyr-joins, and uses the same -logical joining patterns (i.e. inner-join joins and keeps only observations in both datasets). -} -\description{ -Spatial Right Join Using LSH -} -\examples{ -n <- 10 - -X_1 <- matrix(c(seq(0,1,1/(n-1)), seq(0,1,1/(n-1))), nrow=n) -X_2 <- X_1 + .0000001 -X_1 <- as.data.frame(X_1) -X_2 <- as.data.frame(X_2) - -X_1$id_1 <- 1:n -X_2$id_2 <- 1:n - -euclidean_right_join(X_1, X_2, by = c("V1", "V2"), threshold =.00005) - - -} -\references{ -Datar, Mayur, Nicole Immorlica, Pitor Indyk, and Vahab Mirrokni. -"Locality-Sensitive Hashing Scheme Based on p-Stable Distributions" SCG '04: -Proceedings of the twentieth annual symposium on Computational geometry -(2004): 253-262 -} diff --git a/man/logical-joins.Rd b/man/jaccard-joins.Rd similarity index 100% rename from man/logical-joins.Rd rename to man/jaccard-joins.Rd diff --git a/man/zoomerjoin-package.Rd b/man/zoomerjoin-package.Rd index ef83641..da4dca2 100644 --- a/man/zoomerjoin-package.Rd +++ b/man/zoomerjoin-package.Rd @@ -23,7 +23,7 @@ Useful links: Other contributors: \itemize{ - \item Etienne Bacher \email{etienne.bacher@protonmail.com} [contributor] + \item Etienne Bacher \email{etienne.bacher@protonmail.com} (\href{https://orcid.org/0000-0002-9271-5075}{ORCID}) [contributor] \item The authors of the dependency Rust crates (see inst/AUTHORS file for details) [contributor, copyright holder] } diff --git a/zoomerjoin.Rproj b/zoomerjoin.Rproj new file mode 100644 index 0000000..49f2092 --- /dev/null +++ b/zoomerjoin.Rproj @@ -0,0 +1,19 @@ +Version: 1.0 + +RestoreWorkspace: Default +SaveWorkspace: Default +AlwaysSaveHistory: Default + +EnableCodeIndexing: Yes +UseSpacesForTab: Yes +NumSpacesForTab: 2 +Encoding: UTF-8 + +RnwWeave: Sweave +LaTeX: pdfLaTeX + +StripTrailingWhitespace: Yes + +BuildType: Package +PackageUseDevtools: Yes +PackageInstallArgs: --no-multiarch --with-keep.source From 8613d278570bc3f7ba1d22b6ca7737ad0c8c6e80 Mon Sep 17 00:00:00 2001 From: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com> Date: Tue, 13 Feb 2024 17:19:28 +0100 Subject: [PATCH 2/6] missing param --- R/euclidean_logical_joins.R | 2 ++ man/euclidean-joins.Rd | 4 ++++ 2 files changed, 6 insertions(+) diff --git a/R/euclidean_logical_joins.R b/R/euclidean_logical_joins.R index 8a4e9b5..e38938e 100644 --- a/R/euclidean_logical_joins.R +++ b/R/euclidean_logical_joins.R @@ -1,6 +1,8 @@ #' Spatial joins Using LSH #' #' @inheritParams jaccard_left_join +#' @param r Hyperparameter used to govern the sensitivity of the locality +#' sensitive hash. #' #' @return A tibble fuzzily-joined on the basis of the variables in `by.` Tries #' to adhere to the same standards as the dplyr-joins, and uses the same diff --git a/man/euclidean-joins.Rd b/man/euclidean-joins.Rd index 36bbd73..ed56dc1 100644 --- a/man/euclidean-joins.Rd +++ b/man/euclidean-joins.Rd @@ -91,6 +91,9 @@ performance of the hashing. The default settings are for a than .2 have a >.1\% chance of being compared, while pairs with a similarity of greater than .8 have a >99.9\% chance of being compared.} +\item{r}{Hyperparameter used to govern the sensitivity of the locality +sensitive hash.} + \item{progress}{Set to \code{TRUE} to print progress.} } \value{ @@ -105,6 +108,7 @@ Spatial joins Using LSH \examples{ n <- 10 +# Build two matrices that have close values X_1 <- matrix(c(seq(0, 1, 1 / (n - 1)), seq(0, 1, 1 / (n - 1))), nrow = n) X_2 <- X_1 + .0000001 From 5e873a19eaf234e2b831bc43a51ef40b217129fd Mon Sep 17 00:00:00 2001 From: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com> Date: Tue, 13 Feb 2024 17:23:51 +0100 Subject: [PATCH 3/6] param threshold has a different meaning --- R/euclidean_logical_joins.R | 4 ++++ man/euclidean-joins.Rd | 9 ++++----- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/R/euclidean_logical_joins.R b/R/euclidean_logical_joins.R index e38938e..59ca0a2 100644 --- a/R/euclidean_logical_joins.R +++ b/R/euclidean_logical_joins.R @@ -1,6 +1,10 @@ #' Spatial joins Using LSH #' #' @inheritParams jaccard_left_join +#' @param threshold The distance threshold below which units should be +#' considered a match. Note that contrary to Jaccard joins, this value is +#' about the distance and not the similarity. Therefore, a lower value means a +#' higher similarity. #' @param r Hyperparameter used to govern the sensitivity of the locality #' sensitive hash. #' diff --git a/man/euclidean-joins.Rd b/man/euclidean-joins.Rd index ed56dc1..7f10bbc 100644 --- a/man/euclidean-joins.Rd +++ b/man/euclidean-joins.Rd @@ -71,11 +71,10 @@ be the same as dplyr: \code{by = c("column_name_in_df_a" = "column_name_in_df_b" (x column and y column). Specification made with \code{dplyr::join_by()} are also accepted.} -\item{threshold}{The Jaccard similarity threshold above which two strings -should be considered a match (default is .95). The similarity is equal to -1 - the Jaccard distance between the two strings, so 1 implies the strings -are identical, while a similarity of zero implies the strings are completely -dissimilar.} +\item{threshold}{The distance threshold below which units should be +considered a match. Note that contrary to Jaccard joins, this value is +about the distance and not the similarity. Therefore, a lower value means a +higher similarity.} \item{n_bands}{The number of bands used in the minihash algorithm (default is 40). Use this in conjunction with the \code{band_width} to determine the From f5a234f1d1a6b0c8f1956c778452ebf6388d2f7c Mon Sep 17 00:00:00 2001 From: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com> Date: Tue, 13 Feb 2024 17:34:47 +0100 Subject: [PATCH 4/6] remove .Rproj file --- .Rbuildignore | 2 ++ .gitignore | 1 + zoomerjoin.Rproj | 19 ------------------- 3 files changed, 3 insertions(+), 19 deletions(-) delete mode 100644 zoomerjoin.Rproj diff --git a/.Rbuildignore b/.Rbuildignore index 2c3ae73..8d87092 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -18,3 +18,5 @@ src/rust/rehash.py src/rust/uncomment.sh ^cran-comments\.md$ ^CRAN-SUBMISSION$ +^.*\.Rproj$ +^\.Rproj\.user$ diff --git a/.gitignore b/.gitignore index 1e97c30..0b66bc9 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ docs inst/doc src/rust/vendor +.Rproj.user diff --git a/zoomerjoin.Rproj b/zoomerjoin.Rproj deleted file mode 100644 index 49f2092..0000000 --- a/zoomerjoin.Rproj +++ /dev/null @@ -1,19 +0,0 @@ -Version: 1.0 - -RestoreWorkspace: Default -SaveWorkspace: Default -AlwaysSaveHistory: Default - -EnableCodeIndexing: Yes -UseSpacesForTab: Yes -NumSpacesForTab: 2 -Encoding: UTF-8 - -RnwWeave: Sweave -LaTeX: pdfLaTeX - -StripTrailingWhitespace: Yes - -BuildType: Package -PackageUseDevtools: Yes -PackageInstallArgs: --no-multiarch --with-keep.source From 081f1088693c155fbfe8f9d6e7445b4dde7c0ae2 Mon Sep 17 00:00:00 2001 From: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com> Date: Tue, 13 Feb 2024 17:35:57 +0100 Subject: [PATCH 5/6] ignore Rproj file --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 0b66bc9..89ea11f 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,4 @@ docs inst/doc src/rust/vendor .Rproj.user +zoomerjoin.Rproj \ No newline at end of file From 19b995f880b1995fdaa57132ce11685ec4c4d6b2 Mon Sep 17 00:00:00 2001 From: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com> Date: Tue, 13 Feb 2024 18:03:42 +0100 Subject: [PATCH 6/6] add more details for arg `r` --- R/euclidean_logical_joins.R | 6 +++++- man/euclidean-joins.Rd | 6 +++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/R/euclidean_logical_joins.R b/R/euclidean_logical_joins.R index 59ca0a2..82b54a8 100644 --- a/R/euclidean_logical_joins.R +++ b/R/euclidean_logical_joins.R @@ -6,7 +6,11 @@ #' about the distance and not the similarity. Therefore, a lower value means a #' higher similarity. #' @param r Hyperparameter used to govern the sensitivity of the locality -#' sensitive hash. +#' sensitive hash. Corresponds to the width of the hash bucket in the LSH +#' algorithm. Increasing values of `r` mean more hash collisions and higher +#' sensitivity (fewer false-negatives) at the cost of lower specificity (more +#' false-positives and longer run time). For more information, see the +#' description in . #' #' @return A tibble fuzzily-joined on the basis of the variables in `by.` Tries #' to adhere to the same standards as the dplyr-joins, and uses the same diff --git a/man/euclidean-joins.Rd b/man/euclidean-joins.Rd index 7f10bbc..ce7e8c9 100644 --- a/man/euclidean-joins.Rd +++ b/man/euclidean-joins.Rd @@ -91,7 +91,11 @@ than .2 have a >.1\% chance of being compared, while pairs with a similarity of greater than .8 have a >99.9\% chance of being compared.} \item{r}{Hyperparameter used to govern the sensitivity of the locality -sensitive hash.} +sensitive hash. Corresponds to the width of the hash bucket in the LSH +algorithm. Increasing values of \code{r} mean more hash collisions and higher +sensitivity (fewer false-negatives) at the cost of lower specificity (more +false-positives and longer run time). For more information, see the +description in \url{doi:10.1145/997817.997857}.} \item{progress}{Set to \code{TRUE} to print progress.} }