From 4476a1d002e2bc42ab6c9bbda94c9d20ea557852 Mon Sep 17 00:00:00 2001
From: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com>
Date: Tue, 13 Feb 2024 17:15:33 +0100
Subject: [PATCH 1/6] init

---
 R/euclidean_logical_joins.R                | 217 ++++-----------------
 R/jaccard_logical_joins.R                  |  10 +-
 _pkgdown.yml                               |  18 +-
 man/euclidean-joins.Rd                     | 128 ++++++++++++
 man/euclidean_anti_join.Rd                 |  72 -------
 man/euclidean_full_join.Rd                 |  70 -------
 man/euclidean_inner_join.Rd                |  71 -------
 man/euclidean_left_join.Rd                 |  71 -------
 man/euclidean_right_join.Rd                |  70 -------
 man/{logical-joins.Rd => jaccard-joins.Rd} |   0
 man/zoomerjoin-package.Rd                  |   2 +-
 zoomerjoin.Rproj                           |  19 ++
 12 files changed, 200 insertions(+), 548 deletions(-)
 create mode 100644 man/euclidean-joins.Rd
 delete mode 100644 man/euclidean_anti_join.Rd
 delete mode 100644 man/euclidean_full_join.Rd
 delete mode 100644 man/euclidean_inner_join.Rd
 delete mode 100644 man/euclidean_left_join.Rd
 delete mode 100644 man/euclidean_right_join.Rd
 rename man/{logical-joins.Rd => jaccard-joins.Rd} (100%)
 create mode 100644 zoomerjoin.Rproj

diff --git a/R/euclidean_logical_joins.R b/R/euclidean_logical_joins.R
index 7b92f2c..8a4e9b5 100644
--- a/R/euclidean_logical_joins.R
+++ b/R/euclidean_logical_joins.R
@@ -1,203 +1,62 @@
-#' Spatial Anti Join Using LSH
+#' Spatial joins Using LSH
 #'
-#' @param a the first dataframe you wish to join.
-#' @param b the second dataframe you wish to join.
+#' @inheritParams jaccard_left_join
 #'
-#' @param by a named vector indicating which columns to join on. Format should
-#' be the same as dplyr: \code{by = c("column_name_in_df_a" = "column_name_in_df_b")}, but
-#' two columns must be specified in each dataset (x column and y column). Specification
-#' made with `dplyr::join_by()` are also accepted.
-#'
-#' @param n_bands the number of bands used in the LSH algorithm (default
-#' is 30). Use this in conjunction with the \code{band_width} to determine the
-#' performance of the hashing.
-#'
-#' @param band_width the length of each band used in the minihashing algorithm
-#' (default is 5) Use this in conjunction with the \code{n_bands} to determine
-#' the performance of the hashing.
-#'
-#' @param threshold the distance threshold below which units should be considered a match
-#'
-#' @param r the r hyperparameter used to govern the sensitivity of the locality sensitive hash, as described in
-#'
-#' @param progress set to `TRUE` to print progress
-#'
-#' @return a tibble fuzzily-joined on the basis of the variables in `by.` Tries
-#' to adhere to the same standards as the dplyr-joins, and uses the same
-#' logical joining patterns (i.e. inner-join joins and keeps only observations in both datasets).
+#' @return A tibble fuzzily-joined on the basis of the variables in `by.` Tries
+#'   to adhere to the same standards as the dplyr-joins, and uses the same
+#'   logical joining patterns (i.e. inner-join joins and keeps only observations
+#'   in both datasets).
 #'
 #' @references Datar, Mayur, Nicole Immorlica, Pitor Indyk, and Vahab Mirrokni.
-#' "Locality-Sensitive Hashing Scheme Based on p-Stable Distributions" SCG '04:
-#' Proceedings of the twentieth annual symposium on Computational geometry
-#' (2004): 253-262
-#'
-#' @examples
-#'n <- 10
-#'
-#'X_1 <- matrix(c(seq(0,1,1/(n-1)), seq(0,1,1/(n-1))), nrow=n)
-#'X_2 <- X_1 + .0000001
-#'
-#'X_1 <- as.data.frame(X_1)
-#'X_2 <- as.data.frame(X_2)
-#'
-#'X_1$id_1 <- 1:n
-#'X_2$id_2 <- 1:n
-#'
-#'
-#'euclidean_anti_join(X_1, X_2, by = c("V1", "V2"), threshold =.00005)
-#'
+#'   "Locality-Sensitive Hashing Scheme Based on p-Stable Distributions" SCG
+#'   '04: Proceedings of the twentieth annual symposium on Computational
+#'   geometry (2004): 253-262
 #'
 #' @export
-euclidean_anti_join <- function(a, b, by = NULL, threshold = 1, n_bands = 30, band_width = 5,  r=.5, progress = FALSE) {
-    euclidean_join_core(a, b, mode = "anti", by = by, threshold =  threshold, n_bands = n_bands, progress = progress, band_width = band_width, r = r)
-}
-
-#' Spatial Inner Join Using LSH
-#'
-#' @param a the first dataframe you wish to join.
-#' @param b the second dataframe
-#' you wish to join.
-#'
-#' @param by a named vector indicating which columns to join on. Format should
-#' be the same as dplyr: \code{by = c("column_name_in_df_a" = "column_name_in_df_b")}, but
-#' two columns must be specified in each dataset (x column and y column).
-#'
-#' @param n_bands the number of bands used in the LSH algorithm (default
-#' is 30). Use this in conjunction with the \code{band_width} to determine the
-#' performance of the hashing.
-#'
-#' @param band_width the length of each band used in the minihashing algorithm
-#' (default is 5) Use this in conjunction with the \code{n_bands} to determine
-#' the performance of the hashing.
-#'
-#' @param threshold the distance threshold below which units should be considered a match
-#'
-#' @param r the r hyperparameter used to govern the sensitivity of the locality sensitive hash, as described in
-#'
-#' @param progress set to `TRUE` to print progress
-#'
-#' @return a tibble fuzzily-joined on the basis of the variables in `by.` Tries
-#' to adhere to the same standards as the dplyr-joins, and uses the same
-#' logical joining patterns (i.e. inner-join joins and keeps only observations in both datasets).
-#'
-#' @references Datar, Mayur, Nicole Immorlica, Pitor Indyk, and Vahab Mirrokni.
-#' "Locality-Sensitive Hashing Scheme Based on p-Stable Distributions" SCG '04:
-#' Proceedings of the twentieth annual symposium on Computational geometry
-#' (2004): 253-262
+#' @rdname euclidean-joins
 #'
 #' @examples
-#'n <- 10
+#' n <- 10
 #'
-#'X_1 <- matrix(c(seq(0,1,1/(n-1)), seq(0,1,1/(n-1))), nrow=n)
-#'X_2 <- X_1 + .0000001
+#' # Build two matrices that have close values
+#' X_1 <- matrix(c(seq(0, 1, 1 / (n - 1)), seq(0, 1, 1 / (n - 1))), nrow = n)
+#' X_2 <- X_1 + .0000001
 #'
-#'X_1 <- as.data.frame(X_1)
-#'X_2 <- as.data.frame(X_2)
+#' X_1 <- as.data.frame(X_1)
+#' X_2 <- as.data.frame(X_2)
 #'
-#'X_1$id_1 <- 1:n
-#'X_2$id_2 <- 1:n
-#'
-#'euclidean_inner_join(X_1, X_2, by = c("V1", "V2"), threshold =.00005)
+#' X_1$id_1 <- 1:n
+#' X_2$id_2 <- 1:n
 #'
+#' # only keep observations that have a match
+#' euclidean_inner_join(X_1, X_2, by = c("V1", "V2"), threshold = .00005)
 #'
+#' # keep all observations from X_1, regardless of whether they have a match
+#' euclidean_inner_join(X_1, X_2, by = c("V1", "V2"), threshold = .00005)
+euclidean_anti_join <- function(a, b, by = NULL, threshold = 1, n_bands = 30, band_width = 5, r = .5, progress = FALSE) {
+  euclidean_join_core(a, b, mode = "anti", by = by, threshold = threshold, n_bands = n_bands, progress = progress, band_width = band_width, r = r)
+}
+
+#' @rdname euclidean-joins
 #' @export
-euclidean_inner_join <- function(a, b, by = NULL, threshold = 1, n_bands = 30, band_width = 5, r=.5, progress = FALSE) {
-    euclidean_join_core(a, b, mode = "inner", by = by, threshold =  threshold, n_bands = n_bands,progress = progress,  band_width = band_width, r = r)
+euclidean_inner_join <- function(a, b, by = NULL, threshold = 1, n_bands = 30, band_width = 5, r = .5, progress = FALSE) {
+  euclidean_join_core(a, b, mode = "inner", by = by, threshold = threshold, n_bands = n_bands, progress = progress, band_width = band_width, r = r)
 }
 
-#' Spatial Left Join Using LSH
-#'
-#' @inheritParams euclidean_anti_join
-#'
-#' @return a tibble fuzzily-joined on the basis of the variables in `by.` Tries
-#' to adhere to the same standards as the dplyr-joins, and uses the same
-#' logical joining patterns (i.e. inner-join joins and keeps only observations in both datasets).
-#'
-#' @references Datar, Mayur, Nicole Immorlica, Pitor Indyk, and Vahab Mirrokni.
-#' "Locality-Sensitive Hashing Scheme Based on p-Stable Distributions" SCG '04:
-#' Proceedings of the twentieth annual symposium on Computational geometry
-#' (2004): 253-262
-#'
-#' @examples
-#'n <- 10
-#'
-#'X_1 <- matrix(c(seq(0,1,1/(n-1)), seq(0,1,1/(n-1))), nrow=n)
-#'X_2 <- X_1 + .0000001
-#'
-#'X_1 <- as.data.frame(X_1)
-#'X_2 <- as.data.frame(X_2)
-#'
-#'X_1$id_1 <- 1:n
-#'X_2$id_2 <- 1:n
-#'
-#'euclidean_left_join(X_1, X_2, by = c("V1", "V2"), threshold =.00005)
-#'
-#'
+#' @rdname euclidean-joins
 #' @export
-euclidean_left_join <- function(a, b, by = NULL, threshold = 1, n_bands = 30, band_width = 5, r=.5, progress = FALSE) {
-    euclidean_join_core(a, b, mode = "left", by = by, threshold =  threshold, n_bands = n_bands,progress = progress,  band_width = band_width, r = r)
+euclidean_left_join <- function(a, b, by = NULL, threshold = 1, n_bands = 30, band_width = 5, r = .5, progress = FALSE) {
+  euclidean_join_core(a, b, mode = "left", by = by, threshold = threshold, n_bands = n_bands, progress = progress, band_width = band_width, r = r)
 }
 
-#' Spatial Right Join Using LSH
-#'
-#' @inheritParams euclidean_anti_join
-#'
-#' @return a tibble fuzzily-joined on the basis of the variables in `by.` Tries
-#' to adhere to the same standards as the dplyr-joins, and uses the same
-#' logical joining patterns (i.e. inner-join joins and keeps only observations in both datasets).
-#'
-#' @references Datar, Mayur, Nicole Immorlica, Pitor Indyk, and Vahab Mirrokni.
-#' "Locality-Sensitive Hashing Scheme Based on p-Stable Distributions" SCG '04:
-#' Proceedings of the twentieth annual symposium on Computational geometry
-#' (2004): 253-262
-#'
-#' @examples
-#'n <- 10
-#'
-#'X_1 <- matrix(c(seq(0,1,1/(n-1)), seq(0,1,1/(n-1))), nrow=n)
-#'X_2 <- X_1 + .0000001
-#'X_1 <- as.data.frame(X_1)
-#'X_2 <- as.data.frame(X_2)
-#'
-#'X_1$id_1 <- 1:n
-#'X_2$id_2 <- 1:n
-#'
-#'euclidean_right_join(X_1, X_2, by = c("V1", "V2"), threshold =.00005)
-#'
-#'
+#' @rdname euclidean-joins
 #' @export
-euclidean_right_join <- function(a, b, by = NULL, threshold = 1, n_bands = 30, band_width = 5, r=.5, progress = FALSE) {
-    euclidean_join_core(a, b, mode = "right", by = by, threshold =  threshold, n_bands = n_bands,progress = progress,  band_width = band_width, r = r)
+euclidean_right_join <- function(a, b, by = NULL, threshold = 1, n_bands = 30, band_width = 5, r = .5, progress = FALSE) {
+  euclidean_join_core(a, b, mode = "right", by = by, threshold = threshold, n_bands = n_bands, progress = progress, band_width = band_width, r = r)
 }
 
-#' Spatial Full Join Using LSH
-#'
-#' @inheritParams euclidean_anti_join
-#'
-#' @return a tibble fuzzily-joined on the basis of the variables in `by.` Tries
-#' to adhere to the same standards as the dplyr-joins, and uses the same
-#' logical joining patterns (i.e. inner-join joins and keeps only observations in both datasets).
-#'
-#' @references Datar, Mayur, Nicole Immorlica, Pitor Indyk, and Vahab Mirrokni.
-#' "Locality-Sensitive Hashing Scheme Based on p-Stable Distributions" SCG '04:
-#' Proceedings of the twentieth annual symposium on Computational geometry
-#' (2004): 253-262
-#'
-#' @examples
-#'n <- 10
-#'
-#'X_1 <- matrix(c(seq(0,1,1/(n-1)), seq(0,1,1/(n-1))), nrow=n)
-#'X_2 <- X_1 + .0000001
-#'
-#'X_1 <- as.data.frame(X_1)
-#'X_2 <- as.data.frame(X_2)
-#'
-#'X_1$id_1 <- 1:n
-#'X_2$id_2 <- 1:n
-#'
-#'euclidean_full_join(X_1, X_2, by = c("V1", "V2"), threshold =.00005)
-#'
+#' @rdname euclidean-joins
 #' @export
-euclidean_full_join <- function(a, b, by = NULL, threshold = 1, n_bands = 30, band_width = 5, r=.5, progress = FALSE) {
-    euclidean_join_core(a, b, mode = "full", by = by, threshold =  threshold, n_bands = n_bands, progress = progress, band_width = band_width, r = r)
+euclidean_full_join <- function(a, b, by = NULL, threshold = 1, n_bands = 30, band_width = 5, r = .5, progress = FALSE) {
+  euclidean_join_core(a, b, mode = "full", by = by, threshold = threshold, n_bands = n_bands, progress = progress, band_width = band_width, r = r)
 }
diff --git a/R/jaccard_logical_joins.R b/R/jaccard_logical_joins.R
index 2c6fa3b..67f7156 100644
--- a/R/jaccard_logical_joins.R
+++ b/R/jaccard_logical_joins.R
@@ -52,7 +52,7 @@
 #'   logical joining patterns (i.e. inner-join joins and keeps only observations
 #'   in both datasets).
 #'
-#' @rdname logical-joins
+#' @rdname jaccard-joins
 #' @export
 #'
 #' @examples
@@ -115,7 +115,7 @@ jaccard_inner_join <- function(a, b,
   )
 }
 
-#' @rdname logical-joins
+#' @rdname jaccard-joins
 #' @export
 jaccard_anti_join <- function(a, b,
                               by = NULL,
@@ -136,7 +136,7 @@ jaccard_anti_join <- function(a, b,
   )
 }
 
-#' @rdname logical-joins
+#' @rdname jaccard-joins
 #' @export
 jaccard_left_join <- function(a, b,
                               by = NULL,
@@ -160,7 +160,7 @@ jaccard_left_join <- function(a, b,
   )
 }
 
-#' @rdname logical-joins
+#' @rdname jaccard-joins
 #' @export
 jaccard_right_join <- function(a, b,
                                by = NULL,
@@ -184,7 +184,7 @@ jaccard_right_join <- function(a, b,
   )
 }
 
-#' @rdname logical-joins
+#' @rdname jaccard-joins
 #' @export
 jaccard_full_join <- function(a, b,
                               by = NULL,
diff --git a/_pkgdown.yml b/_pkgdown.yml
index 6540753..d52e704 100644
--- a/_pkgdown.yml
+++ b/_pkgdown.yml
@@ -10,14 +10,6 @@ reference:
       - jaccard_right_join
       - jaccard_anti_join
 
-  - title: Probabilistic Matching Algorithms
-    contents:
-      - em_link
-
-  - title: string deduplication
-    contents:
-      - jaccard_string_group
-
   - title: dplyr-style distance joins
     contents:
       - euclidean_inner_join
@@ -26,6 +18,14 @@ reference:
       - euclidean_right_join
       - euclidean_anti_join
 
+  - title: Probabilistic Matching Algorithms
+    contents:
+      - em_link
+
+  - title: String deduplication
+    contents:
+      - jaccard_string_group
+
   - title: Utilities
     contents:
       - jaccard_similarity
@@ -38,7 +38,7 @@ reference:
       - euclidean_curve
       - euclidean_probability
 
-  - title: data
+  - title: Data
     contents:
       - dime_data
 
diff --git a/man/euclidean-joins.Rd b/man/euclidean-joins.Rd
new file mode 100644
index 0000000..36bbd73
--- /dev/null
+++ b/man/euclidean-joins.Rd
@@ -0,0 +1,128 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/euclidean_logical_joins.R
+\name{euclidean_anti_join}
+\alias{euclidean_anti_join}
+\alias{euclidean_inner_join}
+\alias{euclidean_left_join}
+\alias{euclidean_right_join}
+\alias{euclidean_full_join}
+\title{Spatial joins Using LSH}
+\usage{
+euclidean_anti_join(
+  a,
+  b,
+  by = NULL,
+  threshold = 1,
+  n_bands = 30,
+  band_width = 5,
+  r = 0.5,
+  progress = FALSE
+)
+
+euclidean_inner_join(
+  a,
+  b,
+  by = NULL,
+  threshold = 1,
+  n_bands = 30,
+  band_width = 5,
+  r = 0.5,
+  progress = FALSE
+)
+
+euclidean_left_join(
+  a,
+  b,
+  by = NULL,
+  threshold = 1,
+  n_bands = 30,
+  band_width = 5,
+  r = 0.5,
+  progress = FALSE
+)
+
+euclidean_right_join(
+  a,
+  b,
+  by = NULL,
+  threshold = 1,
+  n_bands = 30,
+  band_width = 5,
+  r = 0.5,
+  progress = FALSE
+)
+
+euclidean_full_join(
+  a,
+  b,
+  by = NULL,
+  threshold = 1,
+  n_bands = 30,
+  band_width = 5,
+  r = 0.5,
+  progress = FALSE
+)
+}
+\arguments{
+\item{a, b}{The two dataframes to join.}
+
+\item{by}{A named vector indicating which columns to join on. Format should
+be the same as dplyr: \code{by = c("column_name_in_df_a" = "column_name_in_df_b")}, but two columns must be specified in each dataset
+(x column and y column). Specification made with \code{dplyr::join_by()} are
+also accepted.}
+
+\item{threshold}{The Jaccard similarity threshold above which two strings
+should be considered a match (default is .95). The similarity is equal to
+1 - the Jaccard distance between the two strings, so 1 implies the strings
+are identical, while a similarity of zero implies the strings are completely
+dissimilar.}
+
+\item{n_bands}{The number of bands used in the minihash algorithm (default is
+40). Use this in conjunction with the \code{band_width} to determine the
+performance of the hashing. The default settings are for a
+(.2, .8, .001, .999)-sensitive hash i.e. that pairs with a similarity of less
+than .2 have a >.1\% chance of being compared, while pairs with a similarity
+of greater than .8 have a >99.9\% chance of being compared.}
+
+\item{band_width}{The length of each band used in the minihashing algorithm
+(default is 8) Use this in conjunction with the \code{n_bands} to determine the
+performance of the hashing. The default settings are for a
+(.2, .8, .001, .999)-sensitive hash i.e. that pairs with a similarity of less
+than .2 have a >.1\% chance of being compared, while pairs with a similarity
+of greater than .8 have a >99.9\% chance of being compared.}
+
+\item{progress}{Set to \code{TRUE} to print progress.}
+}
+\value{
+A tibble fuzzily-joined on the basis of the variables in \code{by.} Tries
+to adhere to the same standards as the dplyr-joins, and uses the same
+logical joining patterns (i.e. inner-join joins and keeps only observations
+in both datasets).
+}
+\description{
+Spatial joins Using LSH
+}
+\examples{
+n <- 10
+
+X_1 <- matrix(c(seq(0, 1, 1 / (n - 1)), seq(0, 1, 1 / (n - 1))), nrow = n)
+X_2 <- X_1 + .0000001
+
+X_1 <- as.data.frame(X_1)
+X_2 <- as.data.frame(X_2)
+
+X_1$id_1 <- 1:n
+X_2$id_2 <- 1:n
+
+# only keep observations that have a match
+euclidean_inner_join(X_1, X_2, by = c("V1", "V2"), threshold = .00005)
+
+# keep all observations from X_1, regardless of whether they have a match
+euclidean_inner_join(X_1, X_2, by = c("V1", "V2"), threshold = .00005)
+}
+\references{
+Datar, Mayur, Nicole Immorlica, Pitor Indyk, and Vahab Mirrokni.
+"Locality-Sensitive Hashing Scheme Based on p-Stable Distributions" SCG
+'04: Proceedings of the twentieth annual symposium on Computational
+geometry (2004): 253-262
+}
diff --git a/man/euclidean_anti_join.Rd b/man/euclidean_anti_join.Rd
deleted file mode 100644
index e66f807..0000000
--- a/man/euclidean_anti_join.Rd
+++ /dev/null
@@ -1,72 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/euclidean_logical_joins.R
-\name{euclidean_anti_join}
-\alias{euclidean_anti_join}
-\title{Spatial Anti Join Using LSH}
-\usage{
-euclidean_anti_join(
-  a,
-  b,
-  by = NULL,
-  threshold = 1,
-  n_bands = 30,
-  band_width = 5,
-  r = 0.5,
-  progress = FALSE
-)
-}
-\arguments{
-\item{a}{the first dataframe you wish to join.}
-
-\item{b}{the second dataframe you wish to join.}
-
-\item{by}{a named vector indicating which columns to join on. Format should
-be the same as dplyr: \code{by = c("column_name_in_df_a" = "column_name_in_df_b")}, but
-two columns must be specified in each dataset (x column and y column). Specification
-made with \code{dplyr::join_by()} are also accepted.}
-
-\item{threshold}{the distance threshold below which units should be considered a match}
-
-\item{n_bands}{the number of bands used in the LSH algorithm (default
-is 30). Use this in conjunction with the \code{band_width} to determine the
-performance of the hashing.}
-
-\item{band_width}{the length of each band used in the minihashing algorithm
-(default is 5) Use this in conjunction with the \code{n_bands} to determine
-the performance of the hashing.}
-
-\item{r}{the r hyperparameter used to govern the sensitivity of the locality sensitive hash, as described in}
-
-\item{progress}{set to \code{TRUE} to print progress}
-}
-\value{
-a tibble fuzzily-joined on the basis of the variables in \code{by.} Tries
-to adhere to the same standards as the dplyr-joins, and uses the same
-logical joining patterns (i.e. inner-join joins and keeps only observations in both datasets).
-}
-\description{
-Spatial Anti Join Using LSH
-}
-\examples{
-n <- 10
-
-X_1 <- matrix(c(seq(0,1,1/(n-1)), seq(0,1,1/(n-1))), nrow=n)
-X_2 <- X_1 + .0000001
-
-X_1 <- as.data.frame(X_1)
-X_2 <- as.data.frame(X_2)
-
-X_1$id_1 <- 1:n
-X_2$id_2 <- 1:n
-
-
-euclidean_anti_join(X_1, X_2, by = c("V1", "V2"), threshold =.00005)
-
-
-}
-\references{
-Datar, Mayur, Nicole Immorlica, Pitor Indyk, and Vahab Mirrokni.
-"Locality-Sensitive Hashing Scheme Based on p-Stable Distributions" SCG '04:
-Proceedings of the twentieth annual symposium on Computational geometry
-(2004): 253-262
-}
diff --git a/man/euclidean_full_join.Rd b/man/euclidean_full_join.Rd
deleted file mode 100644
index a3e93be..0000000
--- a/man/euclidean_full_join.Rd
+++ /dev/null
@@ -1,70 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/euclidean_logical_joins.R
-\name{euclidean_full_join}
-\alias{euclidean_full_join}
-\title{Spatial Full Join Using LSH}
-\usage{
-euclidean_full_join(
-  a,
-  b,
-  by = NULL,
-  threshold = 1,
-  n_bands = 30,
-  band_width = 5,
-  r = 0.5,
-  progress = FALSE
-)
-}
-\arguments{
-\item{a}{the first dataframe you wish to join.}
-
-\item{b}{the second dataframe you wish to join.}
-
-\item{by}{a named vector indicating which columns to join on. Format should
-be the same as dplyr: \code{by = c("column_name_in_df_a" = "column_name_in_df_b")}, but
-two columns must be specified in each dataset (x column and y column). Specification
-made with \code{dplyr::join_by()} are also accepted.}
-
-\item{threshold}{the distance threshold below which units should be considered a match}
-
-\item{n_bands}{the number of bands used in the LSH algorithm (default
-is 30). Use this in conjunction with the \code{band_width} to determine the
-performance of the hashing.}
-
-\item{band_width}{the length of each band used in the minihashing algorithm
-(default is 5) Use this in conjunction with the \code{n_bands} to determine
-the performance of the hashing.}
-
-\item{r}{the r hyperparameter used to govern the sensitivity of the locality sensitive hash, as described in}
-
-\item{progress}{set to \code{TRUE} to print progress}
-}
-\value{
-a tibble fuzzily-joined on the basis of the variables in \code{by.} Tries
-to adhere to the same standards as the dplyr-joins, and uses the same
-logical joining patterns (i.e. inner-join joins and keeps only observations in both datasets).
-}
-\description{
-Spatial Full Join Using LSH
-}
-\examples{
-n <- 10
-
-X_1 <- matrix(c(seq(0,1,1/(n-1)), seq(0,1,1/(n-1))), nrow=n)
-X_2 <- X_1 + .0000001
-
-X_1 <- as.data.frame(X_1)
-X_2 <- as.data.frame(X_2)
-
-X_1$id_1 <- 1:n
-X_2$id_2 <- 1:n
-
-euclidean_full_join(X_1, X_2, by = c("V1", "V2"), threshold =.00005)
-
-}
-\references{
-Datar, Mayur, Nicole Immorlica, Pitor Indyk, and Vahab Mirrokni.
-"Locality-Sensitive Hashing Scheme Based on p-Stable Distributions" SCG '04:
-Proceedings of the twentieth annual symposium on Computational geometry
-(2004): 253-262
-}
diff --git a/man/euclidean_inner_join.Rd b/man/euclidean_inner_join.Rd
deleted file mode 100644
index f1e3bfa..0000000
--- a/man/euclidean_inner_join.Rd
+++ /dev/null
@@ -1,71 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/euclidean_logical_joins.R
-\name{euclidean_inner_join}
-\alias{euclidean_inner_join}
-\title{Spatial Inner Join Using LSH}
-\usage{
-euclidean_inner_join(
-  a,
-  b,
-  by = NULL,
-  threshold = 1,
-  n_bands = 30,
-  band_width = 5,
-  r = 0.5,
-  progress = FALSE
-)
-}
-\arguments{
-\item{a}{the first dataframe you wish to join.}
-
-\item{b}{the second dataframe
-you wish to join.}
-
-\item{by}{a named vector indicating which columns to join on. Format should
-be the same as dplyr: \code{by = c("column_name_in_df_a" = "column_name_in_df_b")}, but
-two columns must be specified in each dataset (x column and y column).}
-
-\item{threshold}{the distance threshold below which units should be considered a match}
-
-\item{n_bands}{the number of bands used in the LSH algorithm (default
-is 30). Use this in conjunction with the \code{band_width} to determine the
-performance of the hashing.}
-
-\item{band_width}{the length of each band used in the minihashing algorithm
-(default is 5) Use this in conjunction with the \code{n_bands} to determine
-the performance of the hashing.}
-
-\item{r}{the r hyperparameter used to govern the sensitivity of the locality sensitive hash, as described in}
-
-\item{progress}{set to \code{TRUE} to print progress}
-}
-\value{
-a tibble fuzzily-joined on the basis of the variables in \code{by.} Tries
-to adhere to the same standards as the dplyr-joins, and uses the same
-logical joining patterns (i.e. inner-join joins and keeps only observations in both datasets).
-}
-\description{
-Spatial Inner Join Using LSH
-}
-\examples{
-n <- 10
-
-X_1 <- matrix(c(seq(0,1,1/(n-1)), seq(0,1,1/(n-1))), nrow=n)
-X_2 <- X_1 + .0000001
-
-X_1 <- as.data.frame(X_1)
-X_2 <- as.data.frame(X_2)
-
-X_1$id_1 <- 1:n
-X_2$id_2 <- 1:n
-
-euclidean_inner_join(X_1, X_2, by = c("V1", "V2"), threshold =.00005)
-
-
-}
-\references{
-Datar, Mayur, Nicole Immorlica, Pitor Indyk, and Vahab Mirrokni.
-"Locality-Sensitive Hashing Scheme Based on p-Stable Distributions" SCG '04:
-Proceedings of the twentieth annual symposium on Computational geometry
-(2004): 253-262
-}
diff --git a/man/euclidean_left_join.Rd b/man/euclidean_left_join.Rd
deleted file mode 100644
index b8c9e54..0000000
--- a/man/euclidean_left_join.Rd
+++ /dev/null
@@ -1,71 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/euclidean_logical_joins.R
-\name{euclidean_left_join}
-\alias{euclidean_left_join}
-\title{Spatial Left Join Using LSH}
-\usage{
-euclidean_left_join(
-  a,
-  b,
-  by = NULL,
-  threshold = 1,
-  n_bands = 30,
-  band_width = 5,
-  r = 0.5,
-  progress = FALSE
-)
-}
-\arguments{
-\item{a}{the first dataframe you wish to join.}
-
-\item{b}{the second dataframe you wish to join.}
-
-\item{by}{a named vector indicating which columns to join on. Format should
-be the same as dplyr: \code{by = c("column_name_in_df_a" = "column_name_in_df_b")}, but
-two columns must be specified in each dataset (x column and y column). Specification
-made with \code{dplyr::join_by()} are also accepted.}
-
-\item{threshold}{the distance threshold below which units should be considered a match}
-
-\item{n_bands}{the number of bands used in the LSH algorithm (default
-is 30). Use this in conjunction with the \code{band_width} to determine the
-performance of the hashing.}
-
-\item{band_width}{the length of each band used in the minihashing algorithm
-(default is 5) Use this in conjunction with the \code{n_bands} to determine
-the performance of the hashing.}
-
-\item{r}{the r hyperparameter used to govern the sensitivity of the locality sensitive hash, as described in}
-
-\item{progress}{set to \code{TRUE} to print progress}
-}
-\value{
-a tibble fuzzily-joined on the basis of the variables in \code{by.} Tries
-to adhere to the same standards as the dplyr-joins, and uses the same
-logical joining patterns (i.e. inner-join joins and keeps only observations in both datasets).
-}
-\description{
-Spatial Left Join Using LSH
-}
-\examples{
-n <- 10
-
-X_1 <- matrix(c(seq(0,1,1/(n-1)), seq(0,1,1/(n-1))), nrow=n)
-X_2 <- X_1 + .0000001
-
-X_1 <- as.data.frame(X_1)
-X_2 <- as.data.frame(X_2)
-
-X_1$id_1 <- 1:n
-X_2$id_2 <- 1:n
-
-euclidean_left_join(X_1, X_2, by = c("V1", "V2"), threshold =.00005)
-
-
-}
-\references{
-Datar, Mayur, Nicole Immorlica, Pitor Indyk, and Vahab Mirrokni.
-"Locality-Sensitive Hashing Scheme Based on p-Stable Distributions" SCG '04:
-Proceedings of the twentieth annual symposium on Computational geometry
-(2004): 253-262
-}
diff --git a/man/euclidean_right_join.Rd b/man/euclidean_right_join.Rd
deleted file mode 100644
index 0bfe8b8..0000000
--- a/man/euclidean_right_join.Rd
+++ /dev/null
@@ -1,70 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/euclidean_logical_joins.R
-\name{euclidean_right_join}
-\alias{euclidean_right_join}
-\title{Spatial Right Join Using LSH}
-\usage{
-euclidean_right_join(
-  a,
-  b,
-  by = NULL,
-  threshold = 1,
-  n_bands = 30,
-  band_width = 5,
-  r = 0.5,
-  progress = FALSE
-)
-}
-\arguments{
-\item{a}{the first dataframe you wish to join.}
-
-\item{b}{the second dataframe you wish to join.}
-
-\item{by}{a named vector indicating which columns to join on. Format should
-be the same as dplyr: \code{by = c("column_name_in_df_a" = "column_name_in_df_b")}, but
-two columns must be specified in each dataset (x column and y column). Specification
-made with \code{dplyr::join_by()} are also accepted.}
-
-\item{threshold}{the distance threshold below which units should be considered a match}
-
-\item{n_bands}{the number of bands used in the LSH algorithm (default
-is 30). Use this in conjunction with the \code{band_width} to determine the
-performance of the hashing.}
-
-\item{band_width}{the length of each band used in the minihashing algorithm
-(default is 5) Use this in conjunction with the \code{n_bands} to determine
-the performance of the hashing.}
-
-\item{r}{the r hyperparameter used to govern the sensitivity of the locality sensitive hash, as described in}
-
-\item{progress}{set to \code{TRUE} to print progress}
-}
-\value{
-a tibble fuzzily-joined on the basis of the variables in \code{by.} Tries
-to adhere to the same standards as the dplyr-joins, and uses the same
-logical joining patterns (i.e. inner-join joins and keeps only observations in both datasets).
-}
-\description{
-Spatial Right Join Using LSH
-}
-\examples{
-n <- 10
-
-X_1 <- matrix(c(seq(0,1,1/(n-1)), seq(0,1,1/(n-1))), nrow=n)
-X_2 <- X_1 + .0000001
-X_1 <- as.data.frame(X_1)
-X_2 <- as.data.frame(X_2)
-
-X_1$id_1 <- 1:n
-X_2$id_2 <- 1:n
-
-euclidean_right_join(X_1, X_2, by = c("V1", "V2"), threshold =.00005)
-
-
-}
-\references{
-Datar, Mayur, Nicole Immorlica, Pitor Indyk, and Vahab Mirrokni.
-"Locality-Sensitive Hashing Scheme Based on p-Stable Distributions" SCG '04:
-Proceedings of the twentieth annual symposium on Computational geometry
-(2004): 253-262
-}
diff --git a/man/logical-joins.Rd b/man/jaccard-joins.Rd
similarity index 100%
rename from man/logical-joins.Rd
rename to man/jaccard-joins.Rd
diff --git a/man/zoomerjoin-package.Rd b/man/zoomerjoin-package.Rd
index ef83641..da4dca2 100644
--- a/man/zoomerjoin-package.Rd
+++ b/man/zoomerjoin-package.Rd
@@ -23,7 +23,7 @@ Useful links:
 
 Other contributors:
 \itemize{
-  \item Etienne Bacher \email{etienne.bacher@protonmail.com} [contributor]
+  \item Etienne Bacher \email{etienne.bacher@protonmail.com} (\href{https://orcid.org/0000-0002-9271-5075}{ORCID}) [contributor]
   \item The authors of the dependency Rust crates (see inst/AUTHORS file for details) [contributor, copyright holder]
 }
 
diff --git a/zoomerjoin.Rproj b/zoomerjoin.Rproj
new file mode 100644
index 0000000..49f2092
--- /dev/null
+++ b/zoomerjoin.Rproj
@@ -0,0 +1,19 @@
+Version: 1.0
+
+RestoreWorkspace: Default
+SaveWorkspace: Default
+AlwaysSaveHistory: Default
+
+EnableCodeIndexing: Yes
+UseSpacesForTab: Yes
+NumSpacesForTab: 2
+Encoding: UTF-8
+
+RnwWeave: Sweave
+LaTeX: pdfLaTeX
+
+StripTrailingWhitespace: Yes
+
+BuildType: Package
+PackageUseDevtools: Yes
+PackageInstallArgs: --no-multiarch --with-keep.source

From 8613d278570bc3f7ba1d22b6ca7737ad0c8c6e80 Mon Sep 17 00:00:00 2001
From: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com>
Date: Tue, 13 Feb 2024 17:19:28 +0100
Subject: [PATCH 2/6] missing param

---
 R/euclidean_logical_joins.R | 2 ++
 man/euclidean-joins.Rd      | 4 ++++
 2 files changed, 6 insertions(+)

diff --git a/R/euclidean_logical_joins.R b/R/euclidean_logical_joins.R
index 8a4e9b5..e38938e 100644
--- a/R/euclidean_logical_joins.R
+++ b/R/euclidean_logical_joins.R
@@ -1,6 +1,8 @@
 #' Spatial joins Using LSH
 #'
 #' @inheritParams jaccard_left_join
+#' @param r Hyperparameter used to govern the sensitivity of the locality
+#'   sensitive hash.
 #'
 #' @return A tibble fuzzily-joined on the basis of the variables in `by.` Tries
 #'   to adhere to the same standards as the dplyr-joins, and uses the same
diff --git a/man/euclidean-joins.Rd b/man/euclidean-joins.Rd
index 36bbd73..ed56dc1 100644
--- a/man/euclidean-joins.Rd
+++ b/man/euclidean-joins.Rd
@@ -91,6 +91,9 @@ performance of the hashing. The default settings are for a
 than .2 have a >.1\% chance of being compared, while pairs with a similarity
 of greater than .8 have a >99.9\% chance of being compared.}
 
+\item{r}{Hyperparameter used to govern the sensitivity of the locality
+sensitive hash.}
+
 \item{progress}{Set to \code{TRUE} to print progress.}
 }
 \value{
@@ -105,6 +108,7 @@ Spatial joins Using LSH
 \examples{
 n <- 10
 
+# Build two matrices that have close values
 X_1 <- matrix(c(seq(0, 1, 1 / (n - 1)), seq(0, 1, 1 / (n - 1))), nrow = n)
 X_2 <- X_1 + .0000001
 

From 5e873a19eaf234e2b831bc43a51ef40b217129fd Mon Sep 17 00:00:00 2001
From: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com>
Date: Tue, 13 Feb 2024 17:23:51 +0100
Subject: [PATCH 3/6] param threshold has a different meaning

---
 R/euclidean_logical_joins.R | 4 ++++
 man/euclidean-joins.Rd      | 9 ++++-----
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/R/euclidean_logical_joins.R b/R/euclidean_logical_joins.R
index e38938e..59ca0a2 100644
--- a/R/euclidean_logical_joins.R
+++ b/R/euclidean_logical_joins.R
@@ -1,6 +1,10 @@
 #' Spatial joins Using LSH
 #'
 #' @inheritParams jaccard_left_join
+#' @param threshold The distance threshold below which units should be
+#'   considered a match. Note that contrary to Jaccard joins, this value is
+#'   about the distance and not the similarity. Therefore, a lower value means a
+#'   higher similarity.
 #' @param r Hyperparameter used to govern the sensitivity of the locality
 #'   sensitive hash.
 #'
diff --git a/man/euclidean-joins.Rd b/man/euclidean-joins.Rd
index ed56dc1..7f10bbc 100644
--- a/man/euclidean-joins.Rd
+++ b/man/euclidean-joins.Rd
@@ -71,11 +71,10 @@ be the same as dplyr: \code{by = c("column_name_in_df_a" = "column_name_in_df_b"
 (x column and y column). Specification made with \code{dplyr::join_by()} are
 also accepted.}
 
-\item{threshold}{The Jaccard similarity threshold above which two strings
-should be considered a match (default is .95). The similarity is equal to
-1 - the Jaccard distance between the two strings, so 1 implies the strings
-are identical, while a similarity of zero implies the strings are completely
-dissimilar.}
+\item{threshold}{The distance threshold below which units should be
+considered a match. Note that contrary to Jaccard joins, this value is
+about the distance and not the similarity. Therefore, a lower value means a
+higher similarity.}
 
 \item{n_bands}{The number of bands used in the minihash algorithm (default is
 40). Use this in conjunction with the \code{band_width} to determine the

From f5a234f1d1a6b0c8f1956c778452ebf6388d2f7c Mon Sep 17 00:00:00 2001
From: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com>
Date: Tue, 13 Feb 2024 17:34:47 +0100
Subject: [PATCH 4/6] remove .Rproj file

---
 .Rbuildignore    |  2 ++
 .gitignore       |  1 +
 zoomerjoin.Rproj | 19 -------------------
 3 files changed, 3 insertions(+), 19 deletions(-)
 delete mode 100644 zoomerjoin.Rproj

diff --git a/.Rbuildignore b/.Rbuildignore
index 2c3ae73..8d87092 100644
--- a/.Rbuildignore
+++ b/.Rbuildignore
@@ -18,3 +18,5 @@ src/rust/rehash.py
 src/rust/uncomment.sh
 ^cran-comments\.md$
 ^CRAN-SUBMISSION$
+^.*\.Rproj$
+^\.Rproj\.user$
diff --git a/.gitignore b/.gitignore
index 1e97c30..0b66bc9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
 docs
 inst/doc
 src/rust/vendor
+.Rproj.user
diff --git a/zoomerjoin.Rproj b/zoomerjoin.Rproj
deleted file mode 100644
index 49f2092..0000000
--- a/zoomerjoin.Rproj
+++ /dev/null
@@ -1,19 +0,0 @@
-Version: 1.0
-
-RestoreWorkspace: Default
-SaveWorkspace: Default
-AlwaysSaveHistory: Default
-
-EnableCodeIndexing: Yes
-UseSpacesForTab: Yes
-NumSpacesForTab: 2
-Encoding: UTF-8
-
-RnwWeave: Sweave
-LaTeX: pdfLaTeX
-
-StripTrailingWhitespace: Yes
-
-BuildType: Package
-PackageUseDevtools: Yes
-PackageInstallArgs: --no-multiarch --with-keep.source

From 081f1088693c155fbfe8f9d6e7445b4dde7c0ae2 Mon Sep 17 00:00:00 2001
From: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com>
Date: Tue, 13 Feb 2024 17:35:57 +0100
Subject: [PATCH 5/6] ignore Rproj file

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index 0b66bc9..89ea11f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,3 +2,4 @@ docs
 inst/doc
 src/rust/vendor
 .Rproj.user
+zoomerjoin.Rproj
\ No newline at end of file

From 19b995f880b1995fdaa57132ce11685ec4c4d6b2 Mon Sep 17 00:00:00 2001
From: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com>
Date: Tue, 13 Feb 2024 18:03:42 +0100
Subject: [PATCH 6/6] add more details for arg `r`

---
 R/euclidean_logical_joins.R | 6 +++++-
 man/euclidean-joins.Rd      | 6 +++++-
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/R/euclidean_logical_joins.R b/R/euclidean_logical_joins.R
index 59ca0a2..82b54a8 100644
--- a/R/euclidean_logical_joins.R
+++ b/R/euclidean_logical_joins.R
@@ -6,7 +6,11 @@
 #'   about the distance and not the similarity. Therefore, a lower value means a
 #'   higher similarity.
 #' @param r Hyperparameter used to govern the sensitivity of the locality
-#'   sensitive hash.
+#' sensitive hash. Corresponds to the width of the hash bucket in the LSH
+#' algorithm. Increasing values of `r` mean more hash collisions and higher
+#' sensitivity (fewer false-negatives) at the cost of lower specificity (more
+#' false-positives and longer run time). For more information, see the
+#' description in  <doi:10.1145/997817.997857>.
 #'
 #' @return A tibble fuzzily-joined on the basis of the variables in `by.` Tries
 #'   to adhere to the same standards as the dplyr-joins, and uses the same
diff --git a/man/euclidean-joins.Rd b/man/euclidean-joins.Rd
index 7f10bbc..ce7e8c9 100644
--- a/man/euclidean-joins.Rd
+++ b/man/euclidean-joins.Rd
@@ -91,7 +91,11 @@ than .2 have a >.1\% chance of being compared, while pairs with a similarity
 of greater than .8 have a >99.9\% chance of being compared.}
 
 \item{r}{Hyperparameter used to govern the sensitivity of the locality
-sensitive hash.}
+sensitive hash. Corresponds to the width of the hash bucket in the LSH
+algorithm. Increasing values of \code{r} mean more hash collisions and higher
+sensitivity (fewer false-negatives) at the cost of lower specificity (more
+false-positives and longer run time). For more information, see the
+description in  \url{doi:10.1145/997817.997857}.}
 
 \item{progress}{Set to \code{TRUE} to print progress.}
 }