From e7caf579f1573bc1203694e481f94dd35f3fc9cd Mon Sep 17 00:00:00 2001 From: jw2249a Date: Mon, 25 Mar 2024 18:29:05 +0000 Subject: [PATCH] updating documentation --- README.md | 91 ++++++++++++++++++++++++++++++++++++++++ src/fastlink/fastlink.jl | 36 +++++++++------- 2 files changed, 111 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index 8324da0..01272d3 100644 --- a/README.md +++ b/README.md @@ -5,3 +5,94 @@ Fast Probabilistic Record Linkage for the Julia Language The purpose of FastLink.jl is to bring a fast record linkage package to the julia language. When attempting to match large datasets using existing libraries in R and Python, I found they can be very slow and succumb to issues with memory pressure. This implementation of the fastlink algorithm is intended to scale effeciently in parallel and be able to easily handle matches between tabular data that span millions of rows. [![Run tests](https://github.com/jw2249a/FastLink.jl/actions/workflows/test.yml/badge.svg)](https://github.com/jw2249a/FastLink.jl/actions/workflows/test.yml) + + +Using the fastLink function +___________________________ + +The basic arguments for the `fastLink` function to run are + +- `dfA`: A `DataFrame` table of records to be matched. + +- `dfB`: A `DataFrame` table of records to be matched. + +- `varnames` A `Vector{String}` that contains the variable names present in both tables. + +- `idvar`: A `Tuple{String,String}` that has the unique ids for records in both tables (should differ from each other). Should be formatted as so that the arguments are ordered in ("dfa","dfb") order. + +In addition, there are a number of optional parameters to assist with record linkage. For each of the parameters, you can either specify each variables value or specify a vector with 1 value to apply it to all relevant variables. If irrelevant, like `stringdist_method` for a numeric match method, it will be ignored. + +The optional parameters are. + +- `term_freq_adjustment`: A `Vector{Bool}` that determines whether you want the term frequencies for each comparision for a given variable. Note: does not adjust match weight. Default value `[false]`. + +- `match_method`: A `Vector{String}` that specifies the matching method you would like to do. Match methods supported are "string","exact","fuzzy" (jaro-winkler strings only),"numeric","float",and "int"). Default value inferred from column type. + +- `partials`: A `Vector{Bool}` that specifies whether you want to do 2 (true) or 1 (false) comparison levels for a given variable. Default value `[true]`. + +- `upper_case`: A `Vector{Bool}` that specifies whether a strings column value is upper or lower (only if `match_method` for column is "fuzzy"). Default value is `[true]`. + +- `stringdist_method`: A `Vector{String}` that specifies the desired string distance method ("jw" Jaro-Winkler (Default), "dl" Damerau-Levenshtein, "jaro" Jaro, "lv" Levenshtein, and "ham" Hamming). Default `["jw"]`. + +- `cut_a` A `Vector{Float}` that specifies the first lower bound for string distance cutoff for each comparison. Default `[0.92]`. + +- `cut_p::Float`: A `Vector{Float}` that specifies the second lower bound for string distance (if varnames in partial) for each comparison. Default `[0.88]`. + +- `jw_weight`: A `Vector{Float}` that specifies the Winkler weight for jw string distance for each comparison. Default `[0.1]`. + +- `address_field`: A `Vector{Bool}` that specifies whether a comparison contains an address field. Default `[false]`. + +- `tol_em`: Convergence tolerance for the EM Algorithm. (default `1e-05`) + +- `threshold_match`: Lower bound for the posterior probability that will act as a cutoff for matches. + +- `prior_lambda::Float64`: Default 0.0. + +- `w_lambda::Float64`: Default 0.0. + +- `prior_pi::Float64`: Default 0.0. + +- `w_pi::Float64`: Default 0.0. + +- `dedupe_matches`: Whether to dedupe the matches within the dataset. Default `[0.85]`. + + +`fastLink`'s output +__________________ +- `NamedTuple` with these vars +- `indices` + +- `iter_converge` + +- `matched_ids` + +- `obs_a` + +- `obs_b` + +- `p_m` + +- `p_u` + +- `patterns_b` + +- `patterns_w` + +- `pgamma_jm` + +- `pgamma_ju` + +- `pgamma_km` + +- `pgamma_ku` + +- `tf_adj_table` + +- `varnames` + +- `zeta_j` + +# Examples +```julia +matched_data = fastLink(dfA, dfB, ["firstname", "lastname", "city"], ("id","id2")) +``` diff --git a/src/fastlink/fastlink.jl b/src/fastlink/fastlink.jl index 3e67969..eeefcc3 100755 --- a/src/fastlink/fastlink.jl +++ b/src/fastlink/fastlink.jl @@ -65,21 +65,27 @@ Algorithm taken from: - Enamorado, Ted, Benjamin Fifield, and Kosuke Imai. 2017. fastLink: Fast Probabilistic Record Linkage with Missing Data. Version 0.6. # Arguments -- `dfA::DataFrame`: The first dataset to be matched. -- `dfB::DataFrame`: The second dataset to be matched. -- `varnames::Vector{String}`: Variable names to be used in the matching. -- `fuzzy::Bool`: Whether to match using a fuzzy string distance for speed (default false). -- `string_case::String`: "upper" or "lower" (only if fuzzy) -- `stringdist_method::String`: String distance method ("jw" Jaro-Winkler (Default), "dl" Damerau-Levenshtein, "jaro" Jaro, "lv" Levenshtein, and "ham" Hamming). -- `cut_a::Float`: Upper bound for string distance cutoff. -- `cut_p::Float`: Lower bound for string distance (if varnames in partial). +- `dfA::DataFrame`: Table of records to be matched. +- `dfB::DataFrame`: Table of records to be matched. +- `varnames::Vector{String}`: A vector that contains the variable names present in both tables. +- `idvar::Tuple{String,String}`: Tuple of unique ids for records in both tables (should differ from each other). Should be formatted as so that the arguments are ordered in ("dfa","dfb") order. +- `term_freq_adjustment::Vector{Bool}`: Determines whether you want the term frequencies for each comparision for a given variable. Note: does not adjust match weight. +- `match_method::Vector{String}`: Specifies the matching method you would like to do. Match methods supported are "string","exact","fuzzy" (jaro-winkler strings only),"numeric","float",and "int") +- `partials::Vector{Bool}` Specifies whether you want to do 2 (true) or 1 (false) comparison levels for a given variable. +- `upper_case::Vector{Bool}` that specifies whether a strings column value is upper or lower (only if `match_method` for column is "fuzzy"). +- `stringdist_method::Vector{String}`: String distance method ("jw" Jaro-Winkler (Default), "dl" Damerau-Levenshtein, "jaro" Jaro, "lv" Levenshtein, and "ham" Hamming). +- `cut_a::Float`: First lower bound for string distance cutoff. +- `cut_p::Float`: Second lower bound for string distance (if varnames in partial). - `jw_weight`: Winkler weight for jw string distance. -- `tol_em`: Convergence tolerance for the EM Algorithm. (default 1e-04) -- `threshold_match`: Lower bound for the posterior probability that will act as a cutoff for matches. -- `dedupe_matches`: Whether to dedupe the matches within the dataset. +- `address_field::Vector{Bool}`: Specifies whether a field is an address field. + # Returns -- `MatchedData::DataFrame`: The resulting DataFrame after matching. +- NamedTuple with these vars +indices iter_converge matched_ids obs_a +obs_b p_m p_u patterns_b +patterns_w pgamma_jm pgamma_ju pgamma_km +pgamma_ku tf_adj_table varnames zeta_j # Examples ```julia @@ -102,8 +108,7 @@ function fastLink(dfA::DataFrame, dfB::DataFrame, prior_pi = 0.0, w_pi = 0.0, threshold_match = 0.85, - dedupe_matches = true, - verbose = false) + dedupe_matches = true) # dims numvars=length(varnames) obs_a=nrow(dfA) @@ -254,8 +259,7 @@ function fastLink(dfA::DataFrame, dfB::DataFrame; prior_pi = 0.0, w_pi = 0.0, threshold_match = 0.85, - dedupe_matches = true, - verbose = false) + dedupe_matches = true) # idvar to Tuple idvar = (idvar[1],idvar[2])