adding benchmark fastlink func

jw2249a · May 28, 2024 · c33b6ab · c33b6ab
1 parent 268572e
commit c33b6ab
Show file tree

Hide file tree

Showing 2 changed files with 161 additions and 32 deletions.
diff --git a/src/benchmark.jl b/src/benchmark.jl
@@ -1,21 +1,53 @@
-using DataFrames
+using Base: postoutput
+using Pkg
+Pkg.develop(path="..")
 using FastLink
+using DataFrames
 using BenchmarkTools
 using CSV
 import Pkg.Artifacts: @artifact_str
 using Profile
 
+outputfile = "../benchmark.csv"
+
 include("utils/prettyprinting.jl")
 
-a_fil="../../rstudio/test_merge/data/test_a.csv"
-b_fil="../../rstudio/test_merge/data/test_b.csv"
+a_fil="../../../rstudio/test_merge/data/test_a.csv"
+b_fil="../../../rstudio/test_merge/data/test_b.csv"
+
+
+config = Dict("link_type"=>"link_only",
+     "idvar"=> ["TV_ID", "TS_ID"],
+     "comparisons"=> Dict("name" => "total",
+                         "threshold_match" => 0.88,
+                         "variables" => [
+                             Dict("varname" => "FIRST_NAME",
+                                  "method" => "jarowinkler",
+                                  "tf_adjust" => true),
+                             Dict("varname" => "MIDDLE_NAME",
+                                  "method" => "exact",
+                                  "tf_adjust" => true),
+                             Dict("varname" => "STREET_NAME",
+                                  "method" => "jarowinkler",
+                                  "tf_adjust" => true) ]))
+
+
+config_tf = Dict("link_type"=>"link_only",
+     "idvar"=> ["TV_ID", "TS_ID"],
+     "comparisons"=> Dict("name" => "total",
+                         "threshold_match" => 0.88,
+                         "variables" => [
+                             Dict("varname" => "FIRST_NAME",
+                                  "method" => "jarowinkler"),
+                             Dict("varname" => "MIDDLE_NAME",
+                                  "method" => "exact"),
+                             Dict("varname" => "STREET_NAME",
+                                  "method" => "jarowinkler")]))
+
+open(outputfile, "w" do file
+         write(file,"\"N1\",\"N2\",\"u_FIRST_NAME\",\"u_MIDDLE_NAME\"")
+
 
-#varnames=["FIRST_NAME"]
-varnames=["FIRST_NAME", "MIDDLE_NAME", "LAST_NAME", "STREET_NAME"]
-match_type=["fuzzy","fuzzy","fuzzy","fuzzy"]
-#varnames=["FIRST_NAME", "MIDDLE_NAME", "LAST_NAME", "STREET_NAME", "STATE"]
-#[100,200,500,1_000,2_000,4_000, 5_000, 10_000,20_000, 40_000, 50_000,100_000,1_000_000]
-idvars=("TS_ID","TV_ID")
 N2=20_000
 N1_N=[10_000,50_000,100_000,500_000,750_000,1_000_000]
 println("## $(length(varnames)) vars")
@@ -35,7 +67,7 @@ for N1 in N1_N
                  ntasks=1,
                  pool=true,
                  missingstring=["", "NA", "NaN", "NULL", "Null"])
-
-    @btime fastLink($dfA,$dfB,$varnames,$idvars,match_method=$match_type)
+    
+    res = @benchmark fastLink($dfA,$dfB,$config)
 
 end
diff --git a/src/fastlink/fastlink.jl b/src/fastlink/fastlink.jl
@@ -9,29 +9,10 @@ Algorithm taken from:
 # Arguments
 - `dfA::DataFrame`: Table of records to be matched.
 - `dfB::DataFrame`: Table of records to be matched.
-- `varnames::Vector{String}`: A vector that contains the variable names present in both tables.
-- `idvar::Tuple{String,String}`: Tuple of unique ids for records in both tables (should differ from each other). Should be formatted as so that the arguments are ordered in ("dfa","dfb") order.
-- `term_freq_adjustment::Vector{Bool}`: Determines whether you want the term frequencies for each comparision for a given variable. Note: does not adjust match weight.
-- `match_method::Vector{String}`: Specifies the matching method you would like to do. Match methods supported are "string","exact","fuzzy" (jaro-winkler strings only),"numeric","float",and "int")
-- `partials::Vector{Bool}` Specifies whether you want to do 2 (true) or 1 (false) comparison levels for a given variable.
-- `upper_case::Vector{Bool}` that specifies whether a strings column value is upper or lower (only if `match_method` for column is "fuzzy").
-- `stringdist_method::Vector{String}`: String distance method ("jw" Jaro-Winkler (Default), "dl" Damerau-Levenshtein, "jaro" Jaro, "lv" Levenshtein, and "ham" Hamming).
-- `cut_a::Float`: First lower bound for string distance cutoff.
-- `cut_p::Float`: Second lower bound for string distance (if varnames in partial).
-- `jw_weight`: Winkler weight for jw string distance.
-- `address_field::Vector{Bool}`: Specifies whether a field is an address field.
-
-
-# Returns
-- NamedTuple with these vars
-indices        iter_converge  matched_ids    obs_a
-obs_b          p_m            p_u            patterns_b
-patterns_w     pgamma_jm      pgamma_ju      pgamma_km
-pgamma_ku      tf_adj_table   varnames       zeta_j
-
+- `config::Dict`: Configuration for the match.
 # Examples
 ```julia
-matched_data = fastLink(dfA, dfB, ["firstname", "lastname", "city"])
+matched_data = fastLink(dfA, dfB, config)
 """
 function fastLink(dfA::DataFrame, dfB::DataFrame, config::Dict{String,Any})
     # idvar to Tuple
@@ -143,3 +124,119 @@ function fastLink(dfA::DataFrame, dfB::DataFrame, config::Dict{String,Any})
     end
 end
 
+
+function fastLink(dfA::DataFrame, dfB::DataFrame, config::Dict{String,Any}, benchmark_N::Bool)
+    # idvar to Tuple
+    idvar=Tuple(config["idvar"])
+    # dims
+    varnames = retrieve(config,"varname")
+    numvars=length(varnames)
+    _dims = (nrow(dfA),nrow(dfB))
+
+    # results table
+    res = Dict(v=>DiBitMatrix(_dims...) for v in varnames)
+
+    # fetch parameters for varnames
+    parameters=Dict(v=>fetch_parameters(config,v) for v in varnames)
+
+    tf_tables = Dict{String, Vector{Vector{Float16}}}(v=>[ones(Float16,_dims[1]),ones(Float16,_dims[2])] for v in varnames if haskey(parameters[v], "tf_adjust") && parameters[v]["tf_adjust"])
+
+    # structure of the expectation maximization function in order of the ability to be executed
+    emlink_configuration = parse_configuration(config)
+    btimes = 
+    # allow missing for comparisons
+    allowmissing!(dfA)
+    allowmissing!(dfB)
+
+    benchtimes = []
+    for v in varnames
+        starttime = time()
+        match_method = lowercase(parameters[v]["method"])
+        term_freq_adjustment = retrieve(parameters[v],"tf_adjust") |> x -> !isempty(x) && x |> first
+
+        @info "Now matching var $(v) using $(match_method) with tf_adjust: $term_freq_adjustment"
+        if term_freq_adjustment
+            comparisons_args=namedtuple(remove_keys(parameters[v], ["method", "varname", "tf_adjust", "tf_adjustment_weight"]))
+
+            if match_method == "fuzzy"
+                gammaCKfuzzy!(dfA[!,v],
+                              dfB[!,v],
+                              res[v],
+                              view(tf_tables[v][1],:),
+                              view(tf_tables[v][2],:);
+                              comparisons_args...)
+            elseif match_method == "string"
+                gammaCKpar!(dfA[!,v],
+                            dfB[!,v],
+                            res[v],
+                            view(tf_tables[v][1],:),
+                            view(tf_tables[v][2],:);
+                            comparisons_args...)
+            elseif match_method ∈ keys(STRING_DISTANCE_METHODS)
+                gammaCKpar!(dfA[!,v],
+                            dfB[!,v],
+                            res[v],
+                            view(tf_tables[v][1],:),
+                            view(tf_tables[v][2],:);
+                            distmethod=STRING_DISTANCE_METHODS[match_method],
+                            comparisons_args...)
+            elseif match_method == "exact" || match_method == "bool"
+                gammaKpar!(dfA[!,v],
+                           dfB[!,v],
+                           res[v],
+                           view(tf_tables[v][1],:),
+                           view(tf_tables[v][2],:);
+                           comparisons_args...)
+            elseif match_method == "numeric" || match_method=="float" || match_method == "int"
+                gammaNUMCKpar!(dfA[!,v],
+                               dfB[!,v],
+                               res[v];
+                               comparisons_args...)
+            end
+        else
+            comparisons_args=namedtuple(remove_keys(parameters[v], ["method", "varname"]))
+            if match_method == "fuzzy"
+                gammaCKfuzzy!(dfA[!,v],
+                              dfB[!,v],
+                              res[v];
+                              comparisons_args...)
+            elseif match_method == "string"
+                gammaCKpar!(dfA[!,v],
+                            dfB[!,v],
+                            res[v];
+                            comparisons_args...)
+            elseif match_method ∈ keys(STRING_DISTANCE_METHODS)
+                gammaCKpar!(dfA[!,v],
+                            dfB[!,v],
+                            res[v];
+                            distmethod=STRING_DISTANCE_METHODS[match_method],
+                            comparisons_args...)
+            elseif match_method == "exact" || match_method == "bool"
+                gammaKpar!(dfA[!,v],
+                           dfB[!,v],
+                           res[v];
+                           comparisons_args...)
+            elseif match_method == "numeric" || match_method=="float" || match_method == "int"
+                gammaNUMCKpar!(dfA[!,v],
+                               dfB[!,v],
+                               res[v];
+                               comparisons_args...)
+                end
+        end
+        push!(benchtimes, time() - starttime)
+    end     
+
+    results = process_comparisons(res, emlink_configuration, _dims, parameters, tf_tables)
+
+    if length(results)  == 3
+        return Dict("ids" => indices_to_uids(dfA[!, config["idvar"][1]],dfB[!, config["idvar"][2]],results[1].indices),
+                "resultsEM" => results[2],
+                    "resultsTF" => results[3],
+                    "benchtimes" => benchtimes)
+    else
+        return Dict("ids" => indices_to_uids(dfA[!, config["idvar"][1]],dfB[!, config["idvar"][2]],results[1].indices),
+                    "resultsEM" => results[2],
+                     "benchtimes" => benchtimes)
+    end
+end
+