Skip to content

Commit

Permalink
adding benchmark fastlink func
Browse files Browse the repository at this point in the history
  • Loading branch information
jw2249a committed May 28, 2024
1 parent 268572e commit c33b6ab
Show file tree
Hide file tree
Showing 2 changed files with 161 additions and 32 deletions.
54 changes: 43 additions & 11 deletions src/benchmark.jl
Original file line number Diff line number Diff line change
@@ -1,21 +1,53 @@
using DataFrames
using Base: postoutput
using Pkg
Pkg.develop(path="..")
using FastLink
using DataFrames
using BenchmarkTools
using CSV
import Pkg.Artifacts: @artifact_str
using Profile

outputfile = "../benchmark.csv"

include("utils/prettyprinting.jl")

a_fil="../../rstudio/test_merge/data/test_a.csv"
b_fil="../../rstudio/test_merge/data/test_b.csv"
a_fil="../../../rstudio/test_merge/data/test_a.csv"
b_fil="../../../rstudio/test_merge/data/test_b.csv"


config = Dict("link_type"=>"link_only",
"idvar"=> ["TV_ID", "TS_ID"],
"comparisons"=> Dict("name" => "total",
"threshold_match" => 0.88,
"variables" => [
Dict("varname" => "FIRST_NAME",
"method" => "jarowinkler",
"tf_adjust" => true),
Dict("varname" => "MIDDLE_NAME",
"method" => "exact",
"tf_adjust" => true),
Dict("varname" => "STREET_NAME",
"method" => "jarowinkler",
"tf_adjust" => true) ]))


config_tf = Dict("link_type"=>"link_only",
"idvar"=> ["TV_ID", "TS_ID"],
"comparisons"=> Dict("name" => "total",
"threshold_match" => 0.88,
"variables" => [
Dict("varname" => "FIRST_NAME",
"method" => "jarowinkler"),
Dict("varname" => "MIDDLE_NAME",
"method" => "exact"),
Dict("varname" => "STREET_NAME",
"method" => "jarowinkler")]))

open(outputfile, "w" do file
write(file,"\"N1\",\"N2\",\"u_FIRST_NAME\",\"u_MIDDLE_NAME\"")


#varnames=["FIRST_NAME"]
varnames=["FIRST_NAME", "MIDDLE_NAME", "LAST_NAME", "STREET_NAME"]
match_type=["fuzzy","fuzzy","fuzzy","fuzzy"]
#varnames=["FIRST_NAME", "MIDDLE_NAME", "LAST_NAME", "STREET_NAME", "STATE"]
#[100,200,500,1_000,2_000,4_000, 5_000, 10_000,20_000, 40_000, 50_000,100_000,1_000_000]
idvars=("TS_ID","TV_ID")
N2=20_000
N1_N=[10_000,50_000,100_000,500_000,750_000,1_000_000]
println("## $(length(varnames)) vars")
Expand All @@ -35,7 +67,7 @@ for N1 in N1_N
ntasks=1,
pool=true,
missingstring=["", "NA", "NaN", "NULL", "Null"])

@btime fastLink($dfA,$dfB,$varnames,$idvars,match_method=$match_type)
res = @benchmark fastLink($dfA,$dfB,$config)

end
139 changes: 118 additions & 21 deletions src/fastlink/fastlink.jl
Original file line number Diff line number Diff line change
Expand Up @@ -9,29 +9,10 @@ Algorithm taken from:
# Arguments
- `dfA::DataFrame`: Table of records to be matched.
- `dfB::DataFrame`: Table of records to be matched.
- `varnames::Vector{String}`: A vector that contains the variable names present in both tables.
- `idvar::Tuple{String,String}`: Tuple of unique ids for records in both tables (should differ from each other). Should be formatted as so that the arguments are ordered in ("dfa","dfb") order.
- `term_freq_adjustment::Vector{Bool}`: Determines whether you want the term frequencies for each comparision for a given variable. Note: does not adjust match weight.
- `match_method::Vector{String}`: Specifies the matching method you would like to do. Match methods supported are "string","exact","fuzzy" (jaro-winkler strings only),"numeric","float",and "int")
- `partials::Vector{Bool}` Specifies whether you want to do 2 (true) or 1 (false) comparison levels for a given variable.
- `upper_case::Vector{Bool}` that specifies whether a strings column value is upper or lower (only if `match_method` for column is "fuzzy").
- `stringdist_method::Vector{String}`: String distance method ("jw" Jaro-Winkler (Default), "dl" Damerau-Levenshtein, "jaro" Jaro, "lv" Levenshtein, and "ham" Hamming).
- `cut_a::Float`: First lower bound for string distance cutoff.
- `cut_p::Float`: Second lower bound for string distance (if varnames in partial).
- `jw_weight`: Winkler weight for jw string distance.
- `address_field::Vector{Bool}`: Specifies whether a field is an address field.
# Returns
- NamedTuple with these vars
indices iter_converge matched_ids obs_a
obs_b p_m p_u patterns_b
patterns_w pgamma_jm pgamma_ju pgamma_km
pgamma_ku tf_adj_table varnames zeta_j
- `config::Dict`: Configuration for the match.
# Examples
```julia
matched_data = fastLink(dfA, dfB, ["firstname", "lastname", "city"])
matched_data = fastLink(dfA, dfB, config)
"""
function fastLink(dfA::DataFrame, dfB::DataFrame, config::Dict{String,Any})
# idvar to Tuple
Expand Down Expand Up @@ -143,3 +124,119 @@ function fastLink(dfA::DataFrame, dfB::DataFrame, config::Dict{String,Any})
end
end


function fastLink(dfA::DataFrame, dfB::DataFrame, config::Dict{String,Any}, benchmark_N::Bool)
# idvar to Tuple
idvar=Tuple(config["idvar"])
# dims
varnames = retrieve(config,"varname")
numvars=length(varnames)
_dims = (nrow(dfA),nrow(dfB))

# results table
res = Dict(v=>DiBitMatrix(_dims...) for v in varnames)

# fetch parameters for varnames
parameters=Dict(v=>fetch_parameters(config,v) for v in varnames)

tf_tables = Dict{String, Vector{Vector{Float16}}}(v=>[ones(Float16,_dims[1]),ones(Float16,_dims[2])] for v in varnames if haskey(parameters[v], "tf_adjust") && parameters[v]["tf_adjust"])

# structure of the expectation maximization function in order of the ability to be executed
emlink_configuration = parse_configuration(config)
btimes =
# allow missing for comparisons
allowmissing!(dfA)
allowmissing!(dfB)

benchtimes = []
for v in varnames
starttime = time()
match_method = lowercase(parameters[v]["method"])
term_freq_adjustment = retrieve(parameters[v],"tf_adjust") |> x -> !isempty(x) && x |> first

@info "Now matching var $(v) using $(match_method) with tf_adjust: $term_freq_adjustment"
if term_freq_adjustment
comparisons_args=namedtuple(remove_keys(parameters[v], ["method", "varname", "tf_adjust", "tf_adjustment_weight"]))

if match_method == "fuzzy"
gammaCKfuzzy!(dfA[!,v],
dfB[!,v],
res[v],
view(tf_tables[v][1],:),
view(tf_tables[v][2],:);
comparisons_args...)
elseif match_method == "string"
gammaCKpar!(dfA[!,v],
dfB[!,v],
res[v],
view(tf_tables[v][1],:),
view(tf_tables[v][2],:);
comparisons_args...)
elseif match_method keys(STRING_DISTANCE_METHODS)
gammaCKpar!(dfA[!,v],
dfB[!,v],
res[v],
view(tf_tables[v][1],:),
view(tf_tables[v][2],:);
distmethod=STRING_DISTANCE_METHODS[match_method],
comparisons_args...)
elseif match_method == "exact" || match_method == "bool"
gammaKpar!(dfA[!,v],
dfB[!,v],
res[v],
view(tf_tables[v][1],:),
view(tf_tables[v][2],:);
comparisons_args...)
elseif match_method == "numeric" || match_method=="float" || match_method == "int"
gammaNUMCKpar!(dfA[!,v],
dfB[!,v],
res[v];
comparisons_args...)
end
else
comparisons_args=namedtuple(remove_keys(parameters[v], ["method", "varname"]))
if match_method == "fuzzy"
gammaCKfuzzy!(dfA[!,v],
dfB[!,v],
res[v];
comparisons_args...)
elseif match_method == "string"
gammaCKpar!(dfA[!,v],
dfB[!,v],
res[v];
comparisons_args...)
elseif match_method keys(STRING_DISTANCE_METHODS)
gammaCKpar!(dfA[!,v],
dfB[!,v],
res[v];
distmethod=STRING_DISTANCE_METHODS[match_method],
comparisons_args...)
elseif match_method == "exact" || match_method == "bool"
gammaKpar!(dfA[!,v],
dfB[!,v],
res[v];
comparisons_args...)
elseif match_method == "numeric" || match_method=="float" || match_method == "int"
gammaNUMCKpar!(dfA[!,v],
dfB[!,v],
res[v];
comparisons_args...)
end
end
push!(benchtimes, time() - starttime)
end

results = process_comparisons(res, emlink_configuration, _dims, parameters, tf_tables)

if length(results) == 3
return Dict("ids" => indices_to_uids(dfA[!, config["idvar"][1]],dfB[!, config["idvar"][2]],results[1].indices),
"resultsEM" => results[2],
"resultsTF" => results[3],
"benchtimes" => benchtimes)
else
return Dict("ids" => indices_to_uids(dfA[!, config["idvar"][1]],dfB[!, config["idvar"][2]],results[1].indices),
"resultsEM" => results[2],
"benchtimes" => benchtimes)
end
end

0 comments on commit c33b6ab

Please sign in to comment.