diff --git a/src/fastlink/fastlink.jl b/src/fastlink/fastlink.jl index dc1824f..9f118af 100755 --- a/src/fastlink/fastlink.jl +++ b/src/fastlink/fastlink.jl @@ -132,7 +132,7 @@ function fastLink(dfA::DataFrame, dfB::DataFrame, config::Dict{String,Any}) end results = process_comparisons(res, emlink_configuration, _dims, parameters, tf_tables) - + if length(results) == 3 return Dict("ids" => indices_to_uids(dfA[!, config["idvar"][1]],dfB[!, config["idvar"][2]],results[1].indices), "resultsEM" => results[2], diff --git a/src/gammas/gammaCKpar.jl b/src/gammas/gammaCKpar.jl index 3214cc8..affd737 100644 --- a/src/gammas/gammaCKpar.jl +++ b/src/gammas/gammaCKpar.jl @@ -184,7 +184,7 @@ function gammaCKpar!(vecA::PooledVector,vecB::PooledVector, # term frequency adjustment for x tf_val_x = length(missingindices)/_dims[1] - for tf_i in missingindices + Threads.@threads for tf_i in missingindices tf_table_x[tf_i] = max(tf_val_y, tf_minimum_u_value) end @@ -200,7 +200,7 @@ function gammaCKpar!(vecA::PooledVector,vecB::PooledVector, missingindices = findall(vecB.refs .== missingvals_y) # term frequency adjustment for y tf_val_y = length(missingindices)/_dims[2] - for tf_i in missingindices + Threads.@threads for tf_i in missingindices tf_table_y[tf_i] = max(tf_val_y, tf_minimum_u_value) end diff --git a/src/patterns.jl b/src/patterns.jl index 16ee2c4..c123212 100644 --- a/src/patterns.jl +++ b/src/patterns.jl @@ -90,16 +90,14 @@ end function get_match_patterns(res::Vector{DiBitMatrix}, tf_tables::Dict{String, Vector{Vector{Float16}}}, tf_vars::Vector{String}, tf_indices::Vector{Int64}, isexact=Bool[]) - tf_patterns = Dict("relevant_tf_indices"=>Vector{Int64}[], - "tf_denom_vals"=>Vector{Vector{Float16}}[]) - + "tf_denom_vals"=>Vector{Vector{Float16}}[]) matches=MatchPatterns() N = length(res) dimy=res[1].nrows len=Int(res[1].data.len) lk = ReentrantLock() - Threads.@threads for first_loc in 0:1024:len + for first_loc in 0:1024:len last_loc = first_loc + 1024 if last_loc > len last_loc=len @@ -108,22 +106,20 @@ function get_match_patterns(res::Vector{DiBitMatrix}, tf_tables::Dict{String, Ve patterns=get_local_patterns(x,N,last_loc-first_loc) for i in eachindex(patterns.hashes) lock(lk) do - id = findfirst(patterns.hashes[i] .=== matches.hashes) pattern_indices = get_2Dindex.(first_loc .+ patterns.indices[i],dimy) if isnothing(id) push!(matches.patterns,patterns.patterns[i]) push!(matches.hashes,patterns.hashes[i]) push!(matches.indices, pattern_indices) - relevant_tf_indices = find_tf_pattern_vars(patterns.patterns[i], tf_indices) + tfi_ids = [findfirst(tfi .== tf_indices) for tfi in relevant_tf_indices] push!(tf_patterns["relevant_tf_indices"], relevant_tf_indices) - push!(tf_patterns["tf_denom_vals"], [match_level_tf_lookup(tf_tables[tf_vars[tfi]], pattern_indices, isexact[tfi]) - for tfi in relevant_tf_indices]) + push!(tf_patterns["tf_denom_vals"], [match_level_tf_lookup(tf_tables[tf_vars[tfi]], pattern_indices, isexact[tfi]) for tfi in tfi_ids]) else + tfi_ids = [findfirst(tfi .== tf_indices) for tfi in tf_patterns["relevant_tf_indices"][id]] append!(matches.indices[id], pattern_indices) - - for (tfi_loc, tfi) in enumerate(tf_patterns["relevant_tf_indices"][id]) + for (tfi_loc, tfi) in enumerate(tfi_ids) append!(tf_patterns["tf_denom_vals"][id][tfi_loc], match_level_tf_lookup(tf_tables[tf_vars[tfi]], pattern_indices, isexact[tfi])) end end @@ -224,7 +220,7 @@ function match_and_link(patterns::Vector{DiBitMatrix}, e::Dict{String, Any}, _di e["parameters"]...) tf_prior_weights = get_tf_adjustment_prior_weights(parameters, tf_vars) - resultsTF = generate_tf_adjustment_dict(resultsEM, tf_patterns, tf_prior_weights; base="log") + resultsTF = generate_tf_adjustment_dict(resultsEM, e, tf_vars, tf_patterns, tf_prior_weights; base="log") if e["name"] != final_name return patterns_to_DiBit(resultsTF, counts.indices, _dims) diff --git a/src/term_frequency_adjustment.jl b/src/term_frequency_adjustment.jl index 3fc3a65..eea8bd9 100644 --- a/src/term_frequency_adjustment.jl +++ b/src/term_frequency_adjustment.jl @@ -72,14 +72,17 @@ function pattern_tf_adjustment!(tf_result::Dict{String, Any}, colnames::Vector{S return nothing end -function generate_tf_adjustment_dict(EMOutput::Dict{String,Any}, tfPatterns::Dict{String,Vector}, tf_prior_weights::Vector{Float64}; base="log2") +function generate_tf_adjustment_dict(EMOutput::Dict{String,Any},e::Dict{String, Any}, tf_vars::Vector{String}, tfPatterns::Dict{String,Vector}, tf_prior_weights::Vector{Float64}; base="log2") tfResults = generate_tf_skeleton(EMOutput, tfPatterns["relevant_tf_indices"]) threshold_match = EMOutput["threshold_match"] for pattern_id in collect(1:EMOutput["number_of_unique_patterns"]) - colindices = tfPatterns["relevant_tf_indices"][pattern_id] + colindices = tfPatterns["relevant_tf_indices"][pattern_id] + count = EMOutput["patterns_w"].counts[pattern_id] tf_uvals = get_tf_u_values(EMOutput["patterns_w"], colindices, pattern_id) - tf_pw = tf_prior_weights[colindices] + + ci = [findfirst(v .== tf_vars) for v in e["variables"][colindices]] + tf_pw = tf_prior_weights[ci] if tfResults[pattern_id]["tf_adjusted"] diff --git a/test_parameters.json b/test_parameters.json index 1a52fc8..d7c62d2 100644 --- a/test_parameters.json +++ b/test_parameters.json @@ -9,13 +9,14 @@ "variables": [ {"varname": "firstname", "method": "fuzzy", "partial": true, "cut_a": 0.92, "cut_b": 0.88, "upper": true, "tf_adjust": true, "w": 0.1}, {"varname": "middlename", "method": "exact"}, - {"varname": "lastname", "method": "jarowinkler"}, + {"varname": "lastname", "method": "jarowinkler", "tf_adjust": true}, + {"varname": "birthyear", "method": "exact"}, { "comparisons": { "name": "address", "threshold_match": 0.92, "variables": [ - {"varname": "housenum", "method": "exact", "tf_adjust": true, "tf_adjustment_weight":0.5, "tf_minimum_u_value": 0.001}, + {"varname": "housenum", "method": "exact", "tf_adjust": true}, {"varname": "streetname", "method": "jarowinkler", "w": 0.1, "tf_adjust": true, "tf_adjustment_weight":0.25, "tf_minimum_u_value": 0.001}, {"varname": "city", "method": "jarowinkler", "tf_adjustment_weight":0.15, "tf_adjust": true} ]