From c54a70439e77a7630aea4db745dca8c9b5c9aa5a Mon Sep 17 00:00:00 2001 From: jw2249a Date: Mon, 25 Mar 2024 04:47:54 -0400 Subject: [PATCH] term frequency adjustment fix --- .dir-locals.el | 0 Project.toml | 2 +- scratch.jl | 17 ++-- src/encode/soundex.jl | 67 +++++++++++++++ src/fastlink/fastlink.jl | 108 ++++++++++++++++++------- src/gammas/gammaCKfuzzy.jl | 85 +++++++++++++++++++ src/gammas/gammaCKpar.jl | 162 +++++++++++++++++++++++++++++++++++++ src/getMatches.jl | 19 ++++- src/matchPatterns.jl | 3 +- 9 files changed, 422 insertions(+), 41 deletions(-) delete mode 100644 .dir-locals.el create mode 100644 src/encode/soundex.jl diff --git a/.dir-locals.el b/.dir-locals.el deleted file mode 100644 index e69de29..0000000 diff --git a/Project.toml b/Project.toml index 79244d1..dfd8b06 100644 --- a/Project.toml +++ b/Project.toml @@ -8,6 +8,7 @@ DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" PooledArrays = "2dfb63ee-cc39-5dd5-95bd-886bf059d720" +StaticStrings = "4db0a0c5-418a-4e1d-8806-cb305fe13294" StringDistances = "88034a9c-02f8-509d-84a9-84ec65e18404" [extras] @@ -16,6 +17,5 @@ CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b" Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" - [targets] test = ["Test", "CSV", "Pkg"] diff --git a/scratch.jl b/scratch.jl index 44a0b76..ee80e9f 100755 --- a/scratch.jl +++ b/scratch.jl @@ -1,5 +1,6 @@ using Pkg -#Pkg.develop(path=".") +Pkg.develop(path=".") +Pkg.precompile() using DataFrames using BenchmarkTools using CSV @@ -26,6 +27,8 @@ dfB=CSV.read("$(b_fil)/dfB.csv", DataFrame, ntasks=1, pool=true, missingstring=["", "NA"]) +dfA.id = hash.(eachrow(dfA)) +dfB.id2 = hash.(eachrow(dfB)) for var in varnames[1:3] @@ -33,16 +36,12 @@ for var in varnames[1:3] dfB[!,var] = PooledArray(passmissing(x->uppercase(x)).(dfB[:,var])) end -config = fastLink(dfA,dfB,varnames,match_method=match_method,cut_a=cut_a,cut_p=cut_p, +results=fastLink(dfA,dfB,varnames,("id","id2"), + match_method=match_method, + term_freq_adjustment=[true], + cut_a=cut_a,cut_p=cut_p, threshold_match = 0.85) -dump(config.fastlink_settings.comparison_funs[4]) -results=fastLink(dfA,dfB,varnames,match_method=match_method,cut_a=cut_a,cut_p=cut_p, - threshold_match = 0.85)() - -x=results[1].patterns_w -x[findall(ismissing.(x.gamma_4) .== false .&& x.gamma_4 .== 1),:] -x[findall(ismissing.(x.gamma_4)),:] diff --git a/src/encode/soundex.jl b/src/encode/soundex.jl new file mode 100644 index 0000000..79044ae --- /dev/null +++ b/src/encode/soundex.jl @@ -0,0 +1,67 @@ +module Soundex +export soundex +import StaticStrings: StaticString + +function encode(chr::Char)::Char + if in(chr, ['a', 'e', 'i', 'o','y', 'u']) + return '9' + elseif in(chr,['s','c','k','j','g','z','x','q']) + return '2' + elseif in(chr,['n','m']) + return '5' + elseif 'l' == chr + return '4' + elseif 'r' == chr + return '6' + elseif in(chr,['t','d']) + return '3' + elseif in(chr,['h','w']) + return '8' + elseif in(chr,['v','b','p','f']) + return '1' + elseif chr === ' ' + return '7' + else + @error "Unknown character encountered $chr" + end +end + +function soundex(s::T)::StaticString{4} where T <: AbstractString + output = ['0' for _ in 1:4] + index = 1 + chr=lowercase(s[1]) + previous_encoding = encode(chr) + output[index] = uppercase(chr) + for chr in s[2:end] + if index <= 3 + encoding = encode(lowercase(chr)) + if encoding === '7' + # continue if space + continue + end + # if vowel or ignorable vars 'h' or 'w' + if encoding !== '9' && encoding !== '8' + if encoding !== previous_encoding + # Rule 4 on Consonant Separators + if encoding === output[index] + # If a vowel (A, E, I, O, U) separates two consonants that have the same code, code consonant to the right of vowel + if previous_encoding === '9' + index += 1 + output[index] = encoding + end + # If "H" or "W" separate two consonants that have code, the consonant to the right of the vowel is not coded + else + # different letter code soundex + index += 1 + output[index] = encoding + end + end + end + else + break + end + previous_encoding = encoding + end + return StaticString(String(output)) +end +end # module diff --git a/src/fastlink/fastlink.jl b/src/fastlink/fastlink.jl index 3b70f54..24c2a47 100755 --- a/src/fastlink/fastlink.jl +++ b/src/fastlink/fastlink.jl @@ -88,6 +88,7 @@ matched_data = fastLink(dfA, dfB, ["firstname", "lastname", "city"]) function fastLink(dfA::DataFrame, dfB::DataFrame, varnames::Vector{String}, idvar::Tuple{String,String}; + term_freq_adjustment=[false], match_method=String[], partials=[true], upper_case=[true], @@ -116,11 +117,17 @@ function fastLink(dfA::DataFrame, dfB::DataFrame, cut_a = check_input_lengths(cut_a, numvars, "cut_a") cut_p = check_input_lengths(cut_p, numvars, "cut_p") address_field = check_input_lengths(address_field, numvars, "address_field") + term_freq_adjustment = check_input_lengths(term_freq_adjustment, numvars, "term_freq_adjustment") stringdist_method = check_input_lengths(stringdist_method, numvars, "stringdist_method") vartypes, comparison_levels = check_var_types(dfA,dfB,varnames,match_method,partials) - + + # results table res = [DiBitMatrix(obs_a,obs_b) for _ in varnames] + + # term frequency tables + tf_table_x = [ones(Float16,dims[1]) for _ in varnames] + tf_table_y = [ones(Float16,dims[2]) for _ in varnames] # allow missing for comparisons allowmissing!(dfA) @@ -131,31 +138,67 @@ function fastLink(dfA::DataFrame, dfB::DataFrame, for i in eachindex(varnames) @info "Now matching var $(varnames[i]) using $(match_method[i])" if match_method[i] == "fuzzy" - gammaCKfuzzy!(dfA[!,varnames[i]], - dfB[!,varnames[i]], - res[i], - dims, - cut_a=cut_a[i], - cut_b=cut_p[i], - upper=upper_case[i], - w=jw_weight[i], - partial=partials[i]) + if term_freq_adjustment[i] + gammaCKfuzzy!(dfA[!,varnames[i]], + dfB[!,varnames[i]], + res[i], + dims, + view(tf_table_x[i],:), + view(tf_table_y[i],:), + cut_a=cut_a[i], + cut_b=cut_p[i], + upper=upper_case[i], + w=jw_weight[i], + partial=partials[i]) + else + gammaCKfuzzy!(dfA[!,varnames[i]], + dfB[!,varnames[i]], + res[i], + dims, + cut_a=cut_a[i], + cut_b=cut_p[i], + upper=upper_case[i], + w=jw_weight[i], + partial=partials[i]) + end elseif match_method[i] == "string" - gammaCKpar!(dfA[!,varnames[i]], - dfB[!,varnames[i]], - res[i], - dims, - distmethod=stringdist_method[i], - cut_a=cut_a[i], - cut_b=cut_p[i], - w=jw_weight[i], - partial=partials[i]) - + if term_freq_adjustment[i] + gammaCKpar!(dfA[!,varnames[i]], + dfB[!,varnames[i]], + res[i], + dims, + view(tf_table_x[i],:), + view(tf_table_y[i],:), + distmethod=stringdist_method[i], + cut_a=cut_a[i], + cut_b=cut_p[i], + w=jw_weight[i], + partial=partials[i]) + else + gammaCKpar!(dfA[!,varnames[i]], + dfB[!,varnames[i]], + res[i], + dims, + distmethod=stringdist_method[i], + cut_a=cut_a[i], + cut_b=cut_p[i], + w=jw_weight[i], + partial=partials[i]) + end elseif match_method[i] == "exact" || match_method[i] == "bool" - gammaKpar!(dfA[!,varnames[i]], - dfB[!,varnames[i]], - res[i], - dims) + if term_freq_adjustment[i] + gammaKpar!(dfA[!,varnames[i]], + dfB[!,varnames[i]], + res[i], + dims, + view(tf_table_x[i],:), + view(tf_table_y[i],:)) + else + gammaKpar!(dfA[!,varnames[i]], + dfB[!,varnames[i]], + res[i], + dims) + end elseif match_method == "numeric" || match_method=="float" || match_method == "int" gammaNUMCKpar!(dfA[!,varnames[i]], dfB[!,varnames[i]], @@ -176,11 +219,20 @@ function fastLink(dfA::DataFrame, dfB::DataFrame, address_field=address_field) # testing removing uncessessary indices (where no obs exist) #remove_no_matched_var_indices(resultsEM) - # adding uids - resultsEM = merge(resultsEM, (matched_ids = indices_to_uids(dfA[!, idvar[1]],dfB[!, idvar[2]],resultsEM.indices),)) + # adding uids + + if any(term_freq_adjustment) + resultsEM = merge(resultsEM, (matched_ids = indices_to_uids(dfA[!, idvar[1]],dfB[!, idvar[2]],resultsEM.indices), + tf_adj_table = tf_adj_table(resultsEM,varnames,tf_table_x,tf_table_y))) + + else + resultsEM = merge(resultsEM, (matched_ids = indices_to_uids(dfA[!, idvar[1]],dfB[!, idvar[2]],resultsEM.indices),)) + end + + @info "Retrieving matches" - getMatches(resultsEM,threshold_match=threshold_match) + getMatches!(resultsEM,threshold_match=threshold_match) return (resultsEM) end @@ -282,7 +334,7 @@ function fastLink(dfA::DataFrame, dfB::DataFrame; @info "Retrieving matches" - getMatches(resultsEM,threshold_match=threshold_match) + getMatches!(resultsEM,threshold_match=threshold_match) return (resultsEM) end diff --git a/src/gammas/gammaCKfuzzy.jl b/src/gammas/gammaCKfuzzy.jl index 2084cdf..82616c0 100644 --- a/src/gammas/gammaCKfuzzy.jl +++ b/src/gammas/gammaCKfuzzy.jl @@ -235,3 +235,88 @@ function gammaCKfuzzy!(vecA::PooledVector,vecB::PooledVector,results::DiBitMatri end return nothing end + + +function gammaCKfuzzy!(vecA::PooledVector,vecB::PooledVector,results::DiBitMatrix,dims::Tuple{Int,Int}, + tf_table_x::SubArray{Float16}, + tf_table_y::SubArray{Float16}; + cut_a::Float64=0.92,cut_b::Float64=0.88,upper::Bool=true, + w::Float64=0.1,partial::Bool=true) + + + # functions that update the results view + if partial + score_value! = score_value2 + else + score_value! = score_value + end + + # change the range of ascii characters dependent on case + if upper + space_char,max_char = 0x40,0x5a + else + space_char,max_char = 0x60,0x7a + end + # length of unique values + lenA = UInt32(length(vecA.pool)) + lenB = UInt32(length(vecB.pool)) + + # vector of pool value indices + lookup_a_by_id=pool_lookup_table(vecA.refs, lenA) + lookup_b_by_id=pool_lookup_table(vecB.refs, lenB) + + dims=(length(vecA),length(vecB)) + + # term frequency for x + Threads.@threads for i in lookup_a_by_id + tf_val=length(i)/dims[1] + for ii in i + tf_table_x[ii] = tf_val + end + end + + # term frequency for y + Threads.@threads for i in lookup_b_by_id + tf_val=length(i)/dims[2] + for ii in i + tf_table_y[ii] = tf_val + end + end + + missingindexA = find_missing_index(vecA.invpool) + + base_candidate_lookup = build_candidate_lookup(vecB.pool,spaceletter=space_char,lastletter=max_char) + base_candidate_scores = build_candidate_scores(vecB.pool) + + Threads.@threads for (query_name,new_a_id) in collect(vecA.invpool) + # pass if query is missing val + if new_a_id === missingindexA + update_results!(results, lookup_a_by_id[new_a_id],UInt32(1):UInt32(dims[2]),missingval) + continue + end + + query_len = UInt8(min(ncodeunits(query_name),16)) + query_masks_lookup = maskify(query_name,query_len,space_char=space_char,max_char=max_char) + query_partial = UInt16(1024 รท query_len) + candidate_scores = deepcopy(base_candidate_scores) + + for (query_index, (letter_index, query_mask_by_candidate_len)) in enumerate(query_masks_lookup) + for c_info in base_candidate_lookup[letter_index] + candidate_score = candidate_scores[c_info.name_index] + query_mask = query_mask_by_candidate_len[c_info.len] + score_letter!(candidate_score, query_mask, c_info.mask, query_index) + end + end + + a_ids = lookup_a_by_id[new_a_id] + for (score_i, score) in enumerate(candidate_scores) + if score.len_partial === UInt16(1) + update_results!(results, a_ids, lookup_b_by_id[score_i], missingval) + continue + end + # if present calculate scores + score_value!(results, score,query_partial, a_ids,lookup_b_by_id,score_i, w, cut_a, cut_b) + end + end + return nothing +end diff --git a/src/gammas/gammaCKpar.jl b/src/gammas/gammaCKpar.jl index 4165cbc..ee90643 100644 --- a/src/gammas/gammaCKpar.jl +++ b/src/gammas/gammaCKpar.jl @@ -96,3 +96,165 @@ function gammaCKpar!(vecA::PooledVector,vecB::PooledVector, # Return nothing return nothing end + +# term frequency adjusted version +function gammaCKpar!(vecA::PooledVector,vecB::PooledVector, + results::DiBitMatrix,dims::Tuple{Int,Int}, + tf_table_x::SubArray{Float16}, + tf_table_y::SubArray{Float16}; + distmethod="jw",cut_a=0.92,cut_b=0.88,partial=true,w=0.1) + + # assign distance function + if distmethod=="jw" + distance = JaroWinkler(p=w) + elseif distmethod=="dl" + distance = DamerauLevenshtein() + elseif distmethod=="jaro" + distance = Jaro(p=w) + elseif distmethod=="lv" + distance = Levenshtein() + end + + if partial + score_value! = score_value2 + else + score_value! = score_value + end + + # Segment unique keys from missing key + missingvals_x = findfirst(ismissing.(vecA.pool)) + iter_x=filter(x -> x != missingvals_x, UInt32(1):UInt32(length(vecA.pool))) + + missingvals_y = findfirst(ismissing.(vecB.pool)) + iter_y=filter(x -> x != missingvals_y, UInt32(1):UInt32(length(vecB.pool))) + + # Form match matrices based on differing levels of matches + Threads.@threads for x in iter_x + # all values in x that match unique value + indices_x = findall(vecA.refs .=== x) + + # term frequency adjustment for x + tf_val_x = length(indices_x)/dims[1] + for tf_i in indices_x + tf_table_x[tf_i] =tf_val_x + end + + for y in iter_y + # all values in y that match unique value + indices_y = findall(vecB.refs .=== y) + + # term frequency adjustment for y + tf_val_y = length(indices_y)/dims[2] + for tf_i in indices_y + tf_table_y[tf_i] = tf_val_y + end + + # string comparison + dist=round(compare(vecA.pool[x],vecB.pool[y], distance),digits=4) #this always normalizes dist 0 to 1 + score_value!(dist, indices_x,indices_y, cut_a,cut_b, results) + end + end + + # set all to missing where x is missing + if !isnothing(missingvals_x) + missingindices = findall(vecA.refs .== missingvals_x) + + # term frequency adjustment for x + tf_val_x = length(missingindices)/dims[1] + for tf_i in missingindices + tf_table_x[tf_i] =tf_val_x + end + + Threads.@threads for iy in 1:dims[2] + for ix in missingindices + results[ix,iy] = missingval + end + end + end + + # set all to missing where y is missing + if !isnothing(missingvals_y) + missingindices = findall(vecB.refs .== missingvals_y) + # term frequency adjustment for y + tf_val_y = length(missingindices)/dims[2] + for tf_i in missingindices + tf_table_y[tf_i] =tf_val_y + end + + Threads.@threads for ix in 1:dims[1] + for iy in missingindices + results[ix,iy] = missingval + end + end + end + # Return nothing + return nothing +end + +# term frequency adjusted version +function gammaKpar!(vecA::PooledVector,vecB::PooledVector,results::DiBitMatrix, dims::Tuple, + tf_table_x::SubArray{Float16}, + tf_table_y::SubArray{Float16}) + # Segment unique keys from missing key + missingvals_x = findfirst(ismissing.(vecA.pool)) + iter_x=filter(x -> x != missingvals_x, 0x00000001:UInt32(length(vecA.pool))) + + missingvals_y = findfirst(ismissing.(vecB.pool)) + iter_y=filter(x -> x != missingvals_y, 0x00000001:UInt32(length(vecB.pool))) + + # Form match matrices based on differing levels of matches + Threads.@threads for x in iter_x + indices_x = findall(vecA.refs .=== x) + # term frequency adjustment for x + tf_val_x = length(indices_x)/dims[1] + for tf_i in indices_x + tf_table_x[tf_i] =tf_val_x + end + for y in iter_y + indices_y = findall(vecB.refs .=== y) + # term frequency adjustment for y + tf_val_y = length(indices_y)/dims[2] + for tf_i in indices_y + tf_table_y[tf_i] = tf_val_y + end + # if matches at a threshold, go through result vector and assign new value + if vecA.pool[x] == vecB.pool[y] + for ix in indices_x,iy in indices_y + results[ix,iy] = match2 + end + end + end + end + + # set all to missing where x is missing + if !isnothing(missingvals_x) + missingindices = findall(vecA.refs .== missingvals_x) + # term frequency adjustment for x + tf_val_x = length(missingindices)/dims[1] + for tf_i in missingindices + tf_table_x[tf_i] =tf_val_x + end + Threads.@threads for iy in 1:dims[2] + for ix in missingindices + results[ix,iy] = missingval + end + end + end + # set all to missing where y is missing + if !isnothing(missingvals_y) + missingindices = findall(vecB.refs .== missingvals_y) + # term frequency adjustment for y + tf_val_y = length(missingindices)/dims[2] + for tf_i in missingindices + tf_table_y[tf_i] =tf_val_y + end + + Threads.@threads for ix in 1:dims[1] + for iy in missingindices + results[ix,iy] = missingval + end + end + end + # Return nothing + return nothing +end diff --git a/src/getMatches.jl b/src/getMatches.jl index 252a3d7..47317b3 100755 --- a/src/getMatches.jl +++ b/src/getMatches.jl @@ -13,10 +13,27 @@ Converts the matches from the tableCounts function based on the predefined thres - `resultsEM::NamedTuple`: Output of the expectation maximization fuction (eg emlinkMARmov()) - `threshold_match`: Lower bound for the posterior probability that will act as a cutoff for matches. """ -function getMatches(resultsEM::NamedTuple; +function getMatches!(resultsEM::NamedTuple; threshold_match=0.85,u_b=1e10) resultsEM.patterns_w.ismatch = resultsEM.zeta_j .>= threshold_match .&& resultsEM.patterns_w.weights .<= u_b return nothing end +# applies term frequency adjustments to table +function tf_adj_table(resultsEM::NamedTuple,varnames::Vector{String},tf_table_x::Vector{Vector{Float16}},tf_table_y::Vector{Vector{Float16}}) + tf_vec = [DataFrame() for _ in eachindex(resultsEM.indices)] + new_names=vcat(varnames .* "_x", varnames .* "_y") + for i in eachindex(resultsEM.indices) + result_len=length(resultsEM.indices[i]) + tf_results=DataFrame(ones(Float16,(result_len, 2*length(varnames))),new_names) + Threads.@threads for ii in 1:result_len + val=resultsEM.indices[i][ii] + rowval=vcat([tf_table_x[varid][val.row] for varid in eachindex(varnames)],[tf_table_y[varid][val.col] for varid in eachindex(varnames)]) + tf_results[ii,:] = rowval + end + tf_vec[i] = tf_results + end + + return tf_vec +end diff --git a/src/matchPatterns.jl b/src/matchPatterns.jl index ad5c55b..6467b18 100644 --- a/src/matchPatterns.jl +++ b/src/matchPatterns.jl @@ -23,7 +23,7 @@ struct MatchPatterns end function indices_to_uids(vecA, vecB, - indices::Vector{Vector{ComparisonIndex}} + indices::Vector{Vector{ComparisonIndex}} ) batch_size=500 inds=eachindex(indices) @@ -57,7 +57,6 @@ function get_local_patterns(x::Vector{Vector{UInt8}}, N::Int, S::Int) hashes=Vector{UInt64}() indices=Vector{Vector{UInt16}}() - for i in 1:S pattern=zeros(UInt8,N) for n in 1:N