From c54a70439e77a7630aea4db745dca8c9b5c9aa5a Mon Sep 17 00:00:00 2001
From: jw2249a <jack@democracy.works>
Date: Mon, 25 Mar 2024 04:47:54 -0400
Subject: [PATCH] term frequency adjustment fix

---
 .dir-locals.el             |   0
 Project.toml               |   2 +-
 scratch.jl                 |  17 ++--
 src/encode/soundex.jl      |  67 +++++++++++++++
 src/fastlink/fastlink.jl   | 108 ++++++++++++++++++-------
 src/gammas/gammaCKfuzzy.jl |  85 +++++++++++++++++++
 src/gammas/gammaCKpar.jl   | 162 +++++++++++++++++++++++++++++++++++++
 src/getMatches.jl          |  19 ++++-
 src/matchPatterns.jl       |   3 +-
 9 files changed, 422 insertions(+), 41 deletions(-)
 delete mode 100644 .dir-locals.el
 create mode 100644 src/encode/soundex.jl

diff --git a/.dir-locals.el b/.dir-locals.el
deleted file mode 100644
index e69de29..0000000
diff --git a/Project.toml b/Project.toml
index 79244d1..dfd8b06 100644
--- a/Project.toml
+++ b/Project.toml
@@ -8,6 +8,7 @@ DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
 DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
 Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
 PooledArrays = "2dfb63ee-cc39-5dd5-95bd-886bf059d720"
+StaticStrings = "4db0a0c5-418a-4e1d-8806-cb305fe13294"
 StringDistances = "88034a9c-02f8-509d-84a9-84ec65e18404"
 
 [extras]
@@ -16,6 +17,5 @@ CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
 Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
-
 [targets]
 test = ["Test", "CSV", "Pkg"]
diff --git a/scratch.jl b/scratch.jl
index 44a0b76..ee80e9f 100755
--- a/scratch.jl
+++ b/scratch.jl
@@ -1,5 +1,6 @@
 using Pkg
-#Pkg.develop(path=".")
+Pkg.develop(path=".")
+Pkg.precompile()
 using DataFrames
 using BenchmarkTools
 using CSV
@@ -26,6 +27,8 @@ dfB=CSV.read("$(b_fil)/dfB.csv", DataFrame,
              ntasks=1,
              pool=true,
              missingstring=["", "NA"])
+dfA.id = hash.(eachrow(dfA))
+dfB.id2 = hash.(eachrow(dfB))
 
 
 for var in varnames[1:3]
@@ -33,16 +36,12 @@ for var in varnames[1:3]
     dfB[!,var] = PooledArray(passmissing(x->uppercase(x)).(dfB[:,var]))
 end
 
-config = fastLink(dfA,dfB,varnames,match_method=match_method,cut_a=cut_a,cut_p=cut_p,
+results=fastLink(dfA,dfB,varnames,("id","id2"),
+                 match_method=match_method,
+                 term_freq_adjustment=[true],
+                 cut_a=cut_a,cut_p=cut_p,
                  threshold_match = 0.85)
 
-dump(config.fastlink_settings.comparison_funs[4])
 
-results=fastLink(dfA,dfB,varnames,match_method=match_method,cut_a=cut_a,cut_p=cut_p,
-                 threshold_match = 0.85)()
-
-x=results[1].patterns_w
-x[findall(ismissing.(x.gamma_4) .== false .&& x.gamma_4 .== 1),:]
-x[findall(ismissing.(x.gamma_4)),:]
 
 
diff --git a/src/encode/soundex.jl b/src/encode/soundex.jl
new file mode 100644
index 0000000..79044ae
--- /dev/null
+++ b/src/encode/soundex.jl
@@ -0,0 +1,67 @@
+module Soundex
+export soundex
+import StaticStrings: StaticString
+
+function encode(chr::Char)::Char
+    if in(chr, ['a', 'e', 'i', 'o','y', 'u'])
+        return '9'
+    elseif in(chr,['s','c','k','j','g','z','x','q'])
+        return '2'
+    elseif in(chr,['n','m'])
+        return '5'
+    elseif 'l' == chr
+        return '4'
+    elseif 'r' == chr
+        return '6'
+    elseif in(chr,['t','d'])
+        return '3'
+    elseif in(chr,['h','w'])
+        return '8'
+    elseif in(chr,['v','b','p','f'])
+        return '1'
+    elseif chr === ' '
+        return '7'
+    else
+        @error "Unknown character encountered $chr"
+    end
+end
+
+function soundex(s::T)::StaticString{4} where T <: AbstractString
+    output = ['0' for _ in 1:4]
+    index = 1
+    chr=lowercase(s[1])
+    previous_encoding = encode(chr)
+    output[index] = uppercase(chr)
+    for chr in s[2:end]
+        if index <= 3
+            encoding = encode(lowercase(chr))
+            if encoding === '7'
+                # continue if space
+                continue
+            end
+            # if vowel or ignorable vars 'h' or 'w'
+            if encoding !== '9' && encoding !== '8'
+                if encoding !== previous_encoding
+                    # Rule 4 on Consonant Separators
+                    if encoding === output[index]
+                        # If a vowel (A, E, I, O, U) separates two consonants that have the same code, code consonant to the right of vowel
+                        if previous_encoding === '9'
+                            index += 1
+                            output[index] = encoding
+                        end
+                        # If "H" or "W" separate two consonants that have code, the consonant to the right of the vowel is not coded
+                    else
+                        # different letter code soundex
+                        index += 1
+                        output[index] = encoding
+                    end
+                end
+            end
+        else
+            break
+        end
+        previous_encoding = encoding
+    end
+    return StaticString(String(output))
+end
+end # module
diff --git a/src/fastlink/fastlink.jl b/src/fastlink/fastlink.jl
index 3b70f54..24c2a47 100755
--- a/src/fastlink/fastlink.jl
+++ b/src/fastlink/fastlink.jl
@@ -88,6 +88,7 @@ matched_data = fastLink(dfA, dfB, ["firstname", "lastname", "city"])
 function fastLink(dfA::DataFrame, dfB::DataFrame,
                   varnames::Vector{String},
                   idvar::Tuple{String,String};
+                  term_freq_adjustment=[false],
                   match_method=String[],
                   partials=[true],
                   upper_case=[true],
@@ -116,11 +117,17 @@ function fastLink(dfA::DataFrame, dfB::DataFrame,
     cut_a = check_input_lengths(cut_a, numvars, "cut_a")
     cut_p = check_input_lengths(cut_p, numvars, "cut_p")
     address_field = check_input_lengths(address_field, numvars, "address_field")
+    term_freq_adjustment = check_input_lengths(term_freq_adjustment, numvars, "term_freq_adjustment")
     stringdist_method = check_input_lengths(stringdist_method, numvars, "stringdist_method")
     
     vartypes, comparison_levels = check_var_types(dfA,dfB,varnames,match_method,partials)
-    
+
+    # results table
     res = [DiBitMatrix(obs_a,obs_b) for _ in varnames]
+
+    # term frequency tables
+    tf_table_x = [ones(Float16,dims[1]) for _ in varnames]
+    tf_table_y = [ones(Float16,dims[2]) for _ in varnames]
     
     # allow missing for comparisons
     allowmissing!(dfA)
@@ -131,31 +138,67 @@ function fastLink(dfA::DataFrame, dfB::DataFrame,
     for i in eachindex(varnames)
         @info "Now matching var $(varnames[i]) using $(match_method[i])"
         if match_method[i] == "fuzzy"
-            gammaCKfuzzy!(dfA[!,varnames[i]],
-                          dfB[!,varnames[i]],
-                          res[i],
-                          dims,
-                          cut_a=cut_a[i], 
-                          cut_b=cut_p[i],
-                          upper=upper_case[i],
-                          w=jw_weight[i],
-                          partial=partials[i])
+            if term_freq_adjustment[i]
+                gammaCKfuzzy!(dfA[!,varnames[i]],
+                              dfB[!,varnames[i]],
+                              res[i],
+                              dims,
+                              view(tf_table_x[i],:),
+                              view(tf_table_y[i],:),
+                              cut_a=cut_a[i], 
+                              cut_b=cut_p[i],
+                              upper=upper_case[i],
+                              w=jw_weight[i],
+                              partial=partials[i])
+            else
+                gammaCKfuzzy!(dfA[!,varnames[i]],
+                              dfB[!,varnames[i]],
+                              res[i],
+                              dims,
+                              cut_a=cut_a[i], 
+                              cut_b=cut_p[i],
+                              upper=upper_case[i],
+                              w=jw_weight[i],
+                              partial=partials[i])
+            end
         elseif match_method[i] == "string"
-            gammaCKpar!(dfA[!,varnames[i]],
-                        dfB[!,varnames[i]],
-                        res[i],
-                        dims,
-                        distmethod=stringdist_method[i],
-                        cut_a=cut_a[i], 
-                        cut_b=cut_p[i],
-                        w=jw_weight[i],
-                        partial=partials[i])
-
+            if term_freq_adjustment[i]
+                gammaCKpar!(dfA[!,varnames[i]],
+                            dfB[!,varnames[i]],
+                            res[i],
+                            dims,
+                            view(tf_table_x[i],:),
+                            view(tf_table_y[i],:),
+                            distmethod=stringdist_method[i],
+                            cut_a=cut_a[i], 
+                            cut_b=cut_p[i],
+                            w=jw_weight[i],
+                            partial=partials[i])
+            else
+                gammaCKpar!(dfA[!,varnames[i]],
+                            dfB[!,varnames[i]],
+                            res[i],
+                            dims,
+                            distmethod=stringdist_method[i],
+                            cut_a=cut_a[i], 
+                            cut_b=cut_p[i],
+                            w=jw_weight[i],
+                            partial=partials[i])
+            end
         elseif match_method[i] == "exact" || match_method[i] == "bool"
-            gammaKpar!(dfA[!,varnames[i]],
-                       dfB[!,varnames[i]],
-                       res[i],
-                       dims)
+            if term_freq_adjustment[i]
+                gammaKpar!(dfA[!,varnames[i]],
+                           dfB[!,varnames[i]],
+                           res[i],
+                           dims,
+                           view(tf_table_x[i],:),
+                           view(tf_table_y[i],:))
+            else
+                gammaKpar!(dfA[!,varnames[i]],
+                           dfB[!,varnames[i]],
+                           res[i],
+                           dims)
+            end
         elseif match_method == "numeric" || match_method=="float" || match_method == "int"
             gammaNUMCKpar!(dfA[!,varnames[i]],
                            dfB[!,varnames[i]],
@@ -176,11 +219,20 @@ function fastLink(dfA::DataFrame, dfB::DataFrame,
                              address_field=address_field)
     # testing removing uncessessary indices (where no obs exist)
     #remove_no_matched_var_indices(resultsEM)
-    # adding uids 
-    resultsEM = merge(resultsEM, (matched_ids = indices_to_uids(dfA[!, idvar[1]],dfB[!, idvar[2]],resultsEM.indices),))
+    # adding uids
+
+    if any(term_freq_adjustment)
+        resultsEM = merge(resultsEM, (matched_ids = indices_to_uids(dfA[!, idvar[1]],dfB[!, idvar[2]],resultsEM.indices),
+                                      tf_adj_table = tf_adj_table(resultsEM,varnames,tf_table_x,tf_table_y)))        
+        
+    else
+        resultsEM = merge(resultsEM, (matched_ids = indices_to_uids(dfA[!, idvar[1]],dfB[!, idvar[2]],resultsEM.indices),))        
+    end
+    
+
     
     @info "Retrieving matches"
-    getMatches(resultsEM,threshold_match=threshold_match)
+    getMatches!(resultsEM,threshold_match=threshold_match)
     return (resultsEM) 
 end
 
@@ -282,7 +334,7 @@ function fastLink(dfA::DataFrame, dfB::DataFrame;
 
     
     @info "Retrieving matches"
-    getMatches(resultsEM,threshold_match=threshold_match)
+    getMatches!(resultsEM,threshold_match=threshold_match)
     return (resultsEM) 
 end
 
diff --git a/src/gammas/gammaCKfuzzy.jl b/src/gammas/gammaCKfuzzy.jl
index 2084cdf..82616c0 100644
--- a/src/gammas/gammaCKfuzzy.jl
+++ b/src/gammas/gammaCKfuzzy.jl
@@ -235,3 +235,88 @@ function gammaCKfuzzy!(vecA::PooledVector,vecB::PooledVector,results::DiBitMatri
     end
     return nothing
 end
+
+
+function gammaCKfuzzy!(vecA::PooledVector,vecB::PooledVector,results::DiBitMatrix,dims::Tuple{Int,Int},
+                       tf_table_x::SubArray{Float16},
+                       tf_table_y::SubArray{Float16};
+                       cut_a::Float64=0.92,cut_b::Float64=0.88,upper::Bool=true,
+                       w::Float64=0.1,partial::Bool=true)
+
+    
+    # functions that update the results view
+    if partial
+        score_value! = score_value2
+    else
+        score_value! = score_value
+    end
+    
+    # change the range of ascii characters dependent on case
+    if upper 
+        space_char,max_char = 0x40,0x5a
+    else
+        space_char,max_char = 0x60,0x7a
+    end
+    # length of unique values
+    lenA = UInt32(length(vecA.pool))
+    lenB = UInt32(length(vecB.pool))
+
+    # vector of pool value indices
+    lookup_a_by_id=pool_lookup_table(vecA.refs, lenA)
+    lookup_b_by_id=pool_lookup_table(vecB.refs, lenB)
+
+    dims=(length(vecA),length(vecB))
+
+    # term frequency for x
+    Threads.@threads for i in lookup_a_by_id
+        tf_val=length(i)/dims[1]
+        for ii in i
+            tf_table_x[ii] = tf_val
+        end
+    end
+    
+    # term frequency for y
+    Threads.@threads for i in lookup_b_by_id
+        tf_val=length(i)/dims[2]
+        for ii in i
+            tf_table_y[ii] = tf_val
+        end
+    end
+    
+    missingindexA = find_missing_index(vecA.invpool)
+    
+    base_candidate_lookup = build_candidate_lookup(vecB.pool,spaceletter=space_char,lastletter=max_char)
+    base_candidate_scores = build_candidate_scores(vecB.pool)
+    
+    Threads.@threads for (query_name,new_a_id) in collect(vecA.invpool)
+        # pass if query is missing val
+        if new_a_id === missingindexA
+            update_results!(results, lookup_a_by_id[new_a_id],UInt32(1):UInt32(dims[2]),missingval)
+            continue
+        end
+        
+        query_len = UInt8(min(ncodeunits(query_name),16))
+        query_masks_lookup = maskify(query_name,query_len,space_char=space_char,max_char=max_char)
+        query_partial = UInt16(1024 ÷ query_len)
+        candidate_scores = deepcopy(base_candidate_scores)
+        
+        for (query_index, (letter_index, query_mask_by_candidate_len)) in enumerate(query_masks_lookup)
+            for c_info in base_candidate_lookup[letter_index]
+                candidate_score = candidate_scores[c_info.name_index]
+                query_mask = query_mask_by_candidate_len[c_info.len]
+                score_letter!(candidate_score, query_mask, c_info.mask, query_index)
+            end
+        end
+        
+        a_ids = lookup_a_by_id[new_a_id]
+        for (score_i, score) in enumerate(candidate_scores)
+            if score.len_partial === UInt16(1)               
+                update_results!(results, a_ids, lookup_b_by_id[score_i], missingval)
+                continue
+            end
+            # if present calculate scores
+            score_value!(results, score,query_partial, a_ids,lookup_b_by_id,score_i, w, cut_a, cut_b)      
+        end
+    end
+    return nothing
+end
diff --git a/src/gammas/gammaCKpar.jl b/src/gammas/gammaCKpar.jl
index 4165cbc..ee90643 100644
--- a/src/gammas/gammaCKpar.jl
+++ b/src/gammas/gammaCKpar.jl
@@ -96,3 +96,165 @@ function gammaCKpar!(vecA::PooledVector,vecB::PooledVector,
     # Return nothing
     return nothing
 end
+
+# term frequency adjusted version
+function gammaCKpar!(vecA::PooledVector,vecB::PooledVector,
+                     results::DiBitMatrix,dims::Tuple{Int,Int},
+                     tf_table_x::SubArray{Float16},
+                     tf_table_y::SubArray{Float16};
+                     distmethod="jw",cut_a=0.92,cut_b=0.88,partial=true,w=0.1)
+
+    # assign distance function
+    if distmethod=="jw"
+        distance = JaroWinkler(p=w)
+    elseif distmethod=="dl"
+        distance = DamerauLevenshtein()
+    elseif distmethod=="jaro"
+        distance = Jaro(p=w)
+    elseif distmethod=="lv"
+        distance = Levenshtein()
+    end
+
+    if partial
+        score_value! = score_value2
+    else
+        score_value! = score_value
+    end
+    
+    # Segment unique keys from missing key
+    missingvals_x = findfirst(ismissing.(vecA.pool))
+    iter_x=filter(x -> x != missingvals_x, UInt32(1):UInt32(length(vecA.pool)))
+    
+    missingvals_y = findfirst(ismissing.(vecB.pool))
+    iter_y=filter(x -> x != missingvals_y, UInt32(1):UInt32(length(vecB.pool)))
+    
+    # Form match matrices based on differing levels of matches
+    Threads.@threads for x in iter_x
+        # all values in x that match unique value
+        indices_x = findall(vecA.refs .=== x)
+        
+        # term frequency adjustment for x
+        tf_val_x = length(indices_x)/dims[1]
+        for tf_i in indices_x
+            tf_table_x[tf_i] =tf_val_x
+        end
+        
+        for y in iter_y
+            # all values in y that match unique value
+            indices_y = findall(vecB.refs .=== y)
+            
+            # term frequency adjustment for y
+            tf_val_y = length(indices_y)/dims[2]
+            for tf_i in indices_y
+                tf_table_y[tf_i] = tf_val_y
+            end
+
+            # string comparison
+            dist=round(compare(vecA.pool[x],vecB.pool[y], distance),digits=4) #this always normalizes dist 0 to 1
+            score_value!(dist, indices_x,indices_y, cut_a,cut_b, results)
+        end
+    end
+
+    # set all to missing where x is missing
+    if !isnothing(missingvals_x)
+        missingindices = findall(vecA.refs .== missingvals_x)
+
+        # term frequency adjustment for x
+        tf_val_x = length(missingindices)/dims[1]
+        for tf_i in missingindices
+            tf_table_x[tf_i] =tf_val_x
+        end
+        
+        Threads.@threads for iy in 1:dims[2]
+            for ix in missingindices
+                results[ix,iy] = missingval
+            end
+        end
+    end
+    
+    # set all to missing where y is missing
+    if !isnothing(missingvals_y)
+        missingindices = findall(vecB.refs .== missingvals_y)
+        # term frequency adjustment for y
+        tf_val_y = length(missingindices)/dims[2]
+        for tf_i in missingindices
+            tf_table_y[tf_i] =tf_val_y
+        end
+
+        Threads.@threads for ix in 1:dims[1]
+            for iy in missingindices
+                results[ix,iy] = missingval
+            end
+        end
+    end
+    # Return nothing
+    return nothing
+end
+
+# term frequency adjusted version
+function gammaKpar!(vecA::PooledVector,vecB::PooledVector,results::DiBitMatrix, dims::Tuple,
+                    tf_table_x::SubArray{Float16},
+                    tf_table_y::SubArray{Float16})
+    # Segment unique keys from missing key
+    missingvals_x = findfirst(ismissing.(vecA.pool))
+    iter_x=filter(x -> x != missingvals_x, 0x00000001:UInt32(length(vecA.pool)))
+    
+    missingvals_y = findfirst(ismissing.(vecB.pool))
+    iter_y=filter(x -> x != missingvals_y, 0x00000001:UInt32(length(vecB.pool)))
+    
+    # Form match matrices based on differing levels of matches
+    Threads.@threads for x in iter_x
+        indices_x = findall(vecA.refs .=== x)
+         # term frequency adjustment for x
+        tf_val_x = length(indices_x)/dims[1]
+        for tf_i in indices_x
+            tf_table_x[tf_i] =tf_val_x
+        end
+        for y in  iter_y
+            indices_y = findall(vecB.refs .=== y)
+             # term frequency adjustment for y
+            tf_val_y = length(indices_y)/dims[2]
+            for tf_i in indices_y
+                tf_table_y[tf_i] = tf_val_y
+            end
+            # if matches at a threshold, go through result vector and assign new value
+            if vecA.pool[x] == vecB.pool[y]
+                for ix in indices_x,iy in indices_y
+                    results[ix,iy] = match2
+                end
+            end
+        end
+    end
+
+    # set all to missing where x is missing
+    if !isnothing(missingvals_x)
+        missingindices = findall(vecA.refs .== missingvals_x)
+        # term frequency adjustment for x
+        tf_val_x = length(missingindices)/dims[1]
+        for tf_i in missingindices
+            tf_table_x[tf_i] =tf_val_x
+        end
+        Threads.@threads for iy in 1:dims[2]
+            for ix in missingindices
+                results[ix,iy] = missingval
+            end
+        end
+    end
+    # set all to missing where y is missing
+    if !isnothing(missingvals_y)
+        missingindices = findall(vecB.refs .== missingvals_y)
+         # term frequency adjustment for y
+        tf_val_y = length(missingindices)/dims[2]
+        for tf_i in missingindices
+            tf_table_y[tf_i] =tf_val_y
+        end
+        
+        Threads.@threads for ix in 1:dims[1]
+            for iy in missingindices
+                results[ix,iy] = missingval
+            end
+        end
+    end
+    # Return nothing
+    return nothing
+end
diff --git a/src/getMatches.jl b/src/getMatches.jl
index 252a3d7..47317b3 100755
--- a/src/getMatches.jl
+++ b/src/getMatches.jl
@@ -13,10 +13,27 @@ Converts the matches from the tableCounts function based on the predefined thres
 - `resultsEM::NamedTuple`: Output of the expectation maximization fuction (eg emlinkMARmov())
 - `threshold_match`: Lower bound for the posterior probability that will act as a cutoff for matches.
 """
-function getMatches(resultsEM::NamedTuple;
+function getMatches!(resultsEM::NamedTuple;
                     threshold_match=0.85,u_b=1e10)
     resultsEM.patterns_w.ismatch = resultsEM.zeta_j .>= threshold_match .&& resultsEM.patterns_w.weights .<= u_b
     return nothing
 end
 
+# applies term frequency adjustments to table
+function tf_adj_table(resultsEM::NamedTuple,varnames::Vector{String},tf_table_x::Vector{Vector{Float16}},tf_table_y::Vector{Vector{Float16}})
+    tf_vec = [DataFrame() for _ in eachindex(resultsEM.indices)]
+    new_names=vcat(varnames .* "_x", varnames .* "_y")
+    for i in eachindex(resultsEM.indices)
+        result_len=length(resultsEM.indices[i])
+        tf_results=DataFrame(ones(Float16,(result_len, 2*length(varnames))),new_names)
+        Threads.@threads for ii in 1:result_len
+            val=resultsEM.indices[i][ii]
+            rowval=vcat([tf_table_x[varid][val.row] for varid in eachindex(varnames)],[tf_table_y[varid][val.col] for varid in eachindex(varnames)])
+            tf_results[ii,:] = rowval
+        end
 
+        tf_vec[i] = tf_results
+    end
+
+    return tf_vec
+end
diff --git a/src/matchPatterns.jl b/src/matchPatterns.jl
index ad5c55b..6467b18 100644
--- a/src/matchPatterns.jl
+++ b/src/matchPatterns.jl
@@ -23,7 +23,7 @@ struct MatchPatterns
 end
 
 function indices_to_uids(vecA, vecB,
-                                    indices::Vector{Vector{ComparisonIndex}}
+                         indices::Vector{Vector{ComparisonIndex}}
                          )
     batch_size=500
     inds=eachindex(indices)
@@ -57,7 +57,6 @@ function get_local_patterns(x::Vector{Vector{UInt8}}, N::Int, S::Int)
     hashes=Vector{UInt64}()
     indices=Vector{Vector{UInt16}}()
 
-    
     for i in 1:S
         pattern=zeros(UInt8,N)
         for n in 1:N