term frequency adjustment fix

jw2249a · Mar 25, 2024 · c54a704 · c54a704
1 parent e64c130
commit c54a704
Show file tree

Hide file tree

Showing 9 changed files with 422 additions and 41 deletions.
diff --git a/.dir-locals.el b/.dir-locals.el
diff --git a/Project.toml b/Project.toml
@@ -8,6 +8,7 @@ DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
 DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
 Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
 PooledArrays = "2dfb63ee-cc39-5dd5-95bd-886bf059d720"
+StaticStrings = "4db0a0c5-418a-4e1d-8806-cb305fe13294"
 StringDistances = "88034a9c-02f8-509d-84a9-84ec65e18404"
 
 [extras]
@@ -16,6 +17,5 @@ CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
 Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
-
 [targets]
 test = ["Test", "CSV", "Pkg"]
diff --git a/scratch.jl b/scratch.jl
@@ -1,5 +1,6 @@
 using Pkg
-#Pkg.develop(path=".")
+Pkg.develop(path=".")
+Pkg.precompile()
 using DataFrames
 using BenchmarkTools
 using CSV
@@ -26,23 +27,21 @@ dfB=CSV.read("$(b_fil)/dfB.csv", DataFrame,
              ntasks=1,
              pool=true,
              missingstring=["", "NA"])
+dfA.id = hash.(eachrow(dfA))
+dfB.id2 = hash.(eachrow(dfB))
 
 
 for var in varnames[1:3]
     dfA[!,var] = PooledArray(passmissing(x->uppercase(x)).(dfA[:,var]))
     dfB[!,var] = PooledArray(passmissing(x->uppercase(x)).(dfB[:,var]))
 end
 
-config = fastLink(dfA,dfB,varnames,match_method=match_method,cut_a=cut_a,cut_p=cut_p,
+results=fastLink(dfA,dfB,varnames,("id","id2"),
+                 match_method=match_method,
+                 term_freq_adjustment=[true],
+                 cut_a=cut_a,cut_p=cut_p,
                  threshold_match = 0.85)
 
-dump(config.fastlink_settings.comparison_funs[4])
 
-results=fastLink(dfA,dfB,varnames,match_method=match_method,cut_a=cut_a,cut_p=cut_p,
-                 threshold_match = 0.85)()
-
-x=results[1].patterns_w
-x[findall(ismissing.(x.gamma_4) .== false .&& x.gamma_4 .== 1),:]
-x[findall(ismissing.(x.gamma_4)),:]
 
 
diff --git a/src/encode/soundex.jl b/src/encode/soundex.jl
@@ -0,0 +1,67 @@
+module Soundex
+export soundex
+import StaticStrings: StaticString
+
+function encode(chr::Char)::Char
+    if in(chr, ['a', 'e', 'i', 'o','y', 'u'])
+        return '9'
+    elseif in(chr,['s','c','k','j','g','z','x','q'])
+        return '2'
+    elseif in(chr,['n','m'])
+        return '5'
+    elseif 'l' == chr
+        return '4'
+    elseif 'r' == chr
+        return '6'
+    elseif in(chr,['t','d'])
+        return '3'
+    elseif in(chr,['h','w'])
+        return '8'
+    elseif in(chr,['v','b','p','f'])
+        return '1'
+    elseif chr === ' '
+        return '7'
+    else
+        @error "Unknown character encountered $chr"
+    end
+end
+
+function soundex(s::T)::StaticString{4} where T <: AbstractString
+    output = ['0' for _ in 1:4]
+    index = 1
+    chr=lowercase(s[1])
+    previous_encoding = encode(chr)
+    output[index] = uppercase(chr)
+    for chr in s[2:end]
+        if index <= 3
+            encoding = encode(lowercase(chr))
+            if encoding === '7'
+                # continue if space
+                continue
+            end
+            # if vowel or ignorable vars 'h' or 'w'
+            if encoding !== '9' && encoding !== '8'
+                if encoding !== previous_encoding
+                    # Rule 4 on Consonant Separators
+                    if encoding === output[index]
+                        # If a vowel (A, E, I, O, U) separates two consonants that have the same code, code consonant to the right of vowel
+                        if previous_encoding === '9'
+                            index += 1
+                            output[index] = encoding
+                        end
+                        # If "H" or "W" separate two consonants that have code, the consonant to the right of the vowel is not coded
+                    else
+                        # different letter code soundex
+                        index += 1
+                        output[index] = encoding
+                    end
+                end
+            end
+        else
+            break
+        end
+        previous_encoding = encoding
+    end
+    return StaticString(String(output))
+end
+end # module
diff --git a/src/fastlink/fastlink.jl b/src/fastlink/fastlink.jl
@@ -88,6 +88,7 @@ matched_data = fastLink(dfA, dfB, ["firstname", "lastname", "city"])
 function fastLink(dfA::DataFrame, dfB::DataFrame,
                   varnames::Vector{String},
                   idvar::Tuple{String,String};
+                  term_freq_adjustment=[false],
                   match_method=String[],
                   partials=[true],
                   upper_case=[true],
@@ -116,11 +117,17 @@ function fastLink(dfA::DataFrame, dfB::DataFrame,
     cut_a = check_input_lengths(cut_a, numvars, "cut_a")
     cut_p = check_input_lengths(cut_p, numvars, "cut_p")
     address_field = check_input_lengths(address_field, numvars, "address_field")
+    term_freq_adjustment = check_input_lengths(term_freq_adjustment, numvars, "term_freq_adjustment")
     stringdist_method = check_input_lengths(stringdist_method, numvars, "stringdist_method")
 
     vartypes, comparison_levels = check_var_types(dfA,dfB,varnames,match_method,partials)
-
+
+    # results table
     res = [DiBitMatrix(obs_a,obs_b) for _ in varnames]
+
+    # term frequency tables
+    tf_table_x = [ones(Float16,dims[1]) for _ in varnames]
+    tf_table_y = [ones(Float16,dims[2]) for _ in varnames]
 
     # allow missing for comparisons
     allowmissing!(dfA)
@@ -131,31 +138,67 @@ function fastLink(dfA::DataFrame, dfB::DataFrame,
     for i in eachindex(varnames)
         @info "Now matching var $(varnames[i]) using $(match_method[i])"
         if match_method[i] == "fuzzy"
-            gammaCKfuzzy!(dfA[!,varnames[i]],
-                          dfB[!,varnames[i]],
-                          res[i],
-                          dims,
-                          cut_a=cut_a[i], 
-                          cut_b=cut_p[i],
-                          upper=upper_case[i],
-                          w=jw_weight[i],
-                          partial=partials[i])
+            if term_freq_adjustment[i]
+                gammaCKfuzzy!(dfA[!,varnames[i]],
+                              dfB[!,varnames[i]],
+                              res[i],
+                              dims,
+                              view(tf_table_x[i],:),
+                              view(tf_table_y[i],:),
+                              cut_a=cut_a[i], 
+                              cut_b=cut_p[i],
+                              upper=upper_case[i],
+                              w=jw_weight[i],
+                              partial=partials[i])
+            else
+                gammaCKfuzzy!(dfA[!,varnames[i]],
+                              dfB[!,varnames[i]],
+                              res[i],
+                              dims,
+                              cut_a=cut_a[i], 
+                              cut_b=cut_p[i],
+                              upper=upper_case[i],
+                              w=jw_weight[i],
+                              partial=partials[i])
+            end
         elseif match_method[i] == "string"
-            gammaCKpar!(dfA[!,varnames[i]],
-                        dfB[!,varnames[i]],
-                        res[i],
-                        dims,
-                        distmethod=stringdist_method[i],
-                        cut_a=cut_a[i], 
-                        cut_b=cut_p[i],
-                        w=jw_weight[i],
-                        partial=partials[i])
-
+            if term_freq_adjustment[i]
+                gammaCKpar!(dfA[!,varnames[i]],
+                            dfB[!,varnames[i]],
+                            res[i],
+                            dims,
+                            view(tf_table_x[i],:),
+                            view(tf_table_y[i],:),
+                            distmethod=stringdist_method[i],
+                            cut_a=cut_a[i], 
+                            cut_b=cut_p[i],
+                            w=jw_weight[i],
+                            partial=partials[i])
+            else
+                gammaCKpar!(dfA[!,varnames[i]],
+                            dfB[!,varnames[i]],
+                            res[i],
+                            dims,
+                            distmethod=stringdist_method[i],
+                            cut_a=cut_a[i], 
+                            cut_b=cut_p[i],
+                            w=jw_weight[i],
+                            partial=partials[i])
+            end
         elseif match_method[i] == "exact" || match_method[i] == "bool"
-            gammaKpar!(dfA[!,varnames[i]],
-                       dfB[!,varnames[i]],
-                       res[i],
-                       dims)
+            if term_freq_adjustment[i]
+                gammaKpar!(dfA[!,varnames[i]],
+                           dfB[!,varnames[i]],
+                           res[i],
+                           dims,
+                           view(tf_table_x[i],:),
+                           view(tf_table_y[i],:))
+            else
+                gammaKpar!(dfA[!,varnames[i]],
+                           dfB[!,varnames[i]],
+                           res[i],
+                           dims)
+            end
         elseif match_method == "numeric" || match_method=="float" || match_method == "int"
             gammaNUMCKpar!(dfA[!,varnames[i]],
                            dfB[!,varnames[i]],
@@ -176,11 +219,20 @@ function fastLink(dfA::DataFrame, dfB::DataFrame,
                              address_field=address_field)
     # testing removing uncessessary indices (where no obs exist)
     #remove_no_matched_var_indices(resultsEM)
-    # adding uids 
-    resultsEM = merge(resultsEM, (matched_ids = indices_to_uids(dfA[!, idvar[1]],dfB[!, idvar[2]],resultsEM.indices),))
+    # adding uids
+
+    if any(term_freq_adjustment)
+        resultsEM = merge(resultsEM, (matched_ids = indices_to_uids(dfA[!, idvar[1]],dfB[!, idvar[2]],resultsEM.indices),
+                                      tf_adj_table = tf_adj_table(resultsEM,varnames,tf_table_x,tf_table_y)))        
+
+    else
+        resultsEM = merge(resultsEM, (matched_ids = indices_to_uids(dfA[!, idvar[1]],dfB[!, idvar[2]],resultsEM.indices),))        
+    end
+
+
 
     @info "Retrieving matches"
-    getMatches(resultsEM,threshold_match=threshold_match)
+    getMatches!(resultsEM,threshold_match=threshold_match)
     return (resultsEM) 
 end
 
@@ -282,7 +334,7 @@ function fastLink(dfA::DataFrame, dfB::DataFrame;
 
 
     @info "Retrieving matches"
-    getMatches(resultsEM,threshold_match=threshold_match)
+    getMatches!(resultsEM,threshold_match=threshold_match)
     return (resultsEM) 
 end
 

diff --git a/src/gammas/gammaCKfuzzy.jl b/src/gammas/gammaCKfuzzy.jl
@@ -235,3 +235,88 @@ function gammaCKfuzzy!(vecA::PooledVector,vecB::PooledVector,results::DiBitMatri
     end
     return nothing
 end
+
+
+function gammaCKfuzzy!(vecA::PooledVector,vecB::PooledVector,results::DiBitMatrix,dims::Tuple{Int,Int},
+                       tf_table_x::SubArray{Float16},
+                       tf_table_y::SubArray{Float16};
+                       cut_a::Float64=0.92,cut_b::Float64=0.88,upper::Bool=true,
+                       w::Float64=0.1,partial::Bool=true)
+
+
+    # functions that update the results view
+    if partial
+        score_value! = score_value2
+    else
+        score_value! = score_value
+    end
+
+    # change the range of ascii characters dependent on case
+    if upper 
+        space_char,max_char = 0x40,0x5a
+    else
+        space_char,max_char = 0x60,0x7a
+    end
+    # length of unique values
+    lenA = UInt32(length(vecA.pool))
+    lenB = UInt32(length(vecB.pool))
+
+    # vector of pool value indices
+    lookup_a_by_id=pool_lookup_table(vecA.refs, lenA)
+    lookup_b_by_id=pool_lookup_table(vecB.refs, lenB)
+
+    dims=(length(vecA),length(vecB))
+
+    # term frequency for x
+    Threads.@threads for i in lookup_a_by_id
+        tf_val=length(i)/dims[1]
+        for ii in i
+            tf_table_x[ii] = tf_val
+        end
+    end
+
+    # term frequency for y
+    Threads.@threads for i in lookup_b_by_id
+        tf_val=length(i)/dims[2]
+        for ii in i
+            tf_table_y[ii] = tf_val
+        end
+    end
+
+    missingindexA = find_missing_index(vecA.invpool)
+
+    base_candidate_lookup = build_candidate_lookup(vecB.pool,spaceletter=space_char,lastletter=max_char)
+    base_candidate_scores = build_candidate_scores(vecB.pool)
+
+    Threads.@threads for (query_name,new_a_id) in collect(vecA.invpool)
+        # pass if query is missing val
+        if new_a_id === missingindexA
+            update_results!(results, lookup_a_by_id[new_a_id],UInt32(1):UInt32(dims[2]),missingval)
+            continue
+        end
+
+        query_len = UInt8(min(ncodeunits(query_name),16))
+        query_masks_lookup = maskify(query_name,query_len,space_char=space_char,max_char=max_char)
+        query_partial = UInt16(1024 ÷ query_len)
+        candidate_scores = deepcopy(base_candidate_scores)
+
+        for (query_index, (letter_index, query_mask_by_candidate_len)) in enumerate(query_masks_lookup)
+            for c_info in base_candidate_lookup[letter_index]
+                candidate_score = candidate_scores[c_info.name_index]
+                query_mask = query_mask_by_candidate_len[c_info.len]
+                score_letter!(candidate_score, query_mask, c_info.mask, query_index)
+            end
+        end
+
+        a_ids = lookup_a_by_id[new_a_id]
+        for (score_i, score) in enumerate(candidate_scores)
+            if score.len_partial === UInt16(1)               
+                update_results!(results, a_ids, lookup_b_by_id[score_i], missingval)
+                continue
+            end
+            # if present calculate scores
+            score_value!(results, score,query_partial, a_ids,lookup_b_by_id,score_i, w, cut_a, cut_b)      
+        end
+    end
+    return nothing
+end