Skip to content

Commit

Permalink
term frequency adjustment fix
Browse files Browse the repository at this point in the history
  • Loading branch information
jw2249a committed Mar 25, 2024
1 parent e64c130 commit c54a704
Show file tree
Hide file tree
Showing 9 changed files with 422 additions and 41 deletions.
Empty file removed .dir-locals.el
Empty file.
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
PooledArrays = "2dfb63ee-cc39-5dd5-95bd-886bf059d720"
StaticStrings = "4db0a0c5-418a-4e1d-8806-cb305fe13294"
StringDistances = "88034a9c-02f8-509d-84a9-84ec65e18404"

[extras]
Expand All @@ -16,6 +17,5 @@ CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"


[targets]
test = ["Test", "CSV", "Pkg"]
17 changes: 8 additions & 9 deletions scratch.jl
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
using Pkg
#Pkg.develop(path=".")
Pkg.develop(path=".")
Pkg.precompile()
using DataFrames
using BenchmarkTools
using CSV
Expand All @@ -26,23 +27,21 @@ dfB=CSV.read("$(b_fil)/dfB.csv", DataFrame,
ntasks=1,
pool=true,
missingstring=["", "NA"])
dfA.id = hash.(eachrow(dfA))
dfB.id2 = hash.(eachrow(dfB))


for var in varnames[1:3]
dfA[!,var] = PooledArray(passmissing(x->uppercase(x)).(dfA[:,var]))
dfB[!,var] = PooledArray(passmissing(x->uppercase(x)).(dfB[:,var]))
end

config = fastLink(dfA,dfB,varnames,match_method=match_method,cut_a=cut_a,cut_p=cut_p,
results=fastLink(dfA,dfB,varnames,("id","id2"),
match_method=match_method,
term_freq_adjustment=[true],
cut_a=cut_a,cut_p=cut_p,
threshold_match = 0.85)

dump(config.fastlink_settings.comparison_funs[4])

results=fastLink(dfA,dfB,varnames,match_method=match_method,cut_a=cut_a,cut_p=cut_p,
threshold_match = 0.85)()

x=results[1].patterns_w
x[findall(ismissing.(x.gamma_4) .== false .&& x.gamma_4 .== 1),:]
x[findall(ismissing.(x.gamma_4)),:]


67 changes: 67 additions & 0 deletions src/encode/soundex.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
module Soundex
export soundex
import StaticStrings: StaticString

function encode(chr::Char)::Char
if in(chr, ['a', 'e', 'i', 'o','y', 'u'])
return '9'
elseif in(chr,['s','c','k','j','g','z','x','q'])
return '2'
elseif in(chr,['n','m'])
return '5'
elseif 'l' == chr
return '4'
elseif 'r' == chr
return '6'
elseif in(chr,['t','d'])
return '3'
elseif in(chr,['h','w'])
return '8'
elseif in(chr,['v','b','p','f'])
return '1'
elseif chr === ' '
return '7'
else
@error "Unknown character encountered $chr"
end
end

function soundex(s::T)::StaticString{4} where T <: AbstractString
output = ['0' for _ in 1:4]
index = 1
chr=lowercase(s[1])
previous_encoding = encode(chr)
output[index] = uppercase(chr)
for chr in s[2:end]
if index <= 3
encoding = encode(lowercase(chr))
if encoding === '7'
# continue if space
continue
end
# if vowel or ignorable vars 'h' or 'w'
if encoding !== '9' && encoding !== '8'
if encoding !== previous_encoding
# Rule 4 on Consonant Separators
if encoding === output[index]
# If a vowel (A, E, I, O, U) separates two consonants that have the same code, code consonant to the right of vowel
if previous_encoding === '9'
index += 1
output[index] = encoding
end
# If "H" or "W" separate two consonants that have code, the consonant to the right of the vowel is not coded
else
# different letter code soundex
index += 1
output[index] = encoding
end
end
end
else
break
end
previous_encoding = encoding
end
return StaticString(String(output))
end
end # module
108 changes: 80 additions & 28 deletions src/fastlink/fastlink.jl
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@ matched_data = fastLink(dfA, dfB, ["firstname", "lastname", "city"])
function fastLink(dfA::DataFrame, dfB::DataFrame,
varnames::Vector{String},
idvar::Tuple{String,String};
term_freq_adjustment=[false],
match_method=String[],
partials=[true],
upper_case=[true],
Expand Down Expand Up @@ -116,11 +117,17 @@ function fastLink(dfA::DataFrame, dfB::DataFrame,
cut_a = check_input_lengths(cut_a, numvars, "cut_a")
cut_p = check_input_lengths(cut_p, numvars, "cut_p")
address_field = check_input_lengths(address_field, numvars, "address_field")
term_freq_adjustment = check_input_lengths(term_freq_adjustment, numvars, "term_freq_adjustment")
stringdist_method = check_input_lengths(stringdist_method, numvars, "stringdist_method")

vartypes, comparison_levels = check_var_types(dfA,dfB,varnames,match_method,partials)


# results table
res = [DiBitMatrix(obs_a,obs_b) for _ in varnames]

# term frequency tables
tf_table_x = [ones(Float16,dims[1]) for _ in varnames]
tf_table_y = [ones(Float16,dims[2]) for _ in varnames]

# allow missing for comparisons
allowmissing!(dfA)
Expand All @@ -131,31 +138,67 @@ function fastLink(dfA::DataFrame, dfB::DataFrame,
for i in eachindex(varnames)
@info "Now matching var $(varnames[i]) using $(match_method[i])"
if match_method[i] == "fuzzy"
gammaCKfuzzy!(dfA[!,varnames[i]],
dfB[!,varnames[i]],
res[i],
dims,
cut_a=cut_a[i],
cut_b=cut_p[i],
upper=upper_case[i],
w=jw_weight[i],
partial=partials[i])
if term_freq_adjustment[i]
gammaCKfuzzy!(dfA[!,varnames[i]],
dfB[!,varnames[i]],
res[i],
dims,
view(tf_table_x[i],:),
view(tf_table_y[i],:),
cut_a=cut_a[i],
cut_b=cut_p[i],
upper=upper_case[i],
w=jw_weight[i],
partial=partials[i])
else
gammaCKfuzzy!(dfA[!,varnames[i]],
dfB[!,varnames[i]],
res[i],
dims,
cut_a=cut_a[i],
cut_b=cut_p[i],
upper=upper_case[i],
w=jw_weight[i],
partial=partials[i])
end
elseif match_method[i] == "string"
gammaCKpar!(dfA[!,varnames[i]],
dfB[!,varnames[i]],
res[i],
dims,
distmethod=stringdist_method[i],
cut_a=cut_a[i],
cut_b=cut_p[i],
w=jw_weight[i],
partial=partials[i])

if term_freq_adjustment[i]
gammaCKpar!(dfA[!,varnames[i]],
dfB[!,varnames[i]],
res[i],
dims,
view(tf_table_x[i],:),
view(tf_table_y[i],:),
distmethod=stringdist_method[i],
cut_a=cut_a[i],
cut_b=cut_p[i],
w=jw_weight[i],
partial=partials[i])
else
gammaCKpar!(dfA[!,varnames[i]],
dfB[!,varnames[i]],
res[i],
dims,
distmethod=stringdist_method[i],
cut_a=cut_a[i],
cut_b=cut_p[i],
w=jw_weight[i],
partial=partials[i])
end
elseif match_method[i] == "exact" || match_method[i] == "bool"
gammaKpar!(dfA[!,varnames[i]],
dfB[!,varnames[i]],
res[i],
dims)
if term_freq_adjustment[i]
gammaKpar!(dfA[!,varnames[i]],
dfB[!,varnames[i]],
res[i],
dims,
view(tf_table_x[i],:),
view(tf_table_y[i],:))
else
gammaKpar!(dfA[!,varnames[i]],
dfB[!,varnames[i]],
res[i],
dims)
end
elseif match_method == "numeric" || match_method=="float" || match_method == "int"
gammaNUMCKpar!(dfA[!,varnames[i]],
dfB[!,varnames[i]],
Expand All @@ -176,11 +219,20 @@ function fastLink(dfA::DataFrame, dfB::DataFrame,
address_field=address_field)
# testing removing uncessessary indices (where no obs exist)
#remove_no_matched_var_indices(resultsEM)
# adding uids
resultsEM = merge(resultsEM, (matched_ids = indices_to_uids(dfA[!, idvar[1]],dfB[!, idvar[2]],resultsEM.indices),))
# adding uids

if any(term_freq_adjustment)
resultsEM = merge(resultsEM, (matched_ids = indices_to_uids(dfA[!, idvar[1]],dfB[!, idvar[2]],resultsEM.indices),
tf_adj_table = tf_adj_table(resultsEM,varnames,tf_table_x,tf_table_y)))

else
resultsEM = merge(resultsEM, (matched_ids = indices_to_uids(dfA[!, idvar[1]],dfB[!, idvar[2]],resultsEM.indices),))
end



@info "Retrieving matches"
getMatches(resultsEM,threshold_match=threshold_match)
getMatches!(resultsEM,threshold_match=threshold_match)
return (resultsEM)
end

Expand Down Expand Up @@ -282,7 +334,7 @@ function fastLink(dfA::DataFrame, dfB::DataFrame;


@info "Retrieving matches"
getMatches(resultsEM,threshold_match=threshold_match)
getMatches!(resultsEM,threshold_match=threshold_match)
return (resultsEM)
end

Expand Down
85 changes: 85 additions & 0 deletions src/gammas/gammaCKfuzzy.jl
Original file line number Diff line number Diff line change
Expand Up @@ -235,3 +235,88 @@ function gammaCKfuzzy!(vecA::PooledVector,vecB::PooledVector,results::DiBitMatri
end
return nothing
end


function gammaCKfuzzy!(vecA::PooledVector,vecB::PooledVector,results::DiBitMatrix,dims::Tuple{Int,Int},
tf_table_x::SubArray{Float16},
tf_table_y::SubArray{Float16};
cut_a::Float64=0.92,cut_b::Float64=0.88,upper::Bool=true,
w::Float64=0.1,partial::Bool=true)


# functions that update the results view
if partial
score_value! = score_value2
else
score_value! = score_value
end

# change the range of ascii characters dependent on case
if upper
space_char,max_char = 0x40,0x5a
else
space_char,max_char = 0x60,0x7a
end
# length of unique values
lenA = UInt32(length(vecA.pool))
lenB = UInt32(length(vecB.pool))

# vector of pool value indices
lookup_a_by_id=pool_lookup_table(vecA.refs, lenA)
lookup_b_by_id=pool_lookup_table(vecB.refs, lenB)

dims=(length(vecA),length(vecB))

# term frequency for x
Threads.@threads for i in lookup_a_by_id
tf_val=length(i)/dims[1]
for ii in i
tf_table_x[ii] = tf_val
end
end

# term frequency for y
Threads.@threads for i in lookup_b_by_id
tf_val=length(i)/dims[2]
for ii in i
tf_table_y[ii] = tf_val
end
end

missingindexA = find_missing_index(vecA.invpool)

base_candidate_lookup = build_candidate_lookup(vecB.pool,spaceletter=space_char,lastletter=max_char)
base_candidate_scores = build_candidate_scores(vecB.pool)

Threads.@threads for (query_name,new_a_id) in collect(vecA.invpool)
# pass if query is missing val
if new_a_id === missingindexA
update_results!(results, lookup_a_by_id[new_a_id],UInt32(1):UInt32(dims[2]),missingval)
continue
end

query_len = UInt8(min(ncodeunits(query_name),16))
query_masks_lookup = maskify(query_name,query_len,space_char=space_char,max_char=max_char)
query_partial = UInt16(1024 ÷ query_len)
candidate_scores = deepcopy(base_candidate_scores)

for (query_index, (letter_index, query_mask_by_candidate_len)) in enumerate(query_masks_lookup)
for c_info in base_candidate_lookup[letter_index]
candidate_score = candidate_scores[c_info.name_index]
query_mask = query_mask_by_candidate_len[c_info.len]
score_letter!(candidate_score, query_mask, c_info.mask, query_index)
end
end

a_ids = lookup_a_by_id[new_a_id]
for (score_i, score) in enumerate(candidate_scores)
if score.len_partial === UInt16(1)
update_results!(results, a_ids, lookup_b_by_id[score_i], missingval)
continue
end
# if present calculate scores
score_value!(results, score,query_partial, a_ids,lookup_b_by_id,score_i, w, cut_a, cut_b)
end
end
return nothing
end
Loading

0 comments on commit c54a704

Please sign in to comment.