Skip to content

Commit

Permalink
adding tf version of the new gammakpar!
Browse files Browse the repository at this point in the history
  • Loading branch information
jw2249a committed May 13, 2024
1 parent 174cfd9 commit d62819d
Show file tree
Hide file tree
Showing 2 changed files with 81 additions and 7 deletions.
11 changes: 5 additions & 6 deletions scratch.jl
Original file line number Diff line number Diff line change
@@ -1,14 +1,13 @@
using FastLink
using DataFrames
using CSV
using PooledArrays
import Pkg.Artifacts: @artifact_str
#Pkg.add(url="https://github.com/jw2249a/FastLink.jl")
using StatsBase
import Pkg
Pkg.add(url="https://github.com/jw2249a/FastLink.jl")
using FastLink
using JSON

a_fil = @artifact_str "dfA"
b_fil = @artifact_str "dfB"
a_fil = Pkg.Artifacts.@artifact_str "dfA"
b_fil = Pkg.Artifacts.@artifact_str "dfB"


dfA=CSV.read("$(a_fil)/dfA.csv", DataFrame,
Expand Down
77 changes: 76 additions & 1 deletion src/gammas/gammaKpar.jl
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,82 @@ function gammaKpar!(vecA::PooledVector,vecB::PooledVector,results::DiBitMatrix,
return nothing
end



# vector version of gammakpar for highly entropic data (aka high ordinality compared to obs)
function gammaKpar!(vecA::Vector,vecB::Vector,results::DiBitMatrix,
tf_table_x::SubArray{Float16},
tf_table_y::SubArray{Float16};
tf_minimum_u_value=0.001)
vecA = PooledArray(vecA)
vecB = PooledArray(vecB)
if @isdefined(_dims) == false
_dims = (length(vecA), length(vecB))
end
# Segment unique keys from missing key
missingvals_x = findfirst(ismissing.(vecA.pool))
iter_x=filter(x -> x != missingvals_x, 0x00000001:UInt32(length(vecA.pool)))

missingvals_y = findfirst(ismissing.(vecB.pool))
iter_y=filter(x -> x != missingvals_y, 0x00000001:UInt32(length(vecB.pool)))

# Form match matrices based on differing levels of matches
Threads.@threads for x in iter_x
indices_x = findall(vecA.refs .=== x)
# term frequency adjustment for x
tf_val_x = length(indices_x)/_dims[1]
for tf_i in indices_x
tf_table_x[tf_i] = max(tf_val_x, tf_minimum_u_value)
end
for y in iter_y
indices_y = findall(vecB.refs .=== y)
# term frequency adjustment for y
tf_val_y = length(indices_y)/_dims[2]
for tf_i in indices_y
tf_table_y[tf_i] = max(tf_val_y, tf_minimum_u_value)
end
# if matches at a threshold, go through result vector and assign new value
if vecA.pool[x] == vecB.pool[y]
for ix in indices_x,iy in indices_y
results[ix,iy] = match2
end
end
end
end

# set all to missing where x is missing
if !isnothing(missingvals_x)
missingindices = findall(vecA.refs .== missingvals_x)
# term frequency adjustment for x
tf_val_x = length(missingindices)/_dims[1]
for tf_i in missingindices
tf_table_x[tf_i] = max(tf_val_x, tf_minimum_u_value)
end
Threads.@threads for iy in 1:_dims[2]
for ix in missingindices
results[ix,iy] = missingval
end
end
end
# set all to missing where y is missing
if !isnothing(missingvals_y)
missingindices = findall(vecB.refs .== missingvals_y)
# term frequency adjustment for y
tf_val_y = length(missingindices)/_dims[2]
for tf_i in missingindices
tf_table_y[tf_i] = max(tf_val_y, tf_minimum_u_value)
end

Threads.@threads for ix in 1:_dims[1]
for iy in missingindices
results[ix,iy] = missingval
end
end
end
# Return nothing
return nothing
end

# vector version of gammakpar for highly entropic data (aka high ordinality compared to obs)
function gammaKpar!(vecA::Vector,vecB::Vector,results::DiBitMatrix)
if @isdefined(_dims) == false
Expand Down Expand Up @@ -162,4 +238,3 @@ function gammaKpar!(vecA::Vector,vecB::Vector,results::DiBitMatrix)
# Return nothing
return nothing
end

0 comments on commit d62819d

Please sign in to comment.