From d62819d43d8c7d3f8ce9cd40c838a69c14bcde26 Mon Sep 17 00:00:00 2001 From: jw2249a Date: Mon, 13 May 2024 11:15:56 +0000 Subject: [PATCH] adding tf version of the new gammakpar! --- scratch.jl | 11 +++--- src/gammas/gammaKpar.jl | 77 ++++++++++++++++++++++++++++++++++++++++- 2 files changed, 81 insertions(+), 7 deletions(-) diff --git a/scratch.jl b/scratch.jl index fe8b999..d00febb 100755 --- a/scratch.jl +++ b/scratch.jl @@ -1,14 +1,13 @@ -using FastLink using DataFrames using CSV using PooledArrays -import Pkg.Artifacts: @artifact_str -#Pkg.add(url="https://github.com/jw2249a/FastLink.jl") -using StatsBase +import Pkg +Pkg.add(url="https://github.com/jw2249a/FastLink.jl") +using FastLink using JSON -a_fil = @artifact_str "dfA" -b_fil = @artifact_str "dfB" +a_fil = Pkg.Artifacts.@artifact_str "dfA" +b_fil = Pkg.Artifacts.@artifact_str "dfB" dfA=CSV.read("$(a_fil)/dfA.csv", DataFrame, diff --git a/src/gammas/gammaKpar.jl b/src/gammas/gammaKpar.jl index c1b1ffd..e82ad59 100755 --- a/src/gammas/gammaKpar.jl +++ b/src/gammas/gammaKpar.jl @@ -129,6 +129,82 @@ function gammaKpar!(vecA::PooledVector,vecB::PooledVector,results::DiBitMatrix, return nothing end + + +# vector version of gammakpar for highly entropic data (aka high ordinality compared to obs) +function gammaKpar!(vecA::Vector,vecB::Vector,results::DiBitMatrix, + tf_table_x::SubArray{Float16}, + tf_table_y::SubArray{Float16}; + tf_minimum_u_value=0.001) + vecA = PooledArray(vecA) + vecB = PooledArray(vecB) + if @isdefined(_dims) == false + _dims = (length(vecA), length(vecB)) + end + # Segment unique keys from missing key + missingvals_x = findfirst(ismissing.(vecA.pool)) + iter_x=filter(x -> x != missingvals_x, 0x00000001:UInt32(length(vecA.pool))) + + missingvals_y = findfirst(ismissing.(vecB.pool)) + iter_y=filter(x -> x != missingvals_y, 0x00000001:UInt32(length(vecB.pool))) + + # Form match matrices based on differing levels of matches + Threads.@threads for x in iter_x + indices_x = findall(vecA.refs .=== x) + # term frequency adjustment for x + tf_val_x = length(indices_x)/_dims[1] + for tf_i in indices_x + tf_table_x[tf_i] = max(tf_val_x, tf_minimum_u_value) + end + for y in iter_y + indices_y = findall(vecB.refs .=== y) + # term frequency adjustment for y + tf_val_y = length(indices_y)/_dims[2] + for tf_i in indices_y + tf_table_y[tf_i] = max(tf_val_y, tf_minimum_u_value) + end + # if matches at a threshold, go through result vector and assign new value + if vecA.pool[x] == vecB.pool[y] + for ix in indices_x,iy in indices_y + results[ix,iy] = match2 + end + end + end + end + + # set all to missing where x is missing + if !isnothing(missingvals_x) + missingindices = findall(vecA.refs .== missingvals_x) + # term frequency adjustment for x + tf_val_x = length(missingindices)/_dims[1] + for tf_i in missingindices + tf_table_x[tf_i] = max(tf_val_x, tf_minimum_u_value) + end + Threads.@threads for iy in 1:_dims[2] + for ix in missingindices + results[ix,iy] = missingval + end + end + end + # set all to missing where y is missing + if !isnothing(missingvals_y) + missingindices = findall(vecB.refs .== missingvals_y) + # term frequency adjustment for y + tf_val_y = length(missingindices)/_dims[2] + for tf_i in missingindices + tf_table_y[tf_i] = max(tf_val_y, tf_minimum_u_value) + end + + Threads.@threads for ix in 1:_dims[1] + for iy in missingindices + results[ix,iy] = missingval + end + end + end + # Return nothing + return nothing +end + # vector version of gammakpar for highly entropic data (aka high ordinality compared to obs) function gammaKpar!(vecA::Vector,vecB::Vector,results::DiBitMatrix) if @isdefined(_dims) == false @@ -162,4 +238,3 @@ function gammaKpar!(vecA::Vector,vecB::Vector,results::DiBitMatrix) # Return nothing return nothing end -