Skip to content

Commit

Permalink
full json and nested parameters update
Browse files Browse the repository at this point in the history
  • Loading branch information
jw2249a committed May 6, 2024
1 parent c313e8c commit a193080
Show file tree
Hide file tree
Showing 18 changed files with 1,136 additions and 651 deletions.
3 changes: 2 additions & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
name = "FastLink"
uuid = "11f39cfd-5548-489f-be9a-f4ad0ff6eadc"
authors = ["Jack R. Williams <contact@jackryanwilliams.com>"]
version = "0.0.2"
version = "0.0.5"

[deps]
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
PooledArrays = "2dfb63ee-cc39-5dd5-95bd-886bf059d720"
StaticStrings = "4db0a0c5-418a-4e1d-8806-cb305fe13294"
StringDistances = "88034a9c-02f8-509d-84a9-84ec65e18404"
Expand Down
7 changes: 3 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@ The purpose of FastLink.jl is to bring a fast record linkage package to the juli
___________________________
### Using the fastLink function


The basic arguments for the `fastLink` function to run are

- `dfA`: A `DataFrame` table of records to be matched.
Expand Down Expand Up @@ -44,7 +43,7 @@ The optional parameters are

- `tol_em`: Convergence tolerance for the EM Algorithm. (default `1e-05`)

- `threshold_match`: Lower bound for the posterior probability that will act as a cutoff for matches.
- `threshold_match`: Lower bound for the posterior probability that will act as a cutoff for matches. Default `[0.85]`.

- `prior_lambda::Float64`: Default 0.0.

Expand All @@ -54,7 +53,7 @@ The optional parameters are

- `w_pi::Float64`: Default 0.0.

- `dedupe_matches`: Whether to dedupe the matches within the dataset. Default `[0.85]`.
- `dedupe_matches`: Whether to dedupe the matches within the dataset.


__________________
Expand Down Expand Up @@ -93,7 +92,7 @@ A `NamedTuple` with these vars:
- `tf_adj_table` - A `Vector{DataFrame}` that has a DataFrame for each match pattern and a row in each DataFrame for each comparison appended with the letter of their corresponding dataset.

- `varnames` - A `Vector{String}` of the input variable names

- `zeta_j` - A `Vector{Float64}` with the posterior match probabilities for each agreement pattern.

# Examples
Expand Down
42 changes: 19 additions & 23 deletions scratch.jl
Original file line number Diff line number Diff line change
@@ -1,23 +1,15 @@
using Pkg
Pkg.develop(path=".")
Pkg.precompile()
using FastLink
using DataFrames
using BenchmarkTools
using CSV
using FastLink
using PooledArrays
import Pkg.Artifacts: @artifact_str

#Pkg.add(url="https://github.com/jw2249a/FastLink.jl")
using StatsBase
using JSON

a_fil = @artifact_str "dfA"
b_fil = @artifact_str "dfB"

varnames=["firstname","middlename", "lastname","housenum"]
match_method=["string", "string","string", "float"]
cut_a=[0.92,0.92,0.92,1]
cut_p=[0.88,0.88,0.88,2]



dfA=CSV.read("$(a_fil)/dfA.csv", DataFrame,
ntasks=1,
Expand All @@ -27,21 +19,25 @@ dfB=CSV.read("$(b_fil)/dfB.csv", DataFrame,
ntasks=1,
pool=true,
missingstring=["", "NA"])
dfA.id = hash.(eachrow(dfA))
dfB.id2 = hash.(eachrow(dfB))


for var in varnames[1:3]
dfA[!,var] = PooledArray(passmissing(x->uppercase(x)).(dfA[:,var]))
dfB[!,var] = PooledArray(passmissing(x->uppercase(x)).(dfB[:,var]))
end
config = JSON.parsefile("test_parameters.json")

results=fastLink(dfA,dfB,varnames,("id","id2"),
match_method=match_method,
term_freq_adjustment=[true],
cut_a=cut_a,cut_p=cut_p,
threshold_match = 0.85)
dfA.id = hash.(eachrow(dfA))
dfB.id2 = hash.(eachrow(dfB))


varnames=["firstname","middlename", "lastname","housenum"]

for var in varnames
if eltype(dfA[:,var]) <: AbstractString
dfA[!,var] = PooledArray(passmissing(x->uppercase(x)).(dfA[:,var]))
dfB[!,var] = PooledArray(passmissing(x->uppercase(x)).(dfB[:,var]))
else
dfA[!,var] = Vector(dfA[!,var])
dfB[!,var] = Vector(dfB[!,var])
end
end

result=fastLink(dfA, dfB, config)

42 changes: 36 additions & 6 deletions src/FastLink.jl
Original file line number Diff line number Diff line change
@@ -1,25 +1,55 @@
module FastLink
using DataFrames
import PooledArrays: PooledVector
import Distributions: Dirichlet,rand

# match constants
const nonmatch::UInt8 = UInt8(0)
const match1::UInt8 = UInt8(1)
const match2::UInt8 = UInt8(2)
const missingval::UInt8 = UInt8(3)

const STRING_DISTANCE_METHODS = Dict("jw" => "jw",
"jarowinkler" => "jw",
"jaro winkler" => "jw",
"jaro-winkler" => "jw",
"jaro" => "jaro",
"dl" => "dl",
"dameraulevenshtein" => "dl",
"damerau levenshtein" => "dl",
"damerau-levenshtein" => "dl",
"lv" => "lv",
"levenshtein" => "lv",
"hamming" => "hamming",
"ro" => "ro",
"ratcliffobershelp" => "ro",
"ratcliff obershelp" => "ro",
"ratcliff-obershelp" => "ro",
"osa" => "osa",
"optimal string alignment" => "osa",
"optimalstringalignment" => "osa"
)

include("settings/settings.jl")
include("DiBitMatrix.jl")
using .DiBitMat
include("matchPatterns.jl")
include("gammas/Gammas.jl")
include("term_frequency_adjustment.jl")
include("emlink.jl")
include("patterns.jl")

using .settings
using .DiBitMat
using .matchpatterns
using .Gammas
using .emlink
using .tf
using .patterns

include("matchPatterns.jl")
include("emlink.jl")
include("getMatches.jl")
include("fastlink/fastlink.jl")

export(fastLink)
export gammaCKpar!, gammaKpar!, gammaCKfuzzy!, gammaNUMCKpar!, DiBitMatrix, namedtuple, fetch_parameters, retrieve, parse_configuration, remove_keys, emlinkMARmov, STRING_DISTANCE_METHODS, match1, match2, missingval, nonmatch, indices_to_uids, process_comparisons, fastLink

#export(fastLink)


end # module FastLink
Loading

0 comments on commit a193080

Please sign in to comment.