full json and nested parameters update

jw2249a · May 6, 2024 · a193080 · a193080
1 parent c313e8c
commit a193080
Show file tree

Hide file tree

Showing 18 changed files with 1,136 additions and 651 deletions.
diff --git a/Project.toml b/Project.toml
@@ -1,12 +1,13 @@
 name = "FastLink"
 uuid = "11f39cfd-5548-489f-be9a-f4ad0ff6eadc"
 authors = ["Jack R. Williams <contact@jackryanwilliams.com>"]
-version = "0.0.2"
+version = "0.0.5"
 
 [deps]
 DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
 DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
 Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
+JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
 PooledArrays = "2dfb63ee-cc39-5dd5-95bd-886bf059d720"
 StaticStrings = "4db0a0c5-418a-4e1d-8806-cb305fe13294"
 StringDistances = "88034a9c-02f8-509d-84a9-84ec65e18404"

diff --git a/README.md b/README.md
@@ -9,7 +9,6 @@ The purpose of FastLink.jl is to bring a fast record linkage package to the juli
 ___________________________
 ### Using the fastLink function
 
-
 The basic arguments for the `fastLink` function to run are
 
 - `dfA`: A `DataFrame` table of records to be matched.
@@ -44,7 +43,7 @@ The optional parameters are
 
 - `tol_em`: Convergence tolerance for the EM Algorithm. (default `1e-05`)
 
-- `threshold_match`: Lower bound for the posterior probability that will act as a cutoff for matches.
+- `threshold_match`: Lower bound for the posterior probability that will act as a cutoff for matches. Default `[0.85]`.
 
 - `prior_lambda::Float64`: Default 0.0.
 
@@ -54,7 +53,7 @@ The optional parameters are
 
 - `w_pi::Float64`: Default 0.0.
 
-- `dedupe_matches`: Whether to dedupe the matches within the dataset. Default `[0.85]`.
+- `dedupe_matches`: Whether to dedupe the matches within the dataset.
 
 
 __________________
@@ -93,7 +92,7 @@ A `NamedTuple` with these vars:
 - `tf_adj_table` - A `Vector{DataFrame}` that has a DataFrame for each match pattern and a row in each DataFrame for each comparison appended with the letter of their corresponding dataset.
 
 - `varnames` - A `Vector{String}` of the input variable names
-
+ 
 - `zeta_j` - A `Vector{Float64}` with the posterior match probabilities for each agreement pattern. 
 
 # Examples

diff --git a/scratch.jl b/scratch.jl
@@ -1,23 +1,15 @@
-using Pkg
-Pkg.develop(path=".")
-Pkg.precompile()
+using FastLink
 using DataFrames
-using BenchmarkTools
 using CSV
-using FastLink
 using PooledArrays
 import Pkg.Artifacts: @artifact_str
-
+#Pkg.add(url="https://github.com/jw2249a/FastLink.jl")
+using StatsBase
+using JSON
 
 a_fil = @artifact_str "dfA"
 b_fil = @artifact_str "dfB"
 
-varnames=["firstname","middlename", "lastname","housenum"]
-match_method=["string", "string","string", "float"]
-cut_a=[0.92,0.92,0.92,1]
-cut_p=[0.88,0.88,0.88,2]
-
-
 
 dfA=CSV.read("$(a_fil)/dfA.csv", DataFrame,
              ntasks=1,
@@ -27,21 +19,25 @@ dfB=CSV.read("$(b_fil)/dfB.csv", DataFrame,
              ntasks=1,
              pool=true,
              missingstring=["", "NA"])
-dfA.id = hash.(eachrow(dfA))
-dfB.id2 = hash.(eachrow(dfB))
 
 
-for var in varnames[1:3]
-    dfA[!,var] = PooledArray(passmissing(x->uppercase(x)).(dfA[:,var]))
-    dfB[!,var] = PooledArray(passmissing(x->uppercase(x)).(dfB[:,var]))
-end
+config = JSON.parsefile("test_parameters.json")
 
-results=fastLink(dfA,dfB,varnames,("id","id2"),
-                 match_method=match_method,
-                 term_freq_adjustment=[true],
-                 cut_a=cut_a,cut_p=cut_p,
-                 threshold_match = 0.85)
+dfA.id = hash.(eachrow(dfA))
+dfB.id2 = hash.(eachrow(dfB))
 
 
+varnames=["firstname","middlename", "lastname","housenum"]
+
+for var in varnames
+    if eltype(dfA[:,var]) <: AbstractString
+        dfA[!,var] = PooledArray(passmissing(x->uppercase(x)).(dfA[:,var]))
+        dfB[!,var] = PooledArray(passmissing(x->uppercase(x)).(dfB[:,var]))
+    else
+        dfA[!,var] = Vector(dfA[!,var])
+        dfB[!,var] = Vector(dfB[!,var])
+    end
+end
 
+result=fastLink(dfA, dfB, config)
 
diff --git a/src/FastLink.jl b/src/FastLink.jl
@@ -1,25 +1,55 @@
 module FastLink
 using DataFrames
 import PooledArrays: PooledVector
-import Distributions: Dirichlet,rand
 
 # match constants
 const nonmatch::UInt8 = UInt8(0)
 const match1::UInt8 = UInt8(1)
 const match2::UInt8 = UInt8(2)
 const missingval::UInt8 = UInt8(3)
 
+const STRING_DISTANCE_METHODS = Dict("jw" => "jw",
+                                     "jarowinkler" => "jw",
+                                     "jaro winkler" => "jw",
+                                     "jaro-winkler" => "jw",
+                                     "jaro" => "jaro",
+                                     "dl" => "dl",
+                                     "dameraulevenshtein" => "dl",
+                                     "damerau levenshtein" => "dl",
+                                     "damerau-levenshtein" => "dl",
+                                     "lv" => "lv",
+                                     "levenshtein" => "lv",
+                                     "hamming" => "hamming",
+                                     "ro" => "ro",
+                                     "ratcliffobershelp" => "ro",
+                                     "ratcliff obershelp" => "ro",
+                                     "ratcliff-obershelp" => "ro",
+                                     "osa" => "osa",
+                                     "optimal string alignment" => "osa",
+                                     "optimalstringalignment" => "osa"
+                                     )
+
+include("settings/settings.jl")
 include("DiBitMatrix.jl")
-using .DiBitMat
+include("matchPatterns.jl")
 include("gammas/Gammas.jl")
+include("term_frequency_adjustment.jl")
+include("emlink.jl")
+include("patterns.jl")
+
+using .settings
+using .DiBitMat
+using .matchpatterns
 using .Gammas
+using .emlink
+using .tf
+using .patterns
 
-include("matchPatterns.jl")
-include("emlink.jl")
-include("getMatches.jl")
 include("fastlink/fastlink.jl")
 
-export(fastLink)
+export gammaCKpar!, gammaKpar!, gammaCKfuzzy!, gammaNUMCKpar!, DiBitMatrix, namedtuple, fetch_parameters, retrieve, parse_configuration, remove_keys, emlinkMARmov,  STRING_DISTANCE_METHODS, match1, match2, missingval, nonmatch, indices_to_uids, process_comparisons, fastLink
+
+#export(fastLink)
 
 
 end # module FastLink