-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* major update. numeric completely working, priors added for expectation maximization * removing temp files * addendum. removing unnecessary temp file from emacs... again
- Loading branch information
Showing
19 changed files
with
772 additions
and
903 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -23,5 +23,7 @@ docs/site/ | |
# environment. | ||
Manifest.toml | ||
|
||
|
||
actual_scratch.jl | ||
scratch.jl | ||
.#* | ||
\#* |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,19 +1,21 @@ | ||
name = "FastLink" | ||
uuid = "11f39cfd-5548-489f-be9a-f4ad0ff6eadc" | ||
authors = ["Jack R. Williams <contact@jackryanwilliams.com>"] | ||
version = "0.1.1" | ||
version = "0.0.2" | ||
|
||
[deps] | ||
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" | ||
DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" | ||
Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" | ||
PooledArrays = "2dfb63ee-cc39-5dd5-95bd-886bf059d720" | ||
StringDistances = "88034a9c-02f8-509d-84a9-84ec65e18404" | ||
|
||
[extras] | ||
BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" | ||
CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b" | ||
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" | ||
Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" | ||
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" | ||
|
||
|
||
[targets] | ||
test = ["Test", "CSV", "Pkg"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,7 @@ | ||
# FastLink.jl | ||
Fast Probabilistic Record Linkage for the Julia Language | ||
## What is FastLink.jl | ||
|
||
The purpose of FastLink.jl is to bring a fast record linkage package to the julia language. When attempting to match large datasets using existing libraries in R and Python, I found they can be very slow and succumb to issues with memory pressure. This implementation of the fastlink algorithm is intended to scale effeciently in parallel and be able to easily handle matches between tabular data that span millions of rows. | ||
|
||
[![Run tests](https://github.com/jw2249a/FastLink.jl/actions/workflows/test.yml/badge.svg)](https://github.com/jw2249a/FastLink.jl/actions/workflows/test.yml) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,108 +1,48 @@ | ||
using Pkg | ||
Pkg.develop(path=".") | ||
#Pkg.develop(path=".") | ||
using DataFrames | ||
using BenchmarkTools | ||
using CSV | ||
using FastLink | ||
using PooledArrays | ||
import Pkg.Artifacts: @artifact_str | ||
|
||
numeric=false | ||
# files for performance | ||
test=true | ||
if test | ||
a_fil="../dfA.csv" | ||
b_fil="../dfB.csv" | ||
if numeric | ||
varnames=["housenum"] | ||
match_method=["float"] | ||
cut_a=[1] | ||
cut_p=[2] | ||
else | ||
varnames=["firstname","middlename", "lastname","housenum"] | ||
match_method=["string", "string","string", "float"] | ||
cut_a=[0.92,0.92,0.92,1] | ||
cut_p=[0.88,0.88,0.88,2] | ||
end | ||
else | ||
a_fil="../../rstudio/test_merge/data/test_a.csv" | ||
b_fil="../../rstudio/test_merge/data/test_b.csv" | ||
|
||
if numeric | ||
varnames=["ZIP", "DOB_YEAR", "ZIP4"] | ||
match_method=["float", "float", "float"] | ||
cut_a=[1,1,1] | ||
cut_p=[2,2,2] | ||
else | ||
varnames=["FIRST_NAME", "MIDDLE_NAME", "LAST_NAME", "STREET_NAME"] | ||
cut_a=[0.92,0.92,0.92,0.92] | ||
cut_p=[0.88,0.88,0.88,0.88] | ||
#varnames=["FIRST_NAME", "MIDDLE_NAME", "LAST_NAME", "STREET_NAME", "STATE"] | ||
end | ||
end | ||
a_fil = @artifact_str "dfA" | ||
b_fil = @artifact_str "dfB" | ||
|
||
varnames=["firstname","middlename", "lastname","housenum"] | ||
match_method=["string", "string","string", "float"] | ||
cut_a=[0.92,0.92,0.92,1] | ||
cut_p=[0.88,0.88,0.88,2] | ||
|
||
#[100,200,500,1_000,2_000,4_000, 5_000, 10_000,20_000, 40_000, 50_000,100_000,1_000_000] | ||
N1=10_000 | ||
N2=500_000 | ||
|
||
|
||
if test | ||
dfA=CSV.read(a_fil, DataFrame, | ||
ntasks=1, | ||
pool=true, | ||
missingstring=["", "NA"]) | ||
dfB=CSV.read(b_fil, DataFrame, | ||
ntasks=1, | ||
pool=true, | ||
missingstring=["", "NA"]) | ||
else | ||
dfA=CSV.read(a_fil, DataFrame, | ||
limit=N1, | ||
ignoreemptyrows=true, | ||
ntasks=1, | ||
pool=true, | ||
missingstring=["", "NA", "NaN", "NULL", "Null"]) | ||
dfB=CSV.read(b_fil, DataFrame, | ||
limit=N2, | ||
ignoreemptyrows=true, | ||
ntasks=1, | ||
pool=true, | ||
missingstring=["", "NA", "NaN", "NULL", "Null"]) | ||
end | ||
dfA=CSV.read("$(a_fil)/dfA.csv", DataFrame, | ||
ntasks=1, | ||
pool=true, | ||
missingstring=["", "NA"]) | ||
dfB=CSV.read("$(b_fil)/dfB.csv", DataFrame, | ||
ntasks=1, | ||
pool=true, | ||
missingstring=["", "NA"]) | ||
|
||
|
||
if !test && numeric | ||
for var in varnames | ||
dfA[!,var]=passmissing(x-> try return parse(Float64,x) catch e return 0.0 end).(dfA[:,var]) | ||
dfB[!,var]=passmissing(x-> try return parse(Float64,x) catch e return 0.0 end).(dfB[:,var]) | ||
end | ||
for var in varnames[1:3] | ||
dfA[!,var] = PooledArray(passmissing(x->uppercase(x)).(dfA[:,var])) | ||
dfB[!,var] = PooledArray(passmissing(x->uppercase(x)).(dfB[:,var])) | ||
end | ||
|
||
# if test && !numeric | ||
# for var in varnames | ||
# dfA[!,var] = PooledArray(passmissing(x->uppercase(x)).(dfA[:,var])) | ||
# dfB[!,var] = PooledArray(passmissing(x->uppercase(x)).(dfB[:,var])) | ||
# end | ||
# end | ||
|
||
|
||
config = fastLink(dfA,dfB,varnames,match_method=match_method,cut_a=cut_a,cut_p=cut_p, | ||
threshold_match = 0.85) | ||
|
||
|
||
|
||
dump(config.fastlink_settings.comparison_funs[4]) | ||
|
||
results=fastLink(dfA,dfB,varnames,match_method=match_method,cut_a=cut_a,cut_p=cut_p, | ||
|
||
|
||
|
||
threshold_match = 0.85)() | ||
|
||
|
||
|
||
|
||
x=results[1].patterns_w | ||
x[findall(ismissing.(x.gamma_4) .== false .&& x.gamma_4 .== 1),:] | ||
x[findall(ismissing.(x.gamma_4)),:] | ||
44+7+1+43+79+1 | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,75 @@ | ||
module DiBitMat | ||
import Base: getindex, setindex!, view | ||
import DataStructures: DiBitVector | ||
export DiBitMatrix | ||
|
||
""" | ||
Extending DiBitVectors from DataStructures.jl to include matrices. | ||
""" | ||
struct DiBitMatrix | ||
data::DiBitVector | ||
nrows::Integer | ||
ncols::Integer | ||
end | ||
|
||
# base definition of the DiBitMatrix | ||
function DiBitMatrix(nrows::Integer, ncols::Integer) | ||
data = DiBitVector(nrows * ncols, 0) # Or choose an appropriate type | ||
return DiBitMatrix(data, nrows, ncols) | ||
end | ||
|
||
# getting items by index | ||
function getindex(vm::DiBitMatrix, i::Int, j::Int) | ||
linear_index = (j - 1) * vm.nrows + i | ||
return vm.data[linear_index] | ||
end | ||
|
||
function getindex(vm::DiBitMatrix, ::Colon, j::Int) | ||
column = zeros(UInt8, vm.nrows) | ||
for i in 1:vm.nrows | ||
linear_index = (j - 1) * vm.nrows + i | ||
column[i] = vm.data[linear_index] | ||
end | ||
return column | ||
end | ||
|
||
function getindex(vm::DiBitMatrix, i::Int, ::Colon) | ||
row = zeros(UInt8, vm.ncols) | ||
for j in 1:vm.ncols | ||
linear_index = (j - 1) * vm.nrows + i | ||
row[j] = vm.data[linear_index] | ||
end | ||
return row | ||
end | ||
|
||
# setting items by index | ||
function setindex!(vm::DiBitMatrix, value::UInt8, i::T, j::T) where {T<:Integer} | ||
linear_index = (j - 1) * vm.nrows + i | ||
vm.data[linear_index] = value | ||
end | ||
|
||
# extending view to handle DiBitMatrix columns | ||
function getIndices(vm::DiBitMatrix,::Colon,j::Int) | ||
return (j - 1) * vm.nrows + 1, (j - 1) * vm.nrows + vm.nrows | ||
end | ||
|
||
function getIndices(vm::DiBitMatrix, i::Int, ::Colon) | ||
row = zeros(Integer, vm.ncols) | ||
for j in 1:vm.ncols | ||
row[j] = (j - 1) * vm.nrows + i | ||
end | ||
return row | ||
end | ||
|
||
function view(vm::DiBitMatrix,::Colon, j::Int) | ||
start,finish=getIndices(vm,:,j) | ||
return view(vm.data, start:finish) | ||
end | ||
|
||
function view(vm::DiBitMatrix, i::Int,::Colon) | ||
vals=getIndices(vm, i,:) | ||
return view(vm.data, vals) | ||
end | ||
|
||
|
||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.