Skip to content

Commit

Permalink
Skip projection for diagonal matrices in random projection
Browse files Browse the repository at this point in the history
  • Loading branch information
zgornel committed Jan 25, 2019
1 parent 3f5ecf8 commit 7d2eef7
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 6 deletions.
29 changes: 23 additions & 6 deletions src/rp.jl
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ based on the effects of the
* `nwords::T` averge number of words in a document
* `κ::Int` the `κ` parameter of the BM25 statistic
* `β::Float64` the `β` parameter of the BM25 statistic
* `project::Bool` specifies whether the model actually performs the projection or not; it is false if the number of dimensions provided is zero or negative
# References:
* [Kaski 1998](http://www.academia.edu/371863/Dimensionality_Reduction_by_Random_Mapping_Fast_Similarity_Computation_for_Clustering)
Expand All @@ -33,6 +34,7 @@ struct RPModel{S<:AbstractString, T<:AbstractFloat, A<:AbstractMatrix{T}, H<:Int
nwords::T # average words/document in corpus
κ::Int # κ parameter for Okapi BM25 (used if stats==:bm25)
β::Float64 # β parameter for Okapi BM25 (used if stats==:bm25)
project::Bool
end

function RPModel(dtm::DocumentTermMatrix{T};
Expand All @@ -56,9 +58,10 @@ function RPModel(dtm::DocumentTermMatrix{T};
idf = log.(n ./ documents_containing_term) .+ one(T)
nwords = mean(sum(dtm.dtm, dims=1))
R = random_projection_matrix(k, m, T, density)
project = ifelse(k > 0, true, false)
# Return the model
return RPModel(dtm.terms, dtm.row_indices, R,
stats, idf, nwords, κ, β)
stats, idf, nwords, κ, β, project)
end

function RPModel(dtm::DocumentTermMatrix{T}; kwargs...) where T<:Integer
Expand Down Expand Up @@ -117,7 +120,9 @@ end

function Base.show(io::IO, rpm::RPModel{S,T,A,H}) where {S,T,A,H}
len_vecs, num_terms = size(rpm.R)
print(io, "Random Projection Model ($(rpm.stats)), " *
str_proj = ifelse(rpm.project, "Random Projection model",
"Identity Projection")
print(io, "$str_proj ($(rpm.stats)), " *
"$(num_terms) terms, dimensionality $(len_vecs), $(T) vectors")
end

Expand Down Expand Up @@ -242,7 +247,12 @@ function embed_document(rpm::RPModel{S,T,A,H}, dtv::Vector{T}) where {S,T,A,H}
(k * (one(T) - b + b * words_in_document/rpm.nwords) .+ tf)
end
# Embed
= rpm.R * v # embed
local
if rpm.project
= rpm.R * v # embed
else
= v
end
=./ (norm(d̂,2) .+ eps(T)) # normalize
return
end
Expand All @@ -259,7 +269,12 @@ function embed_document(rpm::RPModel{S,T,A,H}, dtm::DocumentTermMatrix{T}) where
elseif rpm.stats == :bm25
X = bm_25(dtm, κ=rpm.κ, β=rpm.β)
end
U = rpm.R * X
local U
if rpm.project
U = rpm.R * X
else
U = X
end
U ./= (sqrt.(sum(U.^2, dims=1)) .+ eps(T))
return U
end
Expand All @@ -282,6 +297,7 @@ function save_rp_model(rpm::RPModel{S,T,A,H}, filename::AbstractString) where {S
open(filename, "w") do fid
println(fid, "Random Projection Model saved at $(Dates.now())")
println(fid, "$nwords $k") # number of words, k
println(fid, rpm.project)
println(fid, rpm.stats)
writedlm(fid, rpm.idf', " ")
println(fid, rpm.nwords)
Expand Down Expand Up @@ -309,7 +325,7 @@ function load_rp_model(filename::AbstractString, ::Type{T}=DEFAULT_FLOAT_TYPE;
# Matrix type for random projection model
A = ifelse(sparse, SparseMatrixCSC{T, Int}, Matrix{T})
# Define parsed variables local to outer scope of do statement
local vocab, vocab_hash, R, stats, idf, nwords, κ, β
local vocab, vocab_hash, R, stats, idf, nwords, κ, β, project
open(filename, "r") do fid
readline(fid) # first line, header
line = readline(fid)
Expand All @@ -321,6 +337,7 @@ function load_rp_model(filename::AbstractString, ::Type{T}=DEFAULT_FLOAT_TYPE;
R = zeros(T, k, vocab_size)
end
# Start parsing the rest of the file
project = parse(Bool, strip(readline(fid)))
stats = Symbol(strip(readline(fid)))
idf = map(x->parse(T, x), split(readline(fid), ' '))
nwords = parse(T, readline(fid))
Expand All @@ -332,5 +349,5 @@ function load_rp_model(filename::AbstractString, ::Type{T}=DEFAULT_FLOAT_TYPE;
R[i,:] = map(x->parse(T,x), split(readline(fid), ' '))
end
end
return RPModel{String, T, A, Int}(vocab, vocab_hash, R, stats, idf, nwords, κ, β)
return RPModel{String, T, A, Int}(vocab, vocab_hash, R, stats, idf, nwords, κ, β, project)
end
2 changes: 2 additions & 0 deletions test/rp.jl
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,10 @@
@test size(model.R, 2) == m
if k > 0
@test size(model.R, 1) == k
@test model.project
else
@test size(model.R, 1) == m # no projection occurs
@test !model.project
end
idxs, corrs = cosine(model, dtm, query)
@test length(idxs) == length(corrs) == length(crps)
Expand Down

0 comments on commit 7d2eef7

Please sign in to comment.