Skip to content

Commit

Permalink
add filter for alingmnents, returning an alignment
Browse files Browse the repository at this point in the history
  • Loading branch information
PierreBarrat committed Oct 21, 2024
1 parent a420fdf commit 3f818d1
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 2 deletions.
2 changes: 1 addition & 1 deletion src/BioSequenceMappings.jl
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ using Random
import Base: length, size
import Base: in, ==, hash, convert, copy
import Base: getindex, firstindex, lastindex, eachindex, view, keys
import Base: iterate, eltype, unique
import Base: iterate, eltype, unique, filter
import Base: cat
import Base: write

Expand Down
26 changes: 25 additions & 1 deletion src/alignment.jl
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,11 @@ of `data`. They do not have to be unique, and can be ignored
@assert isapprox(sum(weights), 1; rtol = 1e-8) """
Weights must sum to 1 - got $(sum(weights))
"""
@assert Meff <= size(data, 2) "Effective M larger than number of sequences $(Meff) > $(size(data,2))"
if Meff - size(data, 2) > 1e-6
error("Effective M larger than number of sequences $(Meff) > $(size(data,2))")
elseif Meff - size(data, 2) > 0
Meff = round(Int, Meff)
end

alphabet_copy = isnothing(alphabet) ? nothing : copy(alphabet)
return new{A,T}(Matrix(data), alphabet_copy, copy(weights), Meff, string.(names))
Expand Down Expand Up @@ -331,6 +335,7 @@ function subsample(X::AbstractAlignment, indices)
data_copy = copy(X[indices])
Y = Alignment(data_copy, copy(X.alphabet))
Y.weights = X.weights[indices] / sum(X.weights[indices])
Y.Meff = sum(X.weights[indices]) * X.Meff
Y.names = X.names[indices]
return Y
end
Expand All @@ -353,6 +358,7 @@ end
Find sequence with name `label` in `aln`, and return `(index, sequence)`.
Scales as the number of sequences.
Return the first sequence that matches the label.
!!! Return a *view* of the sequence.
"""
Expand All @@ -373,6 +379,24 @@ function match_sequences(pattern, aln::AbstractAlignment)
return idx, eachsequence(aln, idx)
end

"""
filter(f, aln::AbstractAlignment)
Filter sequences of `aln` using boolean function `f`. Return another `Alignment`.
Use `filter(f, eachsequence(aln))` to obtain an array of sequences.
"""
function filter(f, aln::A) where A <: AbstractAlignment
idx = findall(f, eachsequence(aln))

return A(
data = copy(aln.data[:, idx]),
alphabet = aln.alphabet,
names = aln.names[idx],
weights = aln.weights[idx] / sum(aln.weights[idx]),
Meff = sum(aln.weights[idx])*aln.Meff,
)
end

#=============================================#
################ Concatenating ################
#=============================================#
Expand Down

0 comments on commit 3f818d1

Please sign in to comment.