Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 11 additions & 15 deletions .github/workflows/UnitTests.yml
Original file line number Diff line number Diff line change
@@ -1,34 +1,30 @@
name: Unit Tests

on:
- push
- pull_request
push:
branches:
- master
- develop
pull_request:

jobs:
test:
name: Julia ${{ matrix.julia-version }} - ${{ matrix.os }} - ${{ matrix.julia-arch }} - ${{ github.event_name }}
runs-on: ${{ matrix.os }}
continue-on-error: ${{ matrix.julia-version == 'nightly' }}
continue-on-error: ${{ matrix.experimental }}
strategy:
fail-fast: false
matrix:
julia-version:
- '1.0'
- '1.5'
- '1.6' # LTS
- '1'
julia-arch: [x64, x86]
os: [ubuntu-latest, windows-latest, macOS-latest]
exclude:
- os: macOS-latest
julia-arch: x86
- '1.10'
os: [ubuntu-latest, macOS-latest, windows-latest]
experimental: [false]
include:
# Include nightly, but experimental, so it's allowed to fail without
# failing CI.
- julia-version: nightly
julia-arch: x64
os: ubuntu-latest
experimental: true

fail_ci_if_error: false
steps:
- name: Checkout Repository
uses: actions/checkout@v2
Expand Down
16 changes: 15 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,23 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).

## [Unreleased]
## [Unreleased]
### Added
- :arrow_up: Added Project.toml
- Automa v1 compatibility: Upgraded the Automa dependency to "1", enabling the new Automav1 API.
- BioGenerics support: Imported metadata functions from BioGenerics to unify VCF/BCF header handling.
- TranscodingStreams integration: Added using TranscodingStreams for more efficient stream transformations in VCF/BCF readers.
- New VCF record reader: Introduced `src/vcf/readrecord.jl` to encapsulate record parsing logic.

### Changed
- Streamlined imports: Limited BioSequences imports, upgraded BGZFStreams and BufferedStreams usage, and replaced BioCore I/O types with BioGenerics abstractions.
- BCF reader refactoring: Transitioned Reader to subtype BioGenerics.IO.AbstractReader, centralized exception handling, and cleaned up parse logic.
- VCF header & metainfo: Fixed header parsing and improved metainfo tag/value functions to use BioGenerics APIs.
- Project.toml targets: Reorganized `[extras]` and `[targets]` sections.

### Removed
- Deprecated dependencies: Dropped older Automa versions (0.7, 0.8) and obsolete IO imports from BioCore.


## [0.4.0] - 2018-11-22
### Added
Expand Down
34 changes: 20 additions & 14 deletions Project.toml
Original file line number Diff line number Diff line change
@@ -1,33 +1,39 @@
name = "GeneticVariation"
uuid = "9bc6ac9d-e6b2-5f70-b0a8-242a01662520"
authors = ["Kenta Sato <bicycle1885@gmail.com>", "Sabrina J. Ward <sabrinajward@protonmail.com>"]

version = "0.4.1"
authors = ["Kenta Sato <bicycle1885@gmail.com>", "Sabrina J. Ward <sabrinajward@protonmail.com>", "Abhinav Singh <abhinavsns7@gmail.com>"]
version = "0.5.0"

[deps]
Automa = "67c07d97-cdcb-5c2c-af73-a7f9c32a568b"
BGZFStreams = "28d598bf-9b8f-59f1-b38c-5a06b4a0f5e6"
BioCore = "37cfa864-2cd6-5c12-ad9e-b6597d696c81"
BioGenerics = "47718e42-2ac5-11e9-14af-e5595289c2ea"
BioSequences = "7e6ae17a-c86d-528c-b3b9-7f778a29fe59"
BufferedStreams = "e1450e63-4bb3-523b-b2a4-4ffa8c0fd77d"
Combinatorics = "861a8166-3701-5b0c-9a16-15d98fcdc6aa"
Indexes = "4ffb77ac-cb80-11e8-1b35-4b78cc642f6d"
IntervalTrees = "524e6230-43b7-53ae-be76-1e9e4d08d11b"
MinHash = "4b3c9753-2685-44e9-8a29-365b96c023ed"
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
TranscodingStreams = "3bb67fe8-82b1-5028-8e26-92a6c54297fa"
Twiddle = "7200193e-83a8-5a55-b20d-5d36d44a0795"

[compat]
Automa = "0.7, 0.8"
Automa = "1"
BGZFStreams = "0.3"
BioCore = "2.0.5"
BioSequences = "1"
BufferedStreams = "1"
Combinatorics = "0.7"
IntervalTrees = "1"
Twiddle = "1.1"
BioGenerics = "0.1"
BioSequences = "3.4"
Combinatorics = "1"
Indexes = "0.1, 0.2"
IntervalTrees = "1.1"
MinHash = "0.2"
Statistics = "1"
TranscodingStreams = "0.9"
Twiddle = "1"
julia = "1"

[extras]
FormatSpecimens = "3372ea36-2a1a-11e9-3eb7-996970b6ffbd"
TOML = "fa267f1f-6049-4f14-aa54-33bafae1ed76"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
YAML = "ddb6d928-2868-570f-bddf-ab3f9cf99eb6"

[targets]
test = ["Test", "YAML"]
test = ["Test", "FormatSpecimens", "TOML"]
52 changes: 18 additions & 34 deletions src/GeneticVariation.jl
Original file line number Diff line number Diff line change
Expand Up @@ -11,22 +11,18 @@ __precompile__()
module GeneticVariation

export
# Site types
Conserved,
Mutated,
#Transition,
#Transversion,
Segregating,
segregating_sites,
count_segregating_sites,

# Distances
Proportion,
Jaccard,
MASH,
create_sketch,
jaccard,
mash,
distance,
pdistance,
mash,
jaccard,

pdistance_mutated,
jukes_cantor,
kimura_distance,
# Allele frequencies
gene_frequencies,

Expand All @@ -37,36 +33,24 @@ export
# VCF and BCF
VCF,
BCF,
header,
metainfotag,
metainfoval,
isfilled,
MissingFieldException
header

# Import only the necessary symbols from BioSequences v3.
import BioSequences:
BioSequences,
Alphabet,
AA_Term,
BioSequence,
bp_chunk_count,
Certain,
Composition,
DNAAlphabet,
GeneticCode,
ispurine,
Kmer,
Match,
Mismatch,
MinHashSketch,
NucAlphs,
Position,
RNAAlphabet,
Sequence

import BioCore:
metainfotag,
metainfoval,
header
RNAAlphabet

# Import metadata functions from BioGenerics
import BioGenerics: header, metainfotag, metainfoval, isfilled
using Indexes
using TranscodingStreams
#import BioGenerics.Exceptions: MissingFieldException, missingerror

import Combinatorics.permutations
import IntervalTrees: Interval, IntervalValue
Expand All @@ -80,9 +64,9 @@ import Twiddle:
include("vcf/vcf.jl")
include("bcf/bcf.jl")
include("site_counting.jl")
include("seg_sites.jl")
include("distances/minhash.jl")
include("distances/proportion.jl")
include("distances/evodistances.jl")
include("allele_freq.jl")
include("diversity_measures.jl")

Expand Down
51 changes: 12 additions & 39 deletions src/allele_freq.jl
Original file line number Diff line number Diff line change
Expand Up @@ -6,50 +6,23 @@
# This file is a part of BioJulia.
# License is MIT: https://github.com/BioJulia/GeneticVariation.jl/blob/master/LICENSE.md

"""
gene_frequencies(seqcounts::Composition{T}) where T <: Sequence

Compute gene_frequencies from a `BioSequences.Composition` variable that contains
unique sequence counts.
"""
function gene_frequencies(seqcounts::Composition{T}) where T <: Sequence
n = sum(values(seqcounts))
frequencies = Dict{T, Float64}()
@inbounds for (seq, count) in seqcounts
frequencies[seq] = count / n
end
return frequencies
end
using BioSequences

"""
gene_frequencies(iterable)

Compute the gene frequencies for any iterable with an `eltype` which is a
concrete subtype of the abstract `Sequence` type.
Compute allele frequencies from any iterable whose element type is a subtype of BioSequence.
This function iterates over the input, counts occurrences of unique sequences,
and then returns a dictionary mapping each sequence to its relative frequency.
Example:
freqs = gene_frequencies(["ATGC", "ATGC", "ATGT"])
"""
function gene_frequencies(iterable)
return _gene_frequencies(iterable, eltype(iterable), Base.IteratorSize(iterable))
end

# Default for most iterables, throws an error.
_gene_frequencies(iterable, eltype, is) = error("Iterable not supported.")

# Action to take for a sequence iterable which has no known size.
function _gene_frequencies(iterable, ::Type{<:Sequence}, is::Base.SizeUnknown)
composition = BioSequences.composition(iterable)
return gene_frequencies(composition)
end

# Action to take for a sequence iterable which has a known size.
# This version computes frequencies directly as n is known in advance.
# The other method first computes composition, and computes frequencies from
# that.
function _gene_frequencies(sequences, ::Type{T}, is::Union{Base.HasLength,Base.HasShape}) where T<:Sequence
inc = 1 / length(sequences)
frequencies = Dict{T, Float64}()
@inbounds for seq in sequences
old = get(frequencies, seq, 0.0)
frequencies[seq] = old + inc
counts = Dict{eltype(iterable),Int}()
total = 0
for seq in iterable
counts[seq] = get(counts, seq, 0) + 1
total += 1
end
return frequencies
return Dict(k => v / total for (k, v) in counts)
end
11 changes: 9 additions & 2 deletions src/bcf/bcf.jl
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,17 @@

module BCF

import BioCore: BioCore, isfilled
#import BioGenerics: BioGenerics, isfilled
import GeneticVariation.VCF
import BGZFStreams
import BufferedStreams
import TranscodingStreams
using BioGenerics
import BioGenerics: BioGenerics, isfilled
import BioGenerics.Exceptions: MissingFieldException, missingerror

function parsehex(str)
return map(x -> parse(UInt8, x, base=16), split(str, ' '))
end

include("record.jl")
include("reader.jl")
Expand Down
15 changes: 9 additions & 6 deletions src/bcf/reader.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,7 @@
#
# This file is a part of BioJulia.
# License is MIT: https://github.com/BioJulia/GeneticVariation.jl/blob/master/LICENSE

struct Reader{T<:IO} <: BioCore.IO.AbstractReader
struct Reader{T<:IO} <: BioGenerics.IO.AbstractReader
version::Tuple{UInt8,UInt8} # (major, minor)
header::VCF.Header
stream::BGZFStreams.BGZFStream{T}
Expand Down Expand Up @@ -39,7 +38,7 @@ function Reader(input::IO)
data = read(stream, l_header)

# parse VCF header
vcfreader = VCF.Reader(BufferedStreams.BufferedInputStream(data))
vcfreader = VCF.Reader(IOBuffer(data))

return Reader((major, minor), vcfreader.header, stream)
end
Expand All @@ -48,7 +47,7 @@ function Base.eltype(::Type{Reader{T}}) where T
return Record
end

function BioCore.IO.stream(reader::Reader)
function stream(reader::Reader)
return reader.stream
end

Expand All @@ -60,8 +59,11 @@ function header(reader::Reader)
return reader.header
end

function BioCore.header(reader::Reader)
return header(reader)
function Base.close(reader::Reader)
if reader.stream isa IO
close(reader.stream)
end
return nothing
end

function Base.read!(reader::Reader, record::Record)
Expand All @@ -75,3 +77,4 @@ function Base.read!(reader::Reader, record::Record)
record.indivlen = indivlen
return record
end

11 changes: 9 additions & 2 deletions src/bcf/writer.jl
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
# This file is a part of BioJulia.
# License is MIT: https://github.com/BioJulia/GeneticVariation.jl/blob/master/LICENSE

struct Writer{T<:IO} <: BioCore.IO.AbstractWriter
struct Writer{T<:IO} <: BioGenerics.IO.AbstractWriter
stream::BGZFStreams.BGZFStream{T}
end

Expand All @@ -32,7 +32,7 @@ function Writer(output::IO, header::VCF.Header)
return Writer(stream)
end

function BioCore.IO.stream(writer::Writer)
function stream(writer::Writer)
return writer.stream
end

Expand All @@ -43,3 +43,10 @@ function Base.write(writer::Writer, record::Record)
n += write(writer.stream, record.data)
return n
end

function Base.close(writer::Writer)
if writer.stream isa IO
close(writer.stream)
end
return nothing
end
Loading