diff --git a/.codecov.yml b/.codecov.yml new file mode 100644 index 0000000..69cb760 --- /dev/null +++ b/.codecov.yml @@ -0,0 +1 @@ +comment: false diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml new file mode 100644 index 0000000..53173f8 --- /dev/null +++ b/.github/FUNDING.yml @@ -0,0 +1 @@ +open_collective: biojulia \ No newline at end of file diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md new file mode 100644 index 0000000..f39b24c --- /dev/null +++ b/.github/ISSUE_TEMPLATE.md @@ -0,0 +1,40 @@ + + +> _This template is rather extensive. Fill out all that you can, if are a new contributor or you're unsure about any section, leave it unchanged and a reviewer will help you_ :smile:. _This template is simply a tool to help everyone remember the BioJulia guidelines, if you feel anything in this template is not relevant, simply delete it._ + +## Expected Behavior + + + +## Current Behavior + + + +## Possible Solution / Implementation + + + +## Steps to Reproduce (for bugs) + +1. +2. +3. +4. + + + + + + +## Context + + + +## Your Environment + +- Package Version used: +- Julia Version used: +- Operating System and version (desktop or mobile): +- Link to your project: + + diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md new file mode 100644 index 0000000..3575d00 --- /dev/null +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -0,0 +1,47 @@ +# A clear and descriptive title (No issue numbers please) + +> _This template is rather extensive. Fill out all that you can, if are a new contributor or you're unsure about any section, leave it unchanged and a reviewer will help you_ :smile:. _This template is simply a tool to help everyone remember the BioJulia guidelines, if you feel anything in this template is not relevant, simply delete it._ + +## Types of changes + +This PR implements the following changes: +_(Please tick any or all of the following that are applicable)_ + +* [ ] :sparkles: New feature (A non-breaking change which adds functionality). +* [ ] :bug: Bug fix (A non-breaking change, which fixes an issue). +* [ ] :boom: Breaking change (fix or feature that would cause existing functionality to change). + +## :clipboard: Additional detail + +- If you have implemented new features or behaviour + - **Provide a description of the addition** in as many details as possible. + + - **Provide justification of the addition**. + + - **Provide a runnable example of use of your addition**. This lets reviewers + and others try out the feature before it is merged or makes it's way to release. + +- If you have changed current behaviour... + - **Describe the behaviour prior to you changes** + + - **Describe the behaviour after your changes** and justify why you have made the changes, + Please describe any breakages you anticipate as a result of these changes. + + - **Does your change alter APIs or existing exposed methods/types?** + If so, this may cause dependency issues and breakages, so the maintainer + will need to consider this when versioning the next release. + + - If you are implementing changes that are intended to increase performance, you + should provide the results of a simple performance benchmark exercise + demonstrating the improvement. Especially if the changes make code less legible. + +## :ballot_box_with_check: Checklist + +- [ ] :art: The changes implemented is consistent with the [julia style guide](https://docs.julialang.org/en/stable/manual/style-guide/). +- [ ] :blue_book: I have updated and added relevant docstrings, in a manner consistent with the [documentation styleguide](https://docs.julialang.org/en/stable/manual/documentation/). +- [ ] :blue_book: I have added or updated relevant user and developer manuals/documentation in `docs/src/`. +- [ ] :ok: There are unit tests that cover the code changes I have made. +- [ ] :ok: The unit tests cover my code changes AND they pass. +- [ ] :pencil: I have added an entry to the `[UNRELEASED]` section of the manually curated `CHANGELOG.md` file for this repository. +- [ ] :ok: All changes should be compatible with the latest stable version of Julia. +- [ ] :thought_balloon: I have commented liberally for any complex pieces of internal code. diff --git a/.github/workflows/CompatHelper.yml b/.github/workflows/CompatHelper.yml new file mode 100644 index 0000000..7d6bc1c --- /dev/null +++ b/.github/workflows/CompatHelper.yml @@ -0,0 +1,33 @@ +name: CompatHelper + +on: + schedule: + - cron: '00 00 * * *' + +jobs: + CompatHelper: + runs-on: ubuntu-latest + steps: + - uses: julia-actions/setup-julia@latest + with: + version: 1.3 + - name: Add CompatHelper + run: julia -e 'using Pkg; Pkg.add("CompatHelper")' + - name: Run CompatHelper + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: julia -e ' + using CompatHelper, Pkg; + my_registries = [ + Pkg.RegistrySpec( + name = "BioJuliaRegistry", + uuid = "ccbd2cc2-2954-11e9-1ccf-f3e7900901ca", + url = "https://github.com/BioJulia/BioJuliaRegistry.git" + ), + Pkg.RegistrySpec( + name = "General", + uuid = "23338594-aafe-5451-b93e-139f81909106", + url = "https://github.com/JuliaRegistries/General.git" + ) + ]; + CompatHelper.main(; registries = my_registries, master_branch = "master");' diff --git a/.github/workflows/Documentation.yml b/.github/workflows/Documentation.yml new file mode 100644 index 0000000..9610301 --- /dev/null +++ b/.github/workflows/Documentation.yml @@ -0,0 +1,34 @@ +name: Documentation + +on: + push: + branches: + - 'master' + - 'develop' + - 'release/.*' + tags: '*' + pull_request: + +jobs: + build: + runs-on: ${{ matrix.os }} + strategy: + matrix: + julia-version: [1.3.0] + julia-arch: [x86] + os: [ubuntu-latest] + steps: + - uses: actions/checkout@v1.0.0 + - uses: julia-actions/setup-julia@latest + with: + version: ${{ matrix.julia-version }} + - name: Install dependencies + run: | + julia ci_prep.jl; + julia --project=docs/ -e 'using Pkg; Pkg.develop(PackageSpec(path=pwd())); Pkg.instantiate()' + - name: Build and deploy + env: + # https://github.com/JuliaDocs/Documenter.jl/issues/1177 + # GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # For authentication with GitHub Actions token + DOCUMENTER_KEY: ${{ secrets.DOCUMENTER_KEY }} # For authentication with SSH deploy key + run: julia --project=docs/ --color=yes docs/make.jl diff --git a/.github/workflows/TagBot.yml b/.github/workflows/TagBot.yml new file mode 100644 index 0000000..e65374b --- /dev/null +++ b/.github/workflows/TagBot.yml @@ -0,0 +1,13 @@ +name: TagBot +on: + schedule: + - cron: '0 * * * *' +jobs: + TagBot: + runs-on: ubuntu-latest + steps: + - uses: JuliaRegistries/TagBot@v1 + with: + token: ${{ secrets.GITHUB_TOKEN }} + ssh: ${{ secrets.TAGBOT_KEY }} + registry: BioJulia/BioJuliaRegistry \ No newline at end of file diff --git a/.github/workflows/UnitTests.yml b/.github/workflows/UnitTests.yml new file mode 100644 index 0000000..cbaf083 --- /dev/null +++ b/.github/workflows/UnitTests.yml @@ -0,0 +1,22 @@ +name: Unit tests + +on: [push, pull_request] + +jobs: + test: + runs-on: ${{ matrix.os }} + strategy: + matrix: + julia-version: ['1.1', '1.2', '1.3'] + julia-arch: [x64] + os: [ubuntu-latest, windows-latest, macOS-latest] + + steps: + - uses: actions/checkout@v1.0.0 + - uses: julia-actions/setup-julia@v1 + with: + version: ${{ matrix.julia-version }} + arch: ${{ matrix.julia-arch }} + - name: Install dependencies + run: julia ci_prep.jl + - uses: julia-actions/julia-runtest@master diff --git a/Project.toml b/Project.toml new file mode 100644 index 0000000..f111fd4 --- /dev/null +++ b/Project.toml @@ -0,0 +1,23 @@ +name = "Indexes" +uuid = "4ffb77ac-cb80-11e8-1b35-4b78cc642f6d" +authors = ["Kenta Sato ", "Ben J. Ward ", "Ciarán O’Mara "] +version = "0.1.0" + +[deps] +BGZFStreams = "28d598bf-9b8f-59f1-b38c-5a06b4a0f5e6" +BioCore = "37cfa864-2cd6-5c12-ad9e-b6597d696c81" +BufferedStreams = "e1450e63-4bb3-523b-b2a4-4ffa8c0fd77d" +GenomicFeatures = "899a7d2d-5c61-547b-bef9-6698a8d05446" + +[compat] +BGZFStreams = "0.3" +BioCore = "2" +BufferedStreams = "1" +GenomicFeatures = "2" +julia = "1.1" + +[extras] +Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" + +[targets] +test = ["Test"] diff --git a/README.md b/README.md index 64c97e4..1d0eeba 100644 --- a/README.md +++ b/README.md @@ -1 +1,82 @@ -# Indexes.jl +# Indexes.jl + +[![Project Status: Active – The project has reached a stable, usable state and is being actively developed.](https://www.repostatus.org/badges/latest/active.svg)](https://www.repostatus.org/#active) +[![Latest Release](https://img.shields.io/github/release/BioJulia/Indexes.jl.svg)](https://github.com/BioJulia/Indexes.jl/releases/latest) +[![MIT license](https://img.shields.io/badge/license-MIT-green.svg)](https://github.com/BioJulia/Indexes.jl/blob/master/LICENSE) +[![Stable documentation](https://img.shields.io/badge/docs-stable-blue.svg)](https://biojulia.github.io/Indexes.jl/stable) +[![Latest documentation](https://img.shields.io/badge/docs-dev-blue.svg)](https://biojulia.github.io/Indexes.jl/dev/) +[![Join the chat at https://gitter.im/BioJulia/Indexes.jl](https://badges.gitter.im/BioJulia/Indexes.jl.svg)](https://gitter.im/BioJulia/Indexes.jl) + +> This project follows the [semver](http://semver.org) pro forma and uses the [git-flow branching model](https://nvie.com/posts/a-successful-git-branching-model/ "original +blog post"). + +## Description +Handles indexes required to iterate through various IO streams. +For example: +- Generic index for tab-delimited files. +- An index type for BGZFStream. + +## Installation +`Indexes` is bundled into packages like [BED](https://github.com/BioJulia/BED.jl), [GFF3](https://github.com/BioJulia/GFF3.jl), and [XAM](https://github.com/BioJulia/XAM.jl) to assist them with IO streams, so you may not need to install this package explicitly. +However, if you do, `Indexes` is made available to install through BioJulia's package registry. +By default, Julia's package manager only uses the "General" package registry. +Your Julia configuration needs to include the BioJulia registry to be able to install the latest version of `Indexes`. + +To add the BioJulia registry from the [Julia REPL](https://docs.julialang.org/en/v1/manual/getting-started/), press `]` to enter [pkg mode](https://docs.julialang.org/en/v1/stdlib/Pkg/), then enter the following command: +```julia +registry add https://github.com/BioJulia/BioJuliaRegistry.git +``` + +After adding the registry to your configuration, you can install `Indexes` while in [pkg mode](https://docs.julialang.org/en/v1/stdlib/Pkg/) with the following: +```julia +add Indexes +``` + +If you are interested in the cutting edge of the development, please check out the [develop branch](https://github.com/BioJulia/Indexes.jl/tree/develop) to try new features before release. + + +## Testing +Indexes is tested against Julia `1.X` on Linux, OS X, and Windows. + +**Latest build status:** + +[![Unit tests](https://github.com/BioJulia/indexes.jl/workflows/Unit%20tests/badge.svg?branch=master)](https://github.com/BioJulia/indexes.jl/actions?query=workflow%3A%22Unit+tests%22+branch%3Amaster) +[![Documentation](https://github.com/BioJulia/indexes.jl/workflows/Documentation/badge.svg?branch=master)](https://github.com/BioJulia/indexes.jl/actions?query=workflow%3ADocumentation+branch%3Amaster) +[![codecov](https://codecov.io/gh/BioJulia/Indexes.jl/branch/master/graph/badge.svg)](https://codecov.io/gh/BioJulia/Indexes.jl) + +## Contributing +We appreciate [contributions](https://github.com/BioJulia/Indexes.jl/graphs/contributors) from users including reporting bugs, fixing issues, improving performance and adding new features. + +Take a look at the [contributing files](https://github.com/BioJulia/Contributing) detailed contributor and maintainer guidelines, and code of conduct. + +### Financial contributions +We also welcome financial contributions in full transparency on our [open collective](https://opencollective.com/biojulia). +Anyone can file an expense. +If the expense makes sense for the development the core contributors and the person who filed the expense will be reimbursed. + + +## Backers & Sponsors +Thank you to all our backers and sponsors! + +Love our work and community? [Become a backer](https://opencollective.com/biojulia#backer). + +[![backers](https://opencollective.com/biojulia/backers.svg?width=890)](https://opencollective.com/biojulia#backers) + +Does your company use BioJulia? +Help keep BioJulia feature rich and healthy by [sponsoring the project](https://opencollective.com/biojulia#sponsor). +Your logo will show up here with a link to your website. + +[![](https://opencollective.com/biojulia/sponsor/0/avatar.svg)](https://opencollective.com/biojulia/sponsor/0/website) +[![](https://opencollective.com/biojulia/sponsor/1/avatar.svg)](https://opencollective.com/biojulia/sponsor/1/website) +[![](https://opencollective.com/biojulia/sponsor/2/avatar.svg)](https://opencollective.com/biojulia/sponsor/2/website) +[![](https://opencollective.com/biojulia/sponsor/3/avatar.svg)](https://opencollective.com/biojulia/sponsor/3/website) +[![](https://opencollective.com/biojulia/sponsor/4/avatar.svg)](https://opencollective.com/biojulia/sponsor/4/website) +[![](https://opencollective.com/biojulia/sponsor/5/avatar.svg)](https://opencollective.com/biojulia/sponsor/5/website) +[![](https://opencollective.com/biojulia/sponsor/6/avatar.svg)](https://opencollective.com/biojulia/sponsor/6/website) +[![](https://opencollective.com/biojulia/sponsor/7/avatar.svg)](https://opencollective.com/biojulia/sponsor/7/website) +[![](https://opencollective.com/biojulia/sponsor/8/avatar.svg)](https://opencollective.com/biojulia/sponsor/8/website) +[![](https://opencollective.com/biojulia/sponsor/9/avatar.svg)](https://opencollective.com/biojulia/sponsor/9/website) + + +## Questions? +If you have a question about contributing or using BioJulia software, come on over and chat to us on [Gitter](https://gitter.im/BioJulia/General), or you can try the [Bio category of the Julia discourse site](https://discourse.julialang.org/c/domain/bio). diff --git a/ci_prep.jl b/ci_prep.jl new file mode 100644 index 0000000..f3a7535 --- /dev/null +++ b/ci_prep.jl @@ -0,0 +1,3 @@ +using Pkg.Registry +Registry.add(Registry.RegistrySpec(url = "https://github.com/BioJulia/BioJuliaRegistry.git")) +Registry.add(Registry.RegistrySpec(url = "https://github.com/JuliaRegistries/General.git")) diff --git a/coverage/Project.toml b/coverage/Project.toml new file mode 100644 index 0000000..4fbdc47 --- /dev/null +++ b/coverage/Project.toml @@ -0,0 +1,2 @@ +[deps] +Coverage = "a2441757-f6aa-5fb2-8edb-039e3f45d037" diff --git a/coverage/coverage.jl b/coverage/coverage.jl new file mode 100644 index 0000000..3d33ed9 --- /dev/null +++ b/coverage/coverage.jl @@ -0,0 +1,11 @@ +get(ENV, "TRAVIS_OS_NAME", "") == "linux" || exit() +get(ENV, "TRAVIS_JULIA_VERSION", "") == "1.3" || exit() + +using Pkg +Pkg.instantiate() + +using Coverage + +cd(joinpath(@__DIR__, "..")) do + Codecov.submit(Codecov.process_folder()) +end diff --git a/docs/Project.toml b/docs/Project.toml new file mode 100644 index 0000000..3506869 --- /dev/null +++ b/docs/Project.toml @@ -0,0 +1,6 @@ +[deps] +Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" +Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" + +[compat] +Documenter = "0.24" diff --git a/docs/make.jl b/docs/make.jl new file mode 100644 index 0000000..01f7d6b --- /dev/null +++ b/docs/make.jl @@ -0,0 +1,20 @@ +using Pkg +using Documenter, Indexes + +makedocs( + format = Documenter.HTML( + edit_link = "develop" + ), + modules = [Indexes], + sitename = "Indexes.jl", + pages = [ + "Home" => "index.md", + "API Reference" => "man/api.md" + ], + authors = replace(join(Pkg.TOML.parsefile("Project.toml")["authors"], ", "), r" <.*?>" => "" ) * ", The BioJulia Organisation, and other contributors." +) +deploydocs( + repo = "github.com/BioJulia/Indexes.jl.git", + devbranch = "develop", + push_preview = true +) diff --git a/docs/src/assets/logo.svg b/docs/src/assets/logo.svg new file mode 100644 index 0000000..1b2ea24 --- /dev/null +++ b/docs/src/assets/logo.svg @@ -0,0 +1,26 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/src/index.md b/docs/src/index.md new file mode 100644 index 0000000..5c1e3c6 --- /dev/null +++ b/docs/src/index.md @@ -0,0 +1,80 @@ +# Indexes.jl + +[![Project Status: Active – The project has reached a stable, usable state and is being actively developed.](https://www.repostatus.org/badges/latest/active.svg)](https://www.repostatus.org/#active) +[![Latest Release](https://img.shields.io/github/release/BioJulia/Indexes.jl.svg)](https://github.com/BioJulia/Indexes.jl/releases/latest) +[![MIT license](https://img.shields.io/badge/license-MIT-green.svg)](https://github.com/BioJulia/Indexes.jl/blob/master/LICENSE) +[![Stable documentation](https://img.shields.io/badge/docs-stable-blue.svg)](https://biojulia.github.io/Indexes.jl/stable) +[![Latest documentation](https://img.shields.io/badge/docs-dev-blue.svg)](https://biojulia.github.io/Indexes.jl/dev/) +[![Join the chat at https://gitter.im/BioJulia/Indexes.jl](https://badges.gitter.im/BioJulia/Indexes.jl.svg)](https://gitter.im/BioJulia/Indexes.jl) + +> This project follows the [semver](http://semver.org) pro forma and uses the [git-flow branching model](https://nvie.com/posts/a-successful-git-branching-model/). + +## Description +Handles indexes required to iterate through various IO streams. +For example: +- Generic index for tab-delimited files. +- An index type for BGZFStream. + +## Installation +`Indexes` is bundled into packages like [BED](https://github.com/BioJulia/BED.jl), [GFF3](https://github.com/BioJulia/GFF3.jl), and [XAM](https://github.com/BioJulia/XAM.jl) to assist them with IO streams, so you may not need to install this package explicitly. +However, if you do, `Indexes` is made available to install through BioJulia's package registry. +By default, Julia's package manager only uses the "General" package registry. +Your Julia configuration needs to include the BioJulia registry to be able to install the latest version of `Indexes`. + +To add the BioJulia registry from the [Julia REPL](https://docs.julialang.org/en/v1/manual/getting-started/), press `]` to enter [pkg mode](https://docs.julialang.org/en/v1/stdlib/Pkg/), then enter the following command: +```julia +registry add https://github.com/BioJulia/BioJuliaRegistry.git +``` + +After adding the registry to your configuration, you can install `Indexes` while in [pkg mode](https://docs.julialang.org/en/v1/stdlib/Pkg/) with the following: +```julia +add Indexes +``` + +If you are interested in the cutting edge of the development, please check out the [develop branch](https://github.com/BioJulia/Indexes.jl/tree/develop) to try new features before release. + +## Testing +Indexes is tested against Julia `1.X` on Linux, OS X, and Windows. + +**Latest build status:** + +[![Unit tests](https://github.com/BioJulia/indexes.jl/workflows/Unit%20tests/badge.svg?branch=master)](https://github.com/BioJulia/indexes.jl/actions?query=workflow%3A%22Unit+tests%22+branch%3Amaster) +[![Documentation](https://github.com/BioJulia/indexes.jl/workflows/Documentation/badge.svg?branch=master)](https://github.com/BioJulia/indexes.jl/actions?query=workflow%3ADocumentation+branch%3Amaster) +[![codecov](https://codecov.io/gh/BioJulia/Indexes.jl/branch/master/graph/badge.svg)](https://codecov.io/gh/BioJulia/Indexes.jl) + +## Contributing +We appreciate [contributions](https://github.com/BioJulia/Indexes.jl/graphs/contributors) from users including reporting bugs, fixing issues, improving performance and adding new features. + +Take a look at the [contributing files](https://github.com/BioJulia/Contributing) detailed contributor and maintainer guidelines, and code of conduct. + +### Financial contributions +We also welcome financial contributions in full transparency on our [open collective](https://opencollective.com/biojulia). +Anyone can file an expense. +If the expense makes sense for the development the core contributors and the person who filed the expense will be reimbursed. + + +## Backers & Sponsors +Thank you to all our backers and sponsors! + +Love our work and community? [Become a backer](https://opencollective.com/biojulia#backer). + +[![backers](https://opencollective.com/biojulia/backers.svg?width=890)](https://opencollective.com/biojulia#backers) + +Does your company use BioJulia? +Help keep BioJulia feature rich and healthy by [sponsoring the project](https://opencollective.com/biojulia#sponsor). +Your logo will show up here with a link to your website. + +[![](https://opencollective.com/biojulia/sponsor/0/avatar.svg)](https://opencollective.com/biojulia/sponsor/0/website) +[![](https://opencollective.com/biojulia/sponsor/1/avatar.svg)](https://opencollective.com/biojulia/sponsor/1/website) +[![](https://opencollective.com/biojulia/sponsor/2/avatar.svg)](https://opencollective.com/biojulia/sponsor/2/website) +[![](https://opencollective.com/biojulia/sponsor/3/avatar.svg)](https://opencollective.com/biojulia/sponsor/3/website) +[![](https://opencollective.com/biojulia/sponsor/4/avatar.svg)](https://opencollective.com/biojulia/sponsor/4/website) +[![](https://opencollective.com/biojulia/sponsor/5/avatar.svg)](https://opencollective.com/biojulia/sponsor/5/website) +[![](https://opencollective.com/biojulia/sponsor/6/avatar.svg)](https://opencollective.com/biojulia/sponsor/6/website) +[![](https://opencollective.com/biojulia/sponsor/7/avatar.svg)](https://opencollective.com/biojulia/sponsor/7/website) +[![](https://opencollective.com/biojulia/sponsor/8/avatar.svg)](https://opencollective.com/biojulia/sponsor/8/website) +[![](https://opencollective.com/biojulia/sponsor/9/avatar.svg)](https://opencollective.com/biojulia/sponsor/9/website) + + +## Questions? +If you have a question about contributing or using BioJulia software, come on over and chat to us on [Gitter](https://gitter.im/BioJulia/General), or you can try the [Bio category of the Julia discourse site](https://discourse.julialang.org/c/domain/bio). diff --git a/docs/src/man/api.md b/docs/src/man/api.md new file mode 100644 index 0000000..3d61cab --- /dev/null +++ b/docs/src/man/api.md @@ -0,0 +1,14 @@ +# API Reference + +## Public +```@autodocs +Modules = [Indexes] +private = false +``` + +## Internal + +```@autodocs +Modules = [Indexes] +public = false +``` diff --git a/src/Indexes.jl b/src/Indexes.jl new file mode 100644 index 0000000..9bc3078 --- /dev/null +++ b/src/Indexes.jl @@ -0,0 +1,21 @@ +# Index +# ===== +# +# Index types for genomic intervals. +# +# This file is a part of BioJulia. +# License is MIT: https://github.com/BioJulia/Bio.jl/blob/master/LICENSE.md + +module Indexes + +import BGZFStreams +import BioCore +import BufferedStreams +import GenomicFeatures: Interval + +include("chunk.jl") +include("bgzfindex.jl") +include("tabix.jl") +include("overlap.jl") + +end # module diff --git a/src/bgzfindex.jl b/src/bgzfindex.jl new file mode 100644 index 0000000..83081e6 --- /dev/null +++ b/src/bgzfindex.jl @@ -0,0 +1,147 @@ +# BGZF Index +# ========== +# +# An index type for BGZFStream. +# +# The details of the internal is specified in +# https://samtools.github.io/hts-specs/SAMv1.pdf. +# +# This file is a part of BioJulia. +# License is MIT: https://github.com/BioJulia/Bio.jl/blob/master/LICENSE.md + +# binning index +const BinIndex = Dict{UInt32,Vector{Chunk}} + +# linear index +const LinearIndex = Vector{BGZFStreams.VirtualOffset} + +# Metadata providing a summary of the number of mappend/unmapped reads. +struct PseudoBin + # file range of unmapped reads + unmapped::Chunk + + # number of mapped read segments + n_mapped::Int64 + + # number of unmapped read segments + n_unmapped::Int64 +end + +# Index for BGZFStream; used in BAI and Tabix index. +struct BGZFIndex + # indexes of contigs (chromosomes) + data::Vector{Tuple{BinIndex,LinearIndex,Union{PseudoBin, Nothing}}} +end + +# 16Kbp +const LinearWindowSize = 16 * 1024 + +# Find chunks overlapping with `(seqid, interval)` in `index`. +function overlapchunks(index::BGZFIndex, seqid::Integer, interval::UnitRange) + if !(1 ≤ seqid ≤ lastindex(index.data)) + throw(ArgumentError("sequence id $(seqid) is out of range")) + end + + if isempty(interval) + return Chunk[] + end + + binindex, linindex, pbin = index.data[seqid] + bins = reg2bins(first(interval), last(interval)) + ret = Chunk[] + idx = cld(first(interval), LinearWindowSize) + if lastindex(linindex) ≥ idx + # `linindex` may be empty for contigs with no records + offset = linindex[idx] + for bin in bins + if haskey(binindex, bin) + for chunk in binindex[bin] + if chunk.stop > offset + push!(ret, chunk) + end + end + end + end + end + + # tidy up the list of chunks + sort!(ret) + reduce!(ret) + + return ret +end + +# Calculate bins overlapping a region [from, to] (one-based). +function reg2bins(from, to) + bins = UInt32[] + bin_start = 0 + for scale in 29:-3:14 + for k in ((from - 1) >> scale):((to - 1) >> scale) + push!(bins, bin_start + k) + end + bin_start = 8 * bin_start + 1 + end + return bins +end + +# Merge chunks so as to minimize the number of seek operations. +function reduce!(chunks) + @assert issorted(chunks) + # NOTE: the maximum size of a BGZF block is 64KiB + merge_threshold = 64 * 1024 * 2 + i = 1 + while i < lastindex(chunks) + chunk = chunks[i] + next = chunks[i+1] + if chunk.stop > next.start || next.start[1] - chunk.stop[1] ≤ merge_threshold + # merge overlapping or close chunks + chunks[i] = Chunk(chunk.start, max(chunk.stop, next.stop)) + deleteat!(chunks, i + 1) + continue + end + i += 1 + end + return chunks +end + +# Read `n_refs` BAI/Tabix-compatible indexes from `input`. +function read_bgzfindex(input, n_refs) + indexes = Tuple{BinIndex,LinearIndex,Union{PseudoBin, Nothing}}[] + for _ in 1:n_refs + # load a binning index (and a pseudo bin) + n_bins = read(input, Int32) + binindex = BinIndex() + pbin::Union{PseudoBin,Nothing} = nothing + for _ in 1:n_bins + bin = read(input, UInt32) + n_chunks = read(input, Int32) + if bin == 37450 + # pseudo-bin + @assert n_chunks == 2 + chunk_beg = read(input, UInt64) + chunk_end = read(input, UInt64) + n_mapped = read(input, UInt64) + n_unmapped = read(input, UInt64) + pbin = PseudoBin(Chunk(chunk_beg, chunk_end), n_mapped, n_unmapped) + else + chunks = Chunk[] + for i in 1:n_chunks + chunk_beg = read(input, UInt64) + chunk_end = read(input, UInt64) + push!(chunks, Chunk(chunk_beg, chunk_end)) + end + binindex[bin] = chunks + end + end + + # load a linear index + n_intvs = read(input, Int32) + linindex = LinearIndex() + for _ in 1:n_intvs + push!(linindex, read(input, UInt64)) + end + + push!(indexes, (binindex, linindex, pbin)) + end + return BGZFIndex(indexes) +end diff --git a/src/chunk.jl b/src/chunk.jl new file mode 100644 index 0000000..5c3e878 --- /dev/null +++ b/src/chunk.jl @@ -0,0 +1,38 @@ +# Chunk +# ===== +# +# Consecutive range of BGZF files. +# +# This file is a part of BioJulia. +# License is MIT: https://github.com/BioJulia/Bio.jl/blob/master/LICENSE.md + +# BGZF file chunk [.start, .stop). +struct Chunk + start::BGZFStreams.VirtualOffset + stop::BGZFStreams.VirtualOffset +end + +function Base.in(voffset::BGZFStreams.VirtualOffset, chunk::Chunk) + return chunk.start ≤ voffset < chunk.stop +end + +function Base.:(==)(chunk1::Chunk, chunk2::Chunk) + return chunk1.start == chunk2.start && chunk1.stop == chunk2.stop +end + +function Base.isless(chunk1::Chunk, chunk2::Chunk) + + if isless(chunk1.start, chunk2.start) + return true + end + + if chunk1.start == chunk2.start && isless(chunk1.stop, chunk2.stop) + return true + end + + return false +end + +function Base.seek(stream::BGZFStreams.BGZFStream, chunk::Chunk) + return seek(stream, chunk.start) +end diff --git a/src/overlap.jl b/src/overlap.jl new file mode 100644 index 0000000..02f5f8b --- /dev/null +++ b/src/overlap.jl @@ -0,0 +1,90 @@ +# Tabix Overlap Iterator +# ====================== + +struct TabixOverlapIterator{T} + reader::T + interval::Interval +end + +function Base.eltype(::Type{TabixOverlapIterator{T}}) where T + return eltype(T) +end + +function Base.IteratorSize(::Type{TabixOverlapIterator{T}}) where T + return Base.SizeUnknown() +end + +mutable struct TabixOverlapIteratorState{T} + chunks::Vector{Indexes.Chunk} + chunkid::Int + done::Bool + record::T +end + +function Base.iterate(iter::TabixOverlapIterator) + @assert iter.reader.index !== nothing + # TODO: Use a method that resets the reading position. + buffer = BioCore.IO.stream(iter.reader) + iter.reader.state = BioCore.Ragel.State(1, BufferedStreams.BufferedInputStream(buffer.source)) + state = TabixOverlapIteratorState(Indexes.overlapchunks(iter.reader.index, iter.interval), 0, false, eltype(iter)()) + + return iterate(iter, state) +end + +function done(iter::TabixOverlapIterator, state) + buffer = BioCore.IO.stream(iter.reader) + source = buffer.source + if state.chunkid == 0 + if isempty(state.chunks) + return true + end + state.chunkid += 1 + seek(source, state.chunks[state.chunkid].start) + end + while state.chunkid ≤ lastindex(state.chunks) + chunk = state.chunks[state.chunkid] + #= + The `virtualoffset(source)` is not synchronized with the current reading position because data are buffered in `buffer` for parsing text. + So we need to check not only `virtualoffset` but also `nb_available`, which returns the current buffered data size. + =# + while bytesavailable(buffer) > 0 || BGZFStreams.virtualoffset(source) < chunk.stop + read!(iter.reader, state.record) + c = icmp(state.record, iter.interval) + if c == 0 # overlapping + return false + end + if c > 0 + # no more overlapping records in this chunk + break + end + end + state.chunkid += 1 + if state.chunkid ≤ lastindex(state.chunks) + seek(source, state.chunks[state.chunkid].start) + end + end + # no more overlapping records + return true +end + +function Base.iterate(iter::TabixOverlapIterator, state) + if done(iter, state) + return nothing + end + + return copy(state.record), state +end + +function icmp(record, interval) + c = cmp(BioCore.seqname(record), interval.seqname) + + if c < 0 || (c == 0 && BioCore.rightposition(record) < interval.first) + return -1 + end + + if c > 0 || (c == 0 && BioCore.leftposition(record) > interval.last) + return +1 + end + + return 0 +end diff --git a/src/tabix.jl b/src/tabix.jl new file mode 100644 index 0000000..bb621e2 --- /dev/null +++ b/src/tabix.jl @@ -0,0 +1,151 @@ +# Tabix +# ===== +# +# Generic index for tab-delimited files. +# +# Li, Heng. "Tabix: fast retrieval of sequence features from generic TAB-delimited files." Bioinformatics 27.5 (2011): 718-719. +# Specification: http://samtools.github.io/hts-specs/tabix.pdf +# +# This file is a part of BioJulia. +# License is MIT: https://github.com/BioJulia/Bio.jl/blob/master/LICENSE.md + +# An index type for tab-delimited files. +struct Tabix + # file format + # * 0: generic + # * 1: SAM + # * 2: VCF + # note: `format & 0x10000 != 0` indicates the BED rule. + format::Int32 + + # triplet of columns (sequence name, start of a region, end of a region) + columns::NTuple{3,Int} + + # leading character for comment lines + meta::Char + + # number of lines to skip at the beginning + skip::Int + + # sequence names + names::Vector{String} + + # BGZF file index + index::BGZFIndex + + # number of unmapped reads + n_no_coor::Union{Int, Nothing} +end + +function Base.show(io::IO, index::Tabix) + println(io, summary(index), ":") + println(io, " format: ", format2str(index.format)) + println(io, " columns: ", index.columns) + println(io, " meta char: '", index.meta, "'") + println(io, " skip lines: ", index.skip) + print(io, " names: ", index.names) +end + +""" + Tabix(filename::AbstractString) + Tabix(input::IO) + +Load a Tabix index from `filename` or `input`. +""" +function Tabix(filename::AbstractString) + return open(read_tabix, filename) +end + +function Tabix(input::IO) + return read_tabix(input) +end + +function findtabix(filepath::AbstractString) + ret = string(filepath, ".tbi") + if isfile(ret) + return ret + end + return nothing +end + + +""" + overlapchunks(tabix::Tabix, interval::Interval) + +Return chunks possibly overlapping with the range specified by `interval`. + +Note that records within the returned chunks are not guaranteed to actually overlap the query interval. +""" +function overlapchunks(tabix::Tabix, interval::Interval) + seqid = findfirst(isequal(interval.seqname), tabix.names) + if seqid == 0 + throw(ArgumentError("failed to find sequence name '$(interval.seqname)'")) + end + return overlapchunks(tabix.index, seqid, interval.first:interval.last) +end + +# Check if `format` follows the BED rule (half-closed-half-open and 0-based). +function is_bed_rule(format) + return format & 0x10000 != 0 +end + +# Convert a tabix file format integer to a string. +function format2str(format) + if format == 1 + return "SAM" + end + + if format == 2 + return "VCF" + end + + if is_bed_rule(format) + return "generic (BED rule)" + end + + return "generic" +end + +# Read a Tabix object from `input_`. +function read_tabix(input_::IO) + input = BGZFStreams.BGZFStream(input_) + + # check magic bytes + T = read(input, UInt8) + B = read(input, UInt8) + I = read(input, UInt8) + x = read(input, UInt8) + if T != UInt8('T') || B != UInt8('B') || I != UInt8('I') || x != 0x01 + error("invalid tabix magic bytes") + end + + # read contents + n_refs = read(input, Int32) + format = read(input, Int32) + col_seq = read(input, Int32) + col_beg = read(input, Int32) + col_end = read(input, Int32) + meta = read(input, Int32) + skip = read(input, Int32) + l_nm = read(input, Int32) + data = read(input, l_nm) + names = split(String(data), '\0', keepempty=false) + if length(names) != n_refs + error("the number of sequence names doesn't match the expacted value") + end + index = read_bgzfindex(input, n_refs) + if !eof(input) + n_no_coor::Union{Int, Nothing} = read(input, UInt64) + else + n_no_coor = nothing + end + + return Tabix( + format, + (col_seq, col_beg, col_end), + meta, + skip, + names, + index, + n_no_coor) +end diff --git a/test/runtests.jl b/test/runtests.jl new file mode 100644 index 0000000..8e5ed8a --- /dev/null +++ b/test/runtests.jl @@ -0,0 +1,7 @@ +using Test +using Indexes + +@testset "Indexes" begin + # TODO + # @test GenomicFeatures.Indexes.Tabix === GenomicFeatures.Indexes.Tabix +end