From eaa5f4a18e47ac09b02ba4b0722495f057919e22 Mon Sep 17 00:00:00 2001 From: Tom Kurowski Date: Thu, 18 Apr 2019 14:43:54 +0100 Subject: [PATCH] Release Candidate 0.12.0 --- .gitignore | 26 ++ CMakeLists.txt | 55 +++ LICENSE | 19 + README.md | 449 +++++++++++++++++++ docs/.gitkeep | 0 include/alleles.h | 54 +++ include/ast.h | 60 +++ include/bitarray.h | 113 +++++ include/build.h | 32 ++ include/chroms.h | 32 ++ include/distance.h | 32 ++ include/errorc.h | 62 +++ include/hashmap.h | 71 +++ include/heap.h | 46 ++ include/query.h | 33 ++ include/rename.h | 35 ++ include/samples.h | 32 ++ include/snv.h | 56 +++ include/tersect.h | 30 ++ include/tersect_db.h | 139 ++++++ include/vcf_parser.h | 73 +++ include/vcf_writer.h | 47 ++ include/view.h | 32 ++ src/CMakeLists.txt | 27 ++ src/alleles.c | 27 ++ src/ast.c | 147 ++++++ src/bitarray.c | 729 ++++++++++++++++++++++++++++++ src/build.c | 367 +++++++++++++++ src/chroms.c | 97 ++++ src/distance.c | 447 +++++++++++++++++++ src/errorc.c | 64 +++ src/hashmap.c | 123 +++++ src/heap.c | 136 ++++++ src/query.l | 57 +++ src/query.y | 281 ++++++++++++ src/rename.c | 130 ++++++ src/samples.c | 129 ++++++ src/snv.c | 62 +++ src/tersect.c | 91 ++++ src/tersect_db.c | 914 ++++++++++++++++++++++++++++++++++++++ src/tersect_db_internal.h | 89 ++++ src/vcf_parser.c | 221 +++++++++ src/vcf_writer.c | 102 +++++ src/version.h.in | 33 ++ src/view.c | 124 ++++++ 45 files changed, 5925 insertions(+) create mode 100644 .gitignore create mode 100644 CMakeLists.txt create mode 100644 LICENSE create mode 100644 README.md create mode 100644 docs/.gitkeep create mode 100644 include/alleles.h create mode 100644 include/ast.h create mode 100644 include/bitarray.h create mode 100644 include/build.h create mode 100644 include/chroms.h create mode 100644 include/distance.h create mode 100644 include/errorc.h create mode 100644 include/hashmap.h create mode 100644 include/heap.h create mode 100644 include/query.h create mode 100644 include/rename.h create mode 100644 include/samples.h create mode 100644 include/snv.h create mode 100644 include/tersect.h create mode 100644 include/tersect_db.h create mode 100644 include/vcf_parser.h create mode 100644 include/vcf_writer.h create mode 100644 include/view.h create mode 100644 src/CMakeLists.txt create mode 100644 src/alleles.c create mode 100644 src/ast.c create mode 100644 src/bitarray.c create mode 100644 src/build.c create mode 100644 src/chroms.c create mode 100644 src/distance.c create mode 100644 src/errorc.c create mode 100644 src/hashmap.c create mode 100644 src/heap.c create mode 100644 src/query.l create mode 100644 src/query.y create mode 100644 src/rename.c create mode 100644 src/samples.c create mode 100644 src/snv.c create mode 100644 src/tersect.c create mode 100644 src/tersect_db.c create mode 100644 src/tersect_db_internal.h create mode 100644 src/vcf_parser.c create mode 100644 src/vcf_writer.c create mode 100644 src/version.h.in create mode 100644 src/view.c diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..f1b47be --- /dev/null +++ b/.gitignore @@ -0,0 +1,26 @@ +# Build & compilation output +/build +/bin +/dist + +# Test data +/data +/test + +# IDEs and editors +/.idea +.project +.c9/ +*.launch +.settings/ +*.sublime-workspace +/.vscode +.vscode/* +!.vscode/settings.json +!.vscode/tasks.json +!.vscode/launch.json +!.vscode/extensions.json + +# System Files +.DS_Store +Thumbs.db diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..dbcbb88 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,55 @@ +cmake_minimum_required(VERSION 3.1) +project(tersect) + +add_definitions(-D_POSIX_C_SOURCE=200809L) + +set(DEFAULT_BUILD_TYPE "Release") +if(NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE ${DEFAULT_BUILD_TYPE}) +endif() + +set(RELEASE_OPTIONS -pedantic -Wall -Wextra -O3 -march=native -std=c99) +set(DEBUG_OPTIONS -pedantic -Wall -Wextra -O3 -march=native -std=c99 -g) + +add_executable(tersect "") +target_compile_options(tersect +PRIVATE + "$<$:${RELEASE_OPTIONS}>" + "$<$:${DEBUG_OPTIONS}>" +) + +set_target_properties(tersect +PROPERTIES + RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/../bin +) + +execute_process( + COMMAND git describe --tags + WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} + OUTPUT_VARIABLE GIT_VERSION_TAG + OUTPUT_STRIP_TRAILING_WHITESPACE +) +string(REGEX REPLACE "^v" "" TERSECT_VERSION_TAG "${GIT_VERSION_TAG}") +set(VERSION_FILE ${CMAKE_BINARY_DIR}/src/version.h) +configure_file(src/version.h.in ${VERSION_FILE}) + +include_directories(include ${CMAKE_BINARY_DIR}/src) + +find_package(BISON 2.6 REQUIRED) +find_package(FLEX 2.5 REQUIRED) +bison_target(QueryParser src/query.y ${CMAKE_BINARY_DIR}/src/query.tab.c) +flex_target(QueryScanner src/query.l ${CMAKE_BINARY_DIR}/src/lex.yy.c) +add_flex_bison_dependency(QueryScanner QueryParser) + +add_subdirectory(src) + +install(TARGETS tersect DESTINATION bin) + +set(CPACK_GENERATOR DEB RPM TGZ) +set(CPACK_PACKAGE_NAME tersect) +set(CPACK_PACKAGE_VERSION "${TERSECT_VERSION_TAG}") +set(CPACK_PACKAGE_CONTACT "Tomasz Kurowski ") +set(CPACK_PACKAGE_DESCRIPTION_SUMMARY + "set theoretical operations on genomic variant data") +set(CPACK_DEBIAN_PACKAGE_SECTION "science") +include(CPack) diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..461f690 --- /dev/null +++ b/LICENSE @@ -0,0 +1,19 @@ +Copyright (C) 2019 Cranfield University + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..4ee563b --- /dev/null +++ b/README.md @@ -0,0 +1,449 @@ +# Tersect + +Tersect is a command-line utility for conducting fast set theoretical operations and genetic distance estimation on biological sequence variant data. The tool [generates index files](#building-a-tersect-index) based on provided variant data (VCF files) which can then be used to rapidly [execute flexible set theoretical queries](#set-operations) and output the resulting lists of variants in selected regions. + +Tersect is intended to allow for highly responsive, exploratory interaction with variant data as well as for integration with larger tools and pipelines. It follows the Samtools/tabix convention for specifying [genomic regions](#regions) which allows for much faster operations and more manageable output sizes. + +Tersect can also be used to provide estimates of genetic distance between sets of samples, using the number of differing sites as a proxy for distance measures. + +## Table of Contents + +- [Tersect](#tersect) + - [Table of Contents](#table-of-contents) + - [Installation](#installation) + - [Pre-compiled releases](#pre-compiled-releases) + - [Linux](#linux) + - [macOS](#macos) + - [Building Tersect from source](#building-tersect-from-source) + - [1. Cloning the repository](#1-cloning-the-repository) + - [2. Building](#2-building) + - [3. Installing](#3-installing) + - [Example data](#example-data) + - [Building a Tersect index](#building-a-tersect-index) + - [Inspecting a Tersect index](#inspecting-a-tersect-index) + - [Set operations](#set-operations) + - [Overview](#overview) + - [Queries](#queries) + - [Genomes](#genomes) + - [Binary operators](#binary-operators) + - [Genome list](#genome-list) + - [Functional operators](#functional-operators) + - [Regions](#regions) + +## Installation + +### Pre-compiled releases + +Tersect packages and binaries are available for download below: + +#### Linux + +- 64-bit binaries: [tersect-0.12.0-Linux.tar.gz](https://github.com/tomkurowski/tersect/releases/download/v0.12.0/tersect-0.12.0-Linux.tar.gz) +- 64-bit .deb package (Debian, Ubuntu): [tersect-0.12.0-Linux.deb](https://github.com/tomkurowski/tersect/releases/download/v0.12.0/tersect-0.12.0-Linux.deb) +- 64-bit .rpm package (Fedora, openSUSE): [tersect-0.12.0-Linux.rpm](https://github.com/tomkurowski/tersect/releases/download/v0.12.0/tersect-0.12.0-Linux.rpm) + +#### macOS + +- 64-bit binaries: [tersect-0.12.0-macOS.tar.gz](https://github.com/tomkurowski/tersect/releases/download/v0.12.0/tersect-0.12.0-macOS.tar.gz) + +### Building Tersect from source + +Building Tersect from source requires CMake version 3.1+ as well as Flex (lexical analyzer) version 2.5+ and Bison (parser generator) version 2.6+. + +#### 1. Cloning the repository + +```bash +git clone https://github.com/tomkurowski/tersect.git +``` + +#### 2. Building + +For an out-of-source build after cloning the repository use the following commands: + +```bash +cd tersect +mkdir build +cd build +cmake .. +make +``` + +#### 3. Installing + +This step may require elevated permissions (e.g. prefacing the command with ``sudo``). The default installation location for Tersect is `/usr/local/bin`. + +```bash +make install +``` + +## Example data + +Two archives containing example Tersect index files (.tsi) are available for download below to allow you to try out the utility without needing to create an index file yourself. + +The first index contains human genomic variant data for 2504 individuals from the 1000 Genomes Project. While Tersect is capable of handling the entire human genome, the index below is limited to chromosome 1 to make the example archive smaller and quicker to download. + +The second index contains tomato genomic variant data for 360 tomato accessions from the AGIS Tomato 360 Resequencing Project and 84 accessions from the Wageningen UR 150 Tomato Genome Resequencing Project for a combined data set of 444 accessions. Samples have been renamed according to a provided key ([accession_names.tsv](https://github.com/tomkurowski/tersect/releases/download/v0.12.0/accession_names.tsv)) to make them more informative and consistent between the two source data sets. + +**Note:** the index files provided below are compressed using gzip and need to be uncompressed before use. + +- [2504 human genomes, chromosome 1](https://github.com/tomkurowski/tersect/releases/download/v0.12.0/human_chr1.tsi.gz) +- [444 tomato genomes](https://github.com/tomkurowski/tersect/releases/download/v0.12.0/tomato.tsi.gz) [ sample names: [accession_names.tsv](https://github.com/tomkurowski/tersect/releases/download/v0.12.0/accession_names.tsv) ] + +## Building a Tersect index + +You can build your own Tersect index based on a set of VCF files using the `tersect build` command. You need to provide a name for your index file (a .tsi extension will be added if you omit it) as the first argument, followed by any number of input VCF files (which may be compressed using gzip) to be included in the index. + +**Example:** + +The command below builds a Tersect index file named *index.tsi* which includes variants from all *.vcf.gz* files in the *data* directory. Depending on the input size this can take several minutes. + +```console +foo@bar:~$ tersect build tomato.tsi ./data/*.vcf.gz +``` + +Optionally, you can also provide a ``--name-file`` input file containing custom sample names to be used by Tersect. These names will replace the default sample IDs defined in the input VCF header lines. The ``--name-file`` should be a tab-delimited file containing two columns, the first with the sample IDs to be replaced and the second with the names to be used by Tersect. An example is shown below: + +```console +TS-1 S.lyc B TS-1 +TS-10 S.lyc B TS-10 +TS-100 S.lyc B TS-100 +TS-101 S.lyc B TS-101 +TS-102 S.lyc B TS-102 +TS-103 S.lyc B TS-103 +TS-104 S.lyc B TS-104 +TS-108 S.lyc B TS-108 +TS-11 S.lyc B TS-11 +TS-110 S.lyc B TS-110 +``` + +You can also modify sample names in an existing Tersect index file by using the `tersect rename` command. + +## Inspecting a Tersect index + +The data contained in a Tersect index file can be inspected using several commands. The `tersect chroms` command prints information on the number of variants present in each of the reference chromosomes as well as the chromosome names and size. **Note:** In the absence of a reference file, the *length* of each chromosome is represented by the position of the last variant, which will always be an underestimate. + +**Example:** + +The command below prints the per-chromosome variant content of the example Tersect index file named *tomato.tsi* (you can download it [here](#example-data)). + +```console +foo@bar:~$ tersect chroms tomato.tsi +Chromosome Length Variants +SL2.50ch00 21805702 1343815 +SL2.50ch01 98543411 9965680 +SL2.50ch02 55340384 5189338 +SL2.50ch03 70787603 6741448 +SL2.50ch04 66470926 7257520 +SL2.50ch05 65875078 6830857 +SL2.50ch06 49751619 4870941 +SL2.50ch07 68044764 6868152 +SL2.50ch08 65866627 6504025 +SL2.50ch09 72481975 7102356 +SL2.50ch10 65527500 6712293 +SL2.50ch11 56302478 5367032 +SL2.50ch12 67145147 6719621 +``` + +The `tersect samples` command prints the names of samples present in a Tersect index file. These can be either all samples or a subset based on a naming pattern (the `--match` parameter) and/or on the presence of specific variants (the `--contains` parameter). + +Sample name patterns can include wildcard symbols (\*) which match zero or more characters. For example, a pattern like "S.lyc\*" will match all samples whose names begin with "S.lyc". A lone wildcard character matches all samples stored in the Tersect index file. + +If you specify a list of variants via the `--contains` paramter, only samples which contain each of those variants will be printed. The variant format should look as follows: `chromosome:position:ref:alt` where `ref` and `alt` are reference and alternate alleles. Multiple variants can be included, separated by commas (e.g. `chr1:1245:A:G,chr8:5300:T:A`). + +**Examples:** + +The command below prints the names of samples matching the *"S.gal\*"* wildcard pattern contained in the example Tersect index file *tomato.tsi*. + +```console +foo@bar:~$ tersect samples tomato.tsi -m "S.gal*" +Sample +S.gal W TS-208 +S.gal LA1044 +S.gal LA1401 +S.gal LA0483 +``` + +The command below prints the names of all samples containing both a T/G SNV at position 100642 on chromosome 3 and an A/G SNV at position 5001015 on chromosome 6 contained in the example Tersect index file *tomato.tsi*. + +```console +foo@bar:~$ tersect samples tomato.tsi -c "SL2.50ch03:100642:T:G,SL2.50ch06:5001015:A:G" +Sample +S.lyc LA1479 +S.pen LA0716 +S.hab LYC4 +S.hab LA0407 +S.hab LA1777 +S.hab LA1718 +S.hab CGN15792 +S.hab PI134418 +S.hab CGN15791 +S.chm LA2695 +``` + +## Set operations + +### Overview + +Tersect can interpret and display the results of set theoretical commands using the `tersect view` command. This is the primary and most flexible functionality of the application and allows the user to construct arbitrarily complex queries. The expected format of a `tersect view` query is as follows: + +```console +tersect view [options] index.tsi QUERY [REGION1...] +``` + +### Queries + +A query is a command interpreted and evaluated by Tersect which (if successful) prints either a list of variants (if the result is a single genome or virtual genome) or a list of genome sample names (if the result is a list of genomes). The simplest query consists of a genome name and prints out the variants belonging to that genome. More advanced queries can contain complex combinations of operations described in the sections below. + +**Note:** The term *virtual genome* refers to a collection of variants not representing a specific genome - for example, the symmetric difference of two genomes (the collection of variants which appear in one but not both of the genomes). Tersect treats these *virtual genomes* the same way it treats "real" genomes so they can be used as operands in nested queries. + +#### Genomes + +Genomes can be referred to by their sample name, which is either taken from the header line of the source VCF file or set by the user either manually (see `tersect rename`) or through a tab-delimited name file (see `--name-file` in `tersect build` and `tersect rename`). A sample name can be of any length and can include any characters (including whitespace) except for single quotes ('). However, if a sample name includes whitespace, parentheses, or characters used as Tersect operators (-^&|>,\\), it has to be surrounded by single quotes ('). + +If the query is (or results in) a single genome or virtual genome, the variants contained by that one genome are printed out. + +**Example:** + +Print out all the variants belonging to the "S.hab LYC4" genome in the *tomato.tsi* Tersect index file: + +```console +foo@bar:~$ tersect view tomato.tsi "'S.hab LYC4'" +##fileformat=VCFv4.3 +##tersectVersion=0.11.0 +##tersectCommand='S.hab LYC4' +#CHROM POS ID REF ALT QUAL FILTER INFO +SL2.50ch00 391 . C T . . . +SL2.50ch00 416 . T A . . . +SL2.50ch00 734 . T G . . . +SL2.50ch00 759 . C T . . . +SL2.50ch00 771 . A G . . . +SL2.50ch00 778 . T A . . . +... +``` + +**Note:** The sample name had to be surrounded by single quotes because it contains a whitespace character. + +#### Binary operators + +Tersect supports four basic binary operators. Each operand has to be a **single** genome. All four operators have the same precedence and are left-associative. You can use parentheses to enforce precedence other than simple left-to-right. + +| Operator | Name | Usage | Result | +|:---------:|:--------------------:|:------------------:|:------------------------------------------------------------:| +| & | intersection | GENOME1 & GENOME2 | Virtual genome containing variants found in both GENOME1 and GENOME2 | +| \| | union | GENOME1 \| GENOME2 | Virtual genome containing variants found in GENOME1, GENOME2, or both | +| \ | difference | GENOME1 \ GENOME2 | Virtual genome containing variants found in GENOME1 but not in GENOME2 | +| ^ | symmetric difference | GENOME1 ^ GENOME2 | Virtual genome containing variants found in GENOME1 or GENOME2 but not in both | + +The result of a binary operation is treated as a single genome (though it does not have a sample name) called a *virtual genome*, which can be used in further operations. + +**Examples:** + +Print out the variants shared by 'S.hua LA1983' and 'S.pim LYC2798': + +```console +foo@bar:~$ tersect view tomato.tsi "'S.hua LA1983' & 'S.pim LYC2798'" +##fileformat=VCFv4.3 +##tersectVersion=0.11.0 +##tersectCommand='S.hua LA1983' & 'S.pim LYC2798' +#CHROM POS ID REF ALT QUAL FILTER INFO +SL2.50ch00 3235 . A G . . . +SL2.50ch00 3277 . A G . . . +SL2.50ch00 3873 . C T . . . +SL2.50ch00 4083 . A G . . . +SL2.50ch00 4112 . T G . . . +SL2.50ch00 4314 . A C . . . +... +``` + +Print out the variants which appear in only one of 'S.gal LA1044' or 'S.gal W TS-208': + +```console +foo@bar:~$ tersect view tomato.tsi "'S.gal LA1044' ^ 'S.gal W TS-208'" +##fileformat=VCFv4.3 +##tersectVersion=0.11.0 +##tersectCommand='S.gal LA1044' ^ 'S.gal W TS-208' +#CHROM POS ID REF ALT QUAL FILTER INFO +SL2.50ch00 362 . G T . . . +SL2.50ch00 867 . G T . . . +SL2.50ch00 1198 . G A . . . +SL2.50ch00 3235 . A G . . . +SL2.50ch00 3567 . T G . . . +SL2.50ch00 3873 . C T . . . +... +``` + +Print out the variants which appear in 'S.chi CGN15532' but not 'S.chi CGN15530' or 'S.chi W TS-408': + +```console +foo@bar:~$ tersect view tomato.tsi "'S.chi CGN15532' \ 'S.chi CGN15530' \ 'S.chi W TS-408'" +##fileformat=VCFv4.3 +##tersectVersion=0.11.0 +##tersectCommand='S.chi CGN15532' \ 'S.chi CGN15530' \ 'S.chi W TS-408' +#CHROM POS ID REF ALT QUAL FILTER INFO +SL2.50ch00 1163 . C G . . . +SL2.50ch00 1811 . C G . . . +SL2.50ch00 1818 . C A . . . +SL2.50ch00 1818 . C G . . . +SL2.50ch00 2432 . G A . . . +SL2.50ch00 2544 . A T . . . +... +``` + +**Note:** A more convenient way to conduct the same operation on many genomes is by using [functional operators](#functional-operators). + +#### Genome list + +Instead of individual genomes, Tersect can also operate on lists of genomes. These can be selected using wildcard patterns matching genome sample names, with the most general pattern of a lone wildcard operator (`*`) matching *all* the genomes in the Tersect index file. Individual genomes can also be appended to lists using commas (`,`) or removed from lists using minus signs (`-`). + +Genome lists can also be filtered (using the `>` operator) by whether they contain a specified list of variants. The variant format should look as follows: `chromosome:position:ref:alt` where `ref` and `alt` are reference and alternate alleles. Multiple variants can be included, separated by commas (e.g. `chr1:1245:A:G,chr8:5300:T:A`). + +| Operator | Name | Usage | Result | +|:--------:|:--------:|:---------:|:------------:| +| * | wildcard | PATTERN | Genome list containing all genomes whose sample names match the provided wildcard pattern | +| , | append | GENOMELIST, GENOME | Genome list containing all genomes in GENOMELIST and GENOME | +| - | remove | GENOMELIST - GENOME | Genome list containing all genomes in GENOMELIST except GENOME | +| > | superset | GENOMELIST > VARIANTLIST | Genome list containing all genomes in GENOMELIST which contain all variants in VARIANTLIST + +**Note:** Tersect does not distinguish between a genome list which contains only one genome and a single genome. The former can be used in binary operations and the latter can be used in functional operations or in constructing genome lists. + +If the query is (or results in) a genome list, the list of their genome sample names is printed out. + +**Examples:** + +Print out all the names of genomes which begin with "S.pim": + +```console +foo@bar:~$ tersect view tomato.tsi "S.pim*" +S.pim P TS-92 +S.pim P TS-79 +S.pim P TS-77 +S.pim P TS-50 +S.pim P TS-441 +S.pim P TS-440 +S.pim P TS-439 +S.pim P TS-438 +... +``` + +Print out all the names of genomes which contain an A/G single nucleotide polymorphism at position 828587 in chromosome 7: + +```console +foo@bar:~$ tersect view tomato.tsi "* > SL2.50ch07:828587:A:G" +S.lyc C TS-97 +S.lyc C TS-94 +S.pim P TS-79 +S.pim P TS-77 +S.lyc B TS-68 +S.lyc C TS-53 +S.pim P TS-50 +S.pim P TS-441 +... +``` + +Print out all the names of genomes which contain a G/A SNP at position 1590608 in chromosome 5 and a T/C SNP at position 5230 in chromosome 12, except for 'S.gal LA1401' and those whose names begin with "S.pim": + +```console +foo@bar:~$ tersect view tomato.tsi "* > SL2.50ch05:1590608:G:A,SL2.50ch12:5230:T:C - ('S.pim*','S.gal LA1401')" +S.lyc C TS-431 +S.lyc C TS-430 +S.lyc LA1314 +``` + +#### Functional operators + +Functional operators are used to conduct operations on genome lists instead of individual genomes. + +| Operator | Name | Usage | Result | +|:--------:|:----:|:-----:|:------:| +| union()
u() | arbitrary union | union(GENOMELIST)
u(GENOMELIST) | Virtual genome containing all variants contained in any of the genomes in GENOMELIST | +| intersect()
i() | arbitrary intersection | intersect(GENOMELIST)
i(GENOMELIST) | Virtual genome containing all variants which appear in every genome in GENOMELIST | + +The result of a functional operation is treated as a single genome (though it does not have a sample name). + +**Examples:** + +Union of all genomes, containing every variant recorded in the *tomato.tsi* Tersect index file: + +```console +foo@bar:~$ tersect view tomato.tsi "u(*)" +##fileformat=VCFv4.3 +##tersectVersion=0.11.0 +##tersectCommand=u(*) +#CHROM POS ID REF ALT QUAL FILTER INFO +SL2.50ch00 280 . A C . . . +SL2.50ch00 284 . A G . . . +SL2.50ch00 316 . C T . . . +SL2.50ch00 323 . C T . . . +SL2.50ch00 332 . A T . . . +SL2.50ch00 362 . G T . . . +... +``` + +Intersection of all genomes which contain a T/A single nucleotide polymorphism at position 12547 in chromosome 12, containing all variants that are shared by each of those genomes: + +```console +foo@bar:~$ tersect view tomato.tsi "i(* > SL2.50ch12:12547:T:A)" +##fileformat=VCFv4.3 +##tersectVersion=0.11.0 +##tersectCommand=i(* > SL2.50ch12:12547:T:A) +#CHROM POS ID REF ALT QUAL FILTER INFO +SL2.50ch00 16576 . T C . . . +SL2.50ch00 26171 . G T . . . +SL2.50ch00 29880 . A G . . . +SL2.50ch00 37486 . T G . . . +SL2.50ch00 40476 . G T . . . +SL2.50ch00 436850 . A G . . . +... +``` + +Print all the variants which appear only in genome S.hab CGN15792. This is achieved by finding the difference of that genome and the union of all genomes except S.hab CGN15792: + +```console +foo@bar:~$ tersect view tomato.tsi "'S.hab CGN15792' \ u(* - 'S.hab CGN15792')" +##fileformat=VCFv4.3 +##tersectVersion=0.11.0 +##tersectCommand='S.hab CGN15792' \ u(* - 'S.hab CGN15792') +#CHROM POS ID REF ALT QUAL FILTER INFO +SL2.50ch00 1163 . C T . . . +SL2.50ch00 1596 . G A . . . +SL2.50ch00 2048 . G A . . . +SL2.50ch00 2933 . G A . . . +SL2.50ch00 2987 . A T . . . +SL2.50ch00 4349 . C T . . . +... +``` + +### Regions + +By default, queries are executed and results are returned for the entire genome. However, it is possible to selectively execute a query only on a specified region. The familiar tabix/samtools format `chromosome:beginPos-endPos` is used to specify those regions. The coordinates are one-based and inclusive. + +Limiting queries to regions allows for much faster execution since far fewer positions need to be processed and printed, capturing only intervals of interest. This feature makes it possible to use Tersect's flexible queries as a high-performance part of a larger pipeline or the back-end of a highly responsive, interactive application. + +**Example:** + +Print a union, that is all the variants appearing either in genome 'S.lyc SG16', 'S.lyc LA1421', or both, from the first 90 kbp of chromosome 2 in the *tomato.tsi* index file: + +```console +foo@bar:~$ tersect view tomato.tsi "'S.lyc SG16' | 'S.lyc LA1421'" SL2.50ch02:1-90000 +##fileformat=VCFv4.3 +##tersectVersion=0.11.0 +##tersectCommand='S.lyc SG16' | 'S.lyc LA1421' +##tersectRegion=SL2.50ch02:1-90000 +#CHROM POS ID REF ALT QUAL FILTER INFO +SL2.50ch02 204 . A G . . . +SL2.50ch02 255 . TCC TCCC . . . +SL2.50ch02 255 . TCC TCCCC . . . +SL2.50ch02 2382 . G A . . . +SL2.50ch02 13383 . G A . . . +SL2.50ch02 21752 . C T . . . +SL2.50ch02 24538 . T C . . . +SL2.50ch02 29276 . G T . . . +SL2.50ch02 71245 . A C . . . +SL2.50ch02 73326 . C T . . . +SL2.50ch02 86236 . C A . . . +SL2.50ch02 86601 . A G . . . +SL2.50ch02 86635 . T A . . . +SL2.50ch02 86695 . T C . . . +SL2.50ch02 86769 . G A . . . +SL2.50ch02 87079 . T A . . . +``` diff --git a/docs/.gitkeep b/docs/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/include/alleles.h b/include/alleles.h new file mode 100644 index 0000000..2fd4468 --- /dev/null +++ b/include/alleles.h @@ -0,0 +1,54 @@ +/* alleles.h + + Copyright (C) 2019 Cranfield University + + Author: Tomasz Kurowski + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. */ + +#ifndef ALLELES_H +#define ALLELES_H + +#include +#include + +struct allele { + uint32_t position; + char *ref; + char *alt; +}; + +/* +** Compare SNVs, first by position then (alphabetically) by alternate allele +** letter. Used for sorting SNVs. +*/ +inline int allele_cmp(const struct allele *a, const struct allele *b) +{ + int64_t position_diff; + if ((position_diff = a->position - b->position)) { + return position_diff; + } + int ref_diff; + if ((ref_diff = strcmp(a->ref, b->ref))) { + return ref_diff; + } + return strcmp(a->alt, b->alt); +} + +#endif diff --git a/include/ast.h b/include/ast.h new file mode 100644 index 0000000..3536a52 --- /dev/null +++ b/include/ast.h @@ -0,0 +1,60 @@ +/* ast.h + + Copyright (C) 2019 Cranfield University + + Author: Tomasz Kurowski + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. */ + +#ifndef AST_H +#define AST_H + +#include "bitarray.h" +#include "tersect_db.h" + +/** + * AST node types (set theoretical operations / bit arrays / genomes). + */ +#define AST_INTERSECTION 1 +#define AST_UNION 2 +#define AST_DIFFERENCE 3 +#define AST_SYMMETRIC_DIFFERENCE 4 +#define AST_GENOME 10 + +struct ast_node { + int type; + struct ast_node *l; + struct ast_node *r; + struct genome *genome; +}; + +/** + * Create AST subtree describing an intersection/union/symmetric difference of a + * list of genomes and return its root. + */ +struct ast_node *create_subtree(int operation_type, size_t ngenomes, + struct genome *genomes); +struct ast_node *create_ast_node(int operation_type, struct ast_node *l, + struct ast_node *r); +struct ast_node *create_genome_node(struct genome *genome); +struct bitarray *eval_ast(struct ast_node *root, const tersect_db *tdb, + const struct tersect_db_interval *ti); +void free_ast(struct ast_node *root); + +#endif diff --git a/include/bitarray.h b/include/bitarray.h new file mode 100644 index 0000000..834b369 --- /dev/null +++ b/include/bitarray.h @@ -0,0 +1,113 @@ +/* bitarray.h + + Copyright (C) 2019 Cranfield University + + Author: Tomasz Kurowski + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. */ + +#ifndef BITARRAY_H +#define BITARRAY_H + +#include +#include + +#ifndef WAH_BITARRAY_WORD +#define WAH_BITARRAY_WORD uint64_t +#endif + +typedef WAH_BITARRAY_WORD bitarray_word; // type used internally to store the array +extern const uint16_t bitarray_word_capacity; // number of booleans in word + +/* + * The BitArray is a structure meant for compact storage and fast set + * theoretical operations on sets of boolean values. The structure stores two + * "sizes": the size element corresponds to the actual number of boolean values + * stored, while the word_capacity element is an internal variable which tracks + * the size of allocated storage (*array) in terms of multiples of an unsigned + * integer type. + * + * The start and end masks are used to delimit the valid bits in a sub-bit array + * extracted from a larger one. Note that the internal *array of such an + * extracted bit array points to the original array, not a copy. + */ +struct bitarray { + size_t size; // Size in terms of bitarray_word variables + size_t last_word; // Position of previously set bit + size_t ncompressed; // Number of compressed words + bitarray_word *array; + bitarray_word start_mask; + bitarray_word end_mask; +}; + +/** + * Used to store start and end position of bit array intervals. + */ +struct bitarray_interval { + uint64_t start_index; + uint64_t end_index; +}; + +/* + * Initialisation/allocation, zeroing and deallocation routines. + */ +struct bitarray *init_bitarray(uint64_t bit_size); +struct bitarray *copy_bitarray(const struct bitarray *ba); +void clear_bitarray(struct bitarray *ba); +void free_bitarray(struct bitarray *ba); +void bitarray_shrinkwrap(struct bitarray *ba); +void bitarray_resize(struct bitarray *ba, uint64_t new_size); + +/* + * Routines meant for randomising, printing, and writing the contents of bit + * arrays. + */ +void print_bitarray(const struct bitarray *ba); +void print_set_indices(const struct bitarray *ba); + +/* + * Routines for set theoretical operations. + */ +void bitarray_intersection(const struct bitarray *a, + const struct bitarray *b, + struct bitarray **out); +void bitarray_difference(const struct bitarray *a, + const struct bitarray *b, + struct bitarray **out); +void bitarray_symmetric_difference(const struct bitarray *a, + const struct bitarray *b, + struct bitarray **out); +void bitarray_union(const struct bitarray *a, + const struct bitarray *b, + struct bitarray **out); +uint64_t bitarray_distance(const struct bitarray *a, const struct bitarray *b); +uint64_t bitarray_weight(const struct bitarray *ba); +void bitarray_extract_region(struct bitarray *dest_ba, + const struct bitarray *src_ba, + const struct bitarray_interval *region); + +/* + * Routines for manipulating individual bits. + */ +int bitarray_set_bit(struct bitarray *bitset, size_t pos); +int bitarray_get_bit(const struct bitarray *ba, size_t pos); +int bitarray_get_set_indices(const struct bitarray *ba, + size_t *nset_indices, size_t **set_indices); + +#endif diff --git a/include/build.h b/include/build.h new file mode 100644 index 0000000..8f53e68 --- /dev/null +++ b/include/build.h @@ -0,0 +1,32 @@ +/* build.h + + Copyright (C) 2019 Cranfield University + + Author: Tomasz Kurowski + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. */ + +#ifndef BUILD_DB_H +#define BUILD_DB_H + +#include "errorc.h" + +error_t tersect_build_database(int argc, char **argv); + +#endif diff --git a/include/chroms.h b/include/chroms.h new file mode 100644 index 0000000..61bcf8c --- /dev/null +++ b/include/chroms.h @@ -0,0 +1,32 @@ +/* chroms.h + + Copyright (C) 2019 Cranfield University + + Author: Tomasz Kurowski + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. */ + +#ifndef CHROMS_H +#define CHROMS_H + +#include "errorc.h" + +error_t tersect_print_chromosomes(int argc, char **argv); + +#endif diff --git a/include/distance.h b/include/distance.h new file mode 100644 index 0000000..1214622 --- /dev/null +++ b/include/distance.h @@ -0,0 +1,32 @@ +/* distance.h + + Copyright (C) 2019 Cranfield University + + Author: Tomasz Kurowski + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. */ + +#ifndef DISTANCE_H +#define DISTANCE_H + +#include "errorc.h" + +error_t tersect_distance(int argc, char **argv); + +#endif diff --git a/include/errorc.h b/include/errorc.h new file mode 100644 index 0000000..b74051d --- /dev/null +++ b/include/errorc.h @@ -0,0 +1,62 @@ +/* errorc.h + + Copyright (C) 2019 Cranfield University + + Author: Tomasz Kurowski + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. */ + +#ifndef TERSECT_ERROR_H +#define TERSECT_ERROR_H + +typedef enum error_t { + SUCCESS = 0, + FAILURE = 1, + E_ALLOC = 500, + E_NO_GENOME = 600, + E_NO_TSI_FILE = 700, + E_TSI_NOPEN = 701, + E_BUILD_NO_OUTNAME = 5000, + E_BUILD_NO_FILES = 5001, + E_BUILD_CREATE = 5002, + E_BUILD_DB_EXISTS = 5003, + E_BUILD_NO_WRITE = 5004, + E_BUILD_DUPSAMPLE = 5005, + E_PARSE_REGION = 6000, + E_PARSE_REGION_NO_CHROMOSOME = 6001, + E_PARSE_REGION_BAD_BOUNDS = 6002, + E_PARSE_ALLELE = 7000, + E_PARSE_ALLELE_NO_CHROMOSOME = 7001, + E_PARSE_ALLELE_BAD_POSITION = 7002, + E_PARSE_ALLELE_UNKNOWN = 7003, + E_VCF_PARSE_FILE = 7100, + E_VIEW_NO_QUERY = 8000, + E_RENAME_NOPEN = 9000, + E_RENAME_PARSE = 9001, + E_DIST_BIN_REGIONS = 10000, +} error_t; + +extern struct error_desc { + error_t code; + char *description; +} error_desc[]; + +void report_error(error_t code); + +#endif diff --git a/include/hashmap.h b/include/hashmap.h new file mode 100644 index 0000000..7c61a4d --- /dev/null +++ b/include/hashmap.h @@ -0,0 +1,71 @@ +/* hashmap.h + + Copyright (C) 2019 Cranfield University + + Author: Tomasz Kurowski + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. */ + +#ifndef HASHMAP_H +#define HASHMAP_H + +#include + +// Forward declarations +typedef struct HashMap HashMap; +typedef struct Bucket Bucket; +typedef struct HashElement HashElement; +typedef struct HashIterator HashIterator; + +struct HashMap { + uint32_t count; + uint32_t bucket_count; + HashElement *elements; + Bucket *buckets; +}; + +struct Bucket { + uint32_t count; + HashElement *first; +}; + +struct HashElement { + char *key; + void *value; + HashElement *next; +}; + +struct HashIterator { + // Public + char *key; + void *value; + // Private + const HashMap *_hm; + uint32_t _nextindex; +}; + +HashIterator hashmap_iterator(const HashMap *hm); +void hashmap_iterator_next(HashIterator* it); + +HashMap *init_hashmap(uint32_t bucket_count); +void hashmap_insert(HashMap *hm, const char *key, void *value); +void *hashmap_get(const HashMap *hm, const char *key); +void free_hashmap(HashMap *hm); + +#endif diff --git a/include/heap.h b/include/heap.h new file mode 100644 index 0000000..c68d437 --- /dev/null +++ b/include/heap.h @@ -0,0 +1,46 @@ +/* heap.h + + Copyright (C) 2019 Cranfield University + + Author: Tomasz Kurowski + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. */ + +#ifndef HEAP_H +#define HEAP_H + +typedef struct Heap Heap; + +struct Heap { + int size; + int max_size; + int(*comparator)(const void* a, const void* b); + void **array; +}; + +Heap *init_heap(int max_size, int(*comparator)(const void* a, const void* b)); +void heap_push(Heap *heap, void *value); +void *heap_pop(Heap *heap); +void *heap_peek(const Heap *heap); +void sift_down(Heap *heap); +void sift_up(Heap *heap); +void clear_heap(Heap *heap); +void free_heap(Heap *heap); + +#endif diff --git a/include/query.h b/include/query.h new file mode 100644 index 0000000..5593374 --- /dev/null +++ b/include/query.h @@ -0,0 +1,33 @@ +/* query.h + + Copyright (C) 2019 Cranfield University + + Author: Tomasz Kurowski + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. */ + +#ifndef QUERY_H +#define QUERY_H + +#include "bitarray.h" +#include "tersect_db.h" + +struct ast_node *run_set_parser(const char *query, const tersect_db *tdb); + +#endif diff --git a/include/rename.h b/include/rename.h new file mode 100644 index 0000000..0ee9fdc --- /dev/null +++ b/include/rename.h @@ -0,0 +1,35 @@ +/* rename.h + + Copyright (C) 2019 Cranfield University + + Author: Tomasz Kurowski + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. */ + +#ifndef RENAME_H +#define RENAME_H + +#include "errorc.h" +#include "tersect_db.h" + +error_t tersect_rename_sample(int argc, char **argv); + +error_t tersect_load_name_file(tersect_db *tdb, const char *filename); + +#endif diff --git a/include/samples.h b/include/samples.h new file mode 100644 index 0000000..a4a22d5 --- /dev/null +++ b/include/samples.h @@ -0,0 +1,32 @@ +/* samples.h + + Copyright (C) 2019 Cranfield University + + Author: Tomasz Kurowski + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. */ + +#ifndef SAMPLES_H +#define SAMPLES_H + +#include "errorc.h" + +error_t tersect_print_samples(int argc, char **argv); + +#endif diff --git a/include/snv.h b/include/snv.h new file mode 100644 index 0000000..1a58c79 --- /dev/null +++ b/include/snv.h @@ -0,0 +1,56 @@ +/* snv.h + + Copyright (C) 2019 Cranfield University + + Author: Tomasz Kurowski + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. */ + +#ifndef SNV_H +#define SNV_H + +#include + +/** + * Variant codes. + */ +#define V_INDEL 0 +#define SNV_A_C 1 +#define SNV_A_G 2 +#define SNV_A_T 3 +#define SNV_C_A 4 +#define SNV_C_G 5 +#define SNV_C_T 6 +#define SNV_G_A 7 +#define SNV_G_C 8 +#define SNV_G_T 9 +#define SNV_T_A 10 +#define SNV_T_C 11 +#define SNV_T_G 12 + +extern const char * const variant_format[]; + +/** + * @param ref Reference allele + * @param alt Alt allele + * @return int single nucleotide variant code + */ +uint8_t snv_type(char ref, char alt); + +#endif diff --git a/include/tersect.h b/include/tersect.h new file mode 100644 index 0000000..021b1c2 --- /dev/null +++ b/include/tersect.h @@ -0,0 +1,30 @@ +/* tersect.h + + Copyright (C) 2019 Cranfield University + + Author: Tomasz Kurowski + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. */ + +#ifndef TERSECT_H +#define TERSECT_H + +#define DEFAULT_DB_FILENAME "tersect.tsi" + +#endif diff --git a/include/tersect_db.h b/include/tersect_db.h new file mode 100644 index 0000000..c9b623d --- /dev/null +++ b/include/tersect_db.h @@ -0,0 +1,139 @@ +/* tersect_db.h + + Copyright (C) 2019 Cranfield University + + Author: Tomasz Kurowski + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. */ + +#ifndef TERSECT_DB +#define TERSECT_DB + +#include "alleles.h" +#include "bitarray.h" +#include "errorc.h" + +#include + +/* Database operation flags */ +#define TDB_FORCE 2 +#define TDB_VERBOSE 4 + +/* Opaque header handles */ +typedef struct chrom_hdr chrom_hdr; +typedef struct genome_hdr genome_hdr; +struct variant; +struct tersect_db_interval; + +struct chromosome { + char *name; + uint32_t length; + uint32_t variant_count; + struct variant *variants; + chrom_hdr *hdr; +}; + +struct genome { + char *name; + genome_hdr *hdr; +}; + +/** + * Stores a genomic interval as stored in the database - by the chromosome + * object and bit array indices it represents. + */ +struct tersect_db_interval { + struct chromosome chromosome; + size_t nvariants; + struct variant *variants; + struct bitarray_interval interval; +}; + +/** + * Stores a genomic interval in terms of the name of the chromosome and the + * start and end positions in bases (inclusive on both sides). + */ +struct genomic_interval { + char *chromosome; + uint32_t start_base; + uint32_t end_base; +}; + +typedef struct tersect_db tersect_db; + +error_t tersect_db_create(const char *filename, int flags, tersect_db **tdb); +tersect_db *tersect_db_open(const char *filename); +void tersect_db_close(tersect_db *tdb); +error_t tersect_db_insert_allele(tersect_db *tdb, const struct allele *allele, + struct variant *out); +void tersect_db_add_chromosome(tersect_db *tdb, + const char *chr_name, + const struct variant *variants, + uint32_t variant_num, + uint32_t length); +void tersect_db_add_genome(tersect_db *tdb, + const char *genome_name); +uint32_t tersect_db_get_genome_count(const tersect_db *tdb); +uint32_t tersect_db_get_chromosome_count(const tersect_db *tdb); +error_t tersect_db_get_genomes(const tersect_db *tdb, + size_t nmatch, char *const *matches, + size_t ncont, char *const *contains, + size_t *ngenomes, struct genome **genomes); +void tersect_db_add_bitarray(tersect_db *tdb, const char *genome, + const char *chromosome, const struct bitarray *ba); +void tersect_db_get_bitarray(const tersect_db *tdb, + const struct genome *gen, + const struct chromosome *chr, + struct bitarray *output); +void tersect_db_get_chromosomes(const tersect_db *tdb, + size_t *nchroms, struct chromosome **chroms); +void tersect_db_get_chromosome(const tersect_db *tdb, const char *name, + struct chromosome *chrom); +void tersect_db_get_interval(const tersect_db *tdb, + const struct genomic_interval *gi, + struct tersect_db_interval *ti); +void tersect_db_get_bin_intervals(const tersect_db *tdb, + const struct genomic_interval *gi, + uint32_t bin_size, + size_t *nbins, + struct tersect_db_interval **bins); +error_t tersect_db_rename_genome(tersect_db *tdb, const char *old_name, + const char *new_name); + +/** + * Extracts a genomic interval structure from a region string. + * Valid region strings take the following forms: + * + * chromosome:start-end (e.g. 'ch1:1-10000' for the interval from base 1 + * to base 10000 on chromosome 'ch1') + * + * chromosome (e.g. 'ch1' for an interval covering the + * entirety of chromosome 'ch1') + * + * The position bounds are inclusive on both sides. + */ +error_t tersect_db_parse_regions(const tersect_db *tdb, size_t nregions, + char **region_strings, + struct genomic_interval **output); + +error_t tersect_db_get_regions(const tersect_db *tdb, + size_t *nregions, + struct genomic_interval **output); + +#endif diff --git a/include/vcf_parser.h b/include/vcf_parser.h new file mode 100644 index 0000000..8dce5b1 --- /dev/null +++ b/include/vcf_parser.h @@ -0,0 +1,73 @@ +/* vcf_parser.h + + Copyright (C) 2019 Cranfield University + + Author: Tomasz Kurowski + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. */ + +#ifndef VCF_PARSER_H +#define VCF_PARSER_H + +#include "alleles.h" + +#include + +#define ALLELE_FETCHED 1 +#define ALLELE_NOT_FETCHED 0 + +#define MAX_ALT_ALLELES 100 +#define MAX_CHROMOSOME_NAME_LENGTH 100 +#define MAX_FILENAME_LENGTH 500 +#define MAX_SAMPLE_NAME_LENGTH 250 + +/* Genotype codes */ +#define GENOTYPE_HOM_REF 0 +#define GENOTYPE_HOM_ALT 1 +#define GENOTYPE_HET 2 + +/* Parser flags */ +#define VCF_ONLY_HOMOZYGOUS 2 +#define VCF_ONLY_SNPS 4 +#define VCF_ONLY_INDELS 8 + +typedef struct ParserHandle_t { + char filename[MAX_FILENAME_LENGTH]; + int *genotypes; + char **samples; + size_t sample_num; + int flags; + FILE *file_handle; + char current_chromosome[MAX_CHROMOSOME_NAME_LENGTH]; + char *alt_alleles[MAX_ALT_ALLELES]; + int n_alts; + char *line_buffer; + size_t buffer_size; + struct allele current_allele; + char *allele_context; + int current_result; +} VCF_PARSER; + +VCF_PARSER *init_parser(const char *filename, int flags); +int fetch_next_allele(VCF_PARSER *parser); +void goto_next_chromosome(VCF_PARSER *parser); +int parser_allele_cmp(const void *a, const void *b); +void close_parser(VCF_PARSER *parser); + +#endif diff --git a/include/vcf_writer.h b/include/vcf_writer.h new file mode 100644 index 0000000..73d4cca --- /dev/null +++ b/include/vcf_writer.h @@ -0,0 +1,47 @@ +/* vcf_writer.h + + Copyright (C) 2019 Cranfield University + + Author: Tomasz Kurowski + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. */ + +#ifndef VCF_WRITER_H +#define VCF_WRITER_H + +#include "bitarray.h" +#include "tersect_db.h" + +/** + * https://samtools.github.io/hts-specs/VCFv4.3.pdf + */ +#define VCF_FORMAT "VCFv4.3" + +/** + * Prints VCF lines from an interval based on a bit array index. + */ +void vcf_print_bitarray(const tersect_db *tdb, const struct bitarray *ba, + const struct tersect_db_interval *ti); + +/** + * Prints VCF metadata lines and header. + */ +void vcf_print_header(const char *command, size_t nregions, + char **region_strings); +#endif diff --git a/include/view.h b/include/view.h new file mode 100644 index 0000000..df2eeb8 --- /dev/null +++ b/include/view.h @@ -0,0 +1,32 @@ +/* view.h + + Copyright (C) 2019 Cranfield University + + Author: Tomasz Kurowski + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. */ + +#ifndef VIEW_DB_H +#define VIEW_DB_H + +#include "errorc.h" + +error_t tersect_view_set(int argc, char **argv); + +#endif diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt new file mode 100644 index 0000000..f1a73c2 --- /dev/null +++ b/src/CMakeLists.txt @@ -0,0 +1,27 @@ +target_sources(tersect +PRIVATE + "${CMAKE_CURRENT_LIST_DIR}/tersect.c" + "${CMAKE_CURRENT_LIST_DIR}/tersect_db.c" + + "${CMAKE_CURRENT_LIST_DIR}/build.c" + "${CMAKE_CURRENT_LIST_DIR}/chroms.c" + "${CMAKE_CURRENT_LIST_DIR}/distance.c" + "${CMAKE_CURRENT_LIST_DIR}/rename.c" + "${CMAKE_CURRENT_LIST_DIR}/samples.c" + "${CMAKE_CURRENT_LIST_DIR}/view.c" + + "${CMAKE_CURRENT_LIST_DIR}/alleles.c" + "${CMAKE_CURRENT_LIST_DIR}/ast.c" + "${CMAKE_CURRENT_LIST_DIR}/bitarray.c" + "${CMAKE_CURRENT_LIST_DIR}/errorc.c" + "${CMAKE_CURRENT_LIST_DIR}/hashmap.c" + "${CMAKE_CURRENT_LIST_DIR}/heap.c" + "${CMAKE_CURRENT_LIST_DIR}/snv.c" + "${CMAKE_CURRENT_LIST_DIR}/vcf_parser.c" + "${CMAKE_CURRENT_LIST_DIR}/vcf_writer.c" + + "${FLEX_QueryScanner_OUTPUTS}" + "${BISON_QueryParser_OUTPUTS}" + + "${VERSION_FILE}" +) diff --git a/src/alleles.c b/src/alleles.c new file mode 100644 index 0000000..7687ea6 --- /dev/null +++ b/src/alleles.c @@ -0,0 +1,27 @@ +/* alleles.c + + Copyright (C) 2019 Cranfield University + + Author: Tomasz Kurowski + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. */ + +#include "alleles.h" + +extern inline int allele_cmp(const struct allele *a, const struct allele *b); diff --git a/src/ast.c b/src/ast.c new file mode 100644 index 0000000..46bb657 --- /dev/null +++ b/src/ast.c @@ -0,0 +1,147 @@ +/* ast.c + + Copyright (C) 2019 Cranfield University + + Author: Tomasz Kurowski + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. */ + +#include "ast.h" + +#include +#include + +static struct bitarray *eval_node(struct ast_node *node, + const tersect_db *tdb, + const struct tersect_db_interval *ti); + +/** + * Allocate and initialise abstract syntax tree node for a binary operation. + */ +struct ast_node *create_ast_node(int operation_type, struct ast_node *l, + struct ast_node *r) +{ + struct ast_node *node = malloc(sizeof *node); + node->type = operation_type; + node->l = l; + node->r = r; + return node; +} + +struct ast_node *create_genome_node(struct genome *genome) +{ + struct ast_node *node = malloc(sizeof *node); + node->type = AST_GENOME; + node->genome = malloc(sizeof *node->genome); + *(node->genome) = *genome; + return node; +} + +struct ast_node *create_subtree(int operation_type, size_t ngenomes, + struct genome *genomes) +{ + struct ast_node *root = create_genome_node(&genomes[0]); + for (size_t i = 1; i < ngenomes; ++i) { + root = create_ast_node(operation_type, root, + create_genome_node(&genomes[i])); + } + return root; +} + +/** + * Allocates memory for the container struct. + */ +static struct bitarray *load_bitarray(const tersect_db *tdb, struct genome *genome, + const struct tersect_db_interval *ti) +{ + struct bitarray ba; + tersect_db_get_bitarray(tdb, genome, &ti->chromosome, &ba); + struct bitarray *region_ba = malloc(sizeof *region_ba); + bitarray_extract_region(region_ba, &ba, &ti->interval); + return region_ba; +} + +static inline struct bitarray *ast_node_operation(struct ast_node *node, + void (*op)(const struct bitarray*, + const struct bitarray*, + struct bitarray**), + const tersect_db *tdb, + const struct tersect_db_interval *ti) +{ + struct bitarray *ba = eval_node(node->l, tdb, ti); + struct bitarray *bb = eval_node(node->r, tdb, ti); + struct bitarray *out; + op(ba, bb, &out); + if (node->l->type == AST_GENOME) { + free(ba); + } else { + free_bitarray(ba); + } + if (node->r->type == AST_GENOME) { + free(bb); + } else { + free_bitarray(bb); + } + return out; +} + +static struct bitarray *eval_node(struct ast_node *node, const tersect_db *tdb, + const struct tersect_db_interval *ti) +{ + switch (node->type) { + case AST_INTERSECTION: + return ast_node_operation(node, &bitarray_intersection, tdb, ti); + case AST_UNION: + return ast_node_operation(node, &bitarray_union, tdb, ti); + case AST_DIFFERENCE: + return ast_node_operation(node, &bitarray_difference, tdb, ti); + case AST_SYMMETRIC_DIFFERENCE: + return ast_node_operation(node, &bitarray_symmetric_difference, tdb, ti); + case AST_GENOME: + return load_bitarray(tdb, node->genome, ti); + } + return NULL; +} + +struct bitarray *eval_ast(struct ast_node *root, const tersect_db *tdb, + const struct tersect_db_interval *ti) +{ + if (root->type == AST_GENOME) { + struct bitarray *ba = load_bitarray(tdb, root->genome, ti); + struct bitarray *out = copy_bitarray(ba); + free(ba); + return out; + } else { + return eval_node(root, tdb, ti); + } +} + +/** + * Free the entire abstract syntax tree starting from the root. + */ +void free_ast(struct ast_node *root) +{ + if (root->type != AST_GENOME) { + free_ast(root->l); + free_ast(root->r); + } else { + free(root->genome); + } + free(root); +} diff --git a/src/bitarray.c b/src/bitarray.c new file mode 100644 index 0000000..4a78f3f --- /dev/null +++ b/src/bitarray.c @@ -0,0 +1,729 @@ +/* bitarray.c + + Copyright (C) 2019 Cranfield University + + Author: Tomasz Kurowski + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. */ + +#include "bitarray.h" + +#include +#include +#include +#include + +#define GROWTH_FACTOR 1.5 + +static const bitarray_word WORD_MAX = (bitarray_word)~0; +static const bitarray_word MSB = (bitarray_word)1 + << (CHAR_BIT * sizeof(bitarray_word) - 1); + +// Number of boolean bits in the internal bit array storage type. +const uint16_t bitarray_word_capacity = CHAR_BIT * sizeof(bitarray_word) - 1; + +static inline size_t bit_to_word_size(size_t bit_size) +{ + return (bit_size + bitarray_word_capacity - 1) / bitarray_word_capacity; +} + +/* + * Allocate and initialise bit array. All the bits are unset (i.e. set to 0). + */ +struct bitarray *init_bitarray(uint64_t bit_size) +{ + // TODO: set masks to match how region extraction works + struct bitarray *ba = malloc(sizeof *ba); + // Rounding up + ba->size = bit_to_word_size(bit_size); + ba->last_word = 0; + ba->ncompressed = 0; + ba->array = calloc(ba->size, sizeof *(ba->array)); + ba->array[0] = ba->size - 1; + ba->start_mask = WORD_MAX; + ba->end_mask = WORD_MAX; + return ba; +} + +/* + * Duplicate bit array. Needs to be freed manually. + */ +struct bitarray *copy_bitarray(const struct bitarray *ba) +{ + struct bitarray *copied_ba = malloc(sizeof *ba); + size_t array_size = ba->size * sizeof *(ba->array); + *copied_ba = (struct bitarray) { + .size = ba->size, + .last_word = ba->last_word, + .ncompressed = ba->ncompressed, + .array = malloc(array_size), + .start_mask = ba->start_mask, + .end_mask = ba->end_mask + }; + memcpy(copied_ba->array, ba->array, array_size); + return copied_ba; +} + +/* + * Unsets (zeroes) entire bit array. + */ +void clear_bitarray(struct bitarray *ba) +{ + ba->last_word = 0; + ba->ncompressed = 0; + ba->start_mask = WORD_MAX; + ba->end_mask = WORD_MAX; + memset(ba->array, 0, ba->size * sizeof *(ba->array)); + ba->array[0] = ba->size - 1; +} + +/* + * Free bit array. + */ +void free_bitarray(struct bitarray *ba) +{ + free(ba->array); + free(ba); +} + +void print_bitarray_word(bitarray_word w) +{ + for (uint64_t i = 0; i < CHAR_BIT * sizeof(bitarray_word); ++i) { + printf("%"PRIu64, w >> i & 1); + } + printf(" (%"PRIu64")\n", w); +} + +/* + * Print bit array (as a series of 0s and 1s) in standard output. + */ +void print_bitarray(const struct bitarray *ba) +{ + for (size_t i = 0; i < ba->size; ++i) { + printf("%zu:\t", i); + print_bitarray_word(ba->array[i]); + } + printf("SM:\t"); + print_bitarray_word(ba->start_mask); + printf("EM:\t"); + print_bitarray_word(ba->end_mask); +} + +/* + * Print indices of set (i.e. true) bits in array. + */ +void print_set_indices(const struct bitarray *ba) +{ + size_t nset; + size_t *set_indices; + bitarray_get_set_indices(ba, &nset, &set_indices); + for (size_t i = 0; i < nset; ++i) { + printf("%lu,", set_indices[i]); + } + free(set_indices); + printf("\n"); +} + +/** + * Returns the number of words compressed in a zero fill word at a given + * position in a bit array. + */ +static inline size_t load_zerofill(const struct bitarray *ba, size_t pos) +{ + if (pos == 0) { + return ba->start_mask + 1; + } else if (pos + 1 == ba->size) { + return ba->end_mask + 1; + } else { + return ba->array[pos] + 1; + } +} + +static inline void load_masks(const struct bitarray *a, + const struct bitarray *b, + struct bitarray *out) +{ + if (a->array[0] & MSB) { + out->start_mask = a->start_mask; + } else if (b->array[0] & MSB) { + out->start_mask = b->start_mask; + } + if (a->array[a->size - 1] & MSB) { + out->end_mask = a->end_mask; + } else if (b->array[b->size - 1] & MSB) { + out->end_mask = b->end_mask; + } +} + +/** + * Either adds a zero fill word corresponding to zf_num compressed zero words + * at the specified position and increments the position by one, or increases + * the preceding zero fill word if one exists. + */ +static inline void append_zerofills(struct bitarray *ba, size_t *pos, + size_t zf_num) +{ + if (*pos && !(ba->array[(*pos) - 1] & MSB)) { + ba->array[(*pos) - 1] += zf_num; + } else { + ba->array[(*pos)++] = zf_num - 1; + } +} + +void bitarray_union(const struct bitarray *a, const struct bitarray *b, + struct bitarray **out) +{ + *out = init_bitarray((a->size + a->ncompressed + b->size + b->ncompressed) + * bitarray_word_capacity); + load_masks(a, b, *out); + + size_t a_pos = 0; + size_t a_ncomp = 0; // Number of compressed empty words in a + size_t b_pos = 0; + size_t b_ncomp = 0; // Number of compressed empty words in b + size_t out_pos = 0; + + while (a_pos < a->size || b_pos < b->size) { + if (!(a->array[a_pos] & MSB)) { + a_ncomp += load_zerofill(a, a_pos++); + } + if (!(b->array[b_pos] & MSB)) { + b_ncomp += load_zerofill(b, b_pos++); + } + if (a_ncomp) { + if (b_ncomp) { + // Skipping over the smaller set of compressed words + size_t to_skip = (a_ncomp > b_ncomp) ? b_ncomp : a_ncomp; + append_zerofills(*out, &out_pos, to_skip); + a_ncomp -= to_skip; + b_ncomp -= to_skip; + } else { + // Only b word set + (*out)->array[out_pos++] = b->array[b_pos++]; + --a_ncomp; + } + continue; + } else if (b_ncomp) { + // Only a word set + (*out)->array[out_pos++] = a->array[a_pos++]; + --b_ncomp; + continue; + } + (*out)->array[out_pos++] = a->array[a_pos++] | b->array[b_pos++]; + } + (*out)->last_word = out_pos - 1; + bitarray_shrinkwrap(*out); +} + +void bitarray_intersection(const struct bitarray *a, + const struct bitarray *b, + struct bitarray **out) +{ + *out = init_bitarray((a->size + a->ncompressed + b->size + b->ncompressed) + * bitarray_word_capacity); + load_masks(a, b, *out); + + size_t a_pos = 0; + size_t a_ncomp = 0; // Number of compressed empty words in a + size_t b_pos = 0; + size_t b_ncomp = 0; // Number of compressed empty words in b + size_t out_pos = 0; + + while (a_pos < a->size || b_pos < b->size) { + if (a_pos < a->size) { + if (!(a->array[a_pos] & MSB)) { + a_ncomp += load_zerofill(a, a_pos++); + } + } + if (b_pos < b->size) { + if (!(b->array[b_pos] & MSB)) { + b_ncomp += load_zerofill(b, b_pos++); + } + } + if (a_ncomp) { + if (b_ncomp) { + // Skipping over the smaller set of compressed words + size_t to_skip = (a_ncomp > b_ncomp) ? b_ncomp : a_ncomp; + append_zerofills(*out, &out_pos, to_skip); + a_ncomp -= to_skip; + b_ncomp -= to_skip; + } else { + // Only b word set + append_zerofills(*out, &out_pos, 1); + --a_ncomp; + ++b_pos; + } + continue; + } else if (b_ncomp) { + // Only a word set + append_zerofills(*out, &out_pos, 1); + --b_ncomp; + ++a_pos; + continue; + } + bitarray_word res = a->array[a_pos++] & b->array[b_pos++]; + if (res == MSB) { + // Only MSB set, no intersection + append_zerofills(*out, &out_pos, 1); + } else { + (*out)->array[out_pos++] = res; + } + } + (*out)->last_word = out_pos - 1; + bitarray_shrinkwrap(*out); +} + +void bitarray_difference(const struct bitarray *a, const struct bitarray *b, + struct bitarray **out) +{ + *out = init_bitarray((a->size + a->ncompressed + b->size + b->ncompressed) + * bitarray_word_capacity); + load_masks(a, b, *out); + + size_t a_pos = 0; + size_t a_ncomp = 0; // Number of compressed empty words in a + size_t b_pos = 0; + size_t b_ncomp = 0; // Number of compressed empty words in b + size_t out_pos = 0; + + while (a_pos < a->size || b_pos < b->size) { + if (!(a->array[a_pos] & MSB)) { + a_ncomp += load_zerofill(a, a_pos++); + } + if (!(b->array[b_pos] & MSB)) { + b_ncomp += load_zerofill(b, b_pos++); + } + if (a_ncomp) { + if (b_ncomp) { + // Skipping over the smaller set of compressed words + size_t to_skip = (a_ncomp > b_ncomp) ? b_ncomp : a_ncomp; + append_zerofills(*out, &out_pos, to_skip); + a_ncomp -= to_skip; + b_ncomp -= to_skip; + } else { + // Only b word set + append_zerofills(*out, &out_pos, 1); + --a_ncomp; + ++b_pos; + } + continue; + } else if (b_ncomp) { + // Only a word set + (*out)->array[out_pos++] = a->array[a_pos++]; + --b_ncomp; + continue; + } + bitarray_word res = a->array[a_pos++] & ~b->array[b_pos++]; + if (res == 0) { + // MSB is not set due to ~ + append_zerofills(*out, &out_pos, 1); + } else { + (*out)->array[out_pos++] = res | MSB; + } + } + (*out)->last_word = out_pos - 1; + bitarray_shrinkwrap(*out); +} + +void bitarray_symmetric_difference(const struct bitarray *a, + const struct bitarray *b, + struct bitarray **out) +{ + *out = init_bitarray((a->size + a->ncompressed + b->size + b->ncompressed) + * bitarray_word_capacity); + load_masks(a, b, *out); + + size_t a_pos = 0; + size_t a_ncomp = 0; // Number of compressed empty words in a + size_t b_pos = 0; + size_t b_ncomp = 0; // Number of compressed empty words in b + size_t out_pos = 0; + + while (a_pos < a->size || b_pos < b->size) { + if (!(a->array[a_pos] & MSB)) { + a_ncomp += load_zerofill(a, a_pos++); + } + if (!(b->array[b_pos] & MSB)) { + b_ncomp += load_zerofill(b, b_pos++); + } + if (a_ncomp) { + if (b_ncomp) { + // Skipping over the smaller set of compressed words + size_t to_skip = (a_ncomp > b_ncomp) ? b_ncomp : a_ncomp; + append_zerofills(*out, &out_pos, to_skip); + a_ncomp -= to_skip; + b_ncomp -= to_skip; + } else { + // Only b word set + (*out)->array[out_pos++] = b->array[b_pos++]; + --a_ncomp; + } + continue; + } else if (b_ncomp) { + // Only a word set + (*out)->array[out_pos++] = a->array[a_pos++]; + --b_ncomp; + continue; + } + bitarray_word res = a->array[a_pos++] ^ b->array[b_pos++]; + if (res == 0) { + append_zerofills(*out, &out_pos, 1); + } else { + (*out)->array[out_pos++] = res | MSB; + } + } + (*out)->last_word = out_pos - 1; + bitarray_shrinkwrap(*out); +} + +/** + * Calculate the Hamming distance (number of different bits) between two bit + * arrays. + */ +uint64_t bitarray_distance(const struct bitarray *a, const struct bitarray *b) +{ + uint64_t distance = 0; + + size_t a_pos = 0; + size_t a_ncomp = 0; // Number of compressed empty words in a + size_t b_pos = 0; + size_t b_ncomp = 0; // Number of compressed empty words in b + + while (a_pos < a->size || b_pos < b->size) { + if (!(a->array[a_pos] & MSB)) { + a_ncomp += load_zerofill(a, a_pos++); + } + if (!(b->array[b_pos] & MSB)) { + b_ncomp += load_zerofill(b, b_pos++); + } + if (a_ncomp) { + if (b_ncomp) { + // Skipping over the smaller set of compressed words + size_t to_skip = (a_ncomp > b_ncomp) ? b_ncomp : a_ncomp; + a_ncomp -= to_skip; + b_ncomp -= to_skip; + } else { + // Only b word set + distance += __builtin_popcountll(b->array[b_pos++]) - 1; + --a_ncomp; + } + continue; + } else if (b_ncomp) { + // Only a word set + distance += __builtin_popcountll(a->array[a_pos++]) - 1; + --b_ncomp; + continue; + } + distance += __builtin_popcountll(a->array[a_pos++] + ^ b->array[b_pos++]); + } + + // Removing bits masked by start + if ((a->array[0] & MSB) && (b->array[0] & MSB)) { + distance -= __builtin_popcountll((a->array[0] ^ b->array[0]) + & ~a->start_mask); + } else if (a->array[0] & MSB) { + distance -= __builtin_popcountll(a->array[0] & ~a->start_mask); + } else if (b->array[0] & MSB) { + distance -= __builtin_popcountll(b->array[0] & ~b->start_mask); + } + + // Removing bits masked by end + size_t a_last = a->size - 1; + size_t b_last = b->size - 1; + if ((a->array[a_last] & MSB) && (b->array[b_last] & MSB)) { + distance -= __builtin_popcountll((a->array[a_last] ^ b->array[b_last]) + & ~a->end_mask); + } else if (a->array[a_last] & MSB) { + distance -= __builtin_popcountll(a->array[a_last] & ~a->end_mask); + } else if (b->array[b_last] & MSB) { + distance -= __builtin_popcountll(b->array[b_last] & ~b->end_mask); + } + + return distance; +} + +/* + * Get the number of non-zero bits (Hamming weight) in the bit array. + */ +uint64_t bitarray_weight(const struct bitarray *ba) +{ + uint64_t weight = 0; + if (ba->size == 1 && (ba->array[0] & MSB)) { + return __builtin_popcountll(ba->array[0] + & ba->start_mask + & ba->end_mask) - 1; + } + if (ba->array[0] & MSB) { + weight += __builtin_popcountll(ba->array[0] & ba->start_mask) - 1; + } + if (ba->array[ba->size - 1] & MSB) { + weight += __builtin_popcountll(ba->array[ba->size - 1] + & ba->end_mask) - 1; + } + for (size_t i = 1; i < ba->size - 1; ++i) { + if (ba->array[i] & MSB) { + weight += __builtin_popcountll(ba->array[i]) - 1; + } + } + return weight; +} + +static inline void bitarray_resize_internal(struct bitarray *ba, + size_t new_size_words) +{ + size_t old_size = ba->size; + ba->size = new_size_words; + ba->array = realloc(ba->array, ba->size * sizeof *(ba->array)); + if (new_size_words > old_size) { + memset(&ba->array[old_size], 0, + (ba->size - old_size) * sizeof *(ba->array)); + ba->array[ba->last_word + 1] = ba->size - ba->last_word - 2; + } else { + // TODO: Handle shrinking (primarily adjusting last word and end mask) + } +} + +static inline void bitarray_grow(struct bitarray *ba) +{ + bitarray_resize_internal(ba, ba->size * GROWTH_FACTOR); +} + +void bitarray_resize(struct bitarray *ba, uint64_t new_size) +{ + size_t new_size_words = bit_to_word_size(new_size); + if (new_size_words != ba->size) { + bitarray_resize_internal(ba, new_size_words); + } else { + // TODO: Adjust end mask + } +} + +/* + * Set the bit at the specified position to 1. + * Returns 0 on success, -1 on failure. + */ +int bitarray_set_bit(struct bitarray *ba, size_t pos) +{ + // TODO: simplify this + size_t word_pos = pos / bitarray_word_capacity; + if (ba->last_word + ba->ncompressed > word_pos) { + // Cannot set bits in words prior to the last_word + return -1; + } + // TODO: may need to handle word_diff being > WORD_MAX/2 (unlikely) + size_t word_diff = word_pos - ba->last_word - ba->ncompressed; + if (word_diff == 1) { + // New bit is in the next word + if (ba->last_word + 1 >= ba->size) { + bitarray_grow(ba); + } + if (ba->last_word == 0 && !(ba->array[0] & MSB)) { + ba->array[0] = 0; + } + ba->last_word += 1; + } else if (word_diff > 1) { + // New bit is further than one word away from the previous bit + if (ba->last_word == 0 && !(ba->array[0] & MSB)) { + // Adding compressed gap in first word (edge case) only if empty + if (ba->last_word + 1 >= ba->size) { + bitarray_grow(ba); + } + ba->array[0] = word_diff - 1; + ba->ncompressed += word_diff - 1; + ba->last_word += 1; + } else { + // Adding compressed gap + if (ba->last_word + 2 >= ba->size) { + bitarray_grow(ba); + } + ba->array[ba->last_word + 1] = word_diff - 2; + ba->ncompressed += word_diff - 2; + ba->last_word += 2; + } + } + if (!(ba->array[ba->last_word] & MSB)) { + ba->array[ba->last_word] = MSB; + if (ba->last_word + 1 < ba->size) { + /* Except for the last word, store the number of remaining + empty words in the following word */ + ba->array[ba->last_word + 1] = ba->size + - ba->last_word - 2; + } + } + ba->array[ba->last_word] |= (bitarray_word)1 + << pos % bitarray_word_capacity; + return 0; +} + +/* + * Get the value (0 or 1) of the bit at a particular position. + */ +int bitarray_get_bit(const struct bitarray *ba, size_t pos) +{ + size_t word_index = pos / bitarray_word_capacity; + bitarray_word mask = ((bitarray_word)1 << pos % bitarray_word_capacity); + if (word_index == 0) { + if (!(ba->array[0] & MSB) || !(ba->array[0] & mask & ba->start_mask)) { + // Bit in first word and not set or not covered by start mask + return 0; + } else { + return 1; + } + } + if (word_index == ba->size + ba->ncompressed - 1) { + if (!(ba->array[ba->size - 1] & MSB) + || !(ba->array[ba->size - 1] & mask & ba->end_mask)) { + // Bit in last word and not set or not covered by end mask + return 0; + } else { + return 1; + } + } + size_t ncompressed = 0; + if (!(ba->array[0] & MSB)) { + // Include the compression at the start + // (or only part of it, depending on the start mask) + ncompressed += (ba->array[0] > ba->start_mask) ? ba->start_mask + : ba->array[0]; + } + for (size_t i = 1; i < ba->size - 1; ++i) { + if (i + ncompressed > word_index) { + // Index was in the compressed interval + return 0; + } + if (i + ncompressed == word_index) { + return (ba->array[i] & MSB) && (ba->array[i] & mask); + } + if (!(ba->array[i] & MSB)) { + ncompressed += ba->array[i]; + } + } + return 0; +} + +/* + * Get array of the indices of set (i.e. true) bits in the bit array. + * Allocates memory to the first argument; this needs to be freed independently. + */ +int bitarray_get_set_indices(const struct bitarray *ba, + size_t *nset_indices, size_t **set_indices) +{ + *set_indices = malloc(bitarray_word_capacity * ba->size + * sizeof **set_indices); + *nset_indices = 0; + uint64_t ncompressed = 0; + bitarray_word current_word; + for (uint64_t i = 0; i < ba->size; ++i) { + if (!(ba->array[i] & MSB)) { + // MSB is 0, fill word (run-length of zeroes) + ncompressed += ba->array[i]; + continue; + } + current_word = ba->array[i]; + if (!i) { + current_word &= ba->start_mask; + } + if (i + 1 == ba->size) { + current_word &= ba->end_mask; + } + for (uint16_t j = 0; j < bitarray_word_capacity; ++j) { + if (current_word & ((bitarray_word)1 << j)) { + (*set_indices)[(*nset_indices)++] = bitarray_word_capacity + * (i + ncompressed) + j; + } + } + } + // Truncate to final size + *set_indices = realloc(*set_indices, + (*nset_indices) * sizeof **set_indices); + return 0; +} + +void bitarray_shrinkwrap(struct bitarray *ba) +{ + ba->size = ba->last_word + 1; + ba->array = realloc(ba->array, ba->size * sizeof *(ba->array)); + if (!(ba->array[0] & MSB)) { + ba->start_mask = ba->array[0]; + if (ba->size == 1) { + ba->end_mask = ba->start_mask; + } + } + if (!(ba->array[ba->size - 1] & MSB)) { + ba->end_mask = ba->array[ba->size - 1]; + } +} + +/** + * Extract a bit array representing a region of a larger bit array. + * + * If an extracted region starts/ends on a literal word, the start/end mask is + * a normal mask on that word. + * If an extracted region starts/ends on a fill woed, the start/end mask is the + * number of words in that fill included in the region minus one. + * e.g. with a 64-bit word, if the first word is a zero-fill of ten words + * (i.e. 630 consecutive 0 bits) and the region includes all 630, the start_mask + * will be 9 (10 - 1). + */ +void bitarray_extract_region(struct bitarray *dest_ba, + const struct bitarray *src_ba, + const struct bitarray_interval *region) +{ + // The 'internal' indices are in terms of the storage word type + uint64_t internal_start_index = region->start_index + / bitarray_word_capacity; + uint64_t internal_end_index = region->end_index + / bitarray_word_capacity; + // Shifting the internal indices by the number of preceding fill words + size_t ncompressed = 0; + for (size_t i = 0; i <= internal_end_index; ++i) { + if (src_ba->array[i] & MSB) continue; // no compression + if (internal_start_index >= i) { + if (internal_start_index <= i + src_ba->array[i]) { + dest_ba->start_mask = src_ba->array[i] + - (internal_start_index - i); + internal_start_index = i; + } else { + internal_start_index -= src_ba->array[i]; + } + } + if (internal_end_index <= i + src_ba->array[i]) { + dest_ba->end_mask = internal_end_index - i; + internal_end_index = i; + } else { + internal_end_index -= src_ba->array[i]; + } + ncompressed += src_ba->array[i]; + } + dest_ba->size = 1 + internal_end_index - internal_start_index; + dest_ba->last_word = 0; + dest_ba->ncompressed = ncompressed; + dest_ba->array = &(src_ba->array[internal_start_index]); + if (src_ba->array[internal_start_index] & MSB) { + dest_ba->start_mask = WORD_MAX << region->start_index + % bitarray_word_capacity; + } + if (src_ba->array[internal_end_index] & MSB) { + dest_ba->end_mask = WORD_MAX >> (bitarray_word_capacity + - region->end_index + % bitarray_word_capacity) + | MSB; + } +} diff --git a/src/build.c b/src/build.c new file mode 100644 index 0000000..90e3944 --- /dev/null +++ b/src/build.c @@ -0,0 +1,367 @@ +/* build.c + + Copyright (C) 2019 Cranfield University + + Author: Tomasz Kurowski + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. */ + +#include "build.h" + +#include "hashmap.h" +#include "heap.h" +#include "rename.h" +#include "snv.h" +#include "tersect.h" +#include "tersect_db.h" +#include "tersect_db_internal.h" +#include "vcf_parser.h" + +#include +#include +#include + +/** + * Maximum number of alleles per chromosome + * TODO: match this with INITIAL_ALLELE_NUM and expand var_container dynamically + */ +#define MAX_ALLELES 50000000 + +/** + * Maximum allele size in base pairs. Used to set up a parsing buffer. + */ +#define MAX_ALLELE_SIZE 20000 + +#define SAMPLES_PER_FILE 10000 + +/** + * Initial size of allele bitarray. + */ +#define INITIAL_ALLELE_NUM 10000 + +static int tdb_flags = 0; +static int parser_flags = 0; + +/** + * Wrapper for a parser and an associated bit array to record variants present + * in a specific genome file. + */ +struct parser_wrapper { + VCF_PARSER *parser; + struct bitarray **ba; +}; + +static void usage(FILE *stream) +{ + fprintf(stream, + "\n" + "Usage: tersect build [options] ...\n\n" + "Options:\n" + " -f, --force overwrite database file if necessary\n" + " -H, --homozygous include only homozygous variants\n" + " -h, --help print this help message\n" + " -n, --name-file tsv file containing sample names\n" + " -t, --types include snps, indels, or both (default)\n" + " -v, --verbose run in verbose mode\n" + "\n"); +} + +/** + * Comparator for wrapped parsers. + */ +int wrapped_parser_cmp(const void *a, const void *b) +{ + return parser_allele_cmp(((struct parser_wrapper *)a)->parser, + ((struct parser_wrapper *)b)->parser); +} + +static error_t import_files(tersect_db *tdb, + const int file_num, + char **filenames, + int parser_flags); +static inline int load_next_chromosome_queue(Heap *queue, + char *chromosome, + struct parser_wrapper *parsers, + int parser_num); +static inline uint32_t process_chromosome_queue(tersect_db *tdb, Heap *queue, + struct variant *var_container); + +error_t tersect_build_database(int argc, char **argv) +{ + error_t rc = SUCCESS; + char *db_filename = NULL; + char *name_filename = NULL; + static struct option loptions[] = { + {"force", no_argument, NULL, 'f'}, + {"help", no_argument, NULL, 'h'}, + {"homozygous", no_argument, NULL, 'H'}, + {"name-file", required_argument, NULL, 'n'}, + {"types", required_argument, NULL, 't'}, + {"verbose", no_argument, NULL, 'v'}, + {NULL, 0, NULL, 0} + }; + int c; + while ((c = getopt_long(argc, argv, ":fHhn:t:v", loptions, NULL)) != -1) { + switch(c) { + case 'f': + tdb_flags |= TDB_FORCE; + break; + case 'h': + usage(stdout); + return SUCCESS; + case 'H': + parser_flags |= VCF_ONLY_HOMOZYGOUS; + break; + case 'n': + name_filename = optarg; + break; + case 't': + if (!strcmp(optarg, "snps")) { + parser_flags |= VCF_ONLY_SNPS; + } else if (!strcmp(optarg, "indels")) { + parser_flags |= VCF_ONLY_INDELS; + } else if (!strcmp(optarg, "both")) { + // Default, no flag needed + } else { + usage(stderr); + return SUCCESS; + } + break; + case 'v': + tdb_flags |= TDB_VERBOSE; + break; + default: + usage(stderr); + return SUCCESS; + } + } + argc -= optind; + argv += optind; + if (argc) { + db_filename = argv[0]; + --argc; + ++argv; + } else { + // Missing output filename + usage(stderr); + return E_BUILD_NO_OUTNAME; + } + if (!argc) { + return E_BUILD_NO_FILES; + } + tersect_db *tdb; + rc = tersect_db_create(db_filename, tdb_flags, &tdb); + if (rc != SUCCESS) return rc; + rc = import_files(tdb, argc, argv, parser_flags); + tersect_db_close(tdb); + if (name_filename != NULL) { + tdb = tersect_db_open(db_filename); + if (tdb == NULL) return E_TSI_NOPEN; + rc = tersect_load_name_file(tdb, name_filename); + tersect_db_close(tdb); + } + return rc; +} + +/** + * Builds a database out of k files via a queue-based k-way merge. + */ +static error_t import_files(tersect_db *tdb, + const int file_num, + char **filenames, + int parser_flags) +{ + error_t rc = SUCCESS; + if (!file_num) return E_BUILD_NO_FILES; + struct variant *var_container = malloc(MAX_ALLELES * sizeof *var_container); + if (!var_container) return E_ALLOC; + struct parser_wrapper *parsers = malloc(file_num * sizeof *parsers); + if (!parsers) { + rc = E_ALLOC; + goto cleanup_1; + } + char current_chromosome[MAX_CHROMOSOME_NAME_LENGTH] = ""; + Heap *queue = init_heap(file_num, wrapped_parser_cmp); + if (!queue) { + rc = E_ALLOC; + goto cleanup_2; + } + // Open parsers & prepare bit arrays + HashMap *sample_names = init_hashmap(SAMPLES_PER_FILE * file_num); + for (int i = 0; i < file_num; ++i) { + parsers[i].parser = init_parser(filenames[i], parser_flags); + if (parsers[i].parser == NULL) { + rc = E_VCF_PARSE_FILE; + goto cleanup_3; + } + parsers[i].ba = malloc(parsers[i].parser->sample_num + * sizeof *parsers[i].ba); + for (size_t j = 0; j < parsers[i].parser->sample_num; ++j) { + if (hashmap_get(sample_names, + parsers[i].parser->samples[j]) != NULL) { + rc = E_BUILD_DUPSAMPLE; + goto cleanup_3; + } else { + hashmap_insert(sample_names, parsers[i].parser->samples[j], + parsers[i].parser->samples[j]); + } + parsers[i].ba[j] = init_bitarray(INITIAL_ALLELE_NUM); + tersect_db_add_genome(tdb, parsers[i].parser->samples[j]); + } + fetch_next_allele(parsers[i].parser); + } + while (load_next_chromosome_queue(queue, current_chromosome, + parsers, file_num)) { + uint32_t var_count = process_chromosome_queue(tdb, queue, + var_container); + // Taking the position of the last variant in the chromosome as proxy + // for the chromosome size + tersect_db_add_chromosome(tdb, current_chromosome, + var_container, var_count, + var_container[var_count - 1].position); + // The bit arrays stored in parsers are larger and get re-used for each + // chromosome. The interval is used to extract chromosome-specific + // bit arrays. + struct bitarray_interval chr_interval = { + .start_index = 0, + .end_index = var_count - 1 + }; + for (int i = 0; i < file_num; ++i) { + for (size_t j = 0; j < parsers[i].parser->sample_num; ++j) { + struct bitarray ba; + bitarray_resize(parsers[i].ba[j], var_count); + bitarray_extract_region(&ba, parsers[i].ba[j], &chr_interval); + tersect_db_add_bitarray(tdb, parsers[i].parser->samples[j], + current_chromosome, &ba); + clear_bitarray(parsers[i].ba[j]); + } + } + } + // Close parsers + for (int i = 0; i < file_num; ++i) { + for (size_t j = 0; j < parsers[i].parser->sample_num; ++j) { + free_bitarray(parsers[i].ba[j]); + } + close_parser(parsers[i].parser); + free(parsers[i].ba); + } + free_heap(queue); +cleanup_3: + free_hashmap(sample_names); +cleanup_2: + free(parsers); +cleanup_1: + free(var_container); + return rc; +} + +static inline int load_next_chromosome_queue(Heap *queue, + char *chromosome, + struct parser_wrapper *parsers, + int parser_num) +{ + // Save currently set chromosome in order to skip it + char previous_chromosome[MAX_CHROMOSOME_NAME_LENGTH]; + strcpy(previous_chromosome, chromosome); + strcpy(chromosome, ""); // Clear chromosome + clear_heap(queue); + for (int i = 0; i < parser_num; ++i) { + if (!strcmp(parsers[i].parser->current_chromosome, previous_chromosome)) { + goto_next_chromosome(parsers[i].parser); + } + if (parsers[i].parser->current_result == ALLELE_NOT_FETCHED) { + continue; // End of file reached, skip parser + } + if (!strlen(chromosome)) { + // Set next chromosome + strcpy(chromosome, parsers[i].parser->current_chromosome); + heap_push(queue, &parsers[i]); + continue; + } + // Go to set chromosome and put parser on heap if found + while (strcmp(parsers[i].parser->current_chromosome, chromosome) + && parsers[i].parser->current_result != ALLELE_NOT_FETCHED) { + goto_next_chromosome(parsers[i].parser); + } + if (!strcmp(parsers[i].parser->current_chromosome, chromosome)) { + heap_push(queue, &parsers[i]); + } + } + return queue->size; +} + +static inline uint32_t process_chromosome_queue(tersect_db *tdb, Heap *queue, + struct variant *var_container) +{ + if (!(queue->size)) return 0; + struct allele previous_allele = { + .position = 0, + .ref = calloc(MAX_ALLELE_SIZE + 1, 1), + .alt = calloc(MAX_ALLELE_SIZE + 1, 1) + }; + + char chromosome[MAX_CHROMOSOME_NAME_LENGTH]; + strcpy(chromosome, + ((struct parser_wrapper *)heap_peek(queue))->parser + ->current_chromosome); + uint32_t var_count = 0; + while (queue->size) { + // Alternatively could pop & push, but it is slower + struct parser_wrapper *pwr = heap_peek(queue); + if (allele_cmp(&previous_allele, &pwr->parser->current_allele)) { + if (tersect_db_insert_allele(tdb, &pwr->parser->current_allele, + &var_container[var_count]) == SUCCESS) { + previous_allele.position = pwr->parser->current_allele.position; + strcpy(previous_allele.ref, pwr->parser->current_allele.ref); + strcpy(previous_allele.alt, pwr->parser->current_allele.alt); + for (size_t i = 0; i < pwr->parser->sample_num; ++i) { + if (parser_flags & VCF_ONLY_HOMOZYGOUS) { + if (pwr->parser->genotypes[i] != GENOTYPE_HOM_ALT) { + continue; + } + } + if (pwr->parser->genotypes[i] != GENOTYPE_HOM_REF) { + bitarray_set_bit(pwr->ba[i], var_count); + } + } + var_count++; + } + } else { + // The same allele as previously + for (size_t i = 0; i < pwr->parser->sample_num; ++i) { + if (parser_flags & VCF_ONLY_HOMOZYGOUS) { + if (pwr->parser->genotypes[i] != GENOTYPE_HOM_ALT) continue; + } + if (pwr->parser->genotypes[i] != GENOTYPE_HOM_REF) { + bitarray_set_bit(pwr->ba[i], var_count - 1); + } + } + } + if ((fetch_next_allele(pwr->parser) == ALLELE_NOT_FETCHED) + || strcmp(chromosome, pwr->parser->current_chromosome)) { + // end of file or end of chromosome + heap_pop(queue); // Remove parser from queue + } else { + sift_down(queue); // Sift new allele down + } + } + free(previous_allele.ref); + free(previous_allele.alt); + return var_count; +} diff --git a/src/chroms.c b/src/chroms.c new file mode 100644 index 0000000..c15799e --- /dev/null +++ b/src/chroms.c @@ -0,0 +1,97 @@ +/* chroms.c + + Copyright (C) 2019 Cranfield University + + Author: Tomasz Kurowski + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. */ + +#include "chroms.h" + +#include "tersect_db.h" + +#include +#include +#include + +/* Local flags for chroms */ +#define NO_HEADERS 2 +static int local_flags = 0; + +static void usage(FILE *stream) +{ + fprintf(stream, + "\n" + "Usage: tersect chroms [options] \n\n" + "Options:\n" + " -h, --help print this help message\n" + " -n, --no-headers skip column headers\n" + "\n"); +} + +error_t tersect_print_chromosomes(int argc, char **argv) +{ + char *db_filename = NULL; + static struct option loptions[] = { + {"help", no_argument, NULL, 'h'}, + {"no-headers", no_argument, NULL, 'n'}, + {NULL, 0, NULL, 0} + }; + int c; + while ((c = getopt_long(argc, argv, ":hn", loptions, NULL)) != -1) { + switch(c) { + case 'h': + usage(stdout); + return SUCCESS; + case 'n': + local_flags |= NO_HEADERS; + break; + default: + usage(stderr); + return SUCCESS; + } + } + argc -= optind; + argv += optind; + if (!argc) { + // Missing Tersect index file + usage(stderr); + return E_NO_TSI_FILE; + } else if (argc > 1) { + // Too many arguments + usage(stderr); + return SUCCESS; + } + db_filename = argv[0]; + tersect_db *tdb = tersect_db_open(db_filename); + if (tdb == NULL) return E_TSI_NOPEN; + size_t count; + struct chromosome *chroms; + tersect_db_get_chromosomes(tdb, &count, &chroms); + if (!(local_flags & NO_HEADERS)) { + printf("Chromosome\tLength\tVariants\n"); + } + for (size_t i = 0; i < count; ++i) { + printf("%s\t%u\t%u\n", chroms[i].name, + chroms[i].length, chroms[i].variant_count); + } + tersect_db_close(tdb); + free(chroms); + return SUCCESS; +} diff --git a/src/distance.c b/src/distance.c new file mode 100644 index 0000000..5cbeec1 --- /dev/null +++ b/src/distance.c @@ -0,0 +1,447 @@ +/* distance.c + + Copyright (C) 2019 Cranfield University + + Author: Tomasz Kurowski + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. */ + +#include "distance.h" + +#include "bitarray.h" +#include "tersect_db.h" + +#include +#include +#include +#include +#include +#include +#include + +/* Local flags for distance */ +#define JSON_OUTPUT 2 +static int local_flags = 0; + +/* Argument options without a short equivalent */ +#define A_CONTAINS 1000 +#define B_CONTAINS 1001 + +struct distance_matrix { + char **row_samples; + char **col_samples; + size_t nrows; + size_t ncols; + size_t nmatrices; + uint64_t ***distance; + uint32_t bin_size; + bool symmetric; +}; + +static void usage(FILE *stream) +{ + fprintf(stream, + "\n" + "Usage: tersect dist [options] [region]...\n" + " tersect dist [options] [-a ] [-b ] [region]...\n" + " tersect dist [options] [--ac ] [--bc ] [region]...\n\n" + "Options:\n" + " -a, --a-match STR name pattern to be matched by samples in set A\n" + " -b, --b-match STR name pattern to be matched by samples in set B\n" + " --ac STR variants required for sample inclusion in set A\n" + " --ac STR variants required for sample inclusion in set B\n" + " -c, --contains STR variants required for sample inclusion in any set\n" + " -m, --match STR name pattern to be matched by samples in any set\n" + " -B, --bin-size INT size of bins into which the region is split\n" + " -h, --help print this help message\n" + " -j, --json output JSON; implied if match/contains settings for\n" + " set A and set B differ\n" + "\n"); +} + +static inline void init_distance_matrix(size_t nmatrices, + uint32_t bin_size, + size_t nrows, + const struct genome *row_samples, + size_t ncols, + const struct genome *col_samples, + struct distance_matrix *matrix) +{ + matrix->bin_size = bin_size; + // TODO: optimize for partial symmetry, i.e. some but not all shared genomes + matrix->symmetric = row_samples == col_samples; + matrix->nmatrices = nmatrices; + matrix->nrows = nrows; + matrix->ncols = ncols; + + matrix->row_samples = malloc(nrows * sizeof *matrix->row_samples); + for (size_t i = 0; i < nrows; ++i) { + matrix->row_samples[i] = strdup(row_samples[i].name); + } + if (matrix->symmetric) { + matrix->col_samples = matrix->row_samples; + } else { + matrix->col_samples = malloc(ncols * sizeof *matrix->col_samples); + for (size_t i = 0; i < ncols; ++i) { + matrix->col_samples[i] = strdup(col_samples[i].name); + } + } + + matrix->distance = malloc(nmatrices * sizeof *matrix->distance); + for (size_t i = 0; i < nmatrices; ++i) { + matrix->distance[i] = malloc(nrows * sizeof **matrix->distance); + for (size_t j = 0; j < nrows; ++j) { + matrix->distance[i][j] = calloc(ncols, sizeof ***matrix->distance); + } + } +} + +static inline void calculate_distance_matrix(size_t nrows, + const struct genome *row_samples, + const struct bitarray *row_bas, + size_t ncols, + const struct genome *col_samples, + const struct bitarray *col_bas, + bool symmetric, + uint64_t * const *output_dist) +{ + for (size_t j = 0; j < nrows; ++j) { + for (size_t k = symmetric ? j : 0; k < ncols; ++k) { + uint64_t dist = 0; + if (row_samples[j].hdr != col_samples[k].hdr) { + // Only calculating distance for distinct samples + dist = bitarray_distance(&row_bas[j], &col_bas[k]); + } + output_dist[j][k] += dist; + if (symmetric) { + output_dist[k][j] += dist; + } + } + } +} + +static inline void build_distance_matrix(const tersect_db *tdb, + size_t nrows, + const struct genome *row_samples, + size_t ncols, + const struct genome *col_samples, + uint32_t bin_size, + size_t nregions, + const struct genomic_interval *regions, + struct distance_matrix *matrix) +{ + size_t nintervals; + struct tersect_db_interval *intervals; + if (bin_size) { + tersect_db_get_bin_intervals(tdb, regions, bin_size, + &nintervals, &intervals); + init_distance_matrix(nintervals, bin_size, nrows, row_samples, + ncols, col_samples, matrix); + } else { + // No binning + nintervals = nregions; + intervals = malloc(nintervals * sizeof *intervals); + for (size_t i = 0; i < nregions; ++i) { + tersect_db_get_interval(tdb, ®ions[i], &intervals[i]); + } + init_distance_matrix(1, 0, nrows, row_samples, ncols, col_samples, + matrix); + } + + struct bitarray *row_bas = malloc(nrows * sizeof *row_bas); + struct bitarray *col_bas = malloc(ncols * sizeof *col_bas); + + for (size_t i = 0; i < nintervals; ++i) { + // Extracting region bitarrays for rows and cols + for (size_t j = 0; j < nrows; ++j) { + struct bitarray tmp; + tersect_db_get_bitarray(tdb, &row_samples[j], + &intervals[i].chromosome, &tmp); + bitarray_extract_region(&row_bas[j], &tmp, &intervals[i].interval); + } + for (size_t j = 0; j < ncols; ++j) { + struct bitarray tmp; + tersect_db_get_bitarray(tdb, &col_samples[j], + &intervals[i].chromosome, &tmp); + bitarray_extract_region(&col_bas[j], &tmp, &intervals[i].interval); + } + // Calculate distances + calculate_distance_matrix(nrows, row_samples, row_bas, + ncols, col_samples, col_bas, + matrix->symmetric, + matrix->distance[bin_size ? i : 0]); + } + free(intervals); + free(row_bas); + free(col_bas); +} + +static inline void print_distance_matrix_phylip(const struct distance_matrix *matrix) +{ + // TODO: truncate/pad sample name to ten characters + // Note: the format is only valid for symmetric matrices + // http://evolution.genetics.washington.edu/phylip/doc/distance.html + printf("%lu\n", matrix->nrows); + for (size_t i = 0; i < matrix->nrows; ++i) { + printf("%s ", matrix->row_samples[i]); + for (size_t j = 0; j < matrix->ncols; ++j) { + printf("%"PRIu64, matrix->distance[0][i][j]); + if (j + 1 < matrix->ncols) { + printf(" "); + } + } + printf("\n"); + } +} + +static inline void print_single_matrix_json(size_t nrows, + size_t ncols, + uint64_t * const *dist) +{ + printf("\t[\n"); + for (size_t i = 0; i < nrows; ++i) { + printf("\t\t["); + for (size_t j = 0; j < ncols; ++j) { + if (j + 1 < ncols) { + printf("%"PRIu64", ", dist[i][j]); + } else { + printf("%"PRIu64, dist[i][j]); + } + } + if (i + 1 < nrows) { + printf("],\n"); + } else { + printf("]\n"); + } + } + printf("\t]\n"); +} + +static inline void print_distance_matrix_json(const struct distance_matrix *matrix) +{ + printf("{\n"); + printf("\t\"rows\": [\n"); + for (size_t i = 0; i < matrix->nrows; ++i) { + if (i + 1 < matrix->nrows) { + printf("\t\t\"%s\",\n", matrix->row_samples[i]); + } else { + printf("\t\t\"%s\"\n", matrix->row_samples[i]); + } + } + printf("\t],\n"); + printf("\t\"columns\": [\n"); + for (size_t i = 0; i < matrix->ncols; ++i) { + if (i + 1 < matrix->ncols) { + printf("\t\t\"%s\",\n", matrix->col_samples[i]); + } else { + printf("\t\t\"%s\"\n", matrix->col_samples[i]); + } + } + printf("\t],\n"); + printf("\t\"matrix\":\n"); + if (matrix->nmatrices > 1) { + // Binning + printf("\t[\n"); + for (size_t i = 0; i < matrix->nmatrices; ++i) { + print_single_matrix_json(matrix->nrows, + matrix->ncols, + matrix->distance[i]); + if (i + 1 < matrix->nmatrices) { + printf(",\n"); + } + } + printf("\t]\n"); + } else { + print_single_matrix_json(matrix->nrows, + matrix->ncols, + matrix->distance[0]); + } + printf("}\n"); +} + +static inline void dealloc_distance_matrix(struct distance_matrix *matrix) +{ + for (size_t i = 0; i < matrix->nrows; ++i) { + free(matrix->row_samples[i]); + free(matrix->distance[i]); + } + free(matrix->row_samples); + if (!matrix->symmetric) { + for (size_t i = 0; i < matrix->ncols; ++i) { + free(matrix->col_samples[i]); + } + free(matrix->col_samples); + } + free(matrix->distance); +} + +error_t tersect_distance(int argc, char **argv) +{ + error_t rc = SUCCESS; + char *db_filename = NULL; + char **region_strings = NULL; + size_t nregions = 0; + char *a_match = NULL; + char *a_contains = NULL; + char *b_match = NULL; + char *b_contains = NULL; + char *contains = NULL; + char *match = NULL; + bool binning = false; + uint32_t bin_size = 0; + bool symmetric = true; + static struct option loptions[] = { + {"a-match", required_argument, NULL, 'a'}, + {"b-match", required_argument, NULL, 'b'}, + {"ac", required_argument, NULL, A_CONTAINS}, + {"bc", required_argument, NULL, B_CONTAINS}, + {"contains", required_argument, NULL, 'c'}, + {"match", required_argument, NULL, 'm'}, + {"help", no_argument, NULL, 'h'}, + {"json", no_argument, NULL, 'j'}, + {"bin-size", required_argument, NULL, 'B'}, + {NULL, 0, NULL, 0} + }; + int c; + while ((c = getopt_long(argc, argv, ":a:b:B:c:m:hj", loptions, NULL)) != -1) { + switch(c) { + case 'a': + a_match = optarg; + break; + case 'b': + b_match = optarg; + break; + case 'B': + binning = true; + bin_size = strtol(optarg, NULL, 10); + local_flags |= JSON_OUTPUT; + break; + case A_CONTAINS: + a_contains = optarg; + break; + case B_CONTAINS: + b_contains = optarg; + break; + case 'c': + contains = optarg; + break; + case 'm': + match = optarg; + break; + case 'h': + usage(stdout); + return SUCCESS; + case 'j': + local_flags |= JSON_OUTPUT; + break; + default: + usage(stderr); + return SUCCESS; + } + } + argc -= optind; + argv += optind; + if (!argc) { + // Missing Tersect index file + usage(stderr); + return E_NO_TSI_FILE; + } + db_filename = argv[0]; + argc -= 1; + argv += 1; + if (argc) { + region_strings = argv; + nregions = argc; + } + if ((a_match != b_match) || (a_contains != b_contains)) { + // Force JSON for non-symmetric distance matrices + local_flags |= JSON_OUTPUT; + symmetric = false; + } + // End parsing options + + tersect_db *tdb = tersect_db_open(db_filename); + if (tdb == NULL) return E_TSI_NOPEN; + + struct genomic_interval *regions; + if (nregions) { + rc = tersect_db_parse_regions(tdb, nregions, region_strings, ®ions); + } else { + rc = tersect_db_get_regions(tdb, &nregions, ®ions); + } + if (rc != SUCCESS) goto cleanup_1; + + if (binning && nregions > 1) { + rc = E_DIST_BIN_REGIONS; + goto cleanup_2; + } + + char *contains_queries_a[2] = { contains, a_contains }; + char *contains_queries_b[2] = { contains, b_contains }; + char *match_queries_a[2] = { match, a_match }; + char *match_queries_b[2] = { match, b_match }; + + size_t count_a; + size_t count_b; + struct genome *samples_a; + struct genome *samples_b; + + rc = tersect_db_get_genomes(tdb, 2, match_queries_a, 2, contains_queries_a, + &count_a, &samples_a); + if (rc != SUCCESS) goto cleanup_2; + + if (symmetric) { + samples_b = samples_a; + count_b = count_a; + } else { + rc = tersect_db_get_genomes(tdb, + 2, match_queries_b, 2, contains_queries_b, + &count_b, &samples_b); + if (rc != SUCCESS) goto cleanup_3; + } + + struct distance_matrix matrix; + + if (!binning) { + build_distance_matrix(tdb, count_a, samples_a, count_b, samples_b, + 0, nregions, regions, &matrix); + } else { + build_distance_matrix(tdb, count_a, samples_a, count_b, samples_b, + bin_size, 1, regions, &matrix); + } + + if (local_flags & JSON_OUTPUT) { + print_distance_matrix_json(&matrix); + } else { + print_distance_matrix_phylip(&matrix); + } + + dealloc_distance_matrix(&matrix); + + if (samples_b != samples_a) { + free(samples_b); + } +cleanup_3: + free(samples_a); +cleanup_2: + free(regions); +cleanup_1: + tersect_db_close(tdb); + return rc; +} diff --git a/src/errorc.c b/src/errorc.c new file mode 100644 index 0000000..4642ad0 --- /dev/null +++ b/src/errorc.c @@ -0,0 +1,64 @@ +/* errorc.c + + Copyright (C) 2019 Cranfield University + + Author: Tomasz Kurowski + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. */ + +#include "errorc.h" + +#include + +struct error_desc error_desc[] = { + { SUCCESS, "No error" }, + { FAILURE, "General failure" }, + { E_ALLOC, "Memory allocation error"}, + { E_NO_GENOME, "Sample not found"}, + { E_NO_TSI_FILE, "No Tersect index (.tsi) file specified"}, + { E_TSI_NOPEN, "Could not open specified Tersect index (.tsi) file"}, + { E_BUILD_NO_OUTNAME, "Output filename missing"}, + { E_BUILD_NO_FILES, "No input files specified"}, + { E_BUILD_CREATE, "Tersect database file could not be created"}, + { E_BUILD_DB_EXISTS, "Output file already exists (use -f to overwrite)"}, + { E_BUILD_NO_WRITE, "No write permissions on specified output file"}, + { E_BUILD_DUPSAMPLE, "Duplicate sample in input data"}, + { E_PARSE_REGION, "Region could not be parsed"}, + { E_PARSE_REGION_NO_CHROMOSOME, "Requested chromosome is not in the database"}, + { E_PARSE_REGION_BAD_BOUNDS, "Incorrect region bounds specified"}, + { E_PARSE_ALLELE, "Allele could not be parsed"}, + { E_PARSE_ALLELE_NO_CHROMOSOME, "Requested chromosome is not in the database"}, + { E_PARSE_ALLELE_BAD_POSITION, "Incorrect position specified"}, + { E_VCF_PARSE_FILE, "Failed to parse VCF/VCF.GZ file"}, + { E_PARSE_ALLELE_UNKNOWN, "Allele not in database"}, + { E_VIEW_NO_QUERY, "No set query specified"}, + { E_RENAME_NOPEN, "Coult not open specified name file"}, + { E_RENAME_PARSE, "Name file could not be parsed"}, + { E_DIST_BIN_REGIONS, "Only one region allowed if binning is enabled"} +}; + +void report_error(error_t code) { + for (size_t i = 0; i < sizeof error_desc / sizeof error_desc[0]; ++i) { + if (error_desc[i].code == code) { + fprintf(stderr, "Error %d: %s\n", code, error_desc[i].description); + return; + } + } + fprintf(stderr, "Error %d: Unknown error code\n", code); +} diff --git a/src/hashmap.c b/src/hashmap.c new file mode 100644 index 0000000..95d43c8 --- /dev/null +++ b/src/hashmap.c @@ -0,0 +1,123 @@ +/* hashmap.c + + Copyright (C) 2019 Cranfield University + + Author: Tomasz Kurowski + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. */ + +#include "hashmap.h" + +#include +#include + +HashMap *init_hashmap(const uint32_t bucket_count) +{ + HashMap *hm = malloc(sizeof *hm); + hm->bucket_count = bucket_count; + hm->count = 0; + hm->buckets = calloc(bucket_count, sizeof *(hm->buckets)); + hm->elements = calloc(bucket_count, sizeof *(hm->elements)); // this should be e.g. half of bucket count, to be doubled when load factor exceeds 0.5 + return hm; +} + +static uint32_t hash_func(const char *key) +{ + uint32_t hash = 0; + char c; + while ((c = (char)*key++) != 0) { + hash += c; + hash *= 0x9e3779b1; // Knuth + } + return hash; +} + +void hashmap_insert(HashMap *hm, const char *key, void *value) +{ + Bucket *bucket = &(hm->buckets[hash_func(key) % hm->bucket_count]); + HashElement **element = &(bucket->first); + for (uint32_t i = 0; i < bucket->count; ++i) { + if (!strcmp((*element)->key, key)) { + // Overwrite old value + (*element)->value = value; + return; + } + element = &((*element)->next); + } + // Adding new element at end of bucket chain (and to the list of keys) + (*element) = &(hm->elements[hm->count]); + // Make this some sort of string pool instead of an alloc & make sure it's freed + (*element)->key = malloc(strlen(key) + 1); + strcpy((*element)->key, key); + (*element)->value = value; + ++(hm->count); + ++(bucket->count); +} + +void *hashmap_get(const HashMap *hm, const char *key) +{ + Bucket *bucket = &(hm->buckets[hash_func(key) % hm->bucket_count]); + HashElement *element = bucket->first; + while (element) { + if (!strcmp(element->key, key)) { + return element->value; + } + element = element->next; + } + return NULL; +} + +HashIterator hashmap_iterator(const HashMap *hm) +{ + HashIterator it; + it._nextindex = 0; + it._hm = hm; + return it; +} + +void hashmap_iterator_next(HashIterator* it) +{ + HashElement *element; + if (it->_hm->count > it->_nextindex) { + element = &(it->_hm->elements[it->_nextindex++]); + it->key = element->key; + it->value = element->value; + } else { + it->key = NULL; + it->value = NULL; + } +} + +static inline void free_keys(HashMap *hm) +{ + HashIterator it = hashmap_iterator(hm); + hashmap_iterator_next(&it); + while (it.key) { + free(it.key); + hashmap_iterator_next(&it); + } +} + +void free_hashmap(HashMap *hm) +{ + free_keys(hm); + free(hm->buckets); + free(hm->elements); + free(hm); +} diff --git a/src/heap.c b/src/heap.c new file mode 100644 index 0000000..72fbb5f --- /dev/null +++ b/src/heap.c @@ -0,0 +1,136 @@ +/* heap.c + + Copyright (C) 2019 Cranfield University + + Author: Tomasz Kurowski + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. */ + +#include "heap.h" + +#include + +// Get index of left/right child elements for a heap element +#define LEFT_CHILD(index) (index << 1) + 1 +#define RIGHT_CHILD(index) (index << 1) + 2 +#define PARENT(index) (index - 1) >> 1 + +/* Example comparator (for ints) */ +/* int intcomp(const void *a, const void *b) +{ + return *(const int *)a - *(const int *)b; +}*/ + +Heap *init_heap(int max_size, int(*comparator)(const void* a, const void* b)) +{ + Heap *heap = malloc(sizeof *heap); + heap->size = 0; + heap->max_size = max_size; + heap->array = malloc(heap->max_size * sizeof *(heap->array)); + heap->comparator = comparator; + return heap; +} + +void free_heap(Heap *heap) +{ + free(heap->array); + free(heap); +} + +static inline void _swap(void **a, void **b) +{ + int *tmp = *a; + *a = *b; + *b = tmp; +} + +static inline void _sift_up(Heap *heap) +{ + int position = heap->size - 1; + int parent_position = PARENT(position); + while (position && heap->comparator(heap->array[position], + heap->array[parent_position]) + < 0) { + _swap(&heap->array[position], &heap->array[parent_position]); + position = parent_position; + parent_position = PARENT(position); + } +} + +void heap_push(Heap *heap, void *value) +{ + heap->array[heap->size++] = value; + _sift_up(heap); +} + +static inline void _sift_down(Heap *heap) +{ + int position = 0; + int child_left; + int child_right; + int larger_child; + while ((child_left = LEFT_CHILD(position)) < heap->size) { // while left child exists + child_right = RIGHT_CHILD(position); + if (child_right >= heap->size // right child does not exist + || heap->comparator(heap->array[child_left], + heap->array[child_right]) + < 0) { + larger_child = child_left; + } else { + larger_child = child_right; + } + if (heap->comparator(heap->array[larger_child], + heap->array[position]) + < 0) { + _swap(&heap->array[position], &heap->array[larger_child]); + position = larger_child; + } else { + break; + } + } +} + +void sift_down(Heap *heap) +{ + _sift_down(heap); +} + +void sift_up(Heap *heap) +{ + _sift_up(heap); +} + +void clear_heap(Heap *heap) +{ + // No need to actually clear the array + heap->size = 0; +} + +void *heap_pop(Heap *heap) +{ + void *output = heap->array[0]; + heap->array[0] = heap->array[--(heap->size)]; + _sift_down(heap); + return output; +} + +void *heap_peek(const Heap *heap) +{ + return heap->array[0]; +} diff --git a/src/query.l b/src/query.l new file mode 100644 index 0000000..6de89ae --- /dev/null +++ b/src/query.l @@ -0,0 +1,57 @@ +%option noyywrap nodefault noinput nounput +%{ + #include "query.tab.h" + + #include "errorc.h" + #include "tersect_db.h" + + #include + + const tersect_db *PARSE_TERSECT_DB; + struct tersect_db_interval PARSE_TERSECT_REGION; + + static char *strip_single_quotes(char *str); +%} + +%% + +"union"|"u" { + return UNION; + } + +"intersect"|"i" { + return INTER; + } + +"symdiff"|"s" { + return SYMDIFF; + } + +([^-^&|()>,\\ \t\n']+)|('[^']+') { + yylval.name = strdup(strip_single_quotes(yytext)); + return IDENT; + } + +[-^&|()>,\\] return *yytext; + +[ \t\n] ; /* skip whitespace */ + +. { + yyerror("Unknown character in query string: %c", *yytext); + } + +%% + +/** + * Used to strip single quotations marks from the name of a GENOME token. + * NOTE: This modifies the original string. + */ +static char *strip_single_quotes(char *str) +{ + size_t last = strlen(str) - 1; + if (str[0] == '\'' && str[last] == '\'') { + str[last] = '\0'; + return str + 1; + } + return str; +} diff --git a/src/query.y b/src/query.y new file mode 100644 index 0000000..876e720 --- /dev/null +++ b/src/query.y @@ -0,0 +1,281 @@ +%{ + #include "query.h" + + #include "ast.h" + + #include + #include + #include + + struct id_list { + char **ids; + size_t count; + }; + + struct gen_list { + struct genome *genomes; + size_t count; + }; + + int yylex(void); + void yy_scan_string(const char *); + int yylex_destroy(void); + struct id_list *merge_id_lists(struct id_list *list_a, + struct id_list *list_b); + void load_genomes(const struct id_list *gen_id_list, + const struct id_list *var_id_list, + struct gen_list *gen_list); + struct gen_list *diff_gen_lists(struct gen_list *list_a, + struct gen_list *list_b); + void free_id_list(struct id_list *id_list); + void free_gen_list(struct gen_list *gen_list); + const tersect_db *PARSE_TERSECT_DB; + struct ast_node *PARSE_OUTPUT; +%} + +%code provides { + void yyerror(const char *, ...); +} + +%union { + char *name; + struct ast_node *ast; + struct id_list *id_list; + struct gen_list *gen_list; +} + +%token IDENT +%token UNION +%token INTER +%token SYMDIFF +%type expr +%type list +%type genlist; +%left '&' '|' '-' '^' '\\' +%right '>' +%left ',' + +// Needed to handle parantheses for list / genlist / expr +%expect 2 + +%% + +program: + program expr { + PARSE_OUTPUT = $2; + } + | %empty + ; + +list: + IDENT { + struct id_list *list = malloc(sizeof *list); + list->ids = malloc(sizeof *list->ids); + list->ids[0] = $1; + list->count = 1; + $$ = list; + } + | list ',' list { + $$ = merge_id_lists($1, $3); + } + | '(' list ')' { + $$ = $2; + } + ; + +genlist: + list { + struct id_list *gen_id_list = $1; + struct gen_list *gen_list = malloc(sizeof *gen_list); + load_genomes(gen_id_list, NULL, gen_list); + free_id_list(gen_id_list); + $$ = gen_list; + } + | list '>' list { + struct id_list *gen_id_list = $1; + struct id_list *var_id_list = $3; + struct gen_list *gen_list = malloc(sizeof *gen_list); + load_genomes(gen_id_list, + var_id_list, + gen_list); + free_id_list(gen_id_list); + free_id_list(var_id_list); + $$ = gen_list; + } + | genlist '-' genlist { + $$ = diff_gen_lists($1, $3); + } + | '(' genlist ')' { + $$ = $2; + } + ; + +expr: + genlist { + struct gen_list *gen_list = $1; + if (gen_list->count > 1) { + // yyerror("Genome lists are only allowed within functions"); + for (size_t i = 0; i < gen_list->count; ++i) { + printf("%s\n", gen_list->genomes[i].name); + } + $$ = NULL; + } else if (gen_list->count == 0) { + yyerror("Empty genome list"); + $$ = NULL; + } else { + $$ = create_genome_node(&gen_list->genomes[0]); + } + free_gen_list(gen_list); + } + | expr '&' expr { + if ($1 == NULL || $3 == NULL) { + yyerror("Invalid operand in intersection"); + } + $$ = create_ast_node(AST_INTERSECTION, + $1, $3); + } + | expr '|' expr { + if ($1 == NULL || $3 == NULL) { + yyerror("Invalid operand in union"); + } + $$ = create_ast_node(AST_UNION, $1, $3); + } + | expr '\\' expr { + if ($1 == NULL || $3 == NULL) { + yyerror("Invalid operand in difference"); + } + $$ = create_ast_node(AST_DIFFERENCE, $1, $3); + } + | expr '^' expr { + if ($1 == NULL || $3 == NULL) { + yyerror("Invalid operand in symmetric difference"); + } + $$ = create_ast_node(AST_SYMMETRIC_DIFFERENCE, + $1, $3); + } + | '(' expr ')' { + $$ = $2; + } + | UNION '(' genlist ')' { + struct gen_list *gen_list = $3; + $$ = create_subtree(AST_UNION, + gen_list->count, + gen_list->genomes); + free_gen_list(gen_list); + } + | INTER '(' genlist ')' { + struct gen_list *gen_list = $3; + $$ = create_subtree(AST_INTERSECTION, + gen_list->count, + gen_list->genomes); + free_gen_list(gen_list); + } + | SYMDIFF '(' genlist ')' { + struct gen_list *gen_list = $3; + $$ = create_subtree(AST_SYMMETRIC_DIFFERENCE, + gen_list->count, + gen_list->genomes); + free_gen_list(gen_list); + } + ; + +%% + +void yyerror(const char *s, ...) +{ + va_list ap; + va_start(ap, s); + fprintf(stderr, "Error: "); + vfprintf(stderr, s, ap); + fprintf(stderr, "\n"); + exit(1); +} + +/** + * Naive implementation, + */ +struct gen_list *diff_gen_lists(struct gen_list *list_a, + struct gen_list *list_b) +{ + for (size_t i = 0; i < list_a->count; ++i) { + for (size_t j = 0; j < list_b->count; ++j) { + if (list_a->genomes[i].hdr == list_b->genomes[j].hdr) { + --list_a->count; + j = 0; + if (i < list_a->count) { + memmove(&list_a->genomes[i], &list_a->genomes[i + 1], + (list_a->count - i) * sizeof *list_a->genomes); + } + --i; + } + } + } + list_a->genomes = realloc(list_a->genomes, + list_a->count * sizeof *list_a->genomes); + free(list_b->genomes); + free(list_b); + return list_a; +} + +/** + * Merges two id lists. Saves the result in list_a and frees list_b. + * Returns list_a. + */ +struct id_list *merge_id_lists(struct id_list *list_a, struct id_list *list_b) +{ + size_t old_count = list_a->count; + list_a->count += list_b->count; + list_a->ids = realloc(list_a->ids, list_a->count * sizeof *list_a->ids); + memcpy(&list_a->ids[old_count], list_b->ids, + list_b->count * sizeof *list_b->ids); + free(list_b->ids); + free(list_b); + return list_a; +} + +void load_genomes(const struct id_list *gen_id_list, + const struct id_list *var_id_list, + struct gen_list *gen_list) +{ + error_t rc; + if (var_id_list == NULL) { + rc = tersect_db_get_genomes(PARSE_TERSECT_DB, + gen_id_list->count, gen_id_list->ids, + 0, NULL, + &gen_list->count, &gen_list->genomes); + } else { + rc = tersect_db_get_genomes(PARSE_TERSECT_DB, + gen_id_list->count, gen_id_list->ids, + var_id_list->count, var_id_list->ids, + &gen_list->count, &gen_list->genomes); + } + if (rc != SUCCESS || gen_list->count == 0) { + /* TODO: may need to make "tersect_db_get_genomes" verbose here to + print specific missing name(s) */ + yyerror("Could not find specified genome(s)"); + } +} + +void free_id_list(struct id_list *id_list) +{ + for (size_t i = 0; i < id_list->count; ++i) { + free(id_list->ids[i]); + } + free(id_list->ids); + free(id_list); +} + +void free_gen_list(struct gen_list *gen_list) +{ + free(gen_list->genomes); + free(gen_list); +} + +struct ast_node *run_set_parser(const char *query, const tersect_db *tdb) +{ + PARSE_TERSECT_DB = tdb; + yy_scan_string(query); + yyparse(); + yylex_destroy(); + return PARSE_OUTPUT; +} diff --git a/src/rename.c b/src/rename.c new file mode 100644 index 0000000..c2ff4d3 --- /dev/null +++ b/src/rename.c @@ -0,0 +1,130 @@ +/* rename.c + + Copyright (C) 2019 Cranfield University + + Author: Tomasz Kurowski + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. */ + +#include "rename.h" + +#include +#include +#include +#include + +static void usage(FILE *stream) +{ + fprintf(stream, + "\n" + "Usage: tersect rename [options] \n" + " tersect rename [options] -n \n\n" + "Options:\n" + " -h, --help print this help message\n" + " -n, --name-file tsv file containing sample names\n" + "\n"); +} + +error_t tersect_rename_sample(int argc, char **argv) +{ + error_t rc = SUCCESS; + char *db_filename = NULL; + char *oldname = NULL; + char *newname = NULL; + char *name_filename = NULL; + static struct option loptions[] = { + {"help", no_argument, NULL, 'h'}, + {"name-file", required_argument, NULL, 'n'}, + {NULL, 0, NULL, 0} + }; + int c; + while ((c = getopt_long(argc, argv, ":hn:", loptions, NULL)) != -1) { + switch(c) { + case 'h': + usage(stdout); + return SUCCESS; + case 'n': + name_filename = optarg; + break; + default: + usage(stderr); + return SUCCESS; + } + } + argc -= optind; + argv += optind; + if (!argc) { + // Missing Tersect index file + usage(stderr); + return E_NO_TSI_FILE; + } + db_filename = argv[0]; + tersect_db *tdb = tersect_db_open(db_filename); + if (tdb == NULL) return E_TSI_NOPEN; + + if (name_filename != NULL) { + rc = tersect_load_name_file(tdb, name_filename); + } else { + if (argc != 3) { + // Missing old name/new name/too many arguments + usage(stderr); + return SUCCESS; + } + oldname = argv[1]; + newname = argv[2]; + rc = tersect_db_rename_genome(tdb, oldname, newname); + } + + tersect_db_close(tdb); + return rc; +} + +error_t tersect_load_name_file(tersect_db *tdb, const char *filename) +{ + error_t rc = SUCCESS; + FILE *fh = fopen(filename, "r"); + if (fh == NULL) return E_RENAME_NOPEN; + char *line_buffer = NULL; + size_t buffer_size = 0; + char *line_context = NULL; + while (getline(&line_buffer, &buffer_size, fh) != -1) { + char *old_name = strtok_r(line_buffer, "\t\n", &line_context); + if (old_name == NULL) { + rc = E_RENAME_PARSE; + goto cleanup; + } + char *new_name = strtok_r(NULL, "\t\n", &line_context); + if (new_name == NULL) { + rc = E_RENAME_PARSE; + goto cleanup; + } + rc = tersect_db_rename_genome(tdb, old_name, new_name); + if (rc != SUCCESS && rc != E_NO_GENOME) { + goto cleanup; + } else { + rc = SUCCESS; // considering E_NO_GENOME a success as well + } + } +cleanup: + if (line_buffer != NULL) { + free(line_buffer); + } + fclose(fh); + return rc; +} diff --git a/src/samples.c b/src/samples.c new file mode 100644 index 0000000..0fc6c81 --- /dev/null +++ b/src/samples.c @@ -0,0 +1,129 @@ +/* samples.c + + Copyright (C) 2019 Cranfield University + + Author: Tomasz Kurowski + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. */ + +#include "samples.h" + +#include "tersect_db.h" + +#include +#include +#include + +/* Local flags for samples */ +#define NO_HEADERS 2 +static int local_flags = 0; + +static void usage(FILE *stream) +{ + fprintf(stream, + "\n" + "Usage: tersect samples [options] \n\n" + "Options:\n" + " -c, --contains STR print only samples containing each variant from an\n" + " input list (e.g. \"ch02:100:A:G,ch05:4031:C:T\")\n" + " -h, --help print this help message\n" + " -m, --match STR print only samples matching a wildcard pattern\n" + " (e.g. \"S.chi*\" to match all samples beginning\n" + " with \"S.chi\")\n" + " -n, --no-headers skip column headers\n" + "\n"); +} + +error_t tersect_print_samples(int argc, char **argv) +{ + error_t rc = SUCCESS; + char *db_filename = NULL; + char *contains = NULL; + char *pattern = NULL; + static struct option loptions[] = { + {"contains", required_argument, NULL, 'c'}, + {"help", no_argument, NULL, 'h'}, + {"match", required_argument, NULL, 'm'}, + {"no-headers", no_argument, NULL, 'n'}, + {NULL, 0, NULL, 0} + }; + int c; + while ((c = getopt_long(argc, argv, ":c:hm:n", loptions, NULL)) != -1) { + switch(c) { + case 'c': + contains = optarg; + break; + case 'h': + usage(stdout); + return SUCCESS; + case 'm': + pattern = optarg; + break; + case 'n': + local_flags |= NO_HEADERS; + break; + default: + usage(stderr); + return SUCCESS; + } + } + argc -= optind; + argv += optind; + if (!argc) { + // Missing Tersect index file + usage(stderr); + return E_NO_TSI_FILE; + } else if (argc > 1) { + // Too many arguments + usage(stderr); + return SUCCESS; + } + db_filename = argv[0]; + tersect_db *tdb = tersect_db_open(db_filename); + if (tdb == NULL) return E_TSI_NOPEN; + size_t count; + struct genome *samples = NULL; + if (pattern == NULL && contains == NULL) { + rc = tersect_db_get_genomes(tdb, 0, NULL, 0, NULL, + &count, &samples); + } else if (pattern == NULL) { + rc = tersect_db_get_genomes(tdb, 0, NULL, 1, &contains, + &count, &samples); + } else if (contains == NULL) { + rc = tersect_db_get_genomes(tdb, 1, &pattern, 0, NULL, + &count, &samples); + } else { + rc = tersect_db_get_genomes(tdb, 1, &pattern, 1, &contains, + &count, &samples); + } + if (rc != SUCCESS) goto cleanup; + + if (!(local_flags & NO_HEADERS)) { + printf("Sample\n"); + } + for (uint32_t i = 0; i < count; ++i) { + printf("%s\n", samples[i].name); + } + if (count) { + free(samples); + } +cleanup: + tersect_db_close(tdb); + return rc; +} diff --git a/src/snv.c b/src/snv.c new file mode 100644 index 0000000..a88fdb7 --- /dev/null +++ b/src/snv.c @@ -0,0 +1,62 @@ +/* snv.c + + Copyright (C) 2019 Cranfield University + + Author: Tomasz Kurowski + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. */ + +#include "snv.h" + +uint8_t snv_type(char ref, char alt) { + if (ref == 'A' || ref == 'a') { + if (alt == 'C' || alt == 'c') { + return SNV_A_C; + } else if (alt == 'G' || alt == 'g') { + return SNV_A_G; + } else if (alt == 'T' || alt == 't') { + return SNV_A_T; + } + } else if (ref == 'C' || ref == 'c') { + if (alt == 'A' || alt == 'a') { + return SNV_C_A; + } else if (alt == 'G' || alt == 'g') { + return SNV_C_G; + } else if (alt == 'T' || alt == 't') { + return SNV_C_T; + } + } else if (ref == 'G' || ref == 'g') { + if (alt == 'A' || alt == 'a') { + return SNV_G_A; + } else if (alt == 'C' || alt == 'c') { + return SNV_G_C; + } else if (alt == 'T' || alt == 't') { + return SNV_G_T; + } + } else if (ref == 'T' || ref == 't') { + if (alt == 'A' || alt == 'a') { + return SNV_T_A; + } else if (alt == 'C' || alt == 'c') { + return SNV_T_C; + } else if (alt == 'G' || alt == 'g') { + return SNV_T_G; + } + } + return V_INDEL; +} diff --git a/src/tersect.c b/src/tersect.c new file mode 100644 index 0000000..7cda9e2 --- /dev/null +++ b/src/tersect.c @@ -0,0 +1,91 @@ +/* tersect.c + + Copyright (C) 2019 Cranfield University + + Author: Tomasz Kurowski + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. */ + +#include "tersect.h" + +#include "build.h" +#include "errorc.h" +#include "view.h" +#include "chroms.h" +#include "samples.h" +#include "distance.h" +#include "rename.h" +#include "version.h" + +#include +#include +#include + +#include "alleles.h" + +static void usage(FILE *stream) +{ + fprintf(stream, + "\n" + "Version: "TERSECT_VERSION"\n" + "Usage: tersect [options]\n\n" + "Commands:\n" + " build build new VCF database\n" + " chroms list chromosomes in the database\n" + " dist calculate distance matrix for samples\n" + " help print this help message\n" + " rename rename sample\n" + " samples list samples in the database\n" + " view display variants belonging to a sample\n" + "\n"); +} + +int main(int argc, char *argv[]) +{ + error_t rc = SUCCESS; + char *command = "help"; + if (argc > 1) { + command = argv[1]; + --argc; + ++argv; + } + if (!strcmp(command, "build")) { + rc = tersect_build_database(argc, argv); + } else if (!strcmp(command, "view")) { + rc = tersect_view_set(argc, argv); + if (rc != SUCCESS) goto cleanup; + } else if (!strcmp(command, "chroms")) { + rc = tersect_print_chromosomes(argc, argv); + } else if (!strcmp(command, "rename")) { + rc = tersect_rename_sample(argc, argv); + } else if (!strcmp(command, "samples")) { + rc = tersect_print_samples(argc, argv); + } else if (!strcmp(command, "dist")) { + rc = tersect_distance(argc, argv); + } else if (!strcmp(command, "help")) { + usage(stdout); + } else { + usage(stderr); + } +cleanup: + if (rc != SUCCESS) { + report_error(rc); + } + return rc; +} diff --git a/src/tersect_db.c b/src/tersect_db.c new file mode 100644 index 0000000..926f40f --- /dev/null +++ b/src/tersect_db.c @@ -0,0 +1,914 @@ +/* tersect_db.c + + Copyright (C) 2019 Cranfield University + + Author: Tomasz Kurowski + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. */ + +#include "tersect_db.h" +#include "tersect_db_internal.h" + +#include "snv.h" +#include "version.h" +#include "vcf_writer.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#define PAGE_SIZE 4096 +#define INITIAL_DB_SIZE 4096 + +/* Types used for queries */ +#define QUERY_SINGLE_MATCH 0 // Selecting only a single element (exact match) +#define QUERY_WILDCARD_MATCH 1 // Selecting multiple elements +#define QUERY_ALL_MATCH 2 // Selecting all elements + +// Capacity of sequence hashmap used during database build +#define SEQUENCE_MAP_CAPACITY 50000000 + +/** + * Initialize header in new file. Note that the file size was already set by + * tersect_db_resize_file. + * + * Returns 0 success, 1 on failure. + */ +static error_t tersect_db_init_header(tersect_db *tdb) +{ + strcpy(tdb->hdr->format, TERSECT_FORMAT_VERSION); + tdb->hdr->free_head = sizeof *tdb->hdr; + tdb->hdr->chromosome_count = 0; + tdb->hdr->chromosomes = 0; + tdb->hdr->genome_count = 0; + tdb->hdr->genomes = 0; + tdb->hdr->word_size = CHAR_BIT * sizeof(bitarray_word); + return SUCCESS; +} + +error_t tersect_db_resize_file(tersect_db *tdb, off_t new_size) +{ + if ((void *)tdb->mapping != NULL) { + // Removing previous mapping + munmap((void *)tdb->mapping, tdb->hdr->db_size); + } + int fd; + if ((fd = open(tdb->filename, O_RDWR | O_CREAT, 0664)) == -1) return FAILURE; + if (ftruncate(fd, new_size)) return FAILURE; + tdb->mapping = (uintptr_t)mmap(NULL, new_size, PROT_READ | PROT_WRITE, + MAP_SHARED, fd, 0); + if ((void *)tdb->mapping == MAP_FAILED) return FAILURE; + if (close(fd) == -1) { + munmap((void *)tdb->mapping, new_size); + return FAILURE; + } + tdb->hdr = (struct tersect_db_hdr *)tdb->mapping; + tdb->hdr->db_size = new_size; + return SUCCESS; +} + +static tdb_offset tersect_db_malloc(tersect_db *tdb, size_t size) +{ + if (tdb->hdr->free_head + size >= tdb->hdr->db_size) { + // Rounded up + size_t page_num = (tdb->hdr->free_head + size + PAGE_SIZE - 1) + / PAGE_SIZE; + if (tersect_db_resize_file(tdb, page_num * PAGE_SIZE)) return 0; + } + tdb_offset offset = tdb->hdr->free_head; + tdb->hdr->free_head += size; + return offset; +} + +/** + * Verifies file existence and write permissions. Adds .tsi extension if not + * present. Allocates memory for the output. + */ +static inline error_t validate_filename(const char *filename, int flags, + char **output_filename) +{ + error_t rc; + size_t original_length = strlen(filename); + if (!strcmp(&filename[original_length - 4], ".tsi")) { + // Name already has the extension + *output_filename = malloc(original_length + 1); + if (*output_filename == NULL) return E_ALLOC; + strcpy(*output_filename, filename); + } else { + // Adding extension + *output_filename = malloc(original_length + 5); + if (*output_filename == NULL) return E_ALLOC; + sprintf(*output_filename, "%s.tsi", filename); + } + if (access(*output_filename, F_OK) == 0) { + if (flags & TDB_FORCE) { + if (access(*output_filename, W_OK) == -1) { + rc = E_BUILD_NO_WRITE; + goto cleanup; + } + // TODO: Warning: overwriting existing file + } else { + rc = E_BUILD_DB_EXISTS; + goto cleanup; + } + } + return SUCCESS; +cleanup: + free(*output_filename); + return rc; +} + +error_t tersect_db_create(const char *filename, int flags, tersect_db **tdb) +{ + error_t rc = SUCCESS; + size_t size = INITIAL_DB_SIZE; + if (size <= sizeof(struct tersect_db_hdr)) { + size = sizeof(struct tersect_db_hdr) + 1; + } + *tdb = malloc(sizeof **tdb); + if (!(*tdb)) return E_BUILD_CREATE; + **tdb = (tersect_db) { + .mapping = 0, + .sequences = init_hashmap(SEQUENCE_MAP_CAPACITY) + }; + rc = validate_filename(filename, flags, &(*tdb)->filename); + if (rc != SUCCESS) { + goto cleanup_1; + } + if (tersect_db_resize_file(*tdb, size)) goto cleanup_2; + if (tersect_db_init_header(*tdb)) goto cleanup_2; + return rc; +cleanup_2: + free((*tdb)->filename); +cleanup_1: + free_hashmap((*tdb)->sequences); + free(*tdb); + return rc; +} + +tersect_db *tersect_db_open(const char *filename) +{ + tersect_db *tdb = malloc(sizeof *tdb); + if (!tdb) return NULL; + *tdb = (tersect_db) { + .mapping = 0, + .sequences = NULL + }; + if (validate_filename(filename, TDB_FORCE, &tdb->filename) != SUCCESS) { + goto cleanup_1; + } + int fd; + if ((fd = open(tdb->filename, O_RDWR, 0664)) == -1) goto cleanup_2; + struct stat st; + if (fstat(fd, &st) == -1) goto cleanup_3; + tdb->mapping = (uintptr_t)mmap(NULL, st.st_size, PROT_READ | PROT_WRITE, + MAP_SHARED, fd, 0); + if ((void *)tdb->mapping == MAP_FAILED) goto cleanup_3; + if (close(fd) == -1) goto cleanup_4; + tdb->hdr = (struct tersect_db_hdr *)tdb->mapping; + return tdb; +cleanup_4: + munmap((void *)tdb->mapping, st.st_size); +cleanup_3: + close(fd); +cleanup_2: + free(tdb->filename); +cleanup_1: + free(tdb); + return NULL; +} + +void tersect_db_close(tersect_db *tdb) +{ + munmap((void *)tdb->mapping, tdb->hdr->db_size); + if (tdb->sequences != NULL) { + // Free allelic sequences + HashIterator it = hashmap_iterator(tdb->sequences); + hashmap_iterator_next(&it); + while (it.value) { + free(it.value); + hashmap_iterator_next(&it); + } + free_hashmap(tdb->sequences); + } + free(tdb->filename); + free(tdb); +} + +static tdb_offset tersect_db_add_string(tersect_db *tdb, + const char *string) +{ + size_t len = strlen(string) + 1; + tdb_offset offset = tersect_db_malloc(tdb, len); + strcpy((char *)(tdb->mapping + offset), string); + return offset; +} + +static tdb_offset tersect_db_add_variants(tersect_db *tdb, + uint32_t variant_count, + const struct variant *variants) +{ + size_t size = variant_count * sizeof *variants; + tdb_offset offset = tersect_db_malloc(tdb, size); + memcpy((void *)(tdb->mapping + offset), variants, size); + return offset; +} + +/** + * Add the raw content of a bit array to the database. + */ +static tdb_offset tersect_db_add_raw_bitarray(tersect_db *tdb, + const struct bitarray *ba) +{ + size_t size = ba->size * sizeof *ba->array; + tdb_offset offset = tersect_db_malloc(tdb, size); + memcpy((void *)(tdb->mapping + offset), ba->array, size); + return offset; +} + +/** + * Finds genome header by name. Returns NULL if not found. + */ +static struct genome_hdr *tersect_db_find_genome(const tersect_db *tdb, + const char *genome) +{ + tdb_offset offset = tdb->hdr->genomes; + while (offset) { + struct genome_hdr *gen_hdr = (struct genome_hdr *)(tdb->mapping + + offset); + if (!strcmp((char *)(tdb->mapping + gen_hdr->name), genome)) { + return (struct genome_hdr *)(tdb->mapping + offset); + } + offset = gen_hdr->next; + } + return NULL; +} + +/** + * Finds chromosome header by name. Returns NULL if not found. + */ +static struct chrom_hdr *tersect_db_find_chromosome(const tersect_db *tdb, + const char *chromosome) +{ + tdb_offset offset = tdb->hdr->chromosomes; + while (offset) { + struct chrom_hdr *chr_hdr = (struct chrom_hdr *)(tdb->mapping + offset); + if (!strcmp((char *)(tdb->mapping + chr_hdr->name), chromosome)) { + return (struct chrom_hdr *)(tdb->mapping + offset); + } + offset = chr_hdr->next; + } + return NULL; +} + +/** + * Finds bitarray by genome name and chromosome. Returns NULL if not found. + */ +static struct bitarray_hdr *tersect_db_find_bitarray(const tersect_db *tdb, + const struct genome *gen, + const struct chromosome *chr) +{ + tdb_offset genome_offset = (uintptr_t)tersect_db_find_genome(tdb, gen->name) + - tdb->mapping; + tdb_offset offset = chr->hdr->bitarrays; + while (offset) { + struct bitarray_hdr *ba_hdr = (struct bitarray_hdr *)(tdb->mapping + + offset); + if (ba_hdr->genome_offset == genome_offset) { + return ba_hdr; + } + offset = ba_hdr->next; + } + return NULL; +} + +error_t tersect_db_insert_allele(tersect_db *tdb, const struct allele *allele, + struct variant *out) +{ + size_t ref_len = strlen(allele->ref); + size_t alt_len = strlen(allele->alt); + if (ref_len > 1 || alt_len > 1) { + // Indel + char *allele_string = malloc(ref_len + alt_len + 2); + if (allele_string == NULL) return E_ALLOC; + sprintf(allele_string, "%s\t%s", allele->ref, allele->alt); + tdb_offset *allele_offset = (tdb_offset *)hashmap_get(tdb->sequences, + allele_string); + if (allele_offset == NULL) { + // New allelic sequence + allele_offset = malloc(sizeof *allele_offset); + *allele_offset = tersect_db_add_string(tdb, allele_string); + hashmap_insert(tdb->sequences, allele_string, allele_offset); + } + *out = (struct variant) { + .position = allele->position, + .type = 0, + .allele = *allele_offset + }; + free(allele_string); + return SUCCESS; + } + // SNV + *out = (struct variant) { + .position = allele->position, + .type = snv_type(allele->ref[0], allele->alt[0]), + .allele = 0 + }; + return SUCCESS; +} + +void tersect_db_add_bitarray(tersect_db *tdb, const char *genome, + const char *chromosome, + const struct bitarray *ba) +{ + tdb_offset array_offset = tersect_db_add_raw_bitarray(tdb, ba); + tdb_offset ba_offset = tersect_db_malloc(tdb, sizeof(struct bitarray_hdr)); + tdb_offset genome_offset = (uintptr_t)tersect_db_find_genome(tdb, genome) + - tdb->mapping; + struct bitarray_hdr *ba_hdr = (struct bitarray_hdr *)(tdb->mapping + + ba_offset); + struct chrom_hdr *chr_hdr = tersect_db_find_chromosome(tdb, chromosome); + *ba_hdr = (struct bitarray_hdr) { + .genome_offset = genome_offset, + .size = ba->size, + .array = array_offset, + .start_mask = ba->start_mask, + .end_mask = ba->end_mask, + .next = chr_hdr->bitarrays + }; + chr_hdr->bitarrays = ba_offset; +} + +void tersect_db_get_bitarray(const tersect_db *tdb, + const struct genome *gen, + const struct chromosome *chr, + struct bitarray *output) +{ + struct bitarray_hdr *ba_hdr = tersect_db_find_bitarray(tdb, gen, chr); + *output = (struct bitarray) { + .size = ba_hdr->size, + .array = (bitarray_word *)(tdb->mapping + ba_hdr->array), + .start_mask = ba_hdr->start_mask, + .end_mask = ba_hdr->end_mask + }; +} + +void tersect_db_add_chromosome(tersect_db *tdb, + const char *chr_name, + const struct variant *variants, + uint32_t variant_count, + uint32_t length) +{ + tdb_offset name_offset = tersect_db_add_string(tdb, chr_name); + tdb_offset var_offset = tersect_db_add_variants(tdb, variant_count, + variants); + tdb_offset chr_offset = tersect_db_malloc(tdb, sizeof(struct chrom_hdr)); + struct chrom_hdr *chr_hdr = (struct chrom_hdr *)(tdb->mapping + chr_offset); + *chr_hdr = (struct chrom_hdr) { + .name = name_offset, + .variants = var_offset, + .variant_count = variant_count, + .length = length ? length : variants[variant_count - 1].position, + .next = tdb->hdr->chromosomes + }; + tdb->hdr->chromosomes = chr_offset; + ++tdb->hdr->chromosome_count; +} + +void tersect_db_add_genome(tersect_db *tdb, const char *genome_name) +{ + tdb_offset name_offset = tersect_db_add_string(tdb, genome_name); + tdb_offset genome_offset = tersect_db_malloc(tdb, sizeof(struct genome_hdr)); + struct genome_hdr *gen_hdr = (struct genome_hdr *)(tdb->mapping + + genome_offset); + *gen_hdr = (struct genome_hdr) { + .name = name_offset, + .next = tdb->hdr->genomes + }; + tdb->hdr->genomes = genome_offset; + ++tdb->hdr->genome_count; +} + +uint32_t tersect_db_get_genome_count(const tersect_db *tdb) +{ + return tdb->hdr->genome_count; +} + +uint32_t tersect_db_get_chromosome_count(const tersect_db *tdb) +{ + return tdb->hdr->chromosome_count; +} + +static inline bool wildcard_match(const char *query, const char *pattern) +{ + char *pattern_copy = malloc(strlen(pattern) + 1); + strcpy(pattern_copy, pattern); + char *context; + char *token = strtok_r(pattern_copy, "*", &context); + while (token != NULL) { + query = strstr(query, token); + if (query == NULL) { + free(pattern_copy); + return false; + } + query += strlen(token); + token = strtok_r(NULL, "*", &context); + } + free(pattern_copy); + // False if there is trailing text and the last pattern character is not * + if (strlen(query) && pattern[strlen(pattern) - 1] != '*') return false; + return true; +} + +/** + * Finds variant by string (e.g. ch02:204:A:G) + */ +error_t parse_variant(struct genomic_variant *output, + const tersect_db *tdb, + const char *variant) +{ + error_t rc = SUCCESS; + char *context; + char *variant_temp = malloc(strlen(variant) + 1); + if (variant_temp == NULL) { + rc = E_ALLOC; + goto cleanup; + } + strcpy(variant_temp, variant); + char *chr_name = strtok_r(variant_temp, ":", &context); + char *position = strtok_r(NULL, ":", &context); + char *ref = strtok_r(NULL, ":", &context); + char *alt = strtok_r(NULL, ":", &context); + if (chr_name == NULL || position == NULL || ref == NULL || alt == NULL) { + rc = E_PARSE_ALLELE; + goto cleanup; + } + struct chrom_hdr *chr_hdr = tersect_db_find_chromosome(tdb, chr_name); + if (chr_hdr == NULL) { + rc = E_PARSE_ALLELE_NO_CHROMOSOME; + goto cleanup; + } + output->chromosome = (char *)(tdb->mapping + chr_hdr->name); + char *endptr; + output->position = strtol(position, &endptr, 10); + if (endptr == position || *endptr != '\0') { + rc = E_PARSE_ALLELE; + goto cleanup; + } + output->ref = ref; + output->alt = alt; + output->type = snv_type(ref[0], alt[0]); +cleanup: + if (variant_temp != NULL) { + free(variant_temp); + } + return rc; +} + +/** + * Converts an array of variant strings (each of which may contain several + * comma-separated variants, e.g. "ch02:100:A:G,ch05:4031:C:T") into an + * array of strings representing each individual variant. + */ +void flatten_contains_queries(char *const *contains, size_t ncont, + char ***out, size_t *nout) +{ + *nout = 0; + *out = malloc(ncont * sizeof **out); + size_t out_capacity = ncont; + for (size_t i = 0; i < ncont; ++i) { + if (contains[i] == NULL) continue; + char *variant; + char *context; + char *copy = malloc(strlen(contains[i]) + 1); + strcpy(copy, contains[i]); + variant = strtok_r(copy, ",", &context); + while (variant != NULL) { + (*out)[*nout] = malloc(strlen(variant) + 1); + strcpy((*out)[*nout], variant); + ++(*nout); + if (*nout >= out_capacity) { + // Ran out of space, expanding + out_capacity *= 2; + *out = realloc(*out, out_capacity * sizeof **out); + } + variant = strtok_r(NULL, ",", &context); + } + free(copy); + } + // Clean-up / shrinkwrapping + if (!(*nout)) { + free(*out); + *out = NULL; + } else { + *out = realloc(*out, *nout * sizeof **out); + } +} + +/** + * variant_index and intervals are parallel arrays of size nvars + * the former represents the position of each respective variant in the latter + */ +error_t parse_contains_queries(const tersect_db *tdb, + char *const *contains, size_t ncont, + uint64_t **variant_index, + struct tersect_db_interval **out_intervals, + size_t *nvars) +{ + // TODO: error handling + error_t rc = SUCCESS; + char **required_var_strings; + size_t n_required_vars; + flatten_contains_queries(contains, ncont, &required_var_strings, + &n_required_vars); + *nvars = 0; + *variant_index = malloc(n_required_vars * sizeof **variant_index); + *out_intervals = malloc(n_required_vars * sizeof **out_intervals); + for (size_t i = 0; i < n_required_vars; ++i) { + struct genomic_variant variant; + rc = parse_variant(&variant, tdb, required_var_strings[i]); + if (rc != SUCCESS) { + goto cleanup; + } + struct genomic_interval gi = { + .chromosome = variant.chromosome, + .start_base = variant.position, + .end_base = variant.position + }; + struct tersect_db_interval ti; + tersect_db_get_interval(tdb, &gi, &ti); + if (ti.interval.start_index > ti.interval.end_index) { + // site not found + rc = E_PARSE_ALLELE_UNKNOWN; + goto cleanup; + } + // Checking all variants on the same position + for (size_t j = ti.interval.start_index; + j <= ti.interval.end_index; ++j) { + if (variant.type == ti.chromosome.variants[j].type) { + // Found variant + (*variant_index)[*nvars] = j; + (*out_intervals)[*nvars] = ti; + ++(*nvars); + break; + } + if (j == ti.interval.end_index) { + // Variant not in database, so no sample can contain it + rc = E_PARSE_ALLELE_UNKNOWN; + goto cleanup; + } + } + } + // Clean-up / shrinkwrapping +cleanup: + if (!(*nvars)) { + free(*variant_index); + free(*out_intervals); + *variant_index = NULL; + *out_intervals = NULL; + } else { + *variant_index = realloc(*variant_index, + *nvars * sizeof **variant_index); + *out_intervals = realloc(*out_intervals, + *nvars * sizeof **out_intervals); + } + return rc; +} + +/** + * Returns true if str matches at least one pattern, no patterns are provided, + * or all patterns are NULL. + */ +bool matches_any_pattern(const char *str, + char *const *patterns, + size_t npatterns) +{ + if (!npatterns) return true; + size_t null_patterns = 0; + for (size_t i = 0; i < npatterns; ++i) { + if (patterns[i] == NULL) { + ++null_patterns; + continue; + } + if (wildcard_match(str, patterns[i])) { + return true; + } + } + if (null_patterns == npatterns) return true; + return false; +} + +/** + * variant_index and intervals are parallel arrays of size nvars + * the former represents the position of each respective variant in the latter + */ +bool contains_all_variants(const tersect_db *tdb, const struct genome *gen, + const uint64_t *variant_index, + const struct tersect_db_interval *intervals, + size_t nvars) +{ + struct bitarray ba; + for (size_t i = 0; i < nvars; ++i) { + tersect_db_get_bitarray(tdb, gen, &intervals[i].chromosome, &ba); + if (!bitarray_get_bit(&ba, variant_index[i])) { + return false; + } + } + return true; +} + +error_t tersect_db_get_genomes(const tersect_db *tdb, + size_t nmatch, char *const *matches, + size_t ncont, char *const *contains, + size_t *ngenomes, struct genome **genomes) +{ + error_t rc = SUCCESS; + struct genome_hdr *gen_hdr = (struct genome_hdr *)(tdb->mapping + + tdb->hdr->genomes); + uint64_t *vars_index; + struct tersect_db_interval *vars_intervals; + size_t n_contains_vars; + rc = parse_contains_queries(tdb, contains, ncont, &vars_index, + &vars_intervals, &n_contains_vars); + if (rc != SUCCESS) { + if (rc == E_PARSE_ALLELE_UNKNOWN) { + // Variant not in database, so no sample can contain it + // TODO: perhaps print a warning? + *ngenomes = 0; + *genomes = NULL; + return SUCCESS; + } else { + return rc; + } + } + *ngenomes = 0; + *genomes = malloc(tdb->hdr->genome_count * sizeof **genomes); + for (uint32_t i = 0; i < tdb->hdr->genome_count; ++i) { + char *genome_name = (char *)(tdb->mapping + gen_hdr->name); + (*genomes)[*ngenomes] = (struct genome) { + .name = genome_name, + .hdr = gen_hdr + }; + if ((nmatch == 0 || matches_any_pattern(genome_name, matches, nmatch)) + && contains_all_variants(tdb, &(*genomes)[*ngenomes], vars_index, + vars_intervals, n_contains_vars)) { + ++(*ngenomes); + } + if (gen_hdr->next) { + gen_hdr = (struct genome_hdr *)(tdb->mapping + gen_hdr->next); + } + } + if (!(*ngenomes)) { + // No genomes found + free(*genomes); + *genomes = NULL; + } else { + *genomes = realloc(*genomes, *ngenomes * sizeof **genomes); + } + return rc; +} + +/** + * Loads chromosome structure based on a chromosome header. + */ +static inline void load_chromosome(const tersect_db *tdb, + struct chrom_hdr *chr_hdr, + struct chromosome *chrom) +{ + *chrom = (struct chromosome) { + .name = (char *)(tdb->mapping + chr_hdr->name), + .length = chr_hdr->length, + .variant_count = chr_hdr->variant_count, + .variants = (struct variant *)(tdb->mapping + chr_hdr->variants), + .hdr = chr_hdr + }; +} + +void tersect_db_get_chromosomes(const tersect_db *tdb, + size_t *nchroms, struct chromosome **chroms) +{ + struct chrom_hdr *chr_hdr = (struct chrom_hdr *)(tdb->mapping + + tdb->hdr->chromosomes); + *nchroms = tdb->hdr->chromosome_count; + *chroms = malloc(tdb->hdr->chromosome_count * sizeof **chroms); + for (uint32_t i = tdb->hdr->chromosome_count; i; --i) { + load_chromosome(tdb, chr_hdr, &(*chroms)[i - 1]); + if (chr_hdr->next) { + chr_hdr = (struct chrom_hdr *)(tdb->mapping + chr_hdr->next); + } + } +} + +void tersect_db_get_chromosome(const tersect_db *tdb, const char *name, + struct chromosome *chrom) +{ + struct chrom_hdr *chr_hdr = tersect_db_find_chromosome(tdb, name); + load_chromosome(tdb, chr_hdr, chrom); +} + +void tersect_db_get_interval(const tersect_db *tdb, + const struct genomic_interval *gi, + struct tersect_db_interval *ti) +{ + tersect_db_get_chromosome(tdb, gi->chromosome, &ti->chromosome); + // TODO: binary search instead of linear search for interval boundaries + for (uint32_t i = 0; i < ti->chromosome.variant_count; ++i) { + if (ti->chromosome.variants[i].position >= gi->start_base) { + ti->interval.start_index = i; + break; + } + } + for (uint32_t i = ti->chromosome.variant_count; i > 0; --i) { + if (ti->chromosome.variants[i - 1].position <= gi->end_base) { + ti->interval.end_index = i - 1; + break; + } + } + ti->nvariants = 1 + ti->interval.end_index - ti->interval.start_index; + // Rounding down to word index + ti->variants = &ti->chromosome.variants[(ti->interval.start_index + / bitarray_word_capacity) + * bitarray_word_capacity]; +} + +void tersect_db_get_bin_intervals(const tersect_db *tdb, + const struct genomic_interval *gi, + uint32_t bin_size, + size_t *nbins, + struct tersect_db_interval **bins) +{ + struct chromosome chrom; + tersect_db_get_chromosome(tdb, gi->chromosome, &chrom); + + uint32_t region_size = gi->end_base - gi->start_base + 1; + *nbins = (region_size + bin_size - 1) / bin_size; + *bins = calloc(*nbins, sizeof **bins); + for (size_t i = 0; i < *nbins; ++i) { + (*bins)[i].chromosome = chrom; + } + + // Finding the first bin + for (uint32_t i = 0; i < chrom.variant_count; ++i) { + if (chrom.variants[i].position < gi->start_base) continue; + (*bins)[0].interval.start_index = i; + (*bins)[0].variants = &chrom.variants[(i / bitarray_word_capacity) + * bitarray_word_capacity]; + (*bins)[0].nvariants = 1; + break; + } + size_t prev_bin = 0; + + // Went through all variants without finding bin + if (!(*bins)[0].nvariants) return; + + // Finding further bins + for (uint32_t i = (*bins)[0].interval.start_index + 1; + i < chrom.variant_count; ++i) { + if (chrom.variants[i].position > gi->end_base) { + // Close last bin + (*bins)[prev_bin].interval.end_index = i - 1; + break; + } + size_t bin = (chrom.variants[i].position - gi->start_base) / bin_size; + if (bin != prev_bin) { + // Close previous bin + (*bins)[prev_bin].interval.end_index = i - 1; + if (prev_bin + 1 == *nbins) { + // Closed last bin, we're done here + break; + } + // Start next bin + (*bins)[bin].interval.start_index = i; + (*bins)[bin].variants = &chrom.variants[(i / bitarray_word_capacity) + * bitarray_word_capacity]; + prev_bin = bin; + } + ++((*bins)[bin].nvariants); + } + if ((*bins)[prev_bin].interval.end_index == 0) { + // Last bin was not closed since the last variant was still within it + (*bins)[prev_bin].interval.end_index = chrom.variant_count - 1; + } +} + +error_t tersect_db_rename_genome(tersect_db *tdb, const char *old_name, + const char *new_name) +{ + struct genome_hdr *gen_hdr = tersect_db_find_genome(tdb, old_name); + if (gen_hdr == NULL) { + return E_NO_GENOME; + } + tdb_offset new_name_offset = tersect_db_add_string(tdb, new_name); + // Have to find genome header again as adding the string can update mapping + gen_hdr = tersect_db_find_genome(tdb, old_name); + gen_hdr->name = new_name_offset; + return SUCCESS; +} + +error_t parse_region(struct genomic_interval *output, const tersect_db *tdb, + const char *region) +{ + error_t rc = SUCCESS; + char *context; + char *region_temp = malloc(strlen(region) + 1); + if (region_temp == NULL) { + rc = E_ALLOC; + goto cleanup; + } + strcpy(region_temp, region); + char *chr_name = strtok_r(region_temp, ":", &context); + char *bounds = strtok_r(NULL, ":", &context); + struct chrom_hdr *chr_hdr = tersect_db_find_chromosome(tdb, chr_name); + if (chr_hdr == NULL) { + rc = E_PARSE_REGION_NO_CHROMOSOME; + goto cleanup; + } + output->chromosome = (char *)(tdb->mapping + chr_hdr->name); + if (bounds == NULL) { + // No bounds, interval covers entire chromosome + output->start_base = 1; + output->end_base = chr_hdr->length; + } else { + char *start_base = strtok_r(bounds, "-", &context); + char *end_base = strtok_r(NULL, "-", &context); + if (start_base == NULL || end_base == NULL) { + rc = E_PARSE_REGION_BAD_BOUNDS; + goto cleanup; + } + char *endptr; + output->start_base = strtol(start_base, &endptr, 10); + if (endptr == start_base || *endptr != '\0') { + rc = E_PARSE_REGION_BAD_BOUNDS; + goto cleanup; + } + output->end_base = strtol(end_base, &endptr, 10); + if (endptr == end_base || *endptr != '\0') { + rc = E_PARSE_REGION_BAD_BOUNDS; + goto cleanup; + } + } +cleanup: + if (region_temp != NULL) { + free(region_temp); + } + return rc; +} + +error_t tersect_db_parse_regions(const tersect_db *tdb, size_t nregions, + char **region_strings, + struct genomic_interval **output) +{ + error_t rc = SUCCESS; + *output = malloc(nregions * sizeof **output); + for (size_t i = 0; i < nregions; ++i) { + rc = parse_region(&(*output)[i], tdb, region_strings[i]); + if (rc != SUCCESS) { + free(*output); + return rc; + } + } + return rc; +} + +/** + * Returns all regions (covering entire chromosomes) in the database. + */ +error_t tersect_db_get_regions(const tersect_db *tdb, + size_t *nregions, + struct genomic_interval **output) +{ + error_t rc = SUCCESS; + struct chromosome *chroms; + tersect_db_get_chromosomes(tdb, nregions, &chroms); + *output = malloc(*nregions * sizeof **output); + for (size_t i = 0; i < *nregions; ++i) { + (*output)[i] = (struct genomic_interval) { + .chromosome = chroms[i].name, + .start_base = 1, + .end_base = chroms[i].length + }; + } + free(chroms); + return rc; +} diff --git a/src/tersect_db_internal.h b/src/tersect_db_internal.h new file mode 100644 index 0000000..8f8bc09 --- /dev/null +++ b/src/tersect_db_internal.h @@ -0,0 +1,89 @@ +/* tersect_db_internal.h + + Copyright (C) 2019 Cranfield University + + Author: Tomasz Kurowski + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. */ + +#ifndef TERSECT_DB_INTERNAL +#define TERSECT_DB_INTERNAL + +#include "hashmap.h" + +#include + +typedef uint64_t tdb_offset; + +struct tersect_db { + char *filename; + HashMap *sequences; + uintptr_t mapping; + struct tersect_db_hdr *hdr; +}; + +struct chrom_hdr { + tdb_offset name; + tdb_offset variants; + tdb_offset bitarrays; + uint32_t variant_count; + uint32_t length; + tdb_offset next; +}; + +struct genome_hdr { + tdb_offset name; + tdb_offset next; +}; + +struct tersect_db_hdr { + char format[14]; // TERSECT_FORMAT_VERSION + uint64_t db_size; + uint16_t word_size; + tdb_offset chromosomes; + uint32_t chromosome_count; + tdb_offset genomes; + uint32_t genome_count; + tdb_offset free_head; +}; + +struct bitarray_hdr { + tdb_offset genome_offset; + size_t size; + tdb_offset array; + bitarray_word start_mask; + bitarray_word end_mask; + tdb_offset next; +}; + +struct variant { + uint32_t position; + uint8_t type; + tdb_offset allele; +}; + +struct genomic_variant { + char *chromosome; + uint32_t position; + char *ref; + char *alt; + int type; +}; + +#endif diff --git a/src/vcf_parser.c b/src/vcf_parser.c new file mode 100644 index 0000000..97838e6 --- /dev/null +++ b/src/vcf_parser.c @@ -0,0 +1,221 @@ +/* vcf_parser.c + + Copyright (C) 2019 Cranfield University + + Author: Tomasz Kurowski + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. */ + +#include "vcf_parser.h" + +#include +#include +#include + +#define VCF_NUM_COLUMNS 9 + +#define CHROM_COLUMN 0 +#define POS_COLUMN 1 +#define ID_COLUMN 2 +#define REF_COLUMN 3 +#define ALT_COLUMN 4 +#define QUAL_COLUMN 5 +#define FILTER_COLUMN 6 +#define INFO_COLUMN 7 +#define FORMAT_COLUMN 8 + +/** + * Size of the header line prior to the genotype list (46 characters). + * Sample names start past this position. + * + * #CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t + * + */ +#define HEADER_LINE_SIZE 46 + +/** + * Parsers metadata lines until the one starting with #CHROM + */ +static inline void load_metadata(VCF_PARSER *parser) +{ + parser->samples = malloc(sizeof *parser->samples); + char *line_context; + while (getline(&parser->line_buffer, &parser->buffer_size, + parser->file_handle) != -1) { + if (!strncmp(parser->line_buffer, "#CHROM", 6)) { + char *sample_columns = &parser->line_buffer[HEADER_LINE_SIZE]; + char *sample_name = strtok_r(sample_columns, "\t\n", &line_context); + parser->samples[0] = strdup(sample_name); + parser->sample_num = 1; + while ((sample_name = strtok_r(NULL, "\t\n", &line_context)) + != NULL) { + ++parser->sample_num; + parser->samples = realloc(parser->samples, + parser->sample_num + * sizeof *parser->samples); + parser->samples[parser->sample_num - 1] = strdup(sample_name); + } + break; + } + } +} + +static inline int get_genotype_code(char *gt) +{ + if (gt[0] == '0' && gt[2] == '0') { + return GENOTYPE_HOM_REF; + } else if (gt[0] == gt[2]) { + return GENOTYPE_HOM_ALT; + } else { + return GENOTYPE_HET; + } +} + +int is_gzipped(const char *filename) +{ + FILE *fh = fopen(filename, "rb"); + char byte1 = getc(fh); + char byte2 = getc(fh); + fclose(fh); + return byte1 == (char)0x1f && byte2 == (char)0x8b; +} + +VCF_PARSER *init_parser(const char *filename, int flags) +{ + VCF_PARSER *parser = malloc(sizeof *parser); + if (access(filename, F_OK) != 0) { + return NULL; + } + if (is_gzipped(filename)) { + char *cmd = malloc(strlen(filename) + 8); // strlen of "zcat \'%s\'" + sprintf(cmd, "zcat \'%s\'", filename); + parser->file_handle = popen(cmd, "r"); + free(cmd); + } else { + parser->file_handle = fopen(filename, "r"); + } + if (parser->file_handle == NULL) { + return NULL; + } + strcpy(parser->filename, filename); + parser->line_buffer = NULL; + parser->buffer_size = 0; + load_metadata(parser); + parser->genotypes = malloc(parser->sample_num * sizeof *parser->genotypes); + parser->n_alts = 0; + parser->flags = flags; + parser->current_result = ALLELE_NOT_FETCHED; + return parser; +} + +/* Compare most recent alleles from two parsers */ +int parser_allele_cmp(const void *a, const void *b) +{ + VCF_PARSER *parser_a = (VCF_PARSER *)a; + VCF_PARSER *parser_b = (VCF_PARSER *)b; + return allele_cmp(&parser_a->current_allele, &parser_b->current_allele); +} + +/* Compare alleles (for sorting them alphabetically) */ +static inline int alt_comp(const void *a, const void *b) +{ + return strcmp(*(char *const *)b, *(char *const *)a); +} + +int fetch_next_allele(VCF_PARSER *parser) +{ + // TODO: add error handling in case of incorrect file contents + // Fetching successive ALT alleles at the same position + while (parser->n_alts) { + // Pop ALT allele (LIFO) + parser->current_allele.alt = parser->alt_alleles[--(parser->n_alts)]; + if (parser->flags & VCF_ONLY_SNPS) { + if (parser->current_allele.alt[1] != '\0') continue; + } else if (parser->flags & VCF_ONLY_INDELS) { + if (parser->current_allele.alt[1] == '\0' + && parser->current_allele.ref[1] == '\0') continue; + } + return parser->current_result = ALLELE_FETCHED; + } + char *columns[VCF_NUM_COLUMNS]; + char *line_context; + while (getline(&parser->line_buffer, + &parser->buffer_size, + parser->file_handle) != -1) { + if (parser->line_buffer[0] != '#') { + columns[CHROM_COLUMN] = strtok_r(parser->line_buffer, "\t", + &line_context); + for (int i = 1; i < VCF_NUM_COLUMNS; ++i) { + columns[i] = strtok_r(NULL, "\t", &line_context); + } + strcpy(parser->current_chromosome, columns[CHROM_COLUMN]); + parser->current_allele.position = atoi(columns[POS_COLUMN]); + parser->current_allele.ref = columns[REF_COLUMN]; + + if (parser->flags & VCF_ONLY_SNPS) { + if (parser->current_allele.ref[1] != '\0') continue; + } + + for (size_t i = 0; i < parser->sample_num; ++i) { + char *gt = strtok_r(NULL, "\t", &line_context); + parser->genotypes[i] = get_genotype_code(gt); + } + parser->alt_alleles[0] = strtok_r(columns[ALT_COLUMN], ",", + &parser->allele_context); + parser->n_alts = 1; + while ((parser->alt_alleles[parser->n_alts] + = strtok_r(NULL, ",", &(parser->allele_context))) + != NULL) { + ++(parser->n_alts); + } + + qsort(parser->alt_alleles, parser->n_alts, + sizeof(char *), alt_comp); + + return fetch_next_allele(parser); + } + } + return parser->current_result = ALLELE_NOT_FETCHED; +} + +void goto_next_chromosome(VCF_PARSER *parser) +{ + char previous_chromosome[MAX_CHROMOSOME_NAME_LENGTH]; + strcpy(previous_chromosome, parser->current_chromosome); + while (fetch_next_allele(parser) != ALLELE_NOT_FETCHED) { + if (strcmp(parser->current_chromosome, previous_chromosome)) { + // Reached next chromosome + return; + } + } +} + +void close_parser(VCF_PARSER *parser) +{ + if (parser->line_buffer != NULL) { + free(parser->line_buffer); + } + free(parser->genotypes); + for (size_t i = 0; i < parser->sample_num; ++i) { + free(parser->samples[i]); + } + free(parser->samples); + fclose(parser->file_handle); + free(parser); +} diff --git a/src/vcf_writer.c b/src/vcf_writer.c new file mode 100644 index 0000000..2a1b1af --- /dev/null +++ b/src/vcf_writer.c @@ -0,0 +1,102 @@ +/* vcf_writer.c + + Copyright (C) 2019 Cranfield University + + Author: Tomasz Kurowski + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. */ + +#include "vcf_writer.h" + +#include "tersect_db_internal.h" +#include "version.h" + +#include + +/** + * Format strings for printing out variants as VCF lines. + * The positions in the array correspond to codes defined in snv.h. + * + * These are the 8 fixed fields required by the VCF format: + * CHROM (string) chromosome name + * POS (integer) reference position + * ID (string) identifier, not used by tersect (".") + * REF (string) reference base, only one character for SNVs + * ALT (string) alternate base, only one character for SNVs + * QUAL (float) quality, not used by tersect (".") + * FILTER (string) filter status, not used by tersect (".") + * INFO (string) additional information, not used by tersect (".") + */ +const char * const variant_format[] = { + "%s\t%u\t.\t%s\t.\t.\t.\n", // 0 non-SNV variant + "%s\t%u\t.\tA\tC\t.\t.\t.\n", // 1 SNV_A_C + "%s\t%u\t.\tA\tG\t.\t.\t.\n", // 2 SNV_A_G + "%s\t%u\t.\tA\tT\t.\t.\t.\n", // 3 SNV_A_T + "%s\t%u\t.\tC\tA\t.\t.\t.\n", // 4 SNV_C_A + "%s\t%u\t.\tC\tG\t.\t.\t.\n", // 5 SNV_C_G + "%s\t%u\t.\tC\tT\t.\t.\t.\n", // 6 SNV_C_T + "%s\t%u\t.\tG\tA\t.\t.\t.\n", // 7 SNV_G_A + "%s\t%u\t.\tG\tC\t.\t.\t.\n", // 8 SNV_G_C + "%s\t%u\t.\tG\tT\t.\t.\t.\n", // 9 SNV_G_T + "%s\t%u\t.\tT\tA\t.\t.\t.\n", // 10 SNV_T_A + "%s\t%u\t.\tT\tC\t.\t.\t.\n", // 11 SNV_T_C + "%s\t%u\t.\tT\tG\t.\t.\t.\n", // 12 SNV_T_G +}; + +void vcf_print_header(const char *command, size_t nregions, + char **region_strings) +{ + printf("##fileformat="VCF_FORMAT"\n"); + printf("##tersectVersion="TERSECT_VERSION"\n"); + printf("##tersectCommand=%s\n", command); + if (region_strings != NULL) { + printf("##tersectRegion="); + for (size_t i = 0; i < nregions; ++i) { + printf("%s", region_strings[i]); + if (i + 1 < nregions) printf(" "); + } + printf("\n"); + } + printf("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n"); +} + +static inline void print_snv(const tersect_db *tdb, const struct variant v, + const char *chr_name) +{ + if (v.type) { + // SNV + printf(variant_format[v.type], chr_name, v.position); + } else { + // InDel + printf(variant_format[0], chr_name, v.position, + (char *)(tdb->mapping + v.allele)); + } +} + +void vcf_print_bitarray(const tersect_db *tdb, const struct bitarray *ba, + const struct tersect_db_interval *ti) +{ + size_t allele_num; + uint64_t *allele_indices; + bitarray_get_set_indices(ba, &allele_num, &allele_indices); + for (size_t i = 0; i < allele_num; ++i) { + print_snv(tdb, ti->variants[allele_indices[i]], ti->chromosome.name); + } + free(allele_indices); // Allocated by bitarray_get_set_indices +} diff --git a/src/version.h.in b/src/version.h.in new file mode 100644 index 0000000..a45a0cc --- /dev/null +++ b/src/version.h.in @@ -0,0 +1,33 @@ +/* version.h + + Copyright (C) 2019 Cranfield University + + Author: Tomasz Kurowski + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. */ + +#ifndef VERSION_H +#define VERSION_H + +#define TERSECT_VERSION "@TERSECT_VERSION_TAG@" + +/* Has to be 13 characters long */ +#define TERSECT_FORMAT_VERSION "TersectDB 0.2" + +#endif diff --git a/src/view.c b/src/view.c new file mode 100644 index 0000000..0bb4141 --- /dev/null +++ b/src/view.c @@ -0,0 +1,124 @@ +/* view.c + + Copyright (C) 2019 Cranfield University + + Author: Tomasz Kurowski + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. */ + +#include "view.h" + +#include "ast.h" +#include "query.h" +#include "tersect_db.h" +#include "vcf_writer.h" + +#include +#include + +/* Local flags for view */ +#define NO_HEADERS 2 +static int local_flags = 0; + +static void usage(FILE *stream) +{ + fprintf(stream, + "\n" + "Usage: tersect view [options] [region]...\n\n" + "Options:\n" + " -h, --help print this help message\n" + " -n, --no-header skip VCF header\n" + "\n"); +} + +error_t tersect_view_set(int argc, char **argv) +{ + error_t rc = SUCCESS; + char *db_filename = NULL; + char *query = NULL; + char **region_strings = NULL; + size_t nregions = 0; + static struct option loptions[] = { + {"help", no_argument, NULL, 'h'}, + {"no-headers", no_argument, NULL, 'n'}, + {NULL, 0, NULL, 0} + }; + int c; + while ((c = getopt_long(argc, argv, ":hn", loptions, NULL)) != -1) { + switch(c) { + case 'h': + usage(stdout); + return SUCCESS; + case 'n': + local_flags |= NO_HEADERS; + break; + default: + usage(stderr); + return SUCCESS; + } + } + argc -= optind; + argv += optind; + if (!argc) { + // Missing Tersect index file + usage(stderr); + return E_NO_TSI_FILE; + } + db_filename = argv[0]; + if (argc == 1) { + // Missing set query + return E_VIEW_NO_QUERY; + } + query = argv[1]; + argc -= 2; + argv += 2; + if (argc) { + region_strings = argv; + nregions = argc; + } + tersect_db *tdb = tersect_db_open(db_filename); + if (tdb == NULL) return E_TSI_NOPEN; + struct genomic_interval *regions; + if (nregions) { + rc = tersect_db_parse_regions(tdb, nregions, region_strings, ®ions); + } else { + rc = tersect_db_get_regions(tdb, &nregions, ®ions); + } + if (rc != SUCCESS) goto cleanup_1; + struct ast_node *command = run_set_parser(query, tdb); + if (command == NULL) goto cleanup_2; + if (!(local_flags & NO_HEADERS)) { + vcf_print_header(query, nregions, region_strings); + } + for (size_t i = 0; i < nregions; ++i) { + struct tersect_db_interval ti; + tersect_db_get_interval(tdb, ®ions[i], &ti); + struct bitarray *result = eval_ast(command, tdb, &ti); + if (result == NULL) goto cleanup_3; + vcf_print_bitarray(tdb, result, &ti); + free_bitarray(result); + } +cleanup_3: + free_ast(command); +cleanup_2: + free(regions); +cleanup_1: + tersect_db_close(tdb); + return rc; +}