Skip to content

Commit

Permalink
Release 1.6: sort command; filtering and other improvements
Browse files Browse the repository at this point in the history
  • Loading branch information
daviesrob committed Sep 28, 2017
2 parents 04608ff + d564233 commit 7b813be
Show file tree
Hide file tree
Showing 52 changed files with 2,245 additions and 504 deletions.
4 changes: 3 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ OBJS = main.o vcfindex.o tabix.o \
vcfcnv.o HMM.o vcfplugin.o consensus.o ploidy.o bin.o hclust.o version.o \
regidx.o smpl_ilist.o csq.o vcfbuf.o \
mpileup.o bam2bcf.o bam2bcf_indel.o bam_sample.o \
vcfsort.o \
ccall.o em.o prob1.o kmin.o # the original samtools calling

prefix = /usr/local
Expand Down Expand Up @@ -92,7 +93,7 @@ endif

include config.mk

PACKAGE_VERSION = 1.5
PACKAGE_VERSION = 1.6

# If building from a Git repository, replace $(PACKAGE_VERSION) with the Git
# description of the working tree: either a release tag with the same value
Expand Down Expand Up @@ -202,6 +203,7 @@ vcfquery.o: vcfquery.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vc
vcfroh.o: vcfroh.c $(roh_h)
vcfcnv.o: vcfcnv.c $(cnv_h)
vcfsom.o: vcfsom.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(bcftools_h)
vcfsort.o: vcfsort.c $(htslib_vcf_h) $(bcftools_h)
vcfstats.o: vcfstats.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(htslib_faidx_h) $(bcftools_h) $(filter_h) $(bin_h)
vcfview.o: vcfview.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(bcftools_h) $(filter_h)
reheader.o: reheader.c $(htslib_vcf_h) $(htslib_bgzf_h) $(htslib_tbx_h) $(htslib_kseq_h) $(bcftools_h)
Expand Down
33 changes: 33 additions & 0 deletions NEWS
Original file line number Diff line number Diff line change
@@ -1,3 +1,36 @@
## Release 1.6 (September 2017)

* New `sort` command.

* New options added to the `consensus` command. Note that the `-i, --iupac`
option has been renamed to `-I, --iupac`, in favor of the standard
`-i, --include`.

* Filtering expressions (`-i/-e`): support for `GT=<type>` expressions and
for lists and ranges (#639) - see the man page for details.

* `csq`: relax some GFF3 parsing restrictions to enable using Ensembl
GFF3 files for plants (#667)

* `stats`: add further documentation to output stats files (#316) and
include haploid counts in per-sample output (#671).

* `plot-vcfstats`: further fixes for Python3 (@nsoranzo, #645, #666).

* `query` bugfix (#632)

* `+setGT` plugin: new option to set genotypes based on a two-tailed binomial
distribution test. Also, allow combining `-i/-e` with `-t q`.

* `mpileup`: fix typo (#636)

* `convert --gvcf2vcf` bugfix (#641)

* `+mendelian`: recognize some mendelian inconsistencies that were
being missed (@oronnavon, #660), also add support for multiallelic
sites and sex chromosomes.


## Release 1.5 (June 2017)

* Added autoconf support to bcftools. See `INSTALL` for more details.
Expand Down
1 change: 1 addition & 0 deletions config.mk.in
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
prefix = @prefix@
exec_prefix = @exec_prefix@
bindir = @bindir@
libexecdir = @libexecdir@
datarootdir = @datarootdir@
mandir = @mandir@

Expand Down
129 changes: 104 additions & 25 deletions consensus.c
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/* The MIT License
Copyright (c) 2014 Genome Research Ltd.
Copyright (c) 2014-2017 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
Expand Down Expand Up @@ -39,6 +39,16 @@
#include "regidx.h"
#include "bcftools.h"
#include "rbuf.h"
#include "filter.h"

// Logic of the filters: include or exclude sites which match the filters?
#define FLT_INCLUDE 1
#define FLT_EXCLUDE 2

#define PICK_REF 1
#define PICK_ALT 2
#define PICK_LONG 4
#define PICK_SHORT 8

typedef struct
{
Expand Down Expand Up @@ -75,12 +85,16 @@ typedef struct
chain_t *chain; // chain structure to store the sequence of ungapped blocks between the ref and alt sequences
// Note that the chain is re-initialised for each chromosome/seq_region

filter_t *filter;
char *filter_str;
int filter_logic; // include or exclude sites which match the filters? One of FLT_INCLUDE/FLT_EXCLUDE

bcf_srs_t *files;
bcf_hdr_t *hdr;
FILE *fp_out;
FILE *fp_chain;
char **argv;
int argc, output_iupac, haplotype, isample;
int argc, output_iupac, haplotype, allele, isample;
char *fname, *ref_fname, *sample, *output_fname, *mask_fname, *chain_fname;
}
args_t;
Expand Down Expand Up @@ -195,7 +209,7 @@ static void init_data(args_t *args)
args->isample = bcf_hdr_id2int(args->hdr,BCF_DT_SAMPLE,args->sample);
if ( args->isample<0 ) error("No such sample: %s\n", args->sample);
}
if ( args->haplotype && args->isample<0 )
if ( (args->haplotype || args->allele) && args->isample<0 )
{
if ( bcf_hdr_nsamples(args->hdr) > 1 ) error("The --sample option is expected with --haplotype\n");
args->isample = 0;
Expand All @@ -220,10 +234,14 @@ static void init_data(args_t *args)
if ( ! args->fp_out ) error("Failed to create %s: %s\n", args->output_fname, strerror(errno));
}
else args->fp_out = stdout;
if ( args->isample<0 ) fprintf(stderr,"Note: the --sample option not given, applying all records\n");
if ( args->filter_str )
args->filter = filter_init(args->hdr, args->filter_str);
}

static void destroy_data(args_t *args)
{
if (args->filter) filter_destroy(args->filter);
bcf_sr_destroy(args->files);
int i;
for (i=0; i<args->vcf_rbuf.m; i++)
Expand Down Expand Up @@ -287,9 +305,16 @@ static bcf1_t **next_vcf_line(args_t *args)
int i = rbuf_shift(&args->vcf_rbuf);
return &args->vcf_buf[i];
}
else if ( bcf_sr_next_line(args->files) )
while ( bcf_sr_next_line(args->files) )
{
if ( args->filter )
{
int is_ok = filter_test(args->filter, bcf_sr_get_line(args->files,0), NULL);
if ( args->filter_logic & FLT_EXCLUDE ) is_ok = is_ok ? 0 : 1;
if ( !is_ok ) continue;
}
return &args->files->readers[0].buffer[0];

}
return NULL;
}
static void unread_vcf_line(args_t *args, bcf1_t **rec_ptr)
Expand Down Expand Up @@ -358,33 +383,36 @@ static void apply_variant(args_t *args, bcf1_t *rec)
int i, ialt = 1;
if ( args->isample >= 0 )
{
bcf_unpack(rec, BCF_UN_FMT);
bcf_fmt_t *fmt = bcf_get_fmt(args->hdr, rec, "GT");
if ( !fmt ) return;

if ( fmt->type!=BCF_BT_INT8 )
error("Todo: GT field represented with BCF_BT_INT8, too many alleles at %s:%d?\n",bcf_seqname(args->hdr,rec),rec->pos+1);
uint8_t *ptr = fmt->p + fmt->size*args->isample;

if ( args->haplotype )
{
if ( args->haplotype > fmt->n ) error("Can't apply %d-th haplotype at %s:%d\n", args->haplotype,bcf_seqname(args->hdr,rec),rec->pos+1);
uint8_t *ignore, *ptr = fmt->p + fmt->size*args->isample + args->haplotype - 1;
ialt = bcf_dec_int1(ptr, fmt->type, &ignore);
ialt = ptr[args->haplotype-1];
if ( bcf_gt_is_missing(ialt) || ialt==bcf_int32_vector_end ) return;
ialt = bcf_gt_allele(ialt);
}
else if ( args->output_iupac )
{
uint8_t *ignore, *ptr = fmt->p + fmt->size*args->isample;
ialt = bcf_dec_int1(ptr, fmt->type, &ignore);
ialt = ptr[0];
if ( bcf_gt_is_missing(ialt) || ialt==bcf_int32_vector_end ) return;
ialt = bcf_gt_allele(ialt);

int jalt;
if ( fmt->n>1 )
{
ptr = fmt->p + fmt->size*args->isample + 1;
jalt = bcf_dec_int1(ptr, fmt->type, &ignore);
jalt = ptr[1];
if ( bcf_gt_is_missing(jalt) || jalt==bcf_int32_vector_end ) jalt = ialt;
else jalt = bcf_gt_allele(jalt);
}
else jalt = ialt;
if ( rec->n_allele <= ialt || rec->n_allele <= jalt ) error("Broken VCF, too few alts at %s:%d\n", bcf_seqname(args->hdr,rec),rec->pos+1);
if ( rec->n_allele <= ialt || rec->n_allele <= jalt ) error("Invalid VCF, too few ALT alleles at %s:%d\n", bcf_seqname(args->hdr,rec),rec->pos+1);
if ( ialt!=jalt && !rec->d.allele[ialt][1] && !rec->d.allele[jalt][1] ) // is this a het snp?
{
char ial = rec->d.allele[ialt][0];
Expand All @@ -394,13 +422,40 @@ static void apply_variant(args_t *args, bcf1_t *rec)
}
else
{
int is_hom = 1;
for (i=0; i<fmt->n; i++)
{
uint8_t *ignore, *ptr = fmt->p + fmt->size*args->isample + i;
ialt = bcf_dec_int1(ptr, fmt->type, &ignore);
if ( bcf_gt_is_missing(ialt) || ialt==bcf_int32_vector_end ) return;
ialt = bcf_gt_allele(ialt);
if ( ialt ) break;
if ( bcf_gt_is_missing(ptr[i]) ) return; // ignore missing or half-missing genotypes
if ( ptr[i]==bcf_int32_vector_end ) break;
ialt = bcf_gt_allele(ptr[i]);
if ( i>0 && ialt!=bcf_gt_allele(ptr[i-1]) ) { is_hom = 0; break; }
}
if ( !is_hom )
{
int prev_len = 0, jalt;
for (i=0; i<fmt->n; i++)
{
if ( ptr[i]==bcf_int32_vector_end ) break;
jalt = bcf_gt_allele(ptr[i]);
if ( rec->n_allele <= jalt ) error("Broken VCF, too few alts at %s:%d\n", bcf_seqname(args->hdr,rec),rec->pos+1);
if ( args->allele & (PICK_LONG|PICK_SHORT) )
{
int len = jalt==0 ? rec->rlen : strlen(rec->d.allele[jalt]);
if ( i==0 ) ialt = jalt, prev_len = len;
else if ( len == prev_len )
{
if ( args->allele & PICK_REF && jalt==0 ) ialt = jalt, prev_len = len;
else if ( args->allele & PICK_ALT && ialt==0 ) ialt = jalt, prev_len = len;
}
else if ( args->allele & PICK_LONG && len > prev_len ) ialt = jalt, prev_len = len;
else if ( args->allele & PICK_SHORT && len < prev_len ) ialt = jalt, prev_len = len;
}
else
{
if ( args->allele & PICK_REF && jalt==0 ) ialt = jalt;
else if ( args->allele & PICK_ALT && ialt==0 ) ialt = jalt;
}
}
}
}
if ( !ialt ) return; // ref allele
Expand Down Expand Up @@ -623,12 +678,21 @@ static void usage(args_t *args)
fprintf(stderr, " information, such as INFO/AD or FORMAT/AD.\n");
fprintf(stderr, "Usage: bcftools consensus [OPTIONS] <file.vcf>\n");
fprintf(stderr, "Options:\n");
fprintf(stderr, " -c, --chain <file> write a chain file for liftover\n");
fprintf(stderr, " -e, --exclude <expr> exclude sites for which the expression is true (see man page for details)\n");
fprintf(stderr, " -f, --fasta-ref <file> reference sequence in fasta format\n");
fprintf(stderr, " -H, --haplotype <1|2> apply variants for the given haplotype\n");
fprintf(stderr, " -i, --iupac-codes output variants in the form of IUPAC ambiguity codes\n");
fprintf(stderr, " -H, --haplotype <which> choose which allele to use from the FORMAT/GT field, note\n");
fprintf(stderr, " the codes are case-insensitive:\n");
fprintf(stderr, " 1: first allele from GT\n");
fprintf(stderr, " 2: second allele\n");
fprintf(stderr, " R: REF allele in het genotypes\n");
fprintf(stderr, " A: ALT allele\n");
fprintf(stderr, " LR,LA: longer allele and REF/ALT if equal length\n");
fprintf(stderr, " SR,SA: shorter allele and REF/ALT if equal length\n");
fprintf(stderr, " -i, --include <expr> select sites for which the expression is true (see man page for details)\n");
fprintf(stderr, " -I, --iupac-codes output variants in the form of IUPAC ambiguity codes\n");
fprintf(stderr, " -m, --mask <file> replace regions with N\n");
fprintf(stderr, " -o, --output <file> write output to a file [standard output]\n");
fprintf(stderr, " -c, --chain <file> write a chain file for liftover\n");
fprintf(stderr, " -s, --sample <name> apply variants of the given sample\n");
fprintf(stderr, "Examples:\n");
fprintf(stderr, " # Get the consensus for one region. The fasta header lines are then expected\n");
Expand All @@ -645,8 +709,10 @@ int main_consensus(int argc, char *argv[])

static struct option loptions[] =
{
{"exclude",required_argument,NULL,'e'},
{"include",required_argument,NULL,'i'},
{"sample",1,0,'s'},
{"iupac-codes",0,0,'i'},
{"iupac-codes",0,0,'I'},
{"haplotype",1,0,'H'},
{"output",1,0,'o'},
{"fasta-ref",1,0,'f'},
Expand All @@ -655,19 +721,32 @@ int main_consensus(int argc, char *argv[])
{0,0,0,0}
};
int c;
while ((c = getopt_long(argc, argv, "h?s:1iH:f:o:m:c:",loptions,NULL)) >= 0)
while ((c = getopt_long(argc, argv, "h?s:1Ii:e:H:f:o:m:c:",loptions,NULL)) >= 0)
{
switch (c)
{
case 's': args->sample = optarg; break;
case 'o': args->output_fname = optarg; break;
case 'i': args->output_iupac = 1; break;
case 'I': args->output_iupac = 1; break;
case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
case 'f': args->ref_fname = optarg; break;
case 'm': args->mask_fname = optarg; break;
case 'c': args->chain_fname = optarg; break;
case 'H':
args->haplotype = optarg[0] - '0';
if ( args->haplotype <=0 ) error("Expected positive integer with --haplotype\n");
if ( !strcasecmp(optarg,"R") ) args->allele |= PICK_REF;
else if ( !strcasecmp(optarg,"A") ) args->allele |= PICK_ALT;
else if ( !strcasecmp(optarg,"L") ) args->allele |= PICK_LONG|PICK_REF;
else if ( !strcasecmp(optarg,"S") ) args->allele |= PICK_SHORT|PICK_REF;
else if ( !strcasecmp(optarg,"LR") ) args->allele |= PICK_LONG|PICK_REF;
else if ( !strcasecmp(optarg,"LA") ) args->allele |= PICK_LONG|PICK_ALT;
else if ( !strcasecmp(optarg,"SR") ) args->allele |= PICK_SHORT|PICK_REF;
else if ( !strcasecmp(optarg,"SA") ) args->allele |= PICK_SHORT|PICK_ALT;
else
{
args->haplotype = optarg[0] - '0';
if ( args->haplotype <=0 ) error("Expected positive integer with --haplotype\n");
}
break;
default: usage(args); break;
}
Expand Down
Loading

0 comments on commit 7b813be

Please sign in to comment.