Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Version 0.9.2 #175

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,12 @@
## Unreleased


## Beta v0.9.2

Bug Fixes:
- Avoid holding alignment file handles open #173


## Beta v0.9.1

New Features:
Expand Down
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ RUN apt-get update \

RUN git clone https://github.com/PlantandFoodResearch/MCHap.git \
&& cd MCHap \
&& git checkout v0.9.1 \
&& git checkout v0.9.2 \
&& pip install -r requirements.txt \
&& python3 setup.py sdist \
&& python3 -m pip install dist/mchap-*tar.gz
Expand Down
2 changes: 1 addition & 1 deletion docs/assemble.rst
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ MCHap assemble

De novo assembly of micro-haplotypes.

*(Last updated for MCHap version 0.9.1)*
*(Last updated for MCHap version 0.9.2)*

Background
----------
Expand Down
2 changes: 1 addition & 1 deletion docs/call.rst
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ MCHap call

Calling genotypes from known haplotypes.

*(Last updated for MCHap version 0.9.1)*
*(Last updated for MCHap version 0.9.2)*

Background
----------
Expand Down
19 changes: 14 additions & 5 deletions mchap/application/arguments.py
Original file line number Diff line number Diff line change
Expand Up @@ -717,7 +717,9 @@ def parse_sample_pools(samples, sample_bams, sample_pool_argument):
return pools, pool_bams


def parse_sample_bam_paths(bam_argument, sample_pool_argument, read_group_field):
def parse_sample_bam_paths(
bam_argument, sample_pool_argument, read_group_field, reference_path
):
"""Combine arguments relating to sample bam file specification.

Parameters
Expand All @@ -738,7 +740,7 @@ def parse_sample_bam_paths(bam_argument, sample_pool_argument, read_group_field)
textfile = False
if len(bam_argument) == 1:
try:
pysam.AlignmentFile(bam_argument[0])
pysam.AlignmentFile(bam_argument[0], reference_filename=reference_path)
except ValueError:
# not a bam
textfile = True
Expand All @@ -747,7 +749,9 @@ def parse_sample_bam_paths(bam_argument, sample_pool_argument, read_group_field)
else:
bams = bam_argument
if not textfile:
sample_bams = extract_sample_ids(bams, id=read_group_field)
sample_bams = extract_sample_ids(
bams, id=read_group_field, reference_path=reference_path
)
samples = list(sample_bams)

# case of plain-text filepath
Expand All @@ -761,7 +765,9 @@ def parse_sample_bam_paths(bam_argument, sample_pool_argument, read_group_field)
if n_fields == 1:
# list of bam paths
bams = [line[0] for line in lines]
sample_bams = extract_sample_ids(bams, id=read_group_field)
sample_bams = extract_sample_ids(
bams, id=read_group_field, reference_path=reference_path
)
samples = list(sample_bams)
elif n_fields == 2:
# list of sample-bam pairs
Expand Down Expand Up @@ -871,7 +877,10 @@ def collect_default_program_arguments(arguments):
)
# merge sample specific data with defaults
samples, sample_bams = parse_sample_bam_paths(
arguments.bam, arguments.sample_pool[0], arguments.read_group_field[0]
arguments.bam,
arguments.sample_pool[0],
arguments.read_group_field[0],
reference_path=arguments.reference[0],
)
sample_ploidy = parse_sample_value_map(
arguments.ploidy[0],
Expand Down
51 changes: 21 additions & 30 deletions mchap/application/baseclass.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,16 +99,6 @@ def format_fields(self):
def loci(self):
raise NotImplementedError()

def alignment_files(self):
out = {}
for pool, pairs in self.sample_bams.items():
pairs = [
(sample, pysam.AlignmentFile(path, reference_filename=self.ref))
for sample, path in pairs
]
out[pool] = pairs
return out

def header_contigs(self):
with pysam.VariantFile(self.vcf) as f:
contigs = f.header.contigs.values()
Expand All @@ -135,7 +125,7 @@ def header(self):
header = meta_fields + contigs + filters + info_fields + format_fields + columns
return [str(line) for line in header]

def _locus_data(self, locus, alignment_files):
def _locus_data(self, locus, sample_bams):
"""Generate a LocusAssemblyData object for a given locus
to be populated with data relating to a single vcf record.
"""
Expand All @@ -144,7 +134,7 @@ def _locus_data(self, locus, alignment_files):
return LocusAssemblyData(
locus=locus,
samples=self.samples,
sample_bams=alignment_files,
sample_bams=sample_bams,
sample_ploidy=self.sample_ploidy,
sample_inbreeding=self.sample_inbreeding,
infofields=infofields,
Expand Down Expand Up @@ -186,19 +176,22 @@ def encode_sample_reads(self, data):
# of bam sample-path pairs.
pairs = data.sample_bams[sample]
read_chars, read_quals = [], []
for name, alignment_file in pairs:
chars, quals = extract_read_variants(
data.locus,
alignment_file=alignment_file,
samples=name,
id=self.read_group_field,
min_quality=self.mapping_quality,
skip_duplicates=self.skip_duplicates,
skip_qcfail=self.skip_qcfail,
skip_supplementary=self.skip_supplementary,
)[name]
read_chars.append(chars)
read_quals.append(quals)
for name, path in pairs:
with pysam.AlignmentFile(
path, reference_filename=self.ref
) as alignment_file:
chars, quals = extract_read_variants(
data.locus,
alignment_file=alignment_file,
samples=name,
id=self.read_group_field,
min_quality=self.mapping_quality,
skip_duplicates=self.skip_duplicates,
skip_qcfail=self.skip_qcfail,
skip_supplementary=self.skip_supplementary,
)[name]
read_chars.append(chars)
read_quals.append(quals)
read_chars = np.concatenate(read_chars)
read_quals = np.concatenate(read_quals)

Expand Down Expand Up @@ -343,7 +336,7 @@ def sumarise_vcf_record(self, data):
data.infodata["AOP"] = prob_occurring.round(self.precision)
return data

def call_locus(self, locus, alignment_files):
def call_locus(self, locus, sample_bams):
"""Call samples at a locus and formats resulting data
into a VCF record line.

Expand All @@ -366,19 +359,17 @@ def call_locus(self, locus, alignment_files):
VCF variant line.

"""
data = self._locus_data(locus, alignment_files)
data = self._locus_data(locus, sample_bams)
self.encode_sample_reads(data)
self.call_sample_genotypes(data)
self.sumarise_sample_genotypes(data)
self.sumarise_vcf_record(data)
return data.format_vcf_record()

def _assemble_loci_wrapped(self, loci):
# create single set of alignment files to use for every locus in the job
alignment_files = self.alignment_files()
for locus in loci:
try:
result = self.call_locus(locus, alignment_files)
result = self.call_locus(locus, self.sample_bams)
except Exception as e:
message = LOCUS_ASSEMBLY_ERROR.format(
name=locus.name,
Expand Down
105 changes: 53 additions & 52 deletions mchap/application/find_snvs.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,65 +215,69 @@ def _count_alleles(zeros, alleles):
return


def bam_samples(alignment_files, tag="SM"):
out = [None] * len(alignment_files)
for i, bam in enumerate(alignment_files):
read_groups = bam.header["RG"]
sample_id = read_groups[0][tag]
if len(read_groups) > 1:
for rg in read_groups:
if rg[tag] != sample_id:
raise ValueError(
"Expected one sample per bam but found {} and {} in {}".format(
sample_id, rg[tag], bam.filename.decode()
def bam_samples(bam_paths, reference_path, tag="SM"):
out = [None] * len(bam_paths)
for i, path in enumerate(bam_paths):
with pysam.AlignmentFile(path, reference_filename=reference_path) as bam:
read_groups = bam.header["RG"]
sample_id = read_groups[0][tag]
if len(read_groups) > 1:
for rg in read_groups:
if rg[tag] != sample_id:
raise ValueError(
"Expected one sample per bam but found {} and {} in {}".format(
sample_id, rg[tag], bam.filename.decode()
)
)
)
out[i] = sample_id
out[i] = sample_id
return np.array(out)


def bam_region_depths(
alignment_files,
bam_paths,
reference_path,
contig,
start,
stop,
dtype=np.int64,
**kwargs,
):
n_samples = len(alignment_files)
n_samples = len(bam_paths)
n_pos = stop - start
shape = (n_pos, n_samples, 4)
depths = np.zeros(shape, dtype=dtype)
for j, bam in enumerate(alignment_files):
for column in bam.pileup(
contig=contig,
start=start,
stop=stop,
truncate=True,
multiple_iterators=False,
**kwargs,
):
# if start <= column.pos < stop:
i = column.pos - start
alleles = column.get_query_sequences()
if isinstance(alleles, list):
alleles = bases_to_indices(alleles)
_count_alleles(depths[i, j], alleles)
for j, path in enumerate(bam_paths):
with pysam.AlignmentFile(path, reference_filename=reference_path) as bam:
for column in bam.pileup(
contig=contig,
start=start,
stop=stop,
truncate=True,
multiple_iterators=False,
**kwargs,
):
# if start <= column.pos < stop:
i = column.pos - start
alleles = column.get_query_sequences()
if isinstance(alleles, list):
alleles = bases_to_indices(alleles)
_count_alleles(depths[i, j], alleles)
return depths


def write_vcf_header(
command, reference, info_fields=None, format_fields=None, samples=None
command, reference_path, info_fields=None, format_fields=None, samples=None
):
vcfversion_header = str(headermeta.fileformat("v4.3"))
date_header = str(headermeta.filedate())
source_header = str(headermeta.source())
command_header = str(headermeta.commandline(command))
reference_header = str(headermeta.reference(reference.filename.decode()))
contig_header = "\n".join(
str(headermeta.ContigHeader(s, i))
for s, i in zip(reference.references, reference.lengths)
)
with pysam.FastaFile(reference_path) as reference:
reference_header = str(headermeta.reference(reference.filename.decode()))
contig_header = "\n".join(
str(headermeta.ContigHeader(s, i))
for s, i in zip(reference.references, reference.lengths)
)
components = [
vcfversion_header,
date_header,
Expand Down Expand Up @@ -400,8 +404,8 @@ def write_vcf_block(
contig,
start,
stop,
reference,
alignment_files,
reference_path,
bam_paths,
# sample_ploidy,
# sample_inbreeding,
# base_error_rate,
Expand All @@ -420,10 +424,12 @@ def write_vcf_block(
assert start < stop
variant_position = np.arange(start, stop)
variant_contig = np.full(len(variant_position), contig)
variant_reference = np.array(list(reference.fetch(contig, start, stop).upper()))
with pysam.FastaFile(reference_path) as reference:
variant_reference = np.array(list(reference.fetch(contig, start, stop).upper()))
variant_reference_index = bases_to_indices(variant_reference)
allele_depth = bam_region_depths(
alignment_files,
bam_paths,
reference_path,
contig,
start,
stop,
Expand Down Expand Up @@ -587,9 +593,8 @@ def main(command):
bed = pd.read_table(bed_path, header=None)[[0, 1, 2]]
bed.columns = ["contig", "start", "stop"]
reference_path = args.reference[0]
reference = pysam.Fastafile(reference_path)
samples, sample_bams = arguments.parse_sample_bam_paths(
args.bam, None, args.read_group_field[0]
args.bam, None, args.read_group_field[0], reference_path=reference_path
)
# sample_ploidy = arguments.parse_sample_value_map(
# args.ploidy[0],
Expand All @@ -611,13 +616,9 @@ def main(command):
# create alignment file objects and reuse them throughout
# this is important for cram performance!
# also pass reference name explicitly for robustness
alignment_files = [
pysam.AlignmentFile(path, reference_filename=reference_path)
for path in bam_paths
]
samples_found = bam_samples(alignment_files, tag=args.read_group_field[0]).astype(
"U"
)
samples_found = bam_samples(
bam_paths, reference_path, tag=args.read_group_field[0]
).astype("U")
mismatch = samples_found != samples
if np.any(mismatch):
raise IOError(
Expand All @@ -630,7 +631,7 @@ def main(command):
format_fields = [formatfields.GT, formatfields.AD]
write_vcf_header(
command,
reference,
reference_path,
samples=samples,
info_fields=info_fields,
format_fields=format_fields,
Expand All @@ -641,8 +642,8 @@ def main(command):
interval.contig,
interval.start,
interval.stop,
reference,
alignment_files,
reference_path,
bam_paths,
# sample_ploidy,
# sample_inbreeding,
# base_error_rate=args.base_error_rate[0],
Expand Down
4 changes: 2 additions & 2 deletions mchap/io/bam.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
ID_TAGS = {"ID", "SM"}


def extract_sample_ids(bam_paths, id="SM"):
def extract_sample_ids(bam_paths, id="SM", reference_path=None):
"""Extract sample id's from a list of bam files.

Parameters
Expand All @@ -38,7 +38,7 @@ def extract_sample_ids(bam_paths, id="SM"):
assert id in ID_TAGS
data = {}
for path in bam_paths:
bam = pysam.AlignmentFile(path)
bam = pysam.AlignmentFile(path, reference_filename=reference_path)
# allow multiple read groups for a single sample within a bam file
# but guard against duplicate sample identifier across multiple bams
bam_data = {read_group[id]: path for read_group in bam.header["RG"]}
Expand Down
Loading
Loading