-
Notifications
You must be signed in to change notification settings - Fork 2
/
suave_bam_to_h5.py
executable file
·124 lines (97 loc) · 3.41 KB
/
suave_bam_to_h5.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
#! /usr/bin/env python
# 2014-2015 Markus Hsi-Yang Fritz
from __future__ import division
import pysam
import numpy as np
import h5py
import click
import re
import os.path
import math
import sys
try:
import mpi4py
from mpi4py import MPI
comm = MPI.COMM_WORLD
mpi_rank = comm.rank
mpi_size = comm.size
except ImportError:
mpi_rank = 0
mpi_size = 1
CHUNK_SIZE = 100
def gen_outfile_name(out, bam):
if out:
return out
bam = os.path.basename(bam)
if bam.endswith('.bam'):
return re.sub(r'bam$', 'h5', bam)
return bam + '.h5'
def gen_sample_name(smpl, bam):
if smpl:
return smpl
bam = os.path.basename(bam)
return re.sub(r'\.bam$', '', bam)
def bam_to_h5(bamfile,
sample=None,
compression=None,
outfile=None,
force=False):
out_fn = gen_outfile_name(outfile, bamfile)
sample_name = gen_sample_name(sample, bamfile)
if os.path.exists(out_fn) and not force:
click.echo('error: file "{}" exists; use --force to overwrite'\
.format(out_fn), err=True)
sys.exit(1)
if mpi_size == 1:
f_h5 = h5py.File(out_fn, 'w')
else:
f_h5 = h5py.File(out_fn, 'w', driver='mpio', comm=comm)
if mpi_rank == 0:
f_h5.attrs['sample'] = sample_name
f_bam = pysam.Samfile(bamfile)
kwargs = {}
if compression:
kwargs['compression'] = compression
chroms = [(sq['SN'], sq['LN']) for sq in f_bam.header['SQ']]
chroms_chunks = np.array_split(chroms, mpi_size)
mpi_rank_chroms_chunk = chroms_chunks[mpi_rank]
for chrom, chrom_len in mpi_rank_chroms_chunk:
chrom = str(chrom)
chrom_len = int(chrom_len)
click.echo('[t{}] processing {}'.format(mpi_rank, chrom), err=True)
f_h5.create_group(chrom)
f_h5[chrom].attrs['length'] = np.uint32(chrom_len)
nbins = int(math.ceil(chrom_len/CHUNK_SIZE))
cnts = np.zeros(nbins, dtype='uint32')
for read in f_bam.fetch(chrom):
if read.is_secondary or read.is_duplicate or read.is_qcfail:
continue
cnts[read.pos // CHUNK_SIZE] += 1
kwargs['data'] = cnts
f_h5.create_dataset('{}/read_counts'.format(chrom), **kwargs)
dset = f_h5['{}/read_counts'.format(chrom)]
dset.attrs['min'] = np.min(cnts)
dset.attrs['max'] = np.max(cnts)
dset.attrs['mean'] = np.mean(cnts)
dset.attrs['median'] = np.median(cnts)
dset.attrs['sum'] = np.sum(cnts)
# TODO actually, don't store this as it's easy/fast to compute on the fly
# store genome-wide read count for normalization
read_cnt_tot = 0
for chrom in f_h5:
read_cnt_tot += f_h5['{}/read_counts'.format(chrom)].attrs['sum']
f_h5.attrs['read_counts_sum'] = np.array(read_cnt_tot, dtype=np.uint64)
f_bam.close()
@click.command()
@click.option('-s', '--sample', help='sample name')
@click.option('-c', '--compression', type=click.Choice(['gzip', 'lzf']),
help='HDF5 file compression')
@click.option('-o', '--outfile', help='output HDF5 file name')
@click.option('-f', '--force/--no-force', default=False,
help='overwrite existing file')
@click.argument('bamfile')
def cli(bamfile, sample, compression, outfile, force):
"""Convert BAM to coverage HDF5."""
bam_to_h5(bamfile, sample, compression, outfile, force)
if __name__ == '__main__':
cli()