Skip to content

Commit

Permalink
woltka_merge biom
Browse files Browse the repository at this point in the history
  • Loading branch information
antgonza committed Sep 17, 2024
1 parent 118729c commit 2e2df4b
Show file tree
Hide file tree
Showing 3 changed files with 53 additions and 28 deletions.
45 changes: 29 additions & 16 deletions qp_woltka/tests/test_woltka.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,8 +166,8 @@ def test_woltka_to_array_rep82(self):
'#SBATCH --mail-user "qiita.help@gmail.com"\n',
f'#SBATCH --job-name merge-{job_id}\n',
'#SBATCH -N 1\n',
'#SBATCH -n 1\n',
'#SBATCH --time 30:00:00\n',
'#SBATCH -n 12\n',
'#SBATCH --time 20:00:00\n',
'#SBATCH --mem 140g\n',
f'#SBATCH --output {out_dir}/merge-{job_id}.log\n',
f'#SBATCH --error {out_dir}/merge-{job_id}.err\n',
Expand All @@ -180,9 +180,14 @@ def test_woltka_to_array_rep82(self):
"sruns=`grep 'overall alignment rate' *.err | wc -l`\n",
'if [[ ! -f "errors.log" && $sruns -eq "1" ]]; then\n',
f'woltka_merge mxdx --base {out_dir}\n',
f'woltka classify -i {out_dir}/alignments -o {out_dir}/woltka '
f'--no-demux --lineage {database}.tax '
'--rank free,none --outcov coverages/\n',
f'mkdir -p {out_dir}/bioms\n',
f'for f in `ls {out_dir}/alignments/*.sam.xz`; do bname=`basename '
'${f/.sam.xz/}`; echo woltka classify -i $f -o '
f'{out_dir}/bioms/'
'${bname} --no-demux --lineage '
f'{database}.tax --rank none --outcov {out_dir}/coverages/; '
'done | parallel -j 12\n',
f'woltka_merge biom --base {out_dir}\n',
f'cd {out_dir};\n',
'\n',
'cd alignments; tar -cvf ../alignment.tar *.sam.xz; cd ..; '
Expand All @@ -196,7 +201,7 @@ def test_woltka_to_array_rep82(self):
# now let's test that if finished correctly
sdir = 'qp_woltka/support_files/'
mkdir(f'{out_dir}/woltka')
copyfile(f'{sdir}/none.biom', f'{out_dir}/woltka/none.biom')
copyfile(f'{sdir}/none.biom', f'{out_dir}/none.biom')
copyfile(f'{sdir}/alignment.tar', f'{out_dir}/alignment.tar')
copyfile(f'{sdir}/coverages.tgz', f'{out_dir}/coverages.tgz')

Expand All @@ -208,7 +213,7 @@ def test_woltka_to_array_rep82(self):

exp = [
ArtifactInfo('Per genome Predictions', 'BIOM',
[(f'{out_dir}/woltka/none.biom', 'biom'),
[(f'{out_dir}/none.biom', 'biom'),
(f'{out_dir}/alignment.tar', 'log'),
(f'{out_dir}/none/coverages.tgz', 'plain_text')])]
self.assertCountEqual(ainfo, exp)
Expand Down Expand Up @@ -285,8 +290,8 @@ def test_woltka_to_array_wol(self):
'#SBATCH --mail-user "qiita.help@gmail.com"\n',
f'#SBATCH --job-name merge-{job_id}\n',
'#SBATCH -N 1\n',
'#SBATCH -n 1\n',
'#SBATCH --time 30:00:00\n',
'#SBATCH -n 12\n',
'#SBATCH --time 20:00:00\n',
'#SBATCH --mem 140g\n',
f'#SBATCH --output {out_dir}/merge-{job_id}.log\n',
f'#SBATCH --error {out_dir}/merge-{job_id}.err\n',
Expand All @@ -299,11 +304,19 @@ def test_woltka_to_array_wol(self):
"sruns=`grep 'overall alignment rate' *.err | wc -l`\n",
'if [[ ! -f "errors.log" && $sruns -eq "1" ]]; then\n',
f'woltka_merge mxdx --base {out_dir}\n',
f'woltka classify -i {out_dir}/alignments -o {out_dir}/woltka '
f'--no-demux --lineage {database}.tax '
'--rank free,none --outcov coverages/\n',
f'woltka classify -i {out_dir}/alignments --no-demux -c '
f'{database}.coords -o per-gene.biom\n',
f'mkdir -p {out_dir}/bioms\n',
f'for f in `ls {out_dir}/alignments/*.sam.xz`; do bname=`basename '
'${f/.sam.xz/}`; echo woltka classify -i $f -o '
f'{out_dir}/bioms/'
'${bname} --no-demux --lineage '
f'{database}.tax --rank none --outcov {out_dir}/coverages/; '
'done | parallel -j 12\n',
f'for f in `ls {out_dir}/alignments/*.sam.xz`; do bname=`basename '
'${f/.sam.xz/}`; echo woltka classify -i $f -o '
f'{out_dir}/bioms/'
'${bname}/per-gene.biom --no-demux -c '
f'{database}.coords; done | parallel -j 12\n',
f'woltka_merge biom --base {out_dir}\n',
f'cd {out_dir};\n',
'\n',
'cd alignments; tar -cvf ../alignment.tar *.sam.xz; cd ..; '
Expand All @@ -317,7 +330,7 @@ def test_woltka_to_array_wol(self):
# now let's test that if finished correctly
sdir = 'qp_woltka/support_files/'
mkdir(f'{out_dir}/woltka')
copyfile(f'{sdir}/none.biom', f'{out_dir}/woltka/none.biom')
copyfile(f'{sdir}/none.biom', f'{out_dir}/none.biom')
copyfile(f'{sdir}/per-gene.biom', f'{out_dir}/per-gene.biom')
copyfile(f'{sdir}/alignment.tar', f'{out_dir}/alignment.tar')
copyfile(f'{sdir}/coverages.tgz', f'{out_dir}/coverages.tgz')
Expand All @@ -329,7 +342,7 @@ def test_woltka_to_array_wol(self):

exp = [
ArtifactInfo('Per genome Predictions', 'BIOM',
[(f'{out_dir}/woltka/none.biom', 'biom'),
[(f'{out_dir}/none.biom', 'biom'),
(f'{out_dir}/alignment.tar', 'log'),
(f'{out_dir}/none/coverages.tgz', 'plain_text')]),
ArtifactInfo('Per gene Predictions', 'BIOM',
Expand Down
30 changes: 21 additions & 9 deletions qp_woltka/woltka.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
BATCHSIZE = 50000000

WALLTIME = '40:00:00'
MERGE_WALLTIME = '30:00:00'
MERGE_WALLTIME = '20:00:00'
SYNDNA_WALLTIME = '8:00:00'


Expand Down Expand Up @@ -131,15 +131,25 @@ def woltka_to_array(files, output, database_bowtie2, prep, url, name):
"coverage_percentage(['artifact.cov'], '"
f'{db_files["length.map"]}' "')))\"")

ranks = ','.join(["free", "none"])
ranks = ','.join(["none"])
woltka_cmds = [
f'woltka classify -i {output}/alignments -o {output}/woltka '
f'--no-demux --lineage {db_files["taxonomy"]} --rank {ranks} '
'--outcov coverages/']
# creating the output folder
f'mkdir -p {output}/bioms',
# executing the parallel classify
f'for f in `ls {output}/alignments/*.sam.xz`; '
'do bname=`basename ${f/.sam.xz/}`; echo woltka classify -i $f '
f'-o {output}/bioms/' '${bname} --no-demux --lineage '
f'{db_files["taxonomy"]} --rank {ranks} --outcov {output}/coverages/; '
'done | parallel -j 12']

if db_files['gene_coordinates']:
woltka_cmds.append(
f'woltka classify -i {output}/alignments '
f'--no-demux -c {db_files["gene_coordinates"]} -o per-gene.biom')
f'for f in `ls {output}/alignments/*.sam.xz`; '
'do bname=`basename ${f/.sam.xz/}`; echo woltka classify -i $f '
f'-o {output}/'
'bioms/${bname}/per-gene.biom --no-demux -c '
f'{db_files["gene_coordinates"]}; done | parallel -j 12')
woltka_cmds.append(f'woltka_merge biom --base {output}')

wcdm = 'woltka tools collapse -i '
dbfk = db_files['kegg']
Expand All @@ -161,13 +171,15 @@ def woltka_to_array(files, output, database_bowtie2, prep, url, name):
woltka_cmds.append(
f'{wcdm} module.biom -m '
f'{dbfk["module-to-pathway.map"]} -o pathway.biom')
else:
woltka_cmds.append(f'woltka_merge biom --base {output}')

lines = ['#!/bin/bash',
'#SBATCH -p qiita',
'#SBATCH --mail-user "qiita.help@gmail.com"',
f'#SBATCH --job-name merge-{name}',
'#SBATCH -N 1',
'#SBATCH -n 1',
'#SBATCH -n 12',
f'#SBATCH --time {MERGE_WALLTIME}',
f'#SBATCH --mem {MERGE_MEMORY}',
f'#SBATCH --output {output}/merge-{name}.log',
Expand Down Expand Up @@ -289,7 +301,7 @@ def _coverage_copy(dest):
errors = []
ainfo = []

fp_biom = f'{out_dir}/woltka/none.biom'
fp_biom = f'{out_dir}/none.biom'
fp_alng = f'{out_dir}/alignment.tar'
if exists(fp_biom) and exists(fp_alng):
ainfo.append(ArtifactInfo('Per genome Predictions', 'BIOM', [
Expand Down
6 changes: 3 additions & 3 deletions scripts/woltka_merge
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def biom(base):
# this is the size that was used in the tests that kept a small ~2.5G
# memory footprint
chunk_size = 30
for rank in ('free.biom', 'none.biom', 'per-gene.biom'):
for rank in ('none.biom', 'per-gene.biom'):
tables = glob(f'{base}/bioms/*/{rank}')

if not tables:
Expand Down Expand Up @@ -81,8 +81,8 @@ def biom(base):
else:
full = full.concat(loaded)

with h5py.File(f'{base}/{rank}', 'w') as out:
full.to_hdf5(out, 'fast-merge')
with h5py.File(f'{base}/{rank}', 'w') as out:
full.to_hdf5(out, 'fast-merge')


cli.add_command(mxdx)
Expand Down

0 comments on commit 2e2df4b

Please sign in to comment.