woltka_merge biom

qiita-spots · Sep 17, 2024 · 2e2df4b · 2e2df4b
1 parent 118729c
commit 2e2df4b
Show file tree

Hide file tree

Showing 3 changed files with 53 additions and 28 deletions.
diff --git a/qp_woltka/tests/test_woltka.py b/qp_woltka/tests/test_woltka.py
@@ -166,8 +166,8 @@ def test_woltka_to_array_rep82(self):
             '#SBATCH --mail-user "qiita.help@gmail.com"\n',
             f'#SBATCH --job-name merge-{job_id}\n',
             '#SBATCH -N 1\n',
-            '#SBATCH -n 1\n',
-            '#SBATCH --time 30:00:00\n',
+            '#SBATCH -n 12\n',
+            '#SBATCH --time 20:00:00\n',
             '#SBATCH --mem 140g\n',
             f'#SBATCH --output {out_dir}/merge-{job_id}.log\n',
             f'#SBATCH --error {out_dir}/merge-{job_id}.err\n',
@@ -180,9 +180,14 @@ def test_woltka_to_array_rep82(self):
             "sruns=`grep 'overall alignment rate' *.err | wc -l`\n",
             'if [[ ! -f "errors.log" && $sruns -eq "1" ]]; then\n',
             f'woltka_merge mxdx --base {out_dir}\n',
-            f'woltka classify -i {out_dir}/alignments -o {out_dir}/woltka '
-            f'--no-demux --lineage {database}.tax '
-            '--rank free,none --outcov coverages/\n',
+            f'mkdir -p {out_dir}/bioms\n',
+            f'for f in `ls {out_dir}/alignments/*.sam.xz`; do bname=`basename '
+            '${f/.sam.xz/}`; echo woltka classify -i $f -o '
+            f'{out_dir}/bioms/'
+            '${bname} --no-demux --lineage '
+            f'{database}.tax --rank none --outcov {out_dir}/coverages/; '
+            'done | parallel -j 12\n',
+            f'woltka_merge biom --base {out_dir}\n',
             f'cd {out_dir};\n',
             '\n',
             'cd alignments; tar -cvf ../alignment.tar *.sam.xz; cd ..; '
@@ -196,7 +201,7 @@ def test_woltka_to_array_rep82(self):
         # now let's test that if finished correctly
         sdir = 'qp_woltka/support_files/'
         mkdir(f'{out_dir}/woltka')
-        copyfile(f'{sdir}/none.biom', f'{out_dir}/woltka/none.biom')
+        copyfile(f'{sdir}/none.biom', f'{out_dir}/none.biom')
         copyfile(f'{sdir}/alignment.tar', f'{out_dir}/alignment.tar')
         copyfile(f'{sdir}/coverages.tgz', f'{out_dir}/coverages.tgz')
 
@@ -208,7 +213,7 @@ def test_woltka_to_array_rep82(self):
 
         exp = [
             ArtifactInfo('Per genome Predictions', 'BIOM',
-                         [(f'{out_dir}/woltka/none.biom', 'biom'),
+                         [(f'{out_dir}/none.biom', 'biom'),
                           (f'{out_dir}/alignment.tar', 'log'),
                           (f'{out_dir}/none/coverages.tgz', 'plain_text')])]
         self.assertCountEqual(ainfo, exp)
@@ -285,8 +290,8 @@ def test_woltka_to_array_wol(self):
             '#SBATCH --mail-user "qiita.help@gmail.com"\n',
             f'#SBATCH --job-name merge-{job_id}\n',
             '#SBATCH -N 1\n',
-            '#SBATCH -n 1\n',
-            '#SBATCH --time 30:00:00\n',
+            '#SBATCH -n 12\n',
+            '#SBATCH --time 20:00:00\n',
             '#SBATCH --mem 140g\n',
             f'#SBATCH --output {out_dir}/merge-{job_id}.log\n',
             f'#SBATCH --error {out_dir}/merge-{job_id}.err\n',
@@ -299,11 +304,19 @@ def test_woltka_to_array_wol(self):
             "sruns=`grep 'overall alignment rate' *.err | wc -l`\n",
             'if [[ ! -f "errors.log" && $sruns -eq "1" ]]; then\n',
             f'woltka_merge mxdx --base {out_dir}\n',
-            f'woltka classify -i {out_dir}/alignments -o {out_dir}/woltka '
-            f'--no-demux --lineage {database}.tax '
-            '--rank free,none --outcov coverages/\n',
-            f'woltka classify -i {out_dir}/alignments --no-demux -c '
-            f'{database}.coords -o per-gene.biom\n',
+            f'mkdir -p {out_dir}/bioms\n',
+            f'for f in `ls {out_dir}/alignments/*.sam.xz`; do bname=`basename '
+            '${f/.sam.xz/}`; echo woltka classify -i $f -o '
+            f'{out_dir}/bioms/'
+            '${bname} --no-demux --lineage '
+            f'{database}.tax --rank none --outcov {out_dir}/coverages/; '
+            'done | parallel -j 12\n',
+            f'for f in `ls {out_dir}/alignments/*.sam.xz`; do bname=`basename '
+            '${f/.sam.xz/}`; echo woltka classify -i $f -o '
+            f'{out_dir}/bioms/'
+            '${bname}/per-gene.biom --no-demux -c '
+            f'{database}.coords; done | parallel -j 12\n',
+            f'woltka_merge biom --base {out_dir}\n',
             f'cd {out_dir};\n',
             '\n',
             'cd alignments; tar -cvf ../alignment.tar *.sam.xz; cd ..; '
@@ -317,7 +330,7 @@ def test_woltka_to_array_wol(self):
         # now let's test that if finished correctly
         sdir = 'qp_woltka/support_files/'
         mkdir(f'{out_dir}/woltka')
-        copyfile(f'{sdir}/none.biom', f'{out_dir}/woltka/none.biom')
+        copyfile(f'{sdir}/none.biom', f'{out_dir}/none.biom')
         copyfile(f'{sdir}/per-gene.biom', f'{out_dir}/per-gene.biom')
         copyfile(f'{sdir}/alignment.tar', f'{out_dir}/alignment.tar')
         copyfile(f'{sdir}/coverages.tgz', f'{out_dir}/coverages.tgz')
@@ -329,7 +342,7 @@ def test_woltka_to_array_wol(self):
 
         exp = [
             ArtifactInfo('Per genome Predictions', 'BIOM',
-                         [(f'{out_dir}/woltka/none.biom', 'biom'),
+                         [(f'{out_dir}/none.biom', 'biom'),
                           (f'{out_dir}/alignment.tar', 'log'),
                           (f'{out_dir}/none/coverages.tgz', 'plain_text')]),
             ArtifactInfo('Per gene Predictions', 'BIOM',

diff --git a/qp_woltka/woltka.py b/qp_woltka/woltka.py
@@ -36,7 +36,7 @@
 BATCHSIZE = 50000000
 
 WALLTIME = '40:00:00'
-MERGE_WALLTIME = '30:00:00'
+MERGE_WALLTIME = '20:00:00'
 SYNDNA_WALLTIME = '8:00:00'
 
 
@@ -131,15 +131,25 @@ def woltka_to_array(files, output, database_bowtie2, prep, url, name):
             "coverage_percentage(['artifact.cov'], '"
             f'{db_files["length.map"]}' "')))\"")
 
-    ranks = ','.join(["free", "none"])
+    ranks = ','.join(["none"])
     woltka_cmds = [
-        f'woltka classify -i {output}/alignments -o {output}/woltka '
-        f'--no-demux --lineage {db_files["taxonomy"]} --rank {ranks} '
-        '--outcov coverages/']
+        # creating the output folder
+        f'mkdir -p {output}/bioms',
+        # executing the parallel classify
+        f'for f in `ls {output}/alignments/*.sam.xz`; '
+        'do bname=`basename ${f/.sam.xz/}`; echo woltka classify -i $f '
+        f'-o {output}/bioms/' '${bname} --no-demux --lineage '
+        f'{db_files["taxonomy"]} --rank {ranks} --outcov {output}/coverages/; '
+        'done | parallel -j 12']
+
     if db_files['gene_coordinates']:
         woltka_cmds.append(
-            f'woltka classify -i {output}/alignments '
-            f'--no-demux -c {db_files["gene_coordinates"]} -o per-gene.biom')
+            f'for f in `ls {output}/alignments/*.sam.xz`; '
+            'do bname=`basename ${f/.sam.xz/}`; echo woltka classify -i $f '
+            f'-o {output}/'
+            'bioms/${bname}/per-gene.biom --no-demux -c '
+            f'{db_files["gene_coordinates"]}; done | parallel -j 12')
+        woltka_cmds.append(f'woltka_merge biom --base {output}')
 
         wcdm = 'woltka tools collapse -i '
         dbfk = db_files['kegg']
@@ -161,13 +171,15 @@ def woltka_to_array(files, output, database_bowtie2, prep, url, name):
             woltka_cmds.append(
                 f'{wcdm} module.biom -m '
                 f'{dbfk["module-to-pathway.map"]} -o pathway.biom')
+    else:
+        woltka_cmds.append(f'woltka_merge biom --base {output}')
 
     lines = ['#!/bin/bash',
              '#SBATCH -p qiita',
              '#SBATCH --mail-user "qiita.help@gmail.com"',
              f'#SBATCH --job-name merge-{name}',
              '#SBATCH -N 1',
-             '#SBATCH -n 1',
+             '#SBATCH -n 12',
              f'#SBATCH --time {MERGE_WALLTIME}',
              f'#SBATCH --mem {MERGE_MEMORY}',
              f'#SBATCH --output {output}/merge-{name}.log',
@@ -289,7 +301,7 @@ def _coverage_copy(dest):
     errors = []
     ainfo = []
 
-    fp_biom = f'{out_dir}/woltka/none.biom'
+    fp_biom = f'{out_dir}/none.biom'
     fp_alng = f'{out_dir}/alignment.tar'
     if exists(fp_biom) and exists(fp_alng):
         ainfo.append(ArtifactInfo('Per genome Predictions', 'BIOM', [

diff --git a/scripts/woltka_merge b/scripts/woltka_merge
@@ -48,7 +48,7 @@ def biom(base):
     # this is the size that was used in the tests that kept a small ~2.5G
     # memory footprint
     chunk_size = 30
-    for rank in ('free.biom', 'none.biom', 'per-gene.biom'):
+    for rank in ('none.biom', 'per-gene.biom'):
         tables = glob(f'{base}/bioms/*/{rank}')
 
         if not tables:
@@ -81,8 +81,8 @@ def biom(base):
             else:
                 full = full.concat(loaded)
 
-        with h5py.File(f'{base}/{rank}', 'w') as out:
-            full.to_hdf5(out, 'fast-merge')
+            with h5py.File(f'{base}/{rank}', 'w') as out:
+                full.to_hdf5(out, 'fast-merge')
 
 
 cli.add_command(mxdx)