diff --git a/.github/workflows/qiita-plugin-ci.yml b/.github/workflows/qiita-plugin-ci.yml
index a9ffe28..7d4a8ab 100644
--- a/.github/workflows/qiita-plugin-ci.yml
+++ b/.github/workflows/qiita-plugin-ci.yml
@@ -76,8 +76,8 @@ jobs:
shell: bash -l {0}
run: |
conda config --add channels bioconda
- conda create --yes -n qp-woltka-0.1.4 python=3.9 biom-format bowtie2
- conda activate qp-woltka-0.1.4
+ conda create --yes -n qp-woltka python=3.9 biom-format bowtie2
+ conda activate qp-woltka
# bowtie2==2.5.0 is what's installed in main qiita so making sure we keep
# using the same version for consistency; note that we will need to force
conda install --yes -c bioconda bowtie2==2.5.0
@@ -87,7 +87,7 @@ jobs:
pip --quiet install -U pip pip nose flake8
pip --quiet install https://github.com/qiita-spots/qiita_client/archive/master.zip
- export QIITA_SERVER_CERT=`pwd`/qiita-dev/qiita_core/support_files/ci_rootca.crt
+ export QIITA_ROOTCA_CERT=`pwd`/qiita-dev/qiita_core/support_files/ci_rootca.crt
export QIITA_CONFIG_FP=`pwd`/qiita-dev/qiita_core/support_files/config_test_local.cfg
export QC_WOLTKA_DB_DP=$PWD/qp_woltka/databases/woltka/
export QC_WOLTKA_SYNDNA_DB_DP=$PWD/qp_woltka/databases/synDNA
@@ -96,12 +96,12 @@ jobs:
woltka_version=`woltka --version`
bowtie2_version=`bowtie2 --version`
- if [[ $woltka_version != *"0.1.4"* ]]; then echo "wrong woltka version", $woltka_version; exit 1; fi
+ if [[ $woltka_version != *"0.1.6"* ]]; then echo "wrong woltka version", $woltka_version; exit 1; fi
if [[ $bowtie2_version != *"2.5.0"* ]]; then echo "wrong bowtie2 version", $bowtie2_version; exit 1; fi
export ENVIRONMENT="source /home/runner/.profile; conda activate qp-woltka; export QC_WOLTKA_DB_DP=$PWD/qp_woltka/databases/woltka/; export QC_WOLTKA_SYNDNA_DB_DP=$PWD/qp_woltka/databases/synDNA"
- configure_woltka --env-script 'source /home/runner/.profile; conda activate qp-woltka-0.1.4; export QC_WOLTKA_DB_DP=$PWD/qp_woltka/databases/woltka/; export QC_WOLTKA_SYNDNA_DB_DP=$PWD/qp_woltka/databases/synDNA; export ENVIRONMENT="source /home/runner/.profile; conda activate qp-woltka-0.1.4; export QC_WOLTKA_DB_DP=$PWD/qp_woltka/databases/woltka/; export QC_WOLTKA_SYNDNA_DB_DP=$PWD/qp_woltka/databases/synDNA"' --server-cert $QIITA_SERVER_CERT
+ configure_woltka --env-script 'source /home/runner/.profile; conda activate qp-woltka; export QC_WOLTKA_DB_DP=$PWD/qp_woltka/databases/woltka/; export QC_WOLTKA_SYNDNA_DB_DP=$PWD/qp_woltka/databases/synDNA; export ENVIRONMENT="source /home/runner/.profile; conda activate qp-woltka; export QC_WOLTKA_DB_DP=$PWD/qp_woltka/databases/woltka/; export QC_WOLTKA_SYNDNA_DB_DP=$PWD/qp_woltka/databases/synDNA"' --ca-cert $QIITA_ROOTCA_CERT
echo "Available Qiita plugins"
ls ~/.qiita_plugins/
@@ -110,7 +110,7 @@ jobs:
shell: bash -l {0}
run: |
conda activate qiita
- export QIITA_SERVER_CERT=`pwd`/qiita-dev/qiita_core/support_files/ci_rootca.crt
+ export QIITA_ROOTCA_CERT=`pwd`/qiita-dev/qiita_core/support_files/ci_rootca.crt
export QIITA_CONFIG_FP=`pwd`/qiita-dev/qiita_core/support_files/config_test_local.cfg
sed "s#/home/runner/work/qiita/qiita#${PWD}/qiita-dev/#g" `pwd`/qiita-dev/qiita_core/support_files/config_test.cfg > ${QIITA_CONFIG_FP}
@@ -141,10 +141,10 @@ jobs:
env:
COVER_PACKAGE: ${{ matrix.cover_package }}
run: |
- conda activate qp-woltka-0.1.4
- export QIITA_SERVER_CERT=`pwd`/qiita-dev/qiita_core/support_files/ci_rootca.crt
+ conda activate qp-woltka
+ export QIITA_ROOTCA_CERT=`pwd`/qiita-dev/qiita_core/support_files/ci_rootca.crt
export QIITA_CONFIG_FP=`pwd`/qiita-dev/qiita_core/support_files/config_test_local.cfg
- export ENVIRONMENT="source /home/runner/.profile; conda activate qp-woltka-0.1.4; export QC_WOLTKA_DB_DP=$PWD/qp_woltka/databases/woltka/; export QC_WOLTKA_SYNDNA_DB_DP=$PWD/qp_woltka/databases/synDNA"
+ export ENVIRONMENT="source /home/runner/.profile; conda activate qp-woltka; export QC_WOLTKA_DB_DP=$PWD/qp_woltka/databases/woltka/; export QC_WOLTKA_SYNDNA_DB_DP=$PWD/qp_woltka/databases/synDNA"
export QC_WOLTKA_DB_DP=$PWD/qp_woltka/databases/woltka/
export QC_WOLTKA_SYNDNA_DB_DP=$PWD/qp_woltka/databases/synDNA
export PYTHONWARNINGS="ignore:Certificate for localhost has no \`subjectAltName\`"
diff --git a/qp_woltka/__init__.py b/qp_woltka/__init__.py
index cc43f06..516a3bf 100644
--- a/qp_woltka/__init__.py
+++ b/qp_woltka/__init__.py
@@ -27,7 +27,6 @@
}
outputs = {
# taxonomic
- 'Alignment Profile': 'BIOM',
'Per genome Predictions': 'BIOM',
'Per gene Predictions': 'BIOM',
# functional
@@ -37,7 +36,8 @@
}
dflt_param_set = generate_woltka_dflt_params()
woltka_cmd = QiitaCommand(
- 'Woltka v0.1.4', "Functional and Taxonomic Predictions", woltka,
+ 'Woltka v0.1.6, paired-end',
+ "Functional and Taxonomic Predictions", woltka,
req_params, opt_params, outputs, dflt_param_set)
plugin.register_command(woltka_cmd)
diff --git a/qp_woltka/support_files/files_list.tsv b/qp_woltka/support_files/files_list.tsv
new file mode 100644
index 0000000..2d896f8
--- /dev/null
+++ b/qp_woltka/support_files/files_list.tsv
@@ -0,0 +1,3 @@
+filename_1 filename_2 record_count
+/tmp/folder/S22205_S104_L001_R1_001.fastq.gz /tmp/folder/S22205_S104_L001_R2_001.fastq.gz 1000000
+/tmp/folder/S22282_S102_L001_R1_001.fastq.gz /tmp/folder/S22282_S102_L001_R2_001.fastq.gz 1000000
diff --git a/qp_woltka/support_files/free.biom b/qp_woltka/support_files/free.biom
deleted file mode 100644
index 14bb00a..0000000
Binary files a/qp_woltka/support_files/free.biom and /dev/null differ
diff --git a/qp_woltka/support_files/summary.html b/qp_woltka/support_files/summary.html
new file mode 100644
index 0000000..0e267df
--- /dev/null
+++ b/qp_woltka/support_files/summary.html
@@ -0,0 +1,36 @@
+
+
+
+ filename |
+ md5 |
+ file_type |
+ reads |
+
+
+
+
+ S22205_S104_L001_R1_001.fastq.gz |
+ 9dcfb0c77674fdada176262963196db0 |
+ raw_forward_seqs |
+ 1000000 |
+
+
+ S22282_S102_L001_R1_001.fastq.gz |
+ 9dcfb0c77674fdada176262963196db0 |
+ raw_forward_seqs |
+ 1000000 |
+
+
+ S22205_S104_L001_R2_001.fastq.gz |
+ 9dcfb0c77674fdada176262963196db0 |
+ raw_reverse_seqs |
+ 1000000 |
+
+
+ S22282_S102_L001_R2_001.fastq.gz |
+ 9dcfb0c77674fdada176262963196db0 |
+ raw_reverse_seqs |
+ 1000000 |
+
+
+
diff --git a/qp_woltka/tests/test_util.py b/qp_woltka/tests/test_util.py
index acc8285..88daea0 100644
--- a/qp_woltka/tests/test_util.py
+++ b/qp_woltka/tests/test_util.py
@@ -9,13 +9,9 @@
from unittest import main, TestCase
from os import environ
from os.path import join
-from tempfile import TemporaryDirectory
-import gzip
-import io
-import pandas as pd
from qp_woltka.util import (
- get_dbs, generate_woltka_dflt_params, mux, demux, search_by_filename,
+ get_dbs, generate_woltka_dflt_params, search_by_filename,
merge_ranges, coverage_percentage)
@@ -38,41 +34,6 @@ def test_generate_woltka_dflt_params(self):
self.assertDictEqual(obs, exp)
- def test_mux(self):
- f1 = b"@foo\nATGC\n+\nIIII\n"
- f2 = b"@bar\nAAAA\n+\nIIII\n"
- exp = b"@foo@@@foofile\nATGC\n+\nIIII\n@bar@@@barfile\nAAAA\n+\nIIII\n"
- with TemporaryDirectory() as d:
- f1fp = join(d, 'foofile.fastq')
- f2fp = join(d, 'barfile.fastq')
- ofp = join(d, 'output')
- with gzip.open(f1fp, 'wb') as fp:
- fp.write(f1)
- with gzip.open(f2fp, 'wb') as fp:
- fp.write(f2)
- with open(ofp, 'wb') as output:
- mux([f1fp, f2fp], output)
- with open(ofp, 'rb') as result:
- obs = result.read()
-
- self.assertEqual(obs, exp)
-
- def test_demux(self):
- prep = pd.DataFrame([['sample_foo', 'foofile'],
- ['sample_bar', 'barfile']],
- columns=['sample_name', 'run_prefix'])
- input_ = io.BytesIO(b"foo@@@foofile_R1\tATGC\t+\tIIII\nbar@@@"
- b"barfile_R2\tAAAA\t+\tIIII\n")
- expfoo = b"foo\tATGC\t+\tIIII\n"
- expbar = b"bar\tAAAA\t+\tIIII\n"
- with TemporaryDirectory() as d:
- demux(input_, d.encode('ascii'), prep)
- foo = open(join(d, 'sample_foo.sam'), 'rb').read()
- bar = open(join(d, 'sample_bar.sam'), 'rb').read()
-
- self.assertEqual(foo, expfoo)
- self.assertEqual(bar, expbar)
-
def test_search_by_filename(self):
lookup = {'foo_bar': 'baz',
'foo': 'bar'}
diff --git a/qp_woltka/tests/test_woltka.py b/qp_woltka/tests/test_woltka.py
index 6f5383b..f4f69a4 100644
--- a/qp_woltka/tests/test_woltka.py
+++ b/qp_woltka/tests/test_woltka.py
@@ -59,18 +59,21 @@ def _helper_woltka_bowtie(self, prep_info_dict, database=None):
fp1_2 = join(in_dir, 'S22205_S104_L001_R2_001.fastq.gz')
fp2_1 = join(in_dir, 'S22282_S102_L001_R1_001.fastq.gz')
fp2_2 = join(in_dir, 'S22282_S102_L001_R2_001.fastq.gz')
+ fp_summary = join(in_dir, 'summary.html')
source_dir = 'qp_woltka/support_files'
copyfile(f'{source_dir}/S22205_S104_L001_R1_001.fastq.gz', fp1_1)
copyfile(f'{source_dir}/S22205_S104_L001_R2_001.fastq.gz', fp1_2)
copyfile(f'{source_dir}/S22282_S102_L001_R1_001.fastq.gz', fp2_1)
copyfile(f'{source_dir}/S22282_S102_L001_R2_001.fastq.gz', fp2_2)
+ copyfile(f'{source_dir}/summary.html', fp_summary)
data = {
'filepaths': dumps([
(fp1_1, 'raw_forward_seqs'),
(fp1_2, 'raw_reverse_seqs'),
(fp2_1, 'raw_forward_seqs'),
- (fp2_2, 'raw_reverse_seqs')]),
+ (fp2_2, 'raw_reverse_seqs'),
+ (fp_summary, 'html_summary')]),
'type': "per_sample_FASTQ",
'name': "Test Woltka artifact",
'prep': pid}
@@ -82,7 +85,9 @@ def _helper_woltka_bowtie(self, prep_info_dict, database=None):
self.params['Database'] = database
data = {'user': 'demo@microbio.me',
- 'command': dumps(['qp-woltka', '2023.11', 'Woltka v0.1.4']),
+ 'command': dumps(
+ ['qp-woltka', '2024.09',
+ 'Woltka v0.1.6, paired-end']),
'status': 'running',
'parameters': dumps(self.params)}
job_id = self.qclient.post(
@@ -101,6 +106,8 @@ def test_woltka_to_array_rep82(self):
self._clean_up_files.append(out_dir)
files, prep = self.qclient.artifact_and_preparation_files(aid)
+ html_summary = self.qclient.get_artifact_html_summary(aid)
+ files['html_summary'] = html_summary
url = 'this-is-my-url'
database = self.params['Database']
@@ -120,14 +127,15 @@ def test_woltka_to_array_rep82(self):
'#!/bin/bash\n',
'#SBATCH -p qiita\n',
'#SBATCH --mail-user "qiita.help@gmail.com"\n',
+ '#SBATCH --mail-type=FAIL,TIME_LIMIT_80,INVALID_DEPEND\n',
f'#SBATCH --job-name {job_id}\n',
'#SBATCH -N 1\n',
'#SBATCH -n 8\n',
'#SBATCH --time 40:00:00\n',
- '#SBATCH --mem 90g\n',
+ '#SBATCH --mem 70g\n',
f'#SBATCH --output {out_dir}/{job_id}_%a.log\n',
f'#SBATCH --error {out_dir}/{job_id}_%a.err\n',
- '#SBATCH --array 1-1%8\n',
+ '#SBATCH --array 0-0%8\n',
f'cd {out_dir}\n',
f'prep_full_path={prep_file}\n',
f'{self.environment}\n',
@@ -137,27 +145,18 @@ def test_woltka_to_array_rep82(self):
f'dbbase={dirname(database)}\n',
f'dbname={basename(database)}\n',
f'output={out_dir}\n',
- 'files=`cat sample_details_${SLURM_ARRAY_TASK_ID}.txt`\n',
- 'mux ${files} | bowtie2 -p 8 -x '
- f'{database} -q - --seed 42 '
- '--very-sensitive -k 16 --np 1 --mp "1,1" --rdg "0,1" --rfg "0,1" '
- '--score-min "L,0,-0.05" --no-head --no-unal | cut -f1-9 | '
- 'sed \'s/$/\t*\t*/\' | demux ${output} '
- f'{prep_file} | sort | uniq > '
- 'sample_processing_${SLURM_ARRAY_TASK_ID}.log\n',
- '# for each one of our input files, form woltka commands, \n',
- '# and farm off to gnu parallel\n',
- 'for f in `cat sample_processing_${SLURM_ARRAY_TASK_ID}.log`\n',
- 'do\n',
- ' echo "woltka classify -i ${f} -o ${f}.woltka-taxa --no-demux '
- f'--lineage {database}.tax --rank free,none --outcov '
- 'coverages/"\n',
- 'done | parallel -j 8\n',
- 'for f in `cat sample_processing_${SLURM_ARRAY_TASK_ID}.log`\n',
- 'do\n',
- ' # compress it\n',
- ' echo "xz -1 -T1 ${f}"\n',
- 'done | parallel -j 8\n',
+ 'bt2_cores=6\n',
+ f'mxdx mux --file-map {out_dir}/files_list.tsv --batch '
+ '${SLURM_ARRAY_TASK_ID} --batch-size 50000000 '
+ '--paired-handling interleave | '
+ 'bowtie2 -p ${bt2_cores} -x '
+ f'{database} --interleaved - --seed 42 --very-sensitive -k 16 '
+ '--np 1 --mp "1,1" --rdg "0,1" --rfg "0,1" '
+ '--score-min "L,0,-0.05" --no-head --no-unal --no-exact-upfront '
+ "--no-1mm-upfront | cut -f1-9 | sed \'s/$/\t*\t*/' | mxdx demux "
+ f'--file-map {out_dir}/files_list.tsv '
+ '--batch ${SLURM_ARRAY_TASK_ID} --batch-size 50000000 '
+ f'--output-base {out_dir}/alignments --extension sam.xz\n',
'date\n']
self.assertEqual(main, exp_main)
@@ -167,7 +166,7 @@ def test_woltka_to_array_rep82(self):
'#SBATCH --mail-user "qiita.help@gmail.com"\n',
f'#SBATCH --job-name merge-{job_id}\n',
'#SBATCH -N 1\n',
- '#SBATCH -n 2\n',
+ '#SBATCH -n 1\n',
'#SBATCH --time 30:00:00\n',
'#SBATCH --mem 140g\n',
f'#SBATCH --output {out_dir}/merge-{job_id}.log\n',
@@ -179,15 +178,14 @@ def test_woltka_to_array_rep82(self):
'echo $SLURM_JOBID\n',
'set -e\n',
"sruns=`grep 'overall alignment rate' *.err | wc -l`\n",
- "sjobs=`ls sample_details_* | wc -l`\n",
- 'if [[ ! -f "errors.log" && $sruns -eq $sjobs ]]; then\n',
- f'woltka_merge --base {out_dir} --name '
- 'free --glob "*.woltka-taxa/free.biom" &\n',
- f'woltka_merge --base {out_dir} --name '
- 'none --glob "*.woltka-taxa/none.biom" --rename &\n',
- 'wait\n',
+ 'if [[ ! -f "errors.log" && $sruns -eq "1" ]]; then\n',
+ f'woltka_merge --base {out_dir}\n',
+ f'woltka classify -i {out_dir}/alignments -o {out_dir}/woltka '
+ f'--no-demux --lineage {database}.tax '
+ '--rank free,none --outcov coverages/\n',
+ f'cd {out_dir};\n',
'\n',
- f'cd {out_dir}; tar -cvf alignment.tar *.sam.xz; '
+ 'cd alignments; tar -cvf ../alignment.tar *.sam.xz; cd ..; '
'tar zcvf coverages.tgz coverage_percentage.txt artifact.cov '
'coverages\n',
'fi\n',
@@ -197,8 +195,8 @@ def test_woltka_to_array_rep82(self):
# now let's test that if finished correctly
sdir = 'qp_woltka/support_files/'
- copyfile(f'{sdir}/none.biom', f'{out_dir}/none.biom')
- copyfile(f'{sdir}/free.biom', f'{out_dir}/free.biom')
+ mkdir(f'{out_dir}/woltka')
+ copyfile(f'{sdir}/none.biom', f'{out_dir}/woltka/none.biom')
copyfile(f'{sdir}/alignment.tar', f'{out_dir}/alignment.tar')
copyfile(f'{sdir}/coverages.tgz', f'{out_dir}/coverages.tgz')
@@ -209,13 +207,9 @@ def test_woltka_to_array_rep82(self):
self.assertTrue(success)
exp = [
- ArtifactInfo('Alignment Profile', 'BIOM',
- [(f'{out_dir}/free.biom', 'biom'),
- (f'{out_dir}/alignment.tar', 'log'),
- (f'{out_dir}/alignment/coverages.tgz',
- 'plain_text')]),
ArtifactInfo('Per genome Predictions', 'BIOM',
- [(f'{out_dir}/none.biom', 'biom'),
+ [(f'{out_dir}/woltka/none.biom', 'biom'),
+ (f'{out_dir}/alignment.tar', 'log'),
(f'{out_dir}/none/coverages.tgz', 'plain_text')])]
self.assertCountEqual(ainfo, exp)
@@ -232,6 +226,8 @@ def test_woltka_to_array_wol(self):
self._clean_up_files.append(out_dir)
files, prep = self.qclient.artifact_and_preparation_files(aid)
+ html_summary = self.qclient.get_artifact_html_summary(aid)
+ files['html_summary'] = html_summary
url = 'this-is-my-url'
main_fp, merge_fp = woltka_to_array(
@@ -250,14 +246,15 @@ def test_woltka_to_array_wol(self):
'#!/bin/bash\n',
'#SBATCH -p qiita\n',
'#SBATCH --mail-user "qiita.help@gmail.com"\n',
+ '#SBATCH --mail-type=FAIL,TIME_LIMIT_80,INVALID_DEPEND\n',
f'#SBATCH --job-name {job_id}\n',
'#SBATCH -N 1\n',
'#SBATCH -n 8\n',
'#SBATCH --time 40:00:00\n',
- '#SBATCH --mem 90g\n',
+ '#SBATCH --mem 70g\n',
f'#SBATCH --output {out_dir}/{job_id}_%a.log\n',
f'#SBATCH --error {out_dir}/{job_id}_%a.err\n',
- '#SBATCH --array 1-1%8\n',
+ '#SBATCH --array 0-0%8\n',
f'cd {out_dir}\n',
f'prep_full_path={prep_file}\n',
f'{self.environment}\n',
@@ -267,29 +264,18 @@ def test_woltka_to_array_wol(self):
f'dbbase={dirname(database)}\n',
f'dbname={basename(database)}\n',
f'output={out_dir}\n',
- 'files=`cat sample_details_${SLURM_ARRAY_TASK_ID}.txt`\n',
- 'mux ${files} | bowtie2 -p 8 -x '
- f'{database} -q - --seed 42 '
- '--very-sensitive -k 16 --np 1 --mp "1,1" --rdg "0,1" --rfg "0,1" '
- '--score-min "L,0,-0.05" --no-head --no-unal | cut -f1-9 | '
- 'sed \'s/$/\t*\t*/\' | demux ${output} '
- f'{prep_file} | sort | uniq > '
- 'sample_processing_${SLURM_ARRAY_TASK_ID}.log\n',
- '# for each one of our input files, form woltka commands, \n',
- '# and farm off to gnu parallel\n',
- 'for f in `cat sample_processing_${SLURM_ARRAY_TASK_ID}.log`\n',
- 'do\n',
- ' echo "woltka classify -i ${f} -o ${f}.woltka-taxa --no-demux '
- f'--lineage {database}.tax --rank free,none --outcov '
- 'coverages/"\n',
- f' echo "woltka classify -i ${{f}} -c {database}.coords '
- '-o ${f}.woltka-per-gene --no-demux"\n',
- 'done | parallel -j 8\n',
- 'for f in `cat sample_processing_${SLURM_ARRAY_TASK_ID}.log`\n',
- 'do\n',
- ' # compress it\n',
- ' echo "xz -1 -T1 ${f}"\n',
- 'done | parallel -j 8\n',
+ 'bt2_cores=6\n',
+ f'mxdx mux --file-map {out_dir}/files_list.tsv --batch '
+ '${SLURM_ARRAY_TASK_ID} --batch-size 50000000 '
+ '--paired-handling interleave | '
+ 'bowtie2 -p ${bt2_cores} -x '
+ f'{database} --interleaved - --seed 42 --very-sensitive -k 16 '
+ '--np 1 --mp "1,1" --rdg "0,1" --rfg "0,1" '
+ '--score-min "L,0,-0.05" --no-head --no-unal --no-exact-upfront '
+ "--no-1mm-upfront | cut -f1-9 | sed \'s/$/\t*\t*/' | mxdx demux "
+ f'--file-map {out_dir}/files_list.tsv '
+ '--batch ${SLURM_ARRAY_TASK_ID} --batch-size 50000000 '
+ f'--output-base {out_dir}/alignments --extension sam.xz\n',
'date\n']
self.assertEqual(main, exp_main)
@@ -299,7 +285,7 @@ def test_woltka_to_array_wol(self):
'#SBATCH --mail-user "qiita.help@gmail.com"\n',
f'#SBATCH --job-name merge-{job_id}\n',
'#SBATCH -N 1\n',
- '#SBATCH -n 3\n',
+ '#SBATCH -n 1\n',
'#SBATCH --time 30:00:00\n',
'#SBATCH --mem 140g\n',
f'#SBATCH --output {out_dir}/merge-{job_id}.log\n',
@@ -311,17 +297,16 @@ def test_woltka_to_array_wol(self):
'echo $SLURM_JOBID\n',
'set -e\n',
"sruns=`grep 'overall alignment rate' *.err | wc -l`\n",
- "sjobs=`ls sample_details_* | wc -l`\n",
- 'if [[ ! -f "errors.log" && $sruns -eq $sjobs ]]; then\n',
- f'woltka_merge --base {out_dir} --name '
- 'free --glob "*.woltka-taxa/free.biom" &\n',
- f'woltka_merge --base {out_dir} --name '
- 'none --glob "*.woltka-taxa/none.biom" &\n',
- f'woltka_merge --base {out_dir} --name '
- 'per-gene --glob "*.woltka-per-gene" --rename &\n',
- 'wait\n',
+ 'if [[ ! -f "errors.log" && $sruns -eq "1" ]]; then\n',
+ f'woltka_merge --base {out_dir}\n',
+ f'woltka classify -i {out_dir}/alignments -o {out_dir}/woltka '
+ f'--no-demux --lineage {database}.tax '
+ '--rank free,none --outcov coverages/\n',
+ f'woltka classify -i {out_dir}/alignments --no-demux -c '
+ f'{database}.coords -o per-gene.biom\n',
+ f'cd {out_dir};\n',
'\n',
- f'cd {out_dir}; tar -cvf alignment.tar *.sam.xz; '
+ 'cd alignments; tar -cvf ../alignment.tar *.sam.xz; cd ..; '
'tar zcvf coverages.tgz coverage_percentage.txt artifact.cov '
'coverages\n',
'fi\n',
@@ -331,9 +316,9 @@ def test_woltka_to_array_wol(self):
# now let's test that if finished correctly
sdir = 'qp_woltka/support_files/'
- copyfile(f'{sdir}/none.biom', f'{out_dir}/none.biom')
+ mkdir(f'{out_dir}/woltka')
+ copyfile(f'{sdir}/none.biom', f'{out_dir}/woltka/none.biom')
copyfile(f'{sdir}/per-gene.biom', f'{out_dir}/per-gene.biom')
- copyfile(f'{sdir}/free.biom', f'{out_dir}/free.biom')
copyfile(f'{sdir}/alignment.tar', f'{out_dir}/alignment.tar')
copyfile(f'{sdir}/coverages.tgz', f'{out_dir}/coverages.tgz')
success, ainfo, msg = woltka(
@@ -343,13 +328,9 @@ def test_woltka_to_array_wol(self):
self.assertTrue(success)
exp = [
- ArtifactInfo('Alignment Profile', 'BIOM',
- [(f'{out_dir}/free.biom', 'biom'),
- (f'{out_dir}/alignment.tar', 'log'),
- (f'{out_dir}/alignment/coverages.tgz',
- 'plain_text')]),
ArtifactInfo('Per genome Predictions', 'BIOM',
- [(f'{out_dir}/none.biom', 'biom'),
+ [(f'{out_dir}/woltka/none.biom', 'biom'),
+ (f'{out_dir}/alignment.tar', 'log'),
(f'{out_dir}/none/coverages.tgz', 'plain_text')]),
ArtifactInfo('Per gene Predictions', 'BIOM',
[(f'{out_dir}/per-gene.biom', 'biom'),
@@ -373,8 +354,6 @@ def test_woltka_to_array_error(self):
self.qclient, job_id, self.params, out_dir)
exp_msg = '\n'.join([
- 'Missing files from the "Alignment Profile"; please contact '
- 'qiita.help@gmail.com for more information',
'Table none/per-genome was not created, please contact '
'qiita.help@gmail.com for more information',
'Table per-gene was not created, please contact '
diff --git a/qp_woltka/util.py b/qp_woltka/util.py
index 3d87f06..96e56d3 100644
--- a/qp_woltka/util.py
+++ b/qp_woltka/util.py
@@ -10,14 +10,11 @@
from configparser import ConfigParser
from collections import defaultdict
-import gzip
-from signal import signal, SIGPIPE, SIG_DFL
-
from qiita_client import QiitaClient
plugin_details = {'name': 'qp-woltka',
- 'version': '2023.11',
+ 'version': '2024.09',
'description': 'Woltka'}
@@ -62,49 +59,11 @@ def client_connect(url):
config.readfp(conf_file)
qclient = QiitaClient(url, config.get('oauth2', 'CLIENT_ID'),
config.get('oauth2', 'CLIENT_SECRET'),
- server_cert=config.get('oauth2', 'SERVER_CERT'))
+ config.get('oauth2', 'SERVER_CERT'))
return qclient
-def mux(files, output):
- # https://linuxpip.org/broken-pipe-python-error/
- # Ignore SIG_PIPE and don't throw exceptions on it
- # http://docs.python.org/library/signal.html
- signal(SIGPIPE, SIG_DFL)
-
- delimiter = b'@@@'
- newline = b'\n'
-
- errors = []
- # the name used here is the filename, it is not read orientation agnostic,
- # should it be?
- for f in files:
- name = f.split('/')[-1]
- name = name.split('.fastq')[0].encode('ascii')
-
- try:
- fp = gzip.open(f)
- id_ = iter(fp)
- seq = iter(fp)
- dumb = iter(fp)
- qual = iter(fp)
- for i, s, d, q in zip(id_, seq, dumb, qual):
- base_i = i.strip().split(b' ', 1)[0]
- new_i = base_i + delimiter + name + newline
-
- output.write(new_i)
- output.write(s)
- output.write(d)
- output.write(q)
- except Exception as e:
- errors.append(f'{f}\t{e}')
-
- if errors:
- with open('errors.log', 'w') as error_log:
- error_log.write('\n'.join(errors))
-
-
def search_by_filename(fname, lookup):
if fname in lookup:
return lookup[fname]
@@ -128,44 +87,6 @@ def search_by_filename(fname, lookup):
raise KeyError("Cannot determine run_prefix for %s" % original)
-def demux(input_, output, prep):
- lookup = prep.set_index('run_prefix')['sample_name'].to_dict()
- delimiter = b'@@@'
- tab = b'\t'
- mode = 'ab' # IMPORTANT: we are opening in append not write
- ext = b'.sam'
- sep = b'/'
-
- # read each record
- # parse the filename out
- # if the file is not open, open it
- # we are assuming the records are coming in grouped by file
- # however the method will work if records are for whatever
- # reason interleaved
- current_fname = None
- current_fp = None
-
- for line in input_:
- id_, remainder = line.split(tab, 1)
- id_, fname = id_.rsplit(delimiter, 1)
- fname = fname.strip()
-
- if fname != current_fname:
- if current_fp is not None:
- current_fp.close()
-
- sample = search_by_filename(
- fname.decode('utf8'), lookup).encode('ascii')
- fullname = output + sep + sample + ext
- current_fp = open(fullname, mode)
- current_fname = fname
- print(fullname.decode('ascii'))
-
- current_fp.write(id_)
- current_fp.write(tab)
- current_fp.write(remainder)
-
-
def _merge_ranges(files):
# the lines below are borrowed from zebra filter but they are sligthly
# modified; mainly the autocompress parameter was deleted so it always
diff --git a/qp_woltka/woltka.py b/qp_woltka/woltka.py
index 2815b30..77e0fbc 100644
--- a/qp_woltka/woltka.py
+++ b/qp_woltka/woltka.py
@@ -21,16 +21,19 @@
from qp_woltka.util import search_by_filename
from qiita_client import ArtifactInfo
+from qiita_client.util import system_call
# resources per job
PPN = 8
MAX_RUNNING = 8
TASKS_IN_SCRIPT = 10
-MEMORY = '90g'
+MEMORY = '70g'
LARGE_MEMORY = '150g'
MERGE_MEMORY = '140g'
SYNDNA_MEMORY = '190g'
+# setting so an iSeq run, generates 2 jobs
+BATCHSIZE = 50000000
WALLTIME = '40:00:00'
MERGE_WALLTIME = '30:00:00'
@@ -77,69 +80,94 @@ def woltka_to_array(files, output, database_bowtie2, prep, url, name):
"""
environment = environ["ENVIRONMENT"]
+ # processing html_summary
+ html_summary = files.pop('html_summary')
+ df = pd.read_html(html_summary)[0]
+ dname = dirname(html_summary)
+ fwd = dict(df[df.file_type == 'raw_forward_seqs'].apply(
+ lambda x: (x.filename.rsplit('_R1')[0],
+ (x.filename, x.reads)), axis=1).values)
+ rev = dict(df[df.file_type == 'raw_reverse_seqs'].apply(
+ lambda x: (x.filename.rsplit('_R2')[0],
+ (x.filename, x.reads)), axis=1).values)
+ lines = ['filename_1\trecord_count']
+ if rev:
+ lines = ['filename_1\tfilename_2\trecord_count']
+ for k, (fn, reads) in fwd.items():
+ line = f'{dname}/{fn}\t'
+ if k in rev:
+ rfn = rev.pop(k)[0]
+ line += f'{dname}/{rfn}\t'
+ line += f'{reads}'
+ lines.append(line)
+ files_list_fp = f'{output}/files_list.tsv'
+ with open(files_list_fp, 'w') as fp:
+ fp.write('\n'.join(lines))
+
+ cmd = (f'mxdx get-max-batch-number --file-map {files_list_fp} '
+ f'--batch-size {BATCHSIZE}')
+ n_files, stderr, return_value = system_call(cmd)
+ if return_value != 0 or stderr:
+ raise ValueError('`mxdx get-max-batch-number` failed '
+ f'{return_value}: {stderr}')
+ # just making sure that n_files is an int
+ n_files = int(n_files)
+
db_files = _process_database_files(database_bowtie2)
db_folder = dirname(database_bowtie2)
db_name = basename(database_bowtie2)
- n_files = 1
- for i, (k, (f, r)) in enumerate(files.items()):
- if i >= n_files*TASKS_IN_SCRIPT:
- n_files += 1
- with open(join(output, f'sample_details_{n_files}.txt'), 'a+') as fh:
- fh.write(f'{f["filepath"]}\n')
- if r is not None:
- fh.write(f'{r["filepath"]}\n')
-
- ranks = ["free", "none"]
- # now, let's establish the merge script.
- merges = []
- merge_inv = f'woltka_merge --base {output} '
- fcmds = []
- for r in ranks:
- cmd = [merge_inv, f'--name {r}', f'--glob "*.woltka-taxa/{r}.biom"']
- if r == 'free' and 'length.map' in db_files:
- cmd.append(f'--length_map {db_files["length.map"]}')
- cmd.append('&')
- merges.append(" ".join(cmd))
- if db_files['gene_coordinates'] is not None:
- merges.append(" ".join([merge_inv, '--name per-gene',
- '--glob "*.woltka-per-gene"',
- '--rename &'])) # run all at once
+ woltka_merge = f'woltka_merge --base {output}'
+ extra_commands = ''
+ if 'length.map' in db_files:
+ woltka_merge += f' --length_map {db_files["length.map"]}'
+ extra_commands = (
+ 'python -c "from glob import glob; from qp_woltka.util import '
+ "merge_ranges; coverages = glob('coverages/*.cov'); "
+ "open('artifact.cov', 'w').write('\\n'.join("
+ 'merge_ranges(coverages)))"\n'
+ 'python -c "from qp_woltka.util import coverage_percentage; '
+ "open('coverage_percentage.txt', 'w').write('\\n'.join("
+ "coverage_percentage(['artifact.cov'], '"
+ f'{db_files["length.map"]}' "')))\"")
+
+ ranks = ','.join(["free", "none"])
+ woltka_cmds = [
+ f'woltka classify -i {output}/alignments -o {output}/woltka '
+ f'--no-demux --lineage {db_files["taxonomy"]} --rank {ranks} '
+ '--outcov coverages/']
+ if db_files['gene_coordinates']:
+ woltka_cmds.append(
+ f'woltka classify -i {output}/alignments '
+ f'--no-demux -c {db_files["gene_coordinates"]} -o per-gene.biom')
+
wcdm = 'woltka tools collapse -i '
dbfk = db_files['kegg']
if dbfk["orf-to-ko.map.xz"] is not None:
- fcmds.append(f'{wcdm} per-gene.biom -m {dbfk["orf-to-ko.map.xz"]} '
- '-o ko.biom')
+ woltka_cmds.append(f'{wcdm} per-gene.biom -m '
+ f'{dbfk["orf-to-ko.map.xz"]} -o ko.biom')
if dbfk["ko-to-ec.map"] is not None:
- fcmds.append(f'{wcdm} ko.biom -m {dbfk["ko-to-ec.map"]} '
- '-o ec.biom')
+ woltka_cmds.append(f'{wcdm} ko.biom -m {dbfk["ko-to-ec.map"]} '
+ '-o ec.biom')
if dbfk["ko-to-reaction.map"] is not None and \
dbfk["reaction-to-module.map"] is not None and \
dbfk["module-to-pathway.map"] is not None:
- fcmds.append(f'{wcdm} ko.biom -m {dbfk["ko-to-reaction.map"]} '
- '-o reaction.biom')
- fcmds.append(f'{wcdm} reaction.biom -m '
- f'{dbfk["reaction-to-module.map"]} -o module.biom')
- fcmds.append(f'{wcdm} module.biom -m '
- f'{dbfk["module-to-pathway.map"]} -o pathway.biom')
- else:
- # for "simplicity" we will inject the `--rename` flag to the last
- # merge command (between all the parameters and the last &)
- m = merges[-1].split(' ')
- merges[-1] = " ".join(m[:-1] + ['--rename'] + [m[-1]])
-
- # The merge for a HiSeq 2000 lane was 40 seconds and ~150MB of memory.
- # But, let's over request just in case (and this is a very small request
- # relative to the rest of the work).
- n_merges = len(merges)
- assert n_merges < 32 # 32 merges would be crazy...
+ woltka_cmds.append(
+ f'{wcdm} ko.biom -m {dbfk["ko-to-reaction.map"]} '
+ '-o reaction.biom')
+ woltka_cmds.append(
+ f'{wcdm} reaction.biom -m '
+ f'{dbfk["reaction-to-module.map"]} -o module.biom')
+ woltka_cmds.append(
+ f'{wcdm} module.biom -m '
+ f'{dbfk["module-to-pathway.map"]} -o pathway.biom')
lines = ['#!/bin/bash',
'#SBATCH -p qiita',
'#SBATCH --mail-user "qiita.help@gmail.com"',
f'#SBATCH --job-name merge-{name}',
'#SBATCH -N 1',
- f'#SBATCH -n {n_merges}',
+ '#SBATCH -n 1',
f'#SBATCH --time {MERGE_WALLTIME}',
f'#SBATCH --mem {MERGE_MEMORY}',
f'#SBATCH --output {output}/merge-{name}.log',
@@ -154,12 +182,12 @@ def woltka_to_array(files, output, database_bowtie2, prep, url, name):
# reports is equal to the numbers of jobs that started : process
# the bioms
"sruns=`grep 'overall alignment rate' *.err | wc -l`",
- 'sjobs=`ls sample_details_* | wc -l`',
- 'if [[ ! -f "errors.log" && $sruns -eq $sjobs ]]; then',
- '\n'.join(merges),
- "wait",
- '\n'.join(fcmds),
- f'cd {output}; tar -cvf alignment.tar *.sam.xz; '
+ f'if [[ ! -f "errors.log" && $sruns -eq "{n_files + 1}" ]]; then',
+ woltka_merge,
+ '\n'.join(woltka_cmds),
+ f'cd {output};',
+ extra_commands,
+ 'cd alignments; tar -cvf ../alignment.tar *.sam.xz; cd ..; '
'tar zcvf coverages.tgz coverage_percentage.txt artifact.cov '
'coverages\n'
'fi',
@@ -175,20 +203,19 @@ def woltka_to_array(files, output, database_bowtie2, prep, url, name):
# https://github.com/BenLangmead/bowtie2/issues/311
preparation_information = join(output, 'prep_info.tsv')
prep.set_index('sample_name').to_csv(preparation_information, sep='\t')
- bowtie2 = 'mux ${files} | ' + \
- f'bowtie2 -p {PPN} -x {database_bowtie2} ' + \
- '-q - --seed 42 ' + \
- '--very-sensitive -k 16 --np 1 --mp "1,1" ' + \
- '--rdg "0,1" --rfg "0,1" --score-min ' + \
- '"L,0,-0.05" --no-head --no-unal' + \
- " | cut -f1-9 | sed 's/$/\t*\t*/'" + \
- ' | demux ${output} ' + preparation_information + \
- ' | sort | uniq > sample_processing_${SLURM_ARRAY_TASK_ID}.log'
- woltka = 'woltka classify -i ${f} ' + \
- '-o ${f}.woltka-taxa ' + \
- '--no-demux ' + \
- f'--lineage {db_files["taxonomy"]} ' + \
- f'--rank {",".join(ranks)} --outcov coverages/'
+ bowtie2 = (
+ f'mxdx mux --file-map {files_list_fp} --batch '
+ '${SLURM_ARRAY_TASK_ID} '
+ f'--batch-size {BATCHSIZE} --paired-handling interleave | '
+ 'bowtie2 -p ${bt2_cores} '
+ f'-x {database_bowtie2} --interleaved - --seed 42 '
+ '--very-sensitive -k 16 --np 1 --mp "1,1" --rdg "0,1" --rfg "0,1" '
+ '--score-min "L,0,-0.05" --no-head --no-unal --no-exact-upfront '
+ "--no-1mm-upfront | cut -f1-9 | sed 's/$/\t*\t*/' | "
+ f'mxdx demux --file-map {files_list_fp} '
+ '--batch ${SLURM_ARRAY_TASK_ID} '
+ f'--batch-size {BATCHSIZE} --output-base {output}/alignments '
+ '--extension sam.xz')
memory = MEMORY
if 'RS210' in database_bowtie2:
@@ -198,6 +225,7 @@ def woltka_to_array(files, output, database_bowtie2, prep, url, name):
lines = ['#!/bin/bash',
'#SBATCH -p qiita',
'#SBATCH --mail-user "qiita.help@gmail.com"',
+ '#SBATCH --mail-type=FAIL,TIME_LIMIT_80,INVALID_DEPEND',
f'#SBATCH --job-name {name}',
'#SBATCH -N 1',
f'#SBATCH -n {PPN}',
@@ -205,7 +233,7 @@ def woltka_to_array(files, output, database_bowtie2, prep, url, name):
f'#SBATCH --mem {memory}',
f'#SBATCH --output {output}/{name}_%a.log',
f'#SBATCH --error {output}/{name}_%a.err',
- f'#SBATCH --array 1-{n_files}%{MAX_RUNNING}',
+ f'#SBATCH --array 0-{n_files}%{MAX_RUNNING}',
f'cd {output}',
f'prep_full_path={preparation_information}',
f'{environment}',
@@ -215,27 +243,8 @@ def woltka_to_array(files, output, database_bowtie2, prep, url, name):
f'dbbase={db_folder}',
f'dbname={db_name}',
f'output={output}',
- 'files=`cat sample_details_${SLURM_ARRAY_TASK_ID}.txt`',
- bowtie2,
- '# for each one of our input files, form woltka commands, ',
- '# and farm off to gnu parallel',
- 'for f in `cat sample_processing_${SLURM_ARRAY_TASK_ID}.log`',
- 'do',
- f' echo "{woltka}"']
-
- if db_files['gene_coordinates'] is not None:
- lines.append(' echo "woltka classify -i ${f} '
- f'-c {db_files["gene_coordinates"]} '
- '-o ${f}.woltka-per-gene --no-demux"')
- lines.append('done | parallel -j 8')
-
- # finally, compress each one of our sam files
- lines.extend([
- 'for f in `cat sample_processing_${SLURM_ARRAY_TASK_ID}.log`',
- 'do',
- ' # compress it',
- ' echo "xz -1 -T1 ${f}"',
- 'done | parallel -j 8'])
+ f'bt2_cores={PPN - 2}',
+ bowtie2]
lines.append('date') # end time
@@ -279,21 +288,12 @@ def _coverage_copy(dest):
errors = []
ainfo = []
- fp_biom = f'{out_dir}/free.biom'
+
+ fp_biom = f'{out_dir}/woltka/none.biom'
fp_alng = f'{out_dir}/alignment.tar'
if exists(fp_biom) and exists(fp_alng):
- ainfo = [ArtifactInfo('Alignment Profile', 'BIOM', [
- (fp_biom, 'biom'), (fp_alng, 'log'),
- (_coverage_copy(f'{out_dir}/alignment/'), 'plain_text')])]
- else:
- ainfo = []
- errors.append('Missing files from the "Alignment Profile"; please '
- 'contact qiita.help@gmail.com for more information')
-
- fp_biom = f'{out_dir}/none.biom'
- if exists(fp_biom):
ainfo.append(ArtifactInfo('Per genome Predictions', 'BIOM', [
- (fp_biom, 'biom'),
+ (fp_biom, 'biom'), (fp_alng, 'log'),
(_coverage_copy(f'{out_dir}/none/'), 'plain_text')]))
else:
errors.append('Table none/per-genome was not created, please contact '
diff --git a/scripts/configure_woltka b/scripts/configure_woltka
index 25bde97..426cf37 100755
--- a/scripts/configure_woltka
+++ b/scripts/configure_woltka
@@ -16,13 +16,12 @@ from qp_woltka import plugin
@click.command()
@click.option('--env-script', prompt='Environment script',
default='source activate qp-woltka')
-@click.option('--server-cert', prompt='Server certificate', default='None')
-def config(env_script, server_cert):
+@click.option('--ca-cert', prompt='Server certificate', default='None')
+def config(env_script, ca_cert):
"""Generates the Qiita configuration files"""
- if server_cert == 'None':
- server_cert = None
- plugin.generate_config(env_script, 'start_woltka',
- server_cert=server_cert)
+ if ca_cert == 'None':
+ ca_cert = None
+ plugin.generate_config(env_script, 'start_woltka', ca_cert)
if __name__ == '__main__':
diff --git a/scripts/demux b/scripts/demux
deleted file mode 100755
index f85f46a..0000000
--- a/scripts/demux
+++ /dev/null
@@ -1,20 +0,0 @@
-#!/usr/bin/env python
-
-# -----------------------------------------------------------------------------
-# Copyright (c) 2020--, The Qiita Development Team.
-#
-# Distributed under the terms of the BSD 3-clause License.
-#
-# The full license is in the file LICENSE, distributed with this software.
-# -----------------------------------------------------------------------------
-
-import sys
-from qp_woltka.util import demux
-import pandas as pd
-
-input_ = sys.stdin.buffer
-output = sys.argv[1].encode('ascii')
-prep_filepath = sys.argv[2]
-prep = pd.read_csv(prep_filepath, sep='\t', dtype=str)
-
-demux(input_, output, prep)
diff --git a/scripts/mux b/scripts/mux
deleted file mode 100755
index 3b1b565..0000000
--- a/scripts/mux
+++ /dev/null
@@ -1,17 +0,0 @@
-#!/usr/bin/env python
-
-# -----------------------------------------------------------------------------
-# Copyright (c) 2020--, The Qiita Development Team.
-#
-# Distributed under the terms of the BSD 3-clause License.
-#
-# The full license is in the file LICENSE, distributed with this software.
-# -----------------------------------------------------------------------------
-
-import sys
-from qp_woltka.util import mux
-
-
-files = sys.argv[1:]
-output = sys.stdout.buffer
-mux(files, output)
diff --git a/scripts/start_woltka b/scripts/start_woltka
index 878df2e..df9f4f0 100755
--- a/scripts/start_woltka
+++ b/scripts/start_woltka
@@ -35,7 +35,8 @@ def execute(url, job_id, out_dir):
# these were defined in qp_woltka/__init.py__ while defining the
# available commands for this plugin
valid_commands = [
- 'Woltka v0.1.4', 'SynDNA Woltka', 'Calculate Cell Counts',
+ 'Woltka v0.1.6, paired-end',
+ 'SynDNA Woltka', 'Calculate Cell Counts',
'Calculate RNA Copy Counts']
# this if/elif is the current solution for
@@ -62,7 +63,14 @@ def execute(url, job_id, out_dir):
else:
directory = directory.pop()
- if command == 'Woltka v0.1.4':
+ if command.startswith('Woltka v0.1.6'):
+ html_summary = qclient.get_artifact_html_summary(artifact_id)
+ if html_summary is None:
+ raise ValueError(
+ f'`{command}` relies on the artifact html_summary and '
+ f'artifact: {artifact_id} does not have one. Please send '
+ 'this message to the help acount.')
+ files['html_summary'] = html_summary
main_fp, merge_fp = woltka_to_array(
files, out_dir, parameters['Database'], prep, url, job_id)
else:
diff --git a/scripts/woltka_merge b/scripts/woltka_merge
index e863e9a..0480cf8 100755
--- a/scripts/woltka_merge
+++ b/scripts/woltka_merge
@@ -1,66 +1,37 @@
#!/usr/bin/env python
import os
-import biom
-import glob as glob_
-import h5py
import click
-from qp_woltka.util import merge_ranges, coverage_percentage
+import pandas as pd
+
+from qiita_client.util import system_call
+from qp_woltka.util import (
+ # merge_ranges, coverage_percentage,
+ search_by_filename)
@click.command()
@click.option('--base', type=click.Path(exists=True), required=True)
-@click.option('--glob', type=str, required=True)
-@click.option('--name', type=str, required=True)
-@click.option('--rename/--no-rename', type=bool, default=False)
@click.option('--length_map', type=click.Path(exists=True), required=False)
-def merge(base, glob, name, rename, length_map):
- # this is the size that was used in the tests that kept a small ~2.5G
- # memory footprint
- chunk_size = 30
- full = None
-
- search = os.path.join(base, glob)
- tables = glob_.glob(search)
- for block in range(0, len(tables), chunk_size):
- chunk = tables[block:block + chunk_size]
-
- loaded = []
- for c in chunk:
- skip = True
- if biom.util.is_hdf5_file(c):
- skip = False
- else:
- with open(c) as fh:
- for i, l in enumerate(fh):
- if i >= 1 and l:
- skip = False
- break
- if not skip:
- temp = biom.load_table(c)
- if temp.shape != (0, 0):
- loaded.append(temp)
-
- if full is None:
- if len(loaded) == 1:
- full = loaded[0]
- else:
- full = loaded[0].concat(loaded[1:])
- else:
- full = full.concat(loaded)
-
- with h5py.File(f'{base}/{name}.biom', 'w') as out:
- full.to_hdf5(out, 'fast-merge')
-
- if name == 'free' and length_map is not None:
- coverages = glob_.glob('coverages/*.cov')
- artifact_cov = f'{base}/artifact.cov'
- with open(artifact_cov, 'w') as out:
- out.write('\n'.join(merge_ranges(coverages)))
-
- with open(f'{base}/coverage_percentage.txt', 'w') as out:
- out.write('\n'.join(
- coverage_percentage([artifact_cov], length_map)))
+def merge(base, length_map):
+ aln_fp = f'{base}/alignments'
+ cmd = (f'mxdx consolidate-partials --output-base {aln_fp} '
+ '--extension sam.xz')
+ stdout, stderr, return_value = system_call(cmd)
+ if return_value != 0 or stderr:
+ raise ValueError('`mxdx consolidate-partials` failed '
+ f'{return_value}: {stderr}')
+
+ prep = pd.read_csv(f'{base}/prep_info.tsv', dtype=str, sep='\t')
+ lookup = prep.set_index('run_prefix')['sample_name'].to_dict()
+ ignore_fp = f'{base}/ignore'
+ os.mkdir(ignore_fp)
+ for fname in os.listdir(aln_fp):
+ if fname.startswith('dx-partial.'):
+ os.rename(f'{aln_fp}/{fname}', f'{ignore_fp}/{fname}')
+ continue
+ nfname = search_by_filename(fname, lookup)
+ os.rename(f'{aln_fp}/{fname}', f'{aln_fp}/{nfname}.sam.xz')
if __name__ == '__main__':
diff --git a/setup.py b/setup.py
index 4f15419..67aba66 100644
--- a/setup.py
+++ b/setup.py
@@ -11,7 +11,7 @@
from setuptools import setup
-__version__ = "2023.11"
+__version__ = "2024.09"
classes = """
@@ -48,15 +48,16 @@
'support_files/*',
'databases/*']},
scripts=['scripts/configure_woltka', 'scripts/start_woltka',
- 'scripts/finish_woltka', 'scripts/woltka_merge',
- 'scripts/demux', 'scripts/mux'],
+ 'scripts/finish_woltka', 'scripts/woltka_merge'],
extras_require={'test': ["nose >= 0.10.1", "pep8"]},
install_requires=['click >= 3.3', 'future', 'pandas >= 0.15',
- 'h5py >= 2.3.1', 'biom-format',
- # forcing 0.1.4 because Qiita uses this version
- 'woltka @ https://github.com/'
- 'qiyunzhu/woltka/archive/refs/tags/v0.1.4.zip',
+ 'h5py >= 2.3.1', 'biom-format', 'lxml',
+ # forcing 0.1.6 because Qiita uses this version
+ 'polars-lts-cpu', 'woltka @ https://github.com/'
+ 'qiyunzhu/woltka/archive/refs/tags/v0.1.6.zip',
'pysyndna @ https://github.com/AmandaBirmingham/'
- 'pysyndna/archive/refs/heads/main.zip'],
+ 'pysyndna/archive/refs/heads/main.zip',
+ 'mxdx @ https://github.com/wasade/mxdx/archive/'
+ 'refs/heads/main.zip'],
classifiers=classifiers
)