diff --git a/.github/workflows/qiita-plugin-ci.yml b/.github/workflows/qiita-plugin-ci.yml index a9ffe28..7d4a8ab 100644 --- a/.github/workflows/qiita-plugin-ci.yml +++ b/.github/workflows/qiita-plugin-ci.yml @@ -76,8 +76,8 @@ jobs: shell: bash -l {0} run: | conda config --add channels bioconda - conda create --yes -n qp-woltka-0.1.4 python=3.9 biom-format bowtie2 - conda activate qp-woltka-0.1.4 + conda create --yes -n qp-woltka python=3.9 biom-format bowtie2 + conda activate qp-woltka # bowtie2==2.5.0 is what's installed in main qiita so making sure we keep # using the same version for consistency; note that we will need to force conda install --yes -c bioconda bowtie2==2.5.0 @@ -87,7 +87,7 @@ jobs: pip --quiet install -U pip pip nose flake8 pip --quiet install https://github.com/qiita-spots/qiita_client/archive/master.zip - export QIITA_SERVER_CERT=`pwd`/qiita-dev/qiita_core/support_files/ci_rootca.crt + export QIITA_ROOTCA_CERT=`pwd`/qiita-dev/qiita_core/support_files/ci_rootca.crt export QIITA_CONFIG_FP=`pwd`/qiita-dev/qiita_core/support_files/config_test_local.cfg export QC_WOLTKA_DB_DP=$PWD/qp_woltka/databases/woltka/ export QC_WOLTKA_SYNDNA_DB_DP=$PWD/qp_woltka/databases/synDNA @@ -96,12 +96,12 @@ jobs: woltka_version=`woltka --version` bowtie2_version=`bowtie2 --version` - if [[ $woltka_version != *"0.1.4"* ]]; then echo "wrong woltka version", $woltka_version; exit 1; fi + if [[ $woltka_version != *"0.1.6"* ]]; then echo "wrong woltka version", $woltka_version; exit 1; fi if [[ $bowtie2_version != *"2.5.0"* ]]; then echo "wrong bowtie2 version", $bowtie2_version; exit 1; fi export ENVIRONMENT="source /home/runner/.profile; conda activate qp-woltka; export QC_WOLTKA_DB_DP=$PWD/qp_woltka/databases/woltka/; export QC_WOLTKA_SYNDNA_DB_DP=$PWD/qp_woltka/databases/synDNA" - configure_woltka --env-script 'source /home/runner/.profile; conda activate qp-woltka-0.1.4; export QC_WOLTKA_DB_DP=$PWD/qp_woltka/databases/woltka/; export QC_WOLTKA_SYNDNA_DB_DP=$PWD/qp_woltka/databases/synDNA; export ENVIRONMENT="source /home/runner/.profile; conda activate qp-woltka-0.1.4; export QC_WOLTKA_DB_DP=$PWD/qp_woltka/databases/woltka/; export QC_WOLTKA_SYNDNA_DB_DP=$PWD/qp_woltka/databases/synDNA"' --server-cert $QIITA_SERVER_CERT + configure_woltka --env-script 'source /home/runner/.profile; conda activate qp-woltka; export QC_WOLTKA_DB_DP=$PWD/qp_woltka/databases/woltka/; export QC_WOLTKA_SYNDNA_DB_DP=$PWD/qp_woltka/databases/synDNA; export ENVIRONMENT="source /home/runner/.profile; conda activate qp-woltka; export QC_WOLTKA_DB_DP=$PWD/qp_woltka/databases/woltka/; export QC_WOLTKA_SYNDNA_DB_DP=$PWD/qp_woltka/databases/synDNA"' --ca-cert $QIITA_ROOTCA_CERT echo "Available Qiita plugins" ls ~/.qiita_plugins/ @@ -110,7 +110,7 @@ jobs: shell: bash -l {0} run: | conda activate qiita - export QIITA_SERVER_CERT=`pwd`/qiita-dev/qiita_core/support_files/ci_rootca.crt + export QIITA_ROOTCA_CERT=`pwd`/qiita-dev/qiita_core/support_files/ci_rootca.crt export QIITA_CONFIG_FP=`pwd`/qiita-dev/qiita_core/support_files/config_test_local.cfg sed "s#/home/runner/work/qiita/qiita#${PWD}/qiita-dev/#g" `pwd`/qiita-dev/qiita_core/support_files/config_test.cfg > ${QIITA_CONFIG_FP} @@ -141,10 +141,10 @@ jobs: env: COVER_PACKAGE: ${{ matrix.cover_package }} run: | - conda activate qp-woltka-0.1.4 - export QIITA_SERVER_CERT=`pwd`/qiita-dev/qiita_core/support_files/ci_rootca.crt + conda activate qp-woltka + export QIITA_ROOTCA_CERT=`pwd`/qiita-dev/qiita_core/support_files/ci_rootca.crt export QIITA_CONFIG_FP=`pwd`/qiita-dev/qiita_core/support_files/config_test_local.cfg - export ENVIRONMENT="source /home/runner/.profile; conda activate qp-woltka-0.1.4; export QC_WOLTKA_DB_DP=$PWD/qp_woltka/databases/woltka/; export QC_WOLTKA_SYNDNA_DB_DP=$PWD/qp_woltka/databases/synDNA" + export ENVIRONMENT="source /home/runner/.profile; conda activate qp-woltka; export QC_WOLTKA_DB_DP=$PWD/qp_woltka/databases/woltka/; export QC_WOLTKA_SYNDNA_DB_DP=$PWD/qp_woltka/databases/synDNA" export QC_WOLTKA_DB_DP=$PWD/qp_woltka/databases/woltka/ export QC_WOLTKA_SYNDNA_DB_DP=$PWD/qp_woltka/databases/synDNA export PYTHONWARNINGS="ignore:Certificate for localhost has no \`subjectAltName\`" diff --git a/qp_woltka/__init__.py b/qp_woltka/__init__.py index cc43f06..516a3bf 100644 --- a/qp_woltka/__init__.py +++ b/qp_woltka/__init__.py @@ -27,7 +27,6 @@ } outputs = { # taxonomic - 'Alignment Profile': 'BIOM', 'Per genome Predictions': 'BIOM', 'Per gene Predictions': 'BIOM', # functional @@ -37,7 +36,8 @@ } dflt_param_set = generate_woltka_dflt_params() woltka_cmd = QiitaCommand( - 'Woltka v0.1.4', "Functional and Taxonomic Predictions", woltka, + 'Woltka v0.1.6, paired-end', + "Functional and Taxonomic Predictions", woltka, req_params, opt_params, outputs, dflt_param_set) plugin.register_command(woltka_cmd) diff --git a/qp_woltka/support_files/files_list.tsv b/qp_woltka/support_files/files_list.tsv new file mode 100644 index 0000000..2d896f8 --- /dev/null +++ b/qp_woltka/support_files/files_list.tsv @@ -0,0 +1,3 @@ +filename_1 filename_2 record_count +/tmp/folder/S22205_S104_L001_R1_001.fastq.gz /tmp/folder/S22205_S104_L001_R2_001.fastq.gz 1000000 +/tmp/folder/S22282_S102_L001_R1_001.fastq.gz /tmp/folder/S22282_S102_L001_R2_001.fastq.gz 1000000 diff --git a/qp_woltka/support_files/free.biom b/qp_woltka/support_files/free.biom deleted file mode 100644 index 14bb00a..0000000 Binary files a/qp_woltka/support_files/free.biom and /dev/null differ diff --git a/qp_woltka/support_files/summary.html b/qp_woltka/support_files/summary.html new file mode 100644 index 0000000..0e267df --- /dev/null +++ b/qp_woltka/support_files/summary.html @@ -0,0 +1,36 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
filenamemd5file_typereads
S22205_S104_L001_R1_001.fastq.gz9dcfb0c77674fdada176262963196db0raw_forward_seqs1000000
S22282_S102_L001_R1_001.fastq.gz9dcfb0c77674fdada176262963196db0raw_forward_seqs1000000
S22205_S104_L001_R2_001.fastq.gz9dcfb0c77674fdada176262963196db0raw_reverse_seqs1000000
S22282_S102_L001_R2_001.fastq.gz9dcfb0c77674fdada176262963196db0raw_reverse_seqs1000000
diff --git a/qp_woltka/tests/test_util.py b/qp_woltka/tests/test_util.py index acc8285..88daea0 100644 --- a/qp_woltka/tests/test_util.py +++ b/qp_woltka/tests/test_util.py @@ -9,13 +9,9 @@ from unittest import main, TestCase from os import environ from os.path import join -from tempfile import TemporaryDirectory -import gzip -import io -import pandas as pd from qp_woltka.util import ( - get_dbs, generate_woltka_dflt_params, mux, demux, search_by_filename, + get_dbs, generate_woltka_dflt_params, search_by_filename, merge_ranges, coverage_percentage) @@ -38,41 +34,6 @@ def test_generate_woltka_dflt_params(self): self.assertDictEqual(obs, exp) - def test_mux(self): - f1 = b"@foo\nATGC\n+\nIIII\n" - f2 = b"@bar\nAAAA\n+\nIIII\n" - exp = b"@foo@@@foofile\nATGC\n+\nIIII\n@bar@@@barfile\nAAAA\n+\nIIII\n" - with TemporaryDirectory() as d: - f1fp = join(d, 'foofile.fastq') - f2fp = join(d, 'barfile.fastq') - ofp = join(d, 'output') - with gzip.open(f1fp, 'wb') as fp: - fp.write(f1) - with gzip.open(f2fp, 'wb') as fp: - fp.write(f2) - with open(ofp, 'wb') as output: - mux([f1fp, f2fp], output) - with open(ofp, 'rb') as result: - obs = result.read() - - self.assertEqual(obs, exp) - - def test_demux(self): - prep = pd.DataFrame([['sample_foo', 'foofile'], - ['sample_bar', 'barfile']], - columns=['sample_name', 'run_prefix']) - input_ = io.BytesIO(b"foo@@@foofile_R1\tATGC\t+\tIIII\nbar@@@" - b"barfile_R2\tAAAA\t+\tIIII\n") - expfoo = b"foo\tATGC\t+\tIIII\n" - expbar = b"bar\tAAAA\t+\tIIII\n" - with TemporaryDirectory() as d: - demux(input_, d.encode('ascii'), prep) - foo = open(join(d, 'sample_foo.sam'), 'rb').read() - bar = open(join(d, 'sample_bar.sam'), 'rb').read() - - self.assertEqual(foo, expfoo) - self.assertEqual(bar, expbar) - def test_search_by_filename(self): lookup = {'foo_bar': 'baz', 'foo': 'bar'} diff --git a/qp_woltka/tests/test_woltka.py b/qp_woltka/tests/test_woltka.py index 6f5383b..f4f69a4 100644 --- a/qp_woltka/tests/test_woltka.py +++ b/qp_woltka/tests/test_woltka.py @@ -59,18 +59,21 @@ def _helper_woltka_bowtie(self, prep_info_dict, database=None): fp1_2 = join(in_dir, 'S22205_S104_L001_R2_001.fastq.gz') fp2_1 = join(in_dir, 'S22282_S102_L001_R1_001.fastq.gz') fp2_2 = join(in_dir, 'S22282_S102_L001_R2_001.fastq.gz') + fp_summary = join(in_dir, 'summary.html') source_dir = 'qp_woltka/support_files' copyfile(f'{source_dir}/S22205_S104_L001_R1_001.fastq.gz', fp1_1) copyfile(f'{source_dir}/S22205_S104_L001_R2_001.fastq.gz', fp1_2) copyfile(f'{source_dir}/S22282_S102_L001_R1_001.fastq.gz', fp2_1) copyfile(f'{source_dir}/S22282_S102_L001_R2_001.fastq.gz', fp2_2) + copyfile(f'{source_dir}/summary.html', fp_summary) data = { 'filepaths': dumps([ (fp1_1, 'raw_forward_seqs'), (fp1_2, 'raw_reverse_seqs'), (fp2_1, 'raw_forward_seqs'), - (fp2_2, 'raw_reverse_seqs')]), + (fp2_2, 'raw_reverse_seqs'), + (fp_summary, 'html_summary')]), 'type': "per_sample_FASTQ", 'name': "Test Woltka artifact", 'prep': pid} @@ -82,7 +85,9 @@ def _helper_woltka_bowtie(self, prep_info_dict, database=None): self.params['Database'] = database data = {'user': 'demo@microbio.me', - 'command': dumps(['qp-woltka', '2023.11', 'Woltka v0.1.4']), + 'command': dumps( + ['qp-woltka', '2024.09', + 'Woltka v0.1.6, paired-end']), 'status': 'running', 'parameters': dumps(self.params)} job_id = self.qclient.post( @@ -101,6 +106,8 @@ def test_woltka_to_array_rep82(self): self._clean_up_files.append(out_dir) files, prep = self.qclient.artifact_and_preparation_files(aid) + html_summary = self.qclient.get_artifact_html_summary(aid) + files['html_summary'] = html_summary url = 'this-is-my-url' database = self.params['Database'] @@ -120,14 +127,15 @@ def test_woltka_to_array_rep82(self): '#!/bin/bash\n', '#SBATCH -p qiita\n', '#SBATCH --mail-user "qiita.help@gmail.com"\n', + '#SBATCH --mail-type=FAIL,TIME_LIMIT_80,INVALID_DEPEND\n', f'#SBATCH --job-name {job_id}\n', '#SBATCH -N 1\n', '#SBATCH -n 8\n', '#SBATCH --time 40:00:00\n', - '#SBATCH --mem 90g\n', + '#SBATCH --mem 70g\n', f'#SBATCH --output {out_dir}/{job_id}_%a.log\n', f'#SBATCH --error {out_dir}/{job_id}_%a.err\n', - '#SBATCH --array 1-1%8\n', + '#SBATCH --array 0-0%8\n', f'cd {out_dir}\n', f'prep_full_path={prep_file}\n', f'{self.environment}\n', @@ -137,27 +145,18 @@ def test_woltka_to_array_rep82(self): f'dbbase={dirname(database)}\n', f'dbname={basename(database)}\n', f'output={out_dir}\n', - 'files=`cat sample_details_${SLURM_ARRAY_TASK_ID}.txt`\n', - 'mux ${files} | bowtie2 -p 8 -x ' - f'{database} -q - --seed 42 ' - '--very-sensitive -k 16 --np 1 --mp "1,1" --rdg "0,1" --rfg "0,1" ' - '--score-min "L,0,-0.05" --no-head --no-unal | cut -f1-9 | ' - 'sed \'s/$/\t*\t*/\' | demux ${output} ' - f'{prep_file} | sort | uniq > ' - 'sample_processing_${SLURM_ARRAY_TASK_ID}.log\n', - '# for each one of our input files, form woltka commands, \n', - '# and farm off to gnu parallel\n', - 'for f in `cat sample_processing_${SLURM_ARRAY_TASK_ID}.log`\n', - 'do\n', - ' echo "woltka classify -i ${f} -o ${f}.woltka-taxa --no-demux ' - f'--lineage {database}.tax --rank free,none --outcov ' - 'coverages/"\n', - 'done | parallel -j 8\n', - 'for f in `cat sample_processing_${SLURM_ARRAY_TASK_ID}.log`\n', - 'do\n', - ' # compress it\n', - ' echo "xz -1 -T1 ${f}"\n', - 'done | parallel -j 8\n', + 'bt2_cores=6\n', + f'mxdx mux --file-map {out_dir}/files_list.tsv --batch ' + '${SLURM_ARRAY_TASK_ID} --batch-size 50000000 ' + '--paired-handling interleave | ' + 'bowtie2 -p ${bt2_cores} -x ' + f'{database} --interleaved - --seed 42 --very-sensitive -k 16 ' + '--np 1 --mp "1,1" --rdg "0,1" --rfg "0,1" ' + '--score-min "L,0,-0.05" --no-head --no-unal --no-exact-upfront ' + "--no-1mm-upfront | cut -f1-9 | sed \'s/$/\t*\t*/' | mxdx demux " + f'--file-map {out_dir}/files_list.tsv ' + '--batch ${SLURM_ARRAY_TASK_ID} --batch-size 50000000 ' + f'--output-base {out_dir}/alignments --extension sam.xz\n', 'date\n'] self.assertEqual(main, exp_main) @@ -167,7 +166,7 @@ def test_woltka_to_array_rep82(self): '#SBATCH --mail-user "qiita.help@gmail.com"\n', f'#SBATCH --job-name merge-{job_id}\n', '#SBATCH -N 1\n', - '#SBATCH -n 2\n', + '#SBATCH -n 1\n', '#SBATCH --time 30:00:00\n', '#SBATCH --mem 140g\n', f'#SBATCH --output {out_dir}/merge-{job_id}.log\n', @@ -179,15 +178,14 @@ def test_woltka_to_array_rep82(self): 'echo $SLURM_JOBID\n', 'set -e\n', "sruns=`grep 'overall alignment rate' *.err | wc -l`\n", - "sjobs=`ls sample_details_* | wc -l`\n", - 'if [[ ! -f "errors.log" && $sruns -eq $sjobs ]]; then\n', - f'woltka_merge --base {out_dir} --name ' - 'free --glob "*.woltka-taxa/free.biom" &\n', - f'woltka_merge --base {out_dir} --name ' - 'none --glob "*.woltka-taxa/none.biom" --rename &\n', - 'wait\n', + 'if [[ ! -f "errors.log" && $sruns -eq "1" ]]; then\n', + f'woltka_merge --base {out_dir}\n', + f'woltka classify -i {out_dir}/alignments -o {out_dir}/woltka ' + f'--no-demux --lineage {database}.tax ' + '--rank free,none --outcov coverages/\n', + f'cd {out_dir};\n', '\n', - f'cd {out_dir}; tar -cvf alignment.tar *.sam.xz; ' + 'cd alignments; tar -cvf ../alignment.tar *.sam.xz; cd ..; ' 'tar zcvf coverages.tgz coverage_percentage.txt artifact.cov ' 'coverages\n', 'fi\n', @@ -197,8 +195,8 @@ def test_woltka_to_array_rep82(self): # now let's test that if finished correctly sdir = 'qp_woltka/support_files/' - copyfile(f'{sdir}/none.biom', f'{out_dir}/none.biom') - copyfile(f'{sdir}/free.biom', f'{out_dir}/free.biom') + mkdir(f'{out_dir}/woltka') + copyfile(f'{sdir}/none.biom', f'{out_dir}/woltka/none.biom') copyfile(f'{sdir}/alignment.tar', f'{out_dir}/alignment.tar') copyfile(f'{sdir}/coverages.tgz', f'{out_dir}/coverages.tgz') @@ -209,13 +207,9 @@ def test_woltka_to_array_rep82(self): self.assertTrue(success) exp = [ - ArtifactInfo('Alignment Profile', 'BIOM', - [(f'{out_dir}/free.biom', 'biom'), - (f'{out_dir}/alignment.tar', 'log'), - (f'{out_dir}/alignment/coverages.tgz', - 'plain_text')]), ArtifactInfo('Per genome Predictions', 'BIOM', - [(f'{out_dir}/none.biom', 'biom'), + [(f'{out_dir}/woltka/none.biom', 'biom'), + (f'{out_dir}/alignment.tar', 'log'), (f'{out_dir}/none/coverages.tgz', 'plain_text')])] self.assertCountEqual(ainfo, exp) @@ -232,6 +226,8 @@ def test_woltka_to_array_wol(self): self._clean_up_files.append(out_dir) files, prep = self.qclient.artifact_and_preparation_files(aid) + html_summary = self.qclient.get_artifact_html_summary(aid) + files['html_summary'] = html_summary url = 'this-is-my-url' main_fp, merge_fp = woltka_to_array( @@ -250,14 +246,15 @@ def test_woltka_to_array_wol(self): '#!/bin/bash\n', '#SBATCH -p qiita\n', '#SBATCH --mail-user "qiita.help@gmail.com"\n', + '#SBATCH --mail-type=FAIL,TIME_LIMIT_80,INVALID_DEPEND\n', f'#SBATCH --job-name {job_id}\n', '#SBATCH -N 1\n', '#SBATCH -n 8\n', '#SBATCH --time 40:00:00\n', - '#SBATCH --mem 90g\n', + '#SBATCH --mem 70g\n', f'#SBATCH --output {out_dir}/{job_id}_%a.log\n', f'#SBATCH --error {out_dir}/{job_id}_%a.err\n', - '#SBATCH --array 1-1%8\n', + '#SBATCH --array 0-0%8\n', f'cd {out_dir}\n', f'prep_full_path={prep_file}\n', f'{self.environment}\n', @@ -267,29 +264,18 @@ def test_woltka_to_array_wol(self): f'dbbase={dirname(database)}\n', f'dbname={basename(database)}\n', f'output={out_dir}\n', - 'files=`cat sample_details_${SLURM_ARRAY_TASK_ID}.txt`\n', - 'mux ${files} | bowtie2 -p 8 -x ' - f'{database} -q - --seed 42 ' - '--very-sensitive -k 16 --np 1 --mp "1,1" --rdg "0,1" --rfg "0,1" ' - '--score-min "L,0,-0.05" --no-head --no-unal | cut -f1-9 | ' - 'sed \'s/$/\t*\t*/\' | demux ${output} ' - f'{prep_file} | sort | uniq > ' - 'sample_processing_${SLURM_ARRAY_TASK_ID}.log\n', - '# for each one of our input files, form woltka commands, \n', - '# and farm off to gnu parallel\n', - 'for f in `cat sample_processing_${SLURM_ARRAY_TASK_ID}.log`\n', - 'do\n', - ' echo "woltka classify -i ${f} -o ${f}.woltka-taxa --no-demux ' - f'--lineage {database}.tax --rank free,none --outcov ' - 'coverages/"\n', - f' echo "woltka classify -i ${{f}} -c {database}.coords ' - '-o ${f}.woltka-per-gene --no-demux"\n', - 'done | parallel -j 8\n', - 'for f in `cat sample_processing_${SLURM_ARRAY_TASK_ID}.log`\n', - 'do\n', - ' # compress it\n', - ' echo "xz -1 -T1 ${f}"\n', - 'done | parallel -j 8\n', + 'bt2_cores=6\n', + f'mxdx mux --file-map {out_dir}/files_list.tsv --batch ' + '${SLURM_ARRAY_TASK_ID} --batch-size 50000000 ' + '--paired-handling interleave | ' + 'bowtie2 -p ${bt2_cores} -x ' + f'{database} --interleaved - --seed 42 --very-sensitive -k 16 ' + '--np 1 --mp "1,1" --rdg "0,1" --rfg "0,1" ' + '--score-min "L,0,-0.05" --no-head --no-unal --no-exact-upfront ' + "--no-1mm-upfront | cut -f1-9 | sed \'s/$/\t*\t*/' | mxdx demux " + f'--file-map {out_dir}/files_list.tsv ' + '--batch ${SLURM_ARRAY_TASK_ID} --batch-size 50000000 ' + f'--output-base {out_dir}/alignments --extension sam.xz\n', 'date\n'] self.assertEqual(main, exp_main) @@ -299,7 +285,7 @@ def test_woltka_to_array_wol(self): '#SBATCH --mail-user "qiita.help@gmail.com"\n', f'#SBATCH --job-name merge-{job_id}\n', '#SBATCH -N 1\n', - '#SBATCH -n 3\n', + '#SBATCH -n 1\n', '#SBATCH --time 30:00:00\n', '#SBATCH --mem 140g\n', f'#SBATCH --output {out_dir}/merge-{job_id}.log\n', @@ -311,17 +297,16 @@ def test_woltka_to_array_wol(self): 'echo $SLURM_JOBID\n', 'set -e\n', "sruns=`grep 'overall alignment rate' *.err | wc -l`\n", - "sjobs=`ls sample_details_* | wc -l`\n", - 'if [[ ! -f "errors.log" && $sruns -eq $sjobs ]]; then\n', - f'woltka_merge --base {out_dir} --name ' - 'free --glob "*.woltka-taxa/free.biom" &\n', - f'woltka_merge --base {out_dir} --name ' - 'none --glob "*.woltka-taxa/none.biom" &\n', - f'woltka_merge --base {out_dir} --name ' - 'per-gene --glob "*.woltka-per-gene" --rename &\n', - 'wait\n', + 'if [[ ! -f "errors.log" && $sruns -eq "1" ]]; then\n', + f'woltka_merge --base {out_dir}\n', + f'woltka classify -i {out_dir}/alignments -o {out_dir}/woltka ' + f'--no-demux --lineage {database}.tax ' + '--rank free,none --outcov coverages/\n', + f'woltka classify -i {out_dir}/alignments --no-demux -c ' + f'{database}.coords -o per-gene.biom\n', + f'cd {out_dir};\n', '\n', - f'cd {out_dir}; tar -cvf alignment.tar *.sam.xz; ' + 'cd alignments; tar -cvf ../alignment.tar *.sam.xz; cd ..; ' 'tar zcvf coverages.tgz coverage_percentage.txt artifact.cov ' 'coverages\n', 'fi\n', @@ -331,9 +316,9 @@ def test_woltka_to_array_wol(self): # now let's test that if finished correctly sdir = 'qp_woltka/support_files/' - copyfile(f'{sdir}/none.biom', f'{out_dir}/none.biom') + mkdir(f'{out_dir}/woltka') + copyfile(f'{sdir}/none.biom', f'{out_dir}/woltka/none.biom') copyfile(f'{sdir}/per-gene.biom', f'{out_dir}/per-gene.biom') - copyfile(f'{sdir}/free.biom', f'{out_dir}/free.biom') copyfile(f'{sdir}/alignment.tar', f'{out_dir}/alignment.tar') copyfile(f'{sdir}/coverages.tgz', f'{out_dir}/coverages.tgz') success, ainfo, msg = woltka( @@ -343,13 +328,9 @@ def test_woltka_to_array_wol(self): self.assertTrue(success) exp = [ - ArtifactInfo('Alignment Profile', 'BIOM', - [(f'{out_dir}/free.biom', 'biom'), - (f'{out_dir}/alignment.tar', 'log'), - (f'{out_dir}/alignment/coverages.tgz', - 'plain_text')]), ArtifactInfo('Per genome Predictions', 'BIOM', - [(f'{out_dir}/none.biom', 'biom'), + [(f'{out_dir}/woltka/none.biom', 'biom'), + (f'{out_dir}/alignment.tar', 'log'), (f'{out_dir}/none/coverages.tgz', 'plain_text')]), ArtifactInfo('Per gene Predictions', 'BIOM', [(f'{out_dir}/per-gene.biom', 'biom'), @@ -373,8 +354,6 @@ def test_woltka_to_array_error(self): self.qclient, job_id, self.params, out_dir) exp_msg = '\n'.join([ - 'Missing files from the "Alignment Profile"; please contact ' - 'qiita.help@gmail.com for more information', 'Table none/per-genome was not created, please contact ' 'qiita.help@gmail.com for more information', 'Table per-gene was not created, please contact ' diff --git a/qp_woltka/util.py b/qp_woltka/util.py index 3d87f06..96e56d3 100644 --- a/qp_woltka/util.py +++ b/qp_woltka/util.py @@ -10,14 +10,11 @@ from configparser import ConfigParser from collections import defaultdict -import gzip -from signal import signal, SIGPIPE, SIG_DFL - from qiita_client import QiitaClient plugin_details = {'name': 'qp-woltka', - 'version': '2023.11', + 'version': '2024.09', 'description': 'Woltka'} @@ -62,49 +59,11 @@ def client_connect(url): config.readfp(conf_file) qclient = QiitaClient(url, config.get('oauth2', 'CLIENT_ID'), config.get('oauth2', 'CLIENT_SECRET'), - server_cert=config.get('oauth2', 'SERVER_CERT')) + config.get('oauth2', 'SERVER_CERT')) return qclient -def mux(files, output): - # https://linuxpip.org/broken-pipe-python-error/ - # Ignore SIG_PIPE and don't throw exceptions on it - # http://docs.python.org/library/signal.html - signal(SIGPIPE, SIG_DFL) - - delimiter = b'@@@' - newline = b'\n' - - errors = [] - # the name used here is the filename, it is not read orientation agnostic, - # should it be? - for f in files: - name = f.split('/')[-1] - name = name.split('.fastq')[0].encode('ascii') - - try: - fp = gzip.open(f) - id_ = iter(fp) - seq = iter(fp) - dumb = iter(fp) - qual = iter(fp) - for i, s, d, q in zip(id_, seq, dumb, qual): - base_i = i.strip().split(b' ', 1)[0] - new_i = base_i + delimiter + name + newline - - output.write(new_i) - output.write(s) - output.write(d) - output.write(q) - except Exception as e: - errors.append(f'{f}\t{e}') - - if errors: - with open('errors.log', 'w') as error_log: - error_log.write('\n'.join(errors)) - - def search_by_filename(fname, lookup): if fname in lookup: return lookup[fname] @@ -128,44 +87,6 @@ def search_by_filename(fname, lookup): raise KeyError("Cannot determine run_prefix for %s" % original) -def demux(input_, output, prep): - lookup = prep.set_index('run_prefix')['sample_name'].to_dict() - delimiter = b'@@@' - tab = b'\t' - mode = 'ab' # IMPORTANT: we are opening in append not write - ext = b'.sam' - sep = b'/' - - # read each record - # parse the filename out - # if the file is not open, open it - # we are assuming the records are coming in grouped by file - # however the method will work if records are for whatever - # reason interleaved - current_fname = None - current_fp = None - - for line in input_: - id_, remainder = line.split(tab, 1) - id_, fname = id_.rsplit(delimiter, 1) - fname = fname.strip() - - if fname != current_fname: - if current_fp is not None: - current_fp.close() - - sample = search_by_filename( - fname.decode('utf8'), lookup).encode('ascii') - fullname = output + sep + sample + ext - current_fp = open(fullname, mode) - current_fname = fname - print(fullname.decode('ascii')) - - current_fp.write(id_) - current_fp.write(tab) - current_fp.write(remainder) - - def _merge_ranges(files): # the lines below are borrowed from zebra filter but they are sligthly # modified; mainly the autocompress parameter was deleted so it always diff --git a/qp_woltka/woltka.py b/qp_woltka/woltka.py index 2815b30..77e0fbc 100644 --- a/qp_woltka/woltka.py +++ b/qp_woltka/woltka.py @@ -21,16 +21,19 @@ from qp_woltka.util import search_by_filename from qiita_client import ArtifactInfo +from qiita_client.util import system_call # resources per job PPN = 8 MAX_RUNNING = 8 TASKS_IN_SCRIPT = 10 -MEMORY = '90g' +MEMORY = '70g' LARGE_MEMORY = '150g' MERGE_MEMORY = '140g' SYNDNA_MEMORY = '190g' +# setting so an iSeq run, generates 2 jobs +BATCHSIZE = 50000000 WALLTIME = '40:00:00' MERGE_WALLTIME = '30:00:00' @@ -77,69 +80,94 @@ def woltka_to_array(files, output, database_bowtie2, prep, url, name): """ environment = environ["ENVIRONMENT"] + # processing html_summary + html_summary = files.pop('html_summary') + df = pd.read_html(html_summary)[0] + dname = dirname(html_summary) + fwd = dict(df[df.file_type == 'raw_forward_seqs'].apply( + lambda x: (x.filename.rsplit('_R1')[0], + (x.filename, x.reads)), axis=1).values) + rev = dict(df[df.file_type == 'raw_reverse_seqs'].apply( + lambda x: (x.filename.rsplit('_R2')[0], + (x.filename, x.reads)), axis=1).values) + lines = ['filename_1\trecord_count'] + if rev: + lines = ['filename_1\tfilename_2\trecord_count'] + for k, (fn, reads) in fwd.items(): + line = f'{dname}/{fn}\t' + if k in rev: + rfn = rev.pop(k)[0] + line += f'{dname}/{rfn}\t' + line += f'{reads}' + lines.append(line) + files_list_fp = f'{output}/files_list.tsv' + with open(files_list_fp, 'w') as fp: + fp.write('\n'.join(lines)) + + cmd = (f'mxdx get-max-batch-number --file-map {files_list_fp} ' + f'--batch-size {BATCHSIZE}') + n_files, stderr, return_value = system_call(cmd) + if return_value != 0 or stderr: + raise ValueError('`mxdx get-max-batch-number` failed ' + f'{return_value}: {stderr}') + # just making sure that n_files is an int + n_files = int(n_files) + db_files = _process_database_files(database_bowtie2) db_folder = dirname(database_bowtie2) db_name = basename(database_bowtie2) - n_files = 1 - for i, (k, (f, r)) in enumerate(files.items()): - if i >= n_files*TASKS_IN_SCRIPT: - n_files += 1 - with open(join(output, f'sample_details_{n_files}.txt'), 'a+') as fh: - fh.write(f'{f["filepath"]}\n') - if r is not None: - fh.write(f'{r["filepath"]}\n') - - ranks = ["free", "none"] - # now, let's establish the merge script. - merges = [] - merge_inv = f'woltka_merge --base {output} ' - fcmds = [] - for r in ranks: - cmd = [merge_inv, f'--name {r}', f'--glob "*.woltka-taxa/{r}.biom"'] - if r == 'free' and 'length.map' in db_files: - cmd.append(f'--length_map {db_files["length.map"]}') - cmd.append('&') - merges.append(" ".join(cmd)) - if db_files['gene_coordinates'] is not None: - merges.append(" ".join([merge_inv, '--name per-gene', - '--glob "*.woltka-per-gene"', - '--rename &'])) # run all at once + woltka_merge = f'woltka_merge --base {output}' + extra_commands = '' + if 'length.map' in db_files: + woltka_merge += f' --length_map {db_files["length.map"]}' + extra_commands = ( + 'python -c "from glob import glob; from qp_woltka.util import ' + "merge_ranges; coverages = glob('coverages/*.cov'); " + "open('artifact.cov', 'w').write('\\n'.join(" + 'merge_ranges(coverages)))"\n' + 'python -c "from qp_woltka.util import coverage_percentage; ' + "open('coverage_percentage.txt', 'w').write('\\n'.join(" + "coverage_percentage(['artifact.cov'], '" + f'{db_files["length.map"]}' "')))\"") + + ranks = ','.join(["free", "none"]) + woltka_cmds = [ + f'woltka classify -i {output}/alignments -o {output}/woltka ' + f'--no-demux --lineage {db_files["taxonomy"]} --rank {ranks} ' + '--outcov coverages/'] + if db_files['gene_coordinates']: + woltka_cmds.append( + f'woltka classify -i {output}/alignments ' + f'--no-demux -c {db_files["gene_coordinates"]} -o per-gene.biom') + wcdm = 'woltka tools collapse -i ' dbfk = db_files['kegg'] if dbfk["orf-to-ko.map.xz"] is not None: - fcmds.append(f'{wcdm} per-gene.biom -m {dbfk["orf-to-ko.map.xz"]} ' - '-o ko.biom') + woltka_cmds.append(f'{wcdm} per-gene.biom -m ' + f'{dbfk["orf-to-ko.map.xz"]} -o ko.biom') if dbfk["ko-to-ec.map"] is not None: - fcmds.append(f'{wcdm} ko.biom -m {dbfk["ko-to-ec.map"]} ' - '-o ec.biom') + woltka_cmds.append(f'{wcdm} ko.biom -m {dbfk["ko-to-ec.map"]} ' + '-o ec.biom') if dbfk["ko-to-reaction.map"] is not None and \ dbfk["reaction-to-module.map"] is not None and \ dbfk["module-to-pathway.map"] is not None: - fcmds.append(f'{wcdm} ko.biom -m {dbfk["ko-to-reaction.map"]} ' - '-o reaction.biom') - fcmds.append(f'{wcdm} reaction.biom -m ' - f'{dbfk["reaction-to-module.map"]} -o module.biom') - fcmds.append(f'{wcdm} module.biom -m ' - f'{dbfk["module-to-pathway.map"]} -o pathway.biom') - else: - # for "simplicity" we will inject the `--rename` flag to the last - # merge command (between all the parameters and the last &) - m = merges[-1].split(' ') - merges[-1] = " ".join(m[:-1] + ['--rename'] + [m[-1]]) - - # The merge for a HiSeq 2000 lane was 40 seconds and ~150MB of memory. - # But, let's over request just in case (and this is a very small request - # relative to the rest of the work). - n_merges = len(merges) - assert n_merges < 32 # 32 merges would be crazy... + woltka_cmds.append( + f'{wcdm} ko.biom -m {dbfk["ko-to-reaction.map"]} ' + '-o reaction.biom') + woltka_cmds.append( + f'{wcdm} reaction.biom -m ' + f'{dbfk["reaction-to-module.map"]} -o module.biom') + woltka_cmds.append( + f'{wcdm} module.biom -m ' + f'{dbfk["module-to-pathway.map"]} -o pathway.biom') lines = ['#!/bin/bash', '#SBATCH -p qiita', '#SBATCH --mail-user "qiita.help@gmail.com"', f'#SBATCH --job-name merge-{name}', '#SBATCH -N 1', - f'#SBATCH -n {n_merges}', + '#SBATCH -n 1', f'#SBATCH --time {MERGE_WALLTIME}', f'#SBATCH --mem {MERGE_MEMORY}', f'#SBATCH --output {output}/merge-{name}.log', @@ -154,12 +182,12 @@ def woltka_to_array(files, output, database_bowtie2, prep, url, name): # reports is equal to the numbers of jobs that started : process # the bioms "sruns=`grep 'overall alignment rate' *.err | wc -l`", - 'sjobs=`ls sample_details_* | wc -l`', - 'if [[ ! -f "errors.log" && $sruns -eq $sjobs ]]; then', - '\n'.join(merges), - "wait", - '\n'.join(fcmds), - f'cd {output}; tar -cvf alignment.tar *.sam.xz; ' + f'if [[ ! -f "errors.log" && $sruns -eq "{n_files + 1}" ]]; then', + woltka_merge, + '\n'.join(woltka_cmds), + f'cd {output};', + extra_commands, + 'cd alignments; tar -cvf ../alignment.tar *.sam.xz; cd ..; ' 'tar zcvf coverages.tgz coverage_percentage.txt artifact.cov ' 'coverages\n' 'fi', @@ -175,20 +203,19 @@ def woltka_to_array(files, output, database_bowtie2, prep, url, name): # https://github.com/BenLangmead/bowtie2/issues/311 preparation_information = join(output, 'prep_info.tsv') prep.set_index('sample_name').to_csv(preparation_information, sep='\t') - bowtie2 = 'mux ${files} | ' + \ - f'bowtie2 -p {PPN} -x {database_bowtie2} ' + \ - '-q - --seed 42 ' + \ - '--very-sensitive -k 16 --np 1 --mp "1,1" ' + \ - '--rdg "0,1" --rfg "0,1" --score-min ' + \ - '"L,0,-0.05" --no-head --no-unal' + \ - " | cut -f1-9 | sed 's/$/\t*\t*/'" + \ - ' | demux ${output} ' + preparation_information + \ - ' | sort | uniq > sample_processing_${SLURM_ARRAY_TASK_ID}.log' - woltka = 'woltka classify -i ${f} ' + \ - '-o ${f}.woltka-taxa ' + \ - '--no-demux ' + \ - f'--lineage {db_files["taxonomy"]} ' + \ - f'--rank {",".join(ranks)} --outcov coverages/' + bowtie2 = ( + f'mxdx mux --file-map {files_list_fp} --batch ' + '${SLURM_ARRAY_TASK_ID} ' + f'--batch-size {BATCHSIZE} --paired-handling interleave | ' + 'bowtie2 -p ${bt2_cores} ' + f'-x {database_bowtie2} --interleaved - --seed 42 ' + '--very-sensitive -k 16 --np 1 --mp "1,1" --rdg "0,1" --rfg "0,1" ' + '--score-min "L,0,-0.05" --no-head --no-unal --no-exact-upfront ' + "--no-1mm-upfront | cut -f1-9 | sed 's/$/\t*\t*/' | " + f'mxdx demux --file-map {files_list_fp} ' + '--batch ${SLURM_ARRAY_TASK_ID} ' + f'--batch-size {BATCHSIZE} --output-base {output}/alignments ' + '--extension sam.xz') memory = MEMORY if 'RS210' in database_bowtie2: @@ -198,6 +225,7 @@ def woltka_to_array(files, output, database_bowtie2, prep, url, name): lines = ['#!/bin/bash', '#SBATCH -p qiita', '#SBATCH --mail-user "qiita.help@gmail.com"', + '#SBATCH --mail-type=FAIL,TIME_LIMIT_80,INVALID_DEPEND', f'#SBATCH --job-name {name}', '#SBATCH -N 1', f'#SBATCH -n {PPN}', @@ -205,7 +233,7 @@ def woltka_to_array(files, output, database_bowtie2, prep, url, name): f'#SBATCH --mem {memory}', f'#SBATCH --output {output}/{name}_%a.log', f'#SBATCH --error {output}/{name}_%a.err', - f'#SBATCH --array 1-{n_files}%{MAX_RUNNING}', + f'#SBATCH --array 0-{n_files}%{MAX_RUNNING}', f'cd {output}', f'prep_full_path={preparation_information}', f'{environment}', @@ -215,27 +243,8 @@ def woltka_to_array(files, output, database_bowtie2, prep, url, name): f'dbbase={db_folder}', f'dbname={db_name}', f'output={output}', - 'files=`cat sample_details_${SLURM_ARRAY_TASK_ID}.txt`', - bowtie2, - '# for each one of our input files, form woltka commands, ', - '# and farm off to gnu parallel', - 'for f in `cat sample_processing_${SLURM_ARRAY_TASK_ID}.log`', - 'do', - f' echo "{woltka}"'] - - if db_files['gene_coordinates'] is not None: - lines.append(' echo "woltka classify -i ${f} ' - f'-c {db_files["gene_coordinates"]} ' - '-o ${f}.woltka-per-gene --no-demux"') - lines.append('done | parallel -j 8') - - # finally, compress each one of our sam files - lines.extend([ - 'for f in `cat sample_processing_${SLURM_ARRAY_TASK_ID}.log`', - 'do', - ' # compress it', - ' echo "xz -1 -T1 ${f}"', - 'done | parallel -j 8']) + f'bt2_cores={PPN - 2}', + bowtie2] lines.append('date') # end time @@ -279,21 +288,12 @@ def _coverage_copy(dest): errors = [] ainfo = [] - fp_biom = f'{out_dir}/free.biom' + + fp_biom = f'{out_dir}/woltka/none.biom' fp_alng = f'{out_dir}/alignment.tar' if exists(fp_biom) and exists(fp_alng): - ainfo = [ArtifactInfo('Alignment Profile', 'BIOM', [ - (fp_biom, 'biom'), (fp_alng, 'log'), - (_coverage_copy(f'{out_dir}/alignment/'), 'plain_text')])] - else: - ainfo = [] - errors.append('Missing files from the "Alignment Profile"; please ' - 'contact qiita.help@gmail.com for more information') - - fp_biom = f'{out_dir}/none.biom' - if exists(fp_biom): ainfo.append(ArtifactInfo('Per genome Predictions', 'BIOM', [ - (fp_biom, 'biom'), + (fp_biom, 'biom'), (fp_alng, 'log'), (_coverage_copy(f'{out_dir}/none/'), 'plain_text')])) else: errors.append('Table none/per-genome was not created, please contact ' diff --git a/scripts/configure_woltka b/scripts/configure_woltka index 25bde97..426cf37 100755 --- a/scripts/configure_woltka +++ b/scripts/configure_woltka @@ -16,13 +16,12 @@ from qp_woltka import plugin @click.command() @click.option('--env-script', prompt='Environment script', default='source activate qp-woltka') -@click.option('--server-cert', prompt='Server certificate', default='None') -def config(env_script, server_cert): +@click.option('--ca-cert', prompt='Server certificate', default='None') +def config(env_script, ca_cert): """Generates the Qiita configuration files""" - if server_cert == 'None': - server_cert = None - plugin.generate_config(env_script, 'start_woltka', - server_cert=server_cert) + if ca_cert == 'None': + ca_cert = None + plugin.generate_config(env_script, 'start_woltka', ca_cert) if __name__ == '__main__': diff --git a/scripts/demux b/scripts/demux deleted file mode 100755 index f85f46a..0000000 --- a/scripts/demux +++ /dev/null @@ -1,20 +0,0 @@ -#!/usr/bin/env python - -# ----------------------------------------------------------------------------- -# Copyright (c) 2020--, The Qiita Development Team. -# -# Distributed under the terms of the BSD 3-clause License. -# -# The full license is in the file LICENSE, distributed with this software. -# ----------------------------------------------------------------------------- - -import sys -from qp_woltka.util import demux -import pandas as pd - -input_ = sys.stdin.buffer -output = sys.argv[1].encode('ascii') -prep_filepath = sys.argv[2] -prep = pd.read_csv(prep_filepath, sep='\t', dtype=str) - -demux(input_, output, prep) diff --git a/scripts/mux b/scripts/mux deleted file mode 100755 index 3b1b565..0000000 --- a/scripts/mux +++ /dev/null @@ -1,17 +0,0 @@ -#!/usr/bin/env python - -# ----------------------------------------------------------------------------- -# Copyright (c) 2020--, The Qiita Development Team. -# -# Distributed under the terms of the BSD 3-clause License. -# -# The full license is in the file LICENSE, distributed with this software. -# ----------------------------------------------------------------------------- - -import sys -from qp_woltka.util import mux - - -files = sys.argv[1:] -output = sys.stdout.buffer -mux(files, output) diff --git a/scripts/start_woltka b/scripts/start_woltka index 878df2e..df9f4f0 100755 --- a/scripts/start_woltka +++ b/scripts/start_woltka @@ -35,7 +35,8 @@ def execute(url, job_id, out_dir): # these were defined in qp_woltka/__init.py__ while defining the # available commands for this plugin valid_commands = [ - 'Woltka v0.1.4', 'SynDNA Woltka', 'Calculate Cell Counts', + 'Woltka v0.1.6, paired-end', + 'SynDNA Woltka', 'Calculate Cell Counts', 'Calculate RNA Copy Counts'] # this if/elif is the current solution for @@ -62,7 +63,14 @@ def execute(url, job_id, out_dir): else: directory = directory.pop() - if command == 'Woltka v0.1.4': + if command.startswith('Woltka v0.1.6'): + html_summary = qclient.get_artifact_html_summary(artifact_id) + if html_summary is None: + raise ValueError( + f'`{command}` relies on the artifact html_summary and ' + f'artifact: {artifact_id} does not have one. Please send ' + 'this message to the help acount.') + files['html_summary'] = html_summary main_fp, merge_fp = woltka_to_array( files, out_dir, parameters['Database'], prep, url, job_id) else: diff --git a/scripts/woltka_merge b/scripts/woltka_merge index e863e9a..0480cf8 100755 --- a/scripts/woltka_merge +++ b/scripts/woltka_merge @@ -1,66 +1,37 @@ #!/usr/bin/env python import os -import biom -import glob as glob_ -import h5py import click -from qp_woltka.util import merge_ranges, coverage_percentage +import pandas as pd + +from qiita_client.util import system_call +from qp_woltka.util import ( + # merge_ranges, coverage_percentage, + search_by_filename) @click.command() @click.option('--base', type=click.Path(exists=True), required=True) -@click.option('--glob', type=str, required=True) -@click.option('--name', type=str, required=True) -@click.option('--rename/--no-rename', type=bool, default=False) @click.option('--length_map', type=click.Path(exists=True), required=False) -def merge(base, glob, name, rename, length_map): - # this is the size that was used in the tests that kept a small ~2.5G - # memory footprint - chunk_size = 30 - full = None - - search = os.path.join(base, glob) - tables = glob_.glob(search) - for block in range(0, len(tables), chunk_size): - chunk = tables[block:block + chunk_size] - - loaded = [] - for c in chunk: - skip = True - if biom.util.is_hdf5_file(c): - skip = False - else: - with open(c) as fh: - for i, l in enumerate(fh): - if i >= 1 and l: - skip = False - break - if not skip: - temp = biom.load_table(c) - if temp.shape != (0, 0): - loaded.append(temp) - - if full is None: - if len(loaded) == 1: - full = loaded[0] - else: - full = loaded[0].concat(loaded[1:]) - else: - full = full.concat(loaded) - - with h5py.File(f'{base}/{name}.biom', 'w') as out: - full.to_hdf5(out, 'fast-merge') - - if name == 'free' and length_map is not None: - coverages = glob_.glob('coverages/*.cov') - artifact_cov = f'{base}/artifact.cov' - with open(artifact_cov, 'w') as out: - out.write('\n'.join(merge_ranges(coverages))) - - with open(f'{base}/coverage_percentage.txt', 'w') as out: - out.write('\n'.join( - coverage_percentage([artifact_cov], length_map))) +def merge(base, length_map): + aln_fp = f'{base}/alignments' + cmd = (f'mxdx consolidate-partials --output-base {aln_fp} ' + '--extension sam.xz') + stdout, stderr, return_value = system_call(cmd) + if return_value != 0 or stderr: + raise ValueError('`mxdx consolidate-partials` failed ' + f'{return_value}: {stderr}') + + prep = pd.read_csv(f'{base}/prep_info.tsv', dtype=str, sep='\t') + lookup = prep.set_index('run_prefix')['sample_name'].to_dict() + ignore_fp = f'{base}/ignore' + os.mkdir(ignore_fp) + for fname in os.listdir(aln_fp): + if fname.startswith('dx-partial.'): + os.rename(f'{aln_fp}/{fname}', f'{ignore_fp}/{fname}') + continue + nfname = search_by_filename(fname, lookup) + os.rename(f'{aln_fp}/{fname}', f'{aln_fp}/{nfname}.sam.xz') if __name__ == '__main__': diff --git a/setup.py b/setup.py index 4f15419..67aba66 100644 --- a/setup.py +++ b/setup.py @@ -11,7 +11,7 @@ from setuptools import setup -__version__ = "2023.11" +__version__ = "2024.09" classes = """ @@ -48,15 +48,16 @@ 'support_files/*', 'databases/*']}, scripts=['scripts/configure_woltka', 'scripts/start_woltka', - 'scripts/finish_woltka', 'scripts/woltka_merge', - 'scripts/demux', 'scripts/mux'], + 'scripts/finish_woltka', 'scripts/woltka_merge'], extras_require={'test': ["nose >= 0.10.1", "pep8"]}, install_requires=['click >= 3.3', 'future', 'pandas >= 0.15', - 'h5py >= 2.3.1', 'biom-format', - # forcing 0.1.4 because Qiita uses this version - 'woltka @ https://github.com/' - 'qiyunzhu/woltka/archive/refs/tags/v0.1.4.zip', + 'h5py >= 2.3.1', 'biom-format', 'lxml', + # forcing 0.1.6 because Qiita uses this version + 'polars-lts-cpu', 'woltka @ https://github.com/' + 'qiyunzhu/woltka/archive/refs/tags/v0.1.6.zip', 'pysyndna @ https://github.com/AmandaBirmingham/' - 'pysyndna/archive/refs/heads/main.zip'], + 'pysyndna/archive/refs/heads/main.zip', + 'mxdx @ https://github.com/wasade/mxdx/archive/' + 'refs/heads/main.zip'], classifiers=classifiers )