diff --git a/atlas/atlas.py b/atlas/atlas.py
index 3e7ee05c..330d1487 100644
--- a/atlas/atlas.py
+++ b/atlas/atlas.py
@@ -121,7 +121,7 @@ def get_snakefile(file="workflow/Snakefile"):
type=int,
default=multiprocessing.cpu_count(),
show_default=True,
- help="use at most this many jobs in parallel (see cluster submission for mor details).",
+ help="use at most this many jobs in parallel (see cluster submission for more details).",
)
@click.option(
"--max-mem",
diff --git a/atlas/init/create_sample_table.py b/atlas/init/create_sample_table.py
index 8c9ee2f2..734c6cab 100644
--- a/atlas/init/create_sample_table.py
+++ b/atlas/init/create_sample_table.py
@@ -158,7 +158,7 @@ def get_samples_from_fastq(path, fraction_split_character=split_character):
# parse subfolder
if len(subfolders) > 0:
logger.info(
- f"Found {len(subfolders)} subfolders. Check if I find fastq files inside. Use the the subfolder as sample_names "
+ f"Found {len(subfolders)} subfolders. Check if I find fastq files inside. Use the subfolder as sample_names "
)
for subf in subfolders:
diff --git a/atlas/init/parse_sra.py b/atlas/init/parse_sra.py
index 953c03b6..0c2c41ff 100644
--- a/atlas/init/parse_sra.py
+++ b/atlas/init/parse_sra.py
@@ -67,7 +67,7 @@ def filter_runinfo(RunTable, ignore_paired=False):
if Difference > 0:
logger.info(
- f"Runs have the folowing values for {key}: {', '.join(All_values)}\n"
+ f"Runs have the following values for {key}: {', '.join(All_values)}\n"
f"Select only runs {key} == {Expected_library_values[key]}, "
f"Filtered out {Difference} runs"
)
@@ -77,7 +77,7 @@ def filter_runinfo(RunTable, ignore_paired=False):
All_values = RunTable[key].unique()
if any(RunTable[key] != Expected_library_values[key]):
logger.warning(
- f"Runs have the folowing values for {key}: {', '.join(All_values)}\n"
+ f"Runs have the following values for {key}: {', '.join(All_values)}\n"
f"Usually I expect {key} == {Expected_library_values[key]} "
)
@@ -141,7 +141,7 @@ def validate_merging_runinfo(path):
logger.error(
f"You attemt to merge runs from the same sample. "
f"But for {len(problematic_samples)} samples the runs are sequenced with different platforms and should't be merged.\n"
- f"Please resolve the the abiguity in the table {path} and rerun the command.\n"
+ f"Please resolve the abiguity in the table {path} and rerun the command.\n"
)
exit(1)
diff --git a/docs/reports/QC_report.html b/docs/reports/QC_report.html
index ea2e14a0..a95ed956 100644
--- a/docs/reports/QC_report.html
+++ b/docs/reports/QC_report.html
@@ -92,7 +92,7 @@
Number of reads troughout th quality control process
- Total number of reads/bases ater the QC
+ Total number of reads/bases ater the QC
@@ -107,16 +107,16 @@
Total number of reads/bases ater the QC
- Quality values along the read
+ Quality values along the read
- Read length
+ Read length
- Insert size
+ Insert size
The size of the reads + the space between. Ideally the paired-end reads don't overlap.
diff --git a/docs/reports/assembly_report.html b/docs/reports/assembly_report.html
index 33cb12a7..b9a60466 100644
--- a/docs/reports/assembly_report.html
+++ b/docs/reports/assembly_report.html
@@ -66,7 +66,7 @@ Total assembly length
- Fragmentation
+ Fragmentation
N50/N90 is a measure of how fractionated assemblies are:
diff --git a/docs/reports/bin_report_DASTool.html b/docs/reports/bin_report_DASTool.html
index a45aa818..e74a54a6 100644
--- a/docs/reports/bin_report_DASTool.html
+++ b/docs/reports/bin_report_DASTool.html
@@ -65,7 +65,7 @@
Bin Report for Binner DASTool
Quality score is calculated as: Completeness - 5 x Contamination.
For all the information see the file Binning/DASTool/bin_info.tsv and Binning/DASTool/bins2species.tsv
- Number of genomes
+ Number of genomes
@@ -100,16 +100,16 @@ Number of genomes
"Good quality" refers to the standard of Completeness > 90% and Contamination < 5%. Also called high-quality or near-complete. But t-RNA/r-RNA presence is not evaluated. It is less stingent than Quality Score > 90.
- Quality for all bins
+ Quality for all bins
- Quality for Species representatives
+ Quality for Species representatives
- Quality score by Sample
+ Quality score by Sample
diff --git a/docs/reports/bin_report_SemiBin.html b/docs/reports/bin_report_SemiBin.html
index f742cd89..064959fa 100644
--- a/docs/reports/bin_report_SemiBin.html
+++ b/docs/reports/bin_report_SemiBin.html
@@ -65,7 +65,7 @@ Bin Report for Binner SemiBin
Quality score is calculated as: Completeness - 5 x Contamination.
For all the information see the file Binning/SemiBin/bin_info.tsv and Binning/SemiBin/bins2species.tsv
- Number of genomes
+ Number of genomes
@@ -100,16 +100,16 @@ Number of genomes
"Good quality" refers to the standard of Completeness > 90% and Contamination < 5%. Also called high-quality or near-complete. But t-RNA/r-RNA presence is not evaluated. It is less stingent than Quality Score > 90.
- Quality for all bins
+ Quality for all bins
- Quality for Species representatives
+ Quality for Species representatives
- Quality score by Sample
+ Quality score by Sample
diff --git a/docs/reports/bin_report_vamb.html b/docs/reports/bin_report_vamb.html
index 16c31e1c..6357e9cd 100644
--- a/docs/reports/bin_report_vamb.html
+++ b/docs/reports/bin_report_vamb.html
@@ -65,7 +65,7 @@ Bin Report for Binner vamb
Quality score is calculated as: Completeness - 5 x Contamination.
For all the information see the file Binning/vamb/bin_info.tsv and Binning/vamb/bins2species.tsv
- Number of genomes
+ Number of genomes
@@ -100,16 +100,16 @@ Number of genomes
"Good quality" refers to the standard of Completeness > 90% and Contamination < 5%. Also called high-quality or near-complete. But t-RNA/r-RNA presence is not evaluated. It is less stingent than Quality Score > 90.
- Quality for all bins
+ Quality for all bins
- Quality for Species representatives
+ Quality for Species representatives
- Quality score by Sample
+ Quality score by Sample
diff --git a/docs/usage/configuration.rst b/docs/usage/configuration.rst
index 836a3456..4864fc6a 100644
--- a/docs/usage/configuration.rst
+++ b/docs/usage/configuration.rst
@@ -11,10 +11,10 @@ _contaminants:
Remove reads from Host
======================
-One of the most important steps in the Quality control is to remove host genome.
+One of the most important steps in the Quality control is to remove reads from the host's genome.
You can add any number of genomes to be removed.
-We recommend you to use genomes where repetitive sequences are masked.
+We recommend using genomes where repetitive sequences are masked.
See here for more details `human genome `_.
@@ -36,7 +36,7 @@ There are two primary strategies for co-abundance binning:
The samples to be binned together are specified using the `BinGroup` in the `sample.tsv` file.
The size of the BinGroup should be selected based on the binner and the co-binning strategy in use.
-Cross mapping complexity scales quadratically with the size of the BinGroup since each sample's reads are mapped to each other.
+Cross-mapping complexity scales quadratically with the size of the BinGroup since each sample's reads are mapped to each other.
This might yield better results for complex metagenomes, although no definitive benchmark is known.
On the other hand, co-binning is more efficient, as it maps a sample's reads only once to a potentially large assembly.
@@ -88,12 +88,12 @@ Long reads
==========
Limitation: Hybrid assembly of long and short reads is supported with spades and metaSpades.
-However metaSpades needs a paired-end short-read library.
+However, metaSpades needs a paired-end short-read library.
The path of the (preprocessed) long reads should be added manually to the
-the sample table under a new column heading 'longreads'.
+sample table under a new column heading 'longreads'.
-In addition the type of the long reads should be defined in the config file:
+In addition, the type of the long reads should be defined in the config file:
``longread_type`` one of ["pacbio", "nanopore", "sanger", "trusted-contigs", "untrusted-contigs"]
@@ -101,8 +101,7 @@ Example config file
===================
-..
-include:: ../../workflow/../config/template_config.yaml
+..include:: ../../config/template_config.yaml
:code:
diff --git a/docs/usage/getting_started.rst b/docs/usage/getting_started.rst
index d8fbd56f..3c53f42b 100644
--- a/docs/usage/getting_started.rst
+++ b/docs/usage/getting_started.rst
@@ -11,7 +11,7 @@ Conda package manager
---------------------
Atlas has **one dependency**: conda_. All databases and other dependencies are installed **on the fly**.
-Atlas is based on snakemake which allows to run steps of the workflow in parallel on a cluster.
+Atlas is based on snakemake, which allows to run steps of the workflow in parallel on a cluster.
If you want to try atlas and have a linux computer (OSX may also work), you can use our `example data`_ for testing.
@@ -20,7 +20,7 @@ For real metagenomic data atlas should be run on a _linux_ sytem, with enough me
You need to install `anaconda `_ or miniconda.
-If you haven't done it already you need to configure conda with the bioconda-channel and the conda-forge channel. This are sources for packages beyond the default one.
+If you haven't done it already, you need to configure conda with the bioconda-channel and the conda-forge channel. This are sources for packages beyond the default one.
Setting strict channel priority can prevent quite some annoyances.
.. code-block:: bash
@@ -38,12 +38,12 @@ Conda can be a bit slow because there are so many packages. A good way around th
conda install mamba
-From now on you can replace ``conda install`` with ``mamba install`` and see how much faster this snake is.
+From now on, you can replace ``conda install`` with ``mamba install`` and see how much faster this snake is.
Install metagenome-atlas
------------------------
-We recommend you to install metagenome-atlas into a conda environment e.g. named ``atlasenv``
+We recommend to install metagenome-atlas into a conda environment e.g. named ``atlasenv``.
We also recommend to specify the latest version of metagenome-atlas.
.. code-block:: bash
@@ -62,7 +62,7 @@ where `{latest_version}` should be replaced by
Install metagenome-atlas from GitHub
------------------------------------
-Alternatively you can install metagenome Atlas directly form GitHub. This allows you to access versions that are not yet in the conda release, e.g. versions that are still in development.
+Alternatively, you can install metagenome Atlas directly from GitHub. This allows you to access versions that are not yet in the conda release, e.g. versions that are still in development.
.. code-block:: bash
@@ -76,7 +76,7 @@ Alternatively you can install metagenome Atlas directly form GitHub. This allows
mamba env create -n atlas-dev --file atlasenv.yml
conda activate atlas-dev
- # install atlas version. Changes in this files are directly available in the atlas dev version
+ # install atlas version. Changes in the files are directly available in the atlas dev version
pip install --editable .
cd ..
@@ -89,9 +89,9 @@ Alternatively you can install metagenome Atlas directly form GitHub. This allows
Example Data
============
-If you want to test atlas on a small example data here is a two sample, three genome minimal metagenome dataset,
+If you want to test atlas on a small example data, here is a two sample, three genome minimal metagenome dataset,
to test atlas. Even when atlas will run faster on the test data,
-it will anyway download all the databases and requirements, for the a complete run,
+it will anyway download all the databases and requirements, for a complete run,
which can take a certain amount of time and especially disk space (>100Gb).
The database dir of the test run should be the same as for the later atlas executions.
@@ -119,13 +119,13 @@ This command parses the folder for fastq files (extension ``.fastq(.gz)`` or ``.
The command creates a ``samples.tsv`` and a ``config.yaml`` in the working directory.
-Have a look at them with a normal text editor and check if the samples names are inferred correctly. The sample names are used for the naming of contigs, genes, and genomes. Therefore, the sample names should consist only form digits and letters and start with a letter (Even though one ``-`` is allowed). Atlas tries to simplify the file name to obtain unique sample names, if it doesn't succeed it simply puts S1, S2, ... as sample names.
+Have a look at them with a normal text editor and check if the sample names are inferred correctly. The sample names are used for the naming of contigs, genes, and genomes. Therefore, the sample names should consist only of digits and letters and start with a letter (Even though one ``-`` is allowed). Atlas tries to simplify the file name to obtain unique sample names, if it doesn't succeed it simply puts S1, S2, ... as sample names.
See the :download:`example sample table <../reports/samples.tsv>`
The ``BinGroup`` parameter is used during the genomic binning.
-In short: If you have between 5 and 150 samples the default (puting everithing in one group) is fine.
+In short: If you have between 5 and 150 samples the default (putting everything in one group) is fine.
If you have less than 5 samples, put every sample in an individual BinGroup and use `metabat` as final binner.
If you have more samples see the :ref:`cobinning` section for more details.
@@ -180,11 +180,11 @@ Since v2.9 atlas has possibility to start a new project from public data stored
You can run ``atlas init-public `` and specify any ids, like bioprojects, or other SRA ids.
-Atlas does the folowing steps:
+Atlas does the following steps:
- 1. Search SRA for the corresponding sequences (Runs) and save them in the file ``SRA/RunInfo_original.tsv``. For example if you specify a Bioproject, it fetches the information for all runs of this project.
+ 1. Search SRA for the corresponding sequences (Runs) and save them in the file ``SRA/RunInfo_original.tsv``. For example, if you specify a Bioproject, it fetches the information for all runs of this project.
2. Atlas filters the runs to contain only valid metagenome sequences. E.g. exclude singleton reads, 16S. The output will be saved in ``RunInfo.tsv``
- 3. Sometimes the same Sample is sequenced on different laines, which will result into multipe runs from the same sample. Atlas will **merge** runs from the same biosample.
+ 3. Sometimes the same Sample is sequenced on different lanes, which will result into multiple runs from the same sample. Atlas will **merge** runs from the same biosample.
4. Prepare a sample table and a config.yaml similar to the ``atlas init`` command.
@@ -196,10 +196,10 @@ Limitations: For now atlas, cannot handle a mixture of paired and single end rea
If you have longreads for your project, you would need to specify them yourself in the sample.tsv.
During the run, the reads are downloaded from SRA in the likely most efficient way using prefetch and parallel, fastq.gz generation.
-The download step has checkpoints, so if the pipline gets interupted, you can restart where you left off.
-Using the comand line arguments ``--restart-times 3 and --keep-going`` You can even ask atlas to do multiple restarts befor stoping.
+The download step has checkpoints, so if the pipeline gets interrupted, you can restart where you left off.
+Using the command line arguments ``--restart-times 3 and --keep-going`` You can even ask atlas to do multiple restarts before stopping.
-The downloaded reads, are directly processed. If you however want only to doenload the reads you can use.::
+The downloaded reads are directly processed. However, if you only want to download the reads you can use::
atlas run None download_sra
@@ -247,7 +247,7 @@ We recommend to use atlas on a :ref:`cluster` system, which can be set up in a v
Usage: atlas run [OPTIONS] [qc|assembly|binning|genomes|genecatalog|None|all]
[SNAKEMAKE_ARGS]...
- Runs the ATLAS pipline
+ Runs the ATLAS pipeline
By default all steps are executed but a sub-workflow can be specified.
Needs a config-file and expects to find a sample table in the working-
@@ -262,7 +262,7 @@ We recommend to use atlas on a :ref:`cluster` system, which can be set up in a v
-w, --working-dir PATH location to run atlas.
-c, --config-file PATH config-file generated with 'atlas init'
-j, --jobs INTEGER use at most this many jobs in parallel (see cluster
- submission for mor details).
+ submission for more details).
--profile TEXT snakemake profile e.g. for cluster execution.
-n, --dryrun Test execution. [default: False]
@@ -282,7 +282,7 @@ Automatic submitting to cluster systems
---------------------------------------
Thanks to the underlying snakemake Atlas can submit parts of the pipeline automatically to a cluster system and define the appropriate resources. If one job has finished it launches the next one.
-This allows you use the full capacity of your cluster system. You even need to pay attention not to spam the other users of the cluster.
+This allows to use the full capacity of your cluster system. You even need to pay attention not to spam the other users of the cluster.
@@ -303,7 +303,7 @@ Then run::
cookiecutter --output-dir ~/.config/snakemake https://github.com/metagenome-atlas/clusterprofile.git
-This opens a interactive shell dialog and ask you for the name of the profile and your cluster system.
+This opens an interactive shell dialog and ask you for the name of the profile and your cluster system.
We recommend you keep the default name ``cluster``. The profile was tested on ``slurm``, ``lsf`` and ``pbs``.
The resources (threads, memory and time) are defined in the atlas config file (hours and GB).
@@ -352,11 +352,11 @@ The atlas argument ``--jobs`` now becomes the number of jobs simultaneously subm
Single machine execution
========================
-If you dont want to use the :ref:`automatic scheduling ` you can use atlas on a single machine (local execution) with a lot of memory and threads ideally. In this case I recommend you the following options. The same applies if you submit a single job to a cluster running atlas.
+If you don't want to use the :ref:`automatic scheduling ` you can use atlas on a single machine (local execution) with a lot of memory and threads ideally. In this case I recommend you the following options. The same applies if you submit a single job to a cluster running atlas.
-Atlas detects how many CPUs and how much memory is available on your system and it will schedule as many jobs in paralell as possible. If you have less resources available than specified in the config file, the jobs are downscaled.
+Atlas detects how many CPUs and how much memory is available on your system and it will schedule as many jobs in parallel as possible. If you have less resources available than specified in the config file, the jobs are downscaled.
-By default atlas will use all cpus and 95% of all the available memory. If you are not happy with that, or you need to specify an exact ammount of memory/ cpus you can use the comand line arguments ``--jobs`` and ``--max-mem`` to do so.
+By default atlas will use all cpus and 95% of all the available memory. If you are not happy with that, or you need to specify an exact amount of memory/ cpus you can use the command line arguments ``--jobs`` and ``--max-mem`` to do so.
Cloud execution
diff --git a/workflow/Snakefile b/workflow/Snakefile
index 6ec4ed5c..2c12c552 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -224,19 +224,13 @@ onerror:
for r in workflow.rules:
if not "mem_mb" in r.resources:
- if "mem" in r.resources:
- r.resources["mem_mb"] = r.resources["mem"] * 1000
- else:
- # default
- r.resources["mem_mb"] = config["mem"] * 1000
+ # default
+ r.resources["mem_mb"] = config["mem"] * 1000
# add time if ot present. Simple jobs use simple time
if "time_min" not in r.resources:
- if "time" in r.resources:
- r.resources["time_min"] = r.resources["time"] * 60
- else:
- r.resources["time_min"] = config["runtime"]["default"] * 60
+ r.resources["time_min"] = config["runtime"]["default"] * 60
if not "runtime" in r.resources:
r.resources["runtime"] = r.resources["time_min"]
diff --git a/workflow/envs/spades.yaml b/workflow/envs/spades.yaml
index c660afab..be85bd0e 100644
--- a/workflow/envs/spades.yaml
+++ b/workflow/envs/spades.yaml
@@ -3,4 +3,4 @@ channels:
- bioconda
- defaults
dependencies:
- - spades>=3.15.3
+ - spades>=4.0
diff --git a/workflow/report/template_QC_report.html b/workflow/report/template_QC_report.html
index 97a9ebf0..2bd2bb2b 100644
--- a/workflow/report/template_QC_report.html
+++ b/workflow/report/template_QC_report.html
@@ -19,7 +19,7 @@
Quality Control Report
- Number of reads troughout th quality control process
+ Number of reads that went through the quality control process.
{div[Reads]}
@@ -50,7 +50,7 @@ Number of reads troughout th quality control process
- Total number of reads/bases ater the QC
+ Total number of reads/bases after QC
@@ -65,17 +65,17 @@
Total number of reads/bases ater the QC
- Quality values along the read
+ Base quality values along reads
{div[quality_QC]}
- Read length
+ Read length
{div[Length]}
- Insert size
- The size of the reads + the space between. Ideally the paired-end reads don't overlap.
+ Insert size
+ The size of the reads + the space between. Ideally, the paired-end reads don't overlap.
{div[Insert]}
diff --git a/workflow/report/template_assembly_report.html b/workflow/report/template_assembly_report.html
index 215a960d..d3960bab 100644
--- a/workflow/report/template_assembly_report.html
+++ b/workflow/report/template_assembly_report.html
@@ -24,11 +24,11 @@ Total assembly length
{div[Total]}
- Fragmentation
+ Fragmentation
N50/N90 is a measure of how fractionated assemblies are:
- 50%/ 90% of the assembly is made up of contigs of Length N50/N90-length or longer.
+ 50%/90% of the assembly consists of contigs of length N50/N90 or longer.
You need N50/N90-number contigs to get 50%/90% of the total assembly length.
diff --git a/workflow/report/template_bin_report.html b/workflow/report/template_bin_report.html
index 800222ab..eed781d3 100644
--- a/workflow/report/template_bin_report.html
+++ b/workflow/report/template_bin_report.html
@@ -23,21 +23,21 @@ Bin Report for Binner {binner}
{div[QualityScore]}
For all the information see the file {div[input_file]}
- Number of genomes
+ Number of genomes
{div[table]}
"Good quality" refers to the standard of Completeness > 90% and Contamination < 5%. Also called high-quality or near-complete. But t-RNA/r-RNA presence is not evaluated. It is less stingent than Quality Score > 90.
- Quality for all bins
+ Quality for all bins
{div[2D]}
- Quality for Species representatives
+ Quality for Species representatives
{div[2Dsp]}
- Quality score by Sample
+ Quality score by Sample
@@ -49,4 +49,4 @@ Quality score by Sample