Skip to content

Commit

Permalink
Merge pull request #730 from metagenome-atlas/gtdb9-atlas2
Browse files Browse the repository at this point in the history
feat: GTDB v9 Refseq 220
  • Loading branch information
SilasK authored Jun 30, 2024
2 parents adc5a59 + 3b9aa77 commit 22d78b9
Show file tree
Hide file tree
Showing 3 changed files with 75 additions and 138 deletions.
2 changes: 1 addition & 1 deletion atlas/atlas.py
Original file line number Diff line number Diff line change
Expand Up @@ -245,7 +245,7 @@ def run_download(db_dir, jobs, snakemake_args):
"""

cmd = (
"snakemake --snakefile {snakefile} "
"snakemake --snakefile {snakefile} download "
"--jobs {jobs} --rerun-incomplete "
"--conda-frontend mamba --scheduler greedy "
"--nolock --use-conda --conda-prefix {conda_prefix} "
Expand Down
2 changes: 1 addition & 1 deletion workflow/envs/gtdbtk.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@ channels:
- bioconda
- defaults
dependencies:
- gtdbtk =2.3
- gtdbtk =2.4
209 changes: 73 additions & 136 deletions workflow/rules/download.smk
Original file line number Diff line number Diff line change
@@ -1,25 +1,76 @@
import hashlib
import os

from pathlib import Path

# this values are incuded in the snakefile
DBDIR = os.path.realpath(config["database_dir"])
CHECKMDIR = os.path.join(DBDIR, "checkm")
CHECKM_ARCHIVE = "checkm_data_v1.0.9.tar.gz"
CAT_DIR = os.path.join(DBDIR, "CAT")
CAT_flag_downloaded = os.path.join(CAT_DIR, "downloaded")
GUNCDIR = os.path.join(DBDIR, "gunc_database")
BUSCODIR = os.path.join(DBDIR, "busco_lineages")
DBDIR = Path(config["database_dir"]).resolve()

GUNCDIR = DBDIR / "gunc_database"
BUSCODIR = DBDIR / "busco_lineages"

ZENODO_ARCHIVE = "1134890"
EGGNOG_VERSION = "5"
EGGNOG_DIR = os.path.join(DBDIR, "EggNOG_V" + EGGNOG_VERSION)
EGGNOG_DIR = DBDIR / ("EggNOG_V" + EGGNOG_VERSION)

CONDAENV = "../envs"

GTDB_VERSION = "V08_R214"
GTDB_DATA_URL = "https://data.gtdb.ecogenomic.org/releases/release214/214.0/auxillary_files/gtdbtk_r214_data.tar.gz"
GTDBTK_DATA_PATH = os.path.join(DBDIR, "GTDB_" + GTDB_VERSION)

## GTDBTk

GTDB_VERSION = "V09_R200"
GTDB_DATA_URL = "https://data.gtdb.ecogenomic.org/releases/release220/220.0/auxillary_files/gtdbtk_package"
GTDBTK_DATA_PATH = DBDIR / ("GTDB_" + GTDB_VERSION)


def all_partial_gtdb_tarbals(
wildcards,
GTDB_REFSEQ_VERSION=220,
GTDB_PATIAL_SUFFIXES=["a" + i for i in "abcdefghijk"],
):
return expand(
GTDBTK_DATA_PATH / "gtdbtk_r{gtdb_refseq_version}_data.tar.gz.part_{suffix}",
gtdb_refseq_version=GTDB_REFSEQ_VERSION,
suffix=GTDB_PATIAL_SUFFIXES,
)


localrules:
download_partial_gtdb,
extract_gtdb,


rule download_partial_gtdb:
output:
temp(
GTDBTK_DATA_PATH
/ "gtdbtk_r{gtdb_refseq_version}_data.tar.gz.part_{suffix}"
),
threads: 1
params:
url=lambda wc, output: f"{GTDB_DATA_URL}/split_package/{ Path(output[0]).name}",
resources:
time_min=60 * int(config.get("runtime", {"long": 10})["long"]),
log:
"logs/download/gtdbtk_r{gtdb_refseq_version}_part_{suffix}.log",
shell:
" wget --no-check-certificate {params.url} -O {output} &> {log} "


rule extract_gtdb:
input:
all_partial_gtdb_tarbals,
output:
touch(os.path.join(GTDBTK_DATA_PATH, "downloaded_success")),
threads: 1
resources:
time_min=60 * int(config.get("runtime", {"long": 10})["long"]),
log:
"logs/download/gtdbtk_untar.log",
shell:
'( cat {input} | tar -xzvf - -C "{GTDBTK_DATA_PATH}" --strip 1 ) 2> {log} '


### end GTDBTk


def md5(fname):
Expand All @@ -37,52 +88,10 @@ def md5(fname):
FILES = {
"adapters.fa": "ae839dc79cfb855a1b750a0d593fe01e",
"phiX174_virus.fa": "82516880142e8c89b466bc6118696c47",
"refseq.db": "42b8976656f2cfd661b8a299d6e24c19",
"refseq.dmnd": "c01facc7e397270ccb796ea799a09108",
"refseq.tree": "469fcbeb15dd0d4bf8f1677682bde157",
"silva_rfam_all_rRNAs.fa": "f102e35d9f48eabeb0efe9058559bc66",
"eggnog.db": "7923d3bb7eca8e0e8f122be4b5ca6997",
"eggnog_proteins.dmnd": "64fefa838833a6f3e220a06fb9d403cd",
CHECKM_ARCHIVE: "631012fa598c43fdeb88c619ad282c4d",
}


CHECKMFILES = [
"%s/taxon_marker_sets.tsv" % CHECKMDIR,
"%s/selected_marker_sets.tsv" % CHECKMDIR,
"%s/pfam/tigrfam2pfam.tsv" % CHECKMDIR,
"%s/pfam/Pfam-A.hmm.dat" % CHECKMDIR,
"%s/img/img_metadata.tsv" % CHECKMDIR,
"%s/hmms_ssu/SSU_euk.hmm" % CHECKMDIR,
"%s/hmms_ssu/SSU_bacteria.hmm" % CHECKMDIR,
"%s/hmms_ssu/SSU_archaea.hmm" % CHECKMDIR,
"%s/hmms_ssu/createHMMs.py" % CHECKMDIR,
"%s/hmms/phylo.hmm.ssi" % CHECKMDIR,
"%s/hmms/phylo.hmm" % CHECKMDIR,
"%s/hmms/checkm.hmm.ssi" % CHECKMDIR,
"%s/hmms/checkm.hmm" % CHECKMDIR,
"%s/genome_tree/missing_duplicate_genes_97.tsv" % CHECKMDIR,
"%s/genome_tree/missing_duplicate_genes_50.tsv" % CHECKMDIR,
"%s/genome_tree/genome_tree.taxonomy.tsv" % CHECKMDIR,
"%s/genome_tree/genome_tree_reduced.refpkg/phylo_modelJqWx6_.json" % CHECKMDIR,
"%s/genome_tree/genome_tree_reduced.refpkg/genome_tree.tre" % CHECKMDIR,
"%s/genome_tree/genome_tree_reduced.refpkg/genome_tree.log" % CHECKMDIR,
"%s/genome_tree/genome_tree_reduced.refpkg/genome_tree.fasta" % CHECKMDIR,
"%s/genome_tree/genome_tree_reduced.refpkg/CONTENTS.json" % CHECKMDIR,
"%s/genome_tree/genome_tree.metadata.tsv" % CHECKMDIR,
"%s/genome_tree/genome_tree_full.refpkg/phylo_modelEcOyPk.json" % CHECKMDIR,
"%s/genome_tree/genome_tree_full.refpkg/genome_tree.tre" % CHECKMDIR,
"%s/genome_tree/genome_tree_full.refpkg/genome_tree.log" % CHECKMDIR,
"%s/genome_tree/genome_tree_full.refpkg/genome_tree.fasta" % CHECKMDIR,
"%s/genome_tree/genome_tree_full.refpkg/CONTENTS.json" % CHECKMDIR,
"%s/genome_tree/genome_tree.derep.txt" % CHECKMDIR,
"%s/.dmanifest" % CHECKMDIR,
"%s/distributions/td_dist.txt" % CHECKMDIR,
"%s/distributions/gc_dist.txt" % CHECKMDIR,
"%s/distributions/cd_dist.txt" % CHECKMDIR,
]


def get_eggnog_db_file():
return ancient(
expand(
Expand All @@ -97,7 +106,6 @@ localrules:
download,
download_eggNOG_files,
download_atlas_files,
download_checkm_data,
download_gunc,


Expand All @@ -111,7 +119,7 @@ rule download:
),
get_eggnog_db_file(),
f"{DBDIR}/CheckM2",
os.path.join(GTDBTK_DATA_PATH, "downloaded_success"),
GTDBTK_DATA_PATH / "downloaded_success",


rule download_eggNOG_files:
Expand Down Expand Up @@ -139,75 +147,6 @@ rule download_atlas_files:
raise OSError(2, "Invalid checksum", output[0])


rule download_checkm_data:
output:
tar=temp(CHECKM_ARCHIVE),
files=CHECKMFILES,
params:
path=CHECKMDIR,
run:
shell(
"wget -O {output.tar} 'https://zenodo.org/record/{ZENODO_ARCHIVE}/files/{CHECKM_ARCHIVE}' "
)
if not FILES[CHECKM_ARCHIVE] == md5(output.tar):
raise OSError(2, "Invalid checksum", CHECKM_ARCHIVE)

shell("tar -zxf {output.tar} --directory {params.path}")


localrules:
initialize_checkm,


rule initialize_checkm:
input:
ancient(CHECKMFILES),
output:
touched_output=touch("logs/checkm_init.txt"),
params:
database_dir=CHECKMDIR,
conda:
"%s/checkm.yaml" % CONDAENV
log:
"logs/initialize_checkm.log",
shell:
"checkm data setRoot {params.database_dir} &> {log} "


localrules:
download_gtdb,


rule download_gtdb:
output:
temp(f"{GTDBTK_DATA_PATH}/gtdb_data.tar.gz"),
conda:
"../envs/gtdbtk.yaml"
threads: 1
resources:
time_min=60 * int(config.get("runtime", {"long": 10})["long"]),
log:
"logs/download/gtdbtk.log",
shell:
" wget --no-check-certificate {GTDB_DATA_URL} -O {output} &> {log} "


rule extract_gtdb:
input:
rules.download_gtdb.output,
output:
touch(os.path.join(GTDBTK_DATA_PATH, "downloaded_success")),
conda:
"../envs/gtdbtk.yaml"
threads: 1
resources:
time_min=60 * int(config.get("runtime", {"long": 10})["long"]),
log:
"logs/download/gtdbtk_untar.log",
shell:
'tar -xzvf {input} -C "{GTDBTK_DATA_PATH}" --strip 1 2> {log}; '


rule checkm2_download_db:
output:
directory(f"{DBDIR}/CheckM2"),
Expand Down Expand Up @@ -261,14 +200,12 @@ onsuccess:

onerror:
print("An error occurred while downloading reference databases.")
print(
"ATLAS databases can be manually downloaded from: https://zenodo.org/record/%s"
% ZENODO_ARCHIVE
)
print(
"eggNOG databases can be manually downloaded from: http://eggnogdb.embl.de/download/emapperdb-%s"
% EGGNOG_VERSION
)
print(
"CAT databases can be manually downloaded from: https://github.com/dutilh/CAT"
)
# print(
# "ATLAS databases can be manually downloaded from: https://zenodo.org/record/%s"
# % ZENODO_ARCHIVE
# )
# print(
# "eggNOG databases can be manually downloaded from: http://eggnogdb.embl.de/download/emapperdb-%s"
# % EGGNOG_VERSION
# )

0 comments on commit 22d78b9

Please sign in to comment.