diff --git a/easyconfigs/c/checkm2-db/checkm2-db-20230511.eb b/easyconfigs/c/checkm2-db/checkm2-db-20230511.eb new file mode 100644 index 0000000..eb6b783 --- /dev/null +++ b/easyconfigs/c/checkm2-db/checkm2-db-20230511.eb @@ -0,0 +1,31 @@ +easyblock = 'Bundle' + +name = 'checkm2-db' +version = '20230511' + +homepage = 'https://github.com/chklovski/CheckM2' +description = """CheckM2 Database""" + +toolchain = SYSTEM + +#source_urls = [] +#sources = [] + +dependencies = [] + +sanity_check_paths = { + 'files': ['CheckM2_database/uniref100.KO.1.dmnd'], + 'dirs': [], +} + +modextrapaths = { + 'BIODB':'', + 'CHECKM2DB': 'CheckM2_database/uniref100.KO.1.dmnd' + +} + +keeppreviousinstall = True +moduleclass = 'data' + +modloadmsg = "%(name)s/%(version)s database is located at %(installdir)s\n" + diff --git a/easyconfigs/e/ena/ena-20230511.eb b/easyconfigs/e/ena/ena-20230511.eb new file mode 100644 index 0000000..f3c0727 --- /dev/null +++ b/easyconfigs/e/ena/ena-20230511.eb @@ -0,0 +1,28 @@ +easyblock = 'Bundle' + +name = 'ena' +version = '20230511' + +homepage = 'https://www.ebi.ac.uk/ena' +description = """The European Nucleotide Archive (ENA) captures and presents information relating to experimental workflows that are based around nucleotide sequencing. """ + +toolchain = SYSTEM + +source_urls = [''] +sources = [] + +dependencies = [] + +sanity_check_paths = { + 'files': [], + 'dirs': ['sequence','wgs'], +} + +modextrapaths = {'BIODB':'' +} + +keeppreviousinstall = True +moduleclass = 'data' + +modloadmsg = "%(name)s/%(version)s database is located at %(installdir)s\n" + diff --git a/easyconfigs/e/ena/ena-20230925.eb b/easyconfigs/e/ena/ena-20230925.eb new file mode 100644 index 0000000..3764718 --- /dev/null +++ b/easyconfigs/e/ena/ena-20230925.eb @@ -0,0 +1,28 @@ +easyblock = 'Bundle' + +name = 'ena' +version = '20230925' + +homepage = 'https://www.ebi.ac.uk/ena' +description = """The European Nucleotide Archive (ENA) captures and presents information relating to experimental workflows that are based around nucleotide sequencing. """ + +toolchain = SYSTEM + +source_urls = [''] +sources = [] + +dependencies = [] + +sanity_check_paths = { + 'files': [], + 'dirs': ['sequence','wgs'], +} + +modextrapaths = {'BIODB':'' +} + +keeppreviousinstall = True +moduleclass = 'data' + +modloadmsg = "%(name)s/%(version)s database is located at %(installdir)s\n" + diff --git a/easyconfigs/e/ena/ena_download.sh b/easyconfigs/e/ena/ena_download.sh index 1f45c11..c832644 100755 --- a/easyconfigs/e/ena/ena_download.sh +++ b/easyconfigs/e/ena/ena_download.sh @@ -1,5 +1,5 @@ # ----------------Load Modules-------------------- -module load globus-cli/3.10.1-IGB-gcc-8.2.0-Python-3.7.2 +module load globus-cli/3.18.0-IGB-gcc-8.2.0-Python-3.7.2 # ----------------Commands------------------------ @@ -13,12 +13,12 @@ VERSION=$1 MIRROR_DIR=/private_stores/mirror/ena/${VERSION} echo "Downloading Files: `date "+%Y-%m-%d %k:%M:%S"`" -mkdir -p ${MIRROR_DIR} -mkdir ${MIRROR_DIR}/wgs -mkdir ${MIRROR_DIR}/sequence +#mkdir -p ${MIRROR_DIR} +#mkdir ${MIRROR_DIR}/wgs +#mkdir ${MIRROR_DIR}/sequence -globus transfer --preserve-timestamp --skip-source-errors -r "fd9c190c-b824-11e9-98d7-0a63aa6b37da:/gridftp/pub/databases/ena/wgs/" "1ccc563b-0542-44e5-a13c-fc4b00281b72:${MIRROR_DIR}/wgs/" -globus transfer --preserve-timestamp --skip-source-errors -r "fd9c190c-b824-11e9-98d7-0a63aa6b37da:/gridftp/pub/databases/ena/sequence/update/" "1ccc563b-0542-44e5-a13c-fc4b00281b72:${MIRROR_DIR}/sequence/" +globus transfer --preserve-timestamp --skip-source-errors --delete -r "47772002-3e5b-4fd3-b97c-18cee38d6df2:/pub/databases/ena/wgs/" "1ccc563b-0542-44e5-a13c-fc4b00281b72:${MIRROR_DIR}/wgs/" +globus transfer --preserve-timestamp --skip-source-errors --delete -r "47772002-3e5b-4fd3-b97c-18cee38d6df2:/pub/databases/ena/sequence/snapshot_latest/" "1ccc563b-0542-44e5-a13c-fc4b00281b72:${MIRROR_DIR}/sequence/" diff --git a/easyconfigs/f/foldseek-db/foldseek-db-20230921.eb b/easyconfigs/f/foldseek-db/foldseek-db-20230921.eb new file mode 100644 index 0000000..bd6db3d --- /dev/null +++ b/easyconfigs/f/foldseek-db/foldseek-db-20230921.eb @@ -0,0 +1,29 @@ +easyblock = 'Bundle' + +name = 'foldseek-db' +version = '20230921' + +homepage = 'https://github.com/steineggerlab/foldseek' +description = """Foldseek computes for each match a simple estimate for the probability that the match is a true positive match given its structural bit score. Here, hits within the same superfamily are TP, hits to another fold are FP, and hits to the same family or to another superfamily are ignored.""" + +toolchain = SYSTEM + +source_urls = [''] +sources = [] + +dependencies = [] + +sanity_check_paths = { + 'files': ['afdb/afdb','pdb/pdb'], + 'dirs': ['afdb','pdb'], +} + +modextrapaths = { + 'BIODB':'' +} + +keeppreviousinstall = True +moduleclass = 'data' + +modloadmsg = "%(name)s/%(version)s database is located at %(installdir)s\n" + diff --git a/easyconfigs/g/gtdb/gtdb-207.eb b/easyconfigs/g/gtdb/gtdb-207.eb index 0bea71f..06dd155 100644 --- a/easyconfigs/g/gtdb/gtdb-207.eb +++ b/easyconfigs/g/gtdb/gtdb-207.eb @@ -19,7 +19,8 @@ sanity_check_paths = { } modextrapaths = { - 'BIODB':'' + 'BIODB':'', + 'GTDBTK_DATA_PATH': 'auxillary_files/release214_v2' } keeppreviousinstall = True diff --git a/easyconfigs/g/gtdb/gtdb-214.eb b/easyconfigs/g/gtdb/gtdb-214.eb new file mode 100644 index 0000000..a98b7e1 --- /dev/null +++ b/easyconfigs/g/gtdb/gtdb-214.eb @@ -0,0 +1,30 @@ +easyblock = 'Bundle' + +name = 'gtdb' +version = '214' + +homepage = 'https://gtdb.ecogenomic.org/' +description = """GENOME TAXONOMY DATABASE""" + +toolchain = SYSTEM + +source_urls = [''] +sources = [] + +dependencies = [] + +sanity_check_paths = { + 'files': [], + 'dirs': ['auxillary_files','genomic_files_all','genomic_files_reps'], +} + +modextrapaths = { + 'BIODB':'', + 'GTDBTK_DATA_PATH': 'auxillary_files/release214' +} + +keeppreviousinstall = True +moduleclass = 'data' + +modloadmsg = "%(name)s/%(version)s database is located at %(installdir)s\n" + diff --git a/easyconfigs/g/gtdb/gtdb_delete.sh b/easyconfigs/g/gtdb/gtdb_delete.sh new file mode 100755 index 0000000..c445bf7 --- /dev/null +++ b/easyconfigs/g/gtdb/gtdb_delete.sh @@ -0,0 +1,43 @@ +#!/bin/bash +# ----------------SLURM Parameters---------------- +#SBATCH -p admin +#SBATCH -n 4 +#SBATCH --mem=20g +#SBATCH -N 1 +#SBATCH --mail-user=datamover@igb.illinois.edu +#SBATCH --mail-type=ALL +#SBATCH -J gtdb_delete +#SBATCH -D /home/a-m/datamover/jobs +#SBATCH -o %x-%j.out +# ----------------Load Modules-------------------- +module load pigz/2.4-IGB-gcc-8.2.0 +# ----------------Commands------------------------ +# +# Replace DATABASE with name of database you are downloading +# Replace WEBSITE with remote location of database# +# + +DATABASE="gtdb" + +if [ -z "$1" ]; +then + echo "Please specify ${DATABASE} version number"; + exit 1; +fi + +VERSION=$1 +MIRROR_DIR=/private_stores/mirror/${DATABASE}/${VERSION} + + +echo "`date "+%Y-%m-%d %k:%M:%S"` Deleting tar.gz Files" + +find ${MIRROR_DIR} -type f -name '*.tar.gz' -exec rm -f {} \; +if [ $? -ne 0 ] +then + echo "`date "+%Y-%m-%d %k:%M:%S"` Delete files Failed" + exit $? +else + echo "`date "+%Y-%m-%d %k:%M:%S"` Delete Files Complete" +fi + + diff --git a/easyconfigs/g/gtdb/gtdb_extract.sh b/easyconfigs/g/gtdb/gtdb_extract.sh index a0aa449..b849611 100755 --- a/easyconfigs/g/gtdb/gtdb_extract.sh +++ b/easyconfigs/g/gtdb/gtdb_extract.sh @@ -31,7 +31,7 @@ MIRROR_DIR=/private_stores/mirror/${DATABASE}/${VERSION} echo "`date "+%Y-%m-%d %k:%M:%S"` Extracting Files" -find ${MIRROR_DIR} -type f -name '*.tar' -exec tar -xf {} \; +find ${MIRROR_DIR} -type f -name '*.tar.gz' -execdir tar -I pigz -xvf {} \; if [ $? -ne 0 ] then echo "`date "+%Y-%m-%d %k:%M:%S"` Extracting files Failed" diff --git a/easyconfigs/i/interpro/interpro-97.0.eb b/easyconfigs/i/interpro/interpro-97.0.eb new file mode 100644 index 0000000..a67f73a --- /dev/null +++ b/easyconfigs/i/interpro/interpro-97.0.eb @@ -0,0 +1,28 @@ +easyblock = 'Bundle' + +name = 'interpro' +version = '89.0' + +homepage = 'https://www.ebi.ac.uk/interpro/' +description = """InterPro provides functional analysis of proteins by classifying them into families and predicting domains and important sites.""" + +toolchain = SYSTEM + +source_urls = [''] +sources = [] + +dependencies = [] + +sanity_check_paths = { + 'files': ['interpro.xml'], + 'dirs': [], +} + +modextrapaths = {'BIODB':'' +} + +keeppreviousinstall = True +moduleclass = 'data' + +modloadmsg = "%(name)s/%(version)s database is located at %(installdir)s\n" + diff --git a/easyconfigs/i/interpro/interpro_download.sh b/easyconfigs/i/interpro/interpro_download.sh index 060af96..f913836 100755 --- a/easyconfigs/i/interpro/interpro_download.sh +++ b/easyconfigs/i/interpro/interpro_download.sh @@ -1,20 +1,5 @@ -#!/bin/bash -# ----------------SLURM Parameters---------------- -#SBATCH -p admin -#SBATCH -n 1 -#SBATCH --mem=20g -#SBATCH -N 1 -#SBATCH --mail-user=datamover@igb.illinois.edu -#SBATCH --mail-type=ALL -#SBATCH -J interpro_download -#SBATCH -D /home/a-m/datamover/jobs -#SBATCH -o %x-%j.out # ----------------Load Modules-------------------- -# ----------------Commands------------------------ -# -# Replace DATABASE with name of database you are downloading -# Replace WEBSITE with remote location of database -# +module load globus-cli/3.18.0-IGB-gcc-8.2.0-Python-3.7.2 DATABASE="interpro" @@ -29,14 +14,6 @@ MIRROR_DIR=/private_stores/mirror/${DATABASE}/${VERSION} echo "`date "+%Y-%m-%d %k:%M:%S"` Downloading Files" mkdir -p ${MIRROR_DIR} -rsync -av rsync://ftp.ebi.ac.uk/pub/databases/interpro/${VERSION}/ ${MIRROR_DIR}/ -if [ $? -ne 0 ] -then - echo "`date "+%Y-%m-%d %k:%M:%S"` Downloading Files Failed" - exit $? -else - echo "`date "+%Y-%m-%d %k:%M:%S"` Downloading Files Complete" -fi - +globus transfer -r --exclude "reference_proteomes" --exclude "taxonomic_divisions" "47772002-3e5b-4fd3-b97c-18cee38d6df2:/pub/databases/interpro/releases/$VERSION/" "1ccc563b-0542-44e5-a13c-fc4b00281b72:${MIRROR_DIR}/" diff --git a/easyconfigs/i/interpro/interpro_extract.sh b/easyconfigs/i/interpro/interpro_extract.sh index e68cc44..5476c88 100755 --- a/easyconfigs/i/interpro/interpro_extract.sh +++ b/easyconfigs/i/interpro/interpro_extract.sh @@ -6,7 +6,7 @@ #SBATCH -N 1 #SBATCH --mail-user=datamover@igb.illinois.edu #SBATCH --mail-type=ALL -#SBATCH -J DATABASE_extract +#SBATCH -J interpro_extract #SBATCH -D /home/a-m/datamover/jobs #SBATCH -o %x-%j.out # ----------------Load Modules-------------------- @@ -17,7 +17,7 @@ module load pigz/2.4-IGB-gcc-8.2.0 # Replace WEBSITE with remote location of database# # -DATABSE="interpro" +DATABASE="interpro" if [ -z "$1" ]; then @@ -30,8 +30,18 @@ MIRROR_DIR=/private_stores/mirror/${DATABASE}/${VERSION} echo "`date "+%Y-%m-%d %k:%M:%S"` Extracting Files" - -pigz -p ${SLURM_NTASKS} -dr ${MIRROR_DIR} +for f in $(find ${MIRROR_DIR} -name '*.tar.gz'); +do + tar -xvzf $f -C dirname $f + if [ $? -ne 0 ]; then + echo "`date "+%Y-%m-%d %k:%M:%S"` Error extracting file: $f" + exit 1 + else + echo "`date "+%Y-%m-%d %k:%M:%S"` Done extracting file: $f" + fi +done + +pigz -p $SLURM_NTASKS -dr $MIRROR_DIR if [ $? -ne 0 ] then echo "`date "+%Y-%m-%d %k:%M:%S"` Extracting files Failed" diff --git a/easyconfigs/r/R/R_mirror_update.pl b/easyconfigs/r/R/R_mirror_update.pl index 6054110..d0f4213 100755 --- a/easyconfigs/r/R/R_mirror_update.pl +++ b/easyconfigs/r/R/R_mirror_update.pl @@ -7,12 +7,12 @@ my $cran_dir="/private_stores/mirror/R/cran"; my $cran_exclude="--exclude='bin'"; -my @bioc_versions = ('3.16'); +my @bioc_versions = ('3.18'); foreach my $version (@bioc_versions) { #Rsync the files - my $command ="rsync -e 'ssh -i ~/.ssh/id_rsa' -zrtlv --delete $bioc_exclude bioc-rsync\@master.bioconductor.org:$version/ $bioc_dir/packages/$version/"; + my $command ="rsync -e 'ssh -i ~/.ssh/id_rsa' -zrtlv $bioc_exclude bioc-rsync\@master.bioconductor.org:$version/ $bioc_dir/packages/$version/"; print $command . "\n"; system($command); diff --git a/easyconfigs/templates/extract.sh b/easyconfigs/templates/extract.sh index 1a963f5..45b9901 100755 --- a/easyconfigs/templates/extract.sh +++ b/easyconfigs/templates/extract.sh @@ -29,8 +29,19 @@ VERSION=$1 MIRROR_DIR=/private_stores/mirror/${DATABASE}/${VERSION} -echo "`date "+%Y-%m-%d %k:%M:%S"` Extracting Files" - +echo "`date "+%Y-%m-%d %k:%M:%S"` Extracting tar.gz Files" +for f in $(find ${MIRROR_DIR} -name '*.tar.gz'); +do + tar -xvzf $f -C dirname $f + if [ $? -ne 0 ]; then + echo "`date "+%Y-%m-%d %k:%M:%S"` Error extracting file: $f" + exit $? + else + echo "`date "+%Y-%m-%d %k:%M:%S"` Done extracting file: $f" + fi +done + +echo "`date "+%Y-%m-%d %k:%M:%S"` Extracting .gz Files with Pigz" pigz -p ${SLURM_NTASKS} -dr ${MIRROR_DIR} if [ $? -ne 0 ] then diff --git a/easyconfigs/templates/globus.sh b/easyconfigs/templates/globus.sh new file mode 100755 index 0000000..ab6c37c --- /dev/null +++ b/easyconfigs/templates/globus.sh @@ -0,0 +1,24 @@ +# ----------------Load Modules-------------------- +module load globus-cli/3.18.0-IGB-gcc-8.2.0-Python-3.7.2 + +# ----------------Commands------------------------ + +if [ -z "$1" ]; +then + echo "Please specify version number"; + exit 1; +fi + +VERSION=$1 +DATABASE="" +MIRROR_DIR=/private_stores/mirror/${DATABASE}/${VERSION} +FASTA_DIR=${MIRROR_DIR}/db +BIOTRANSFER_UUID="1ccc563b-0542-44e5-a13c-fc4b00281b72" +SOURCE_UUID="" + +echo "Downloading Files: `date "+%Y-%m-%d %k:%M:%S"`" +mkdir -p ${FASTA_DIR} +globus transfer -r "${SOURCE_UUID}:/pub/databases/uniprot/current_release/" "${PBIOTRANSFER_UUID}:${FASTA_DIR}/" + + + diff --git a/easyconfigs/u/uniprot/uniprot-2023_04.eb b/easyconfigs/u/uniprot/uniprot-2023_04.eb new file mode 100644 index 0000000..9082308 --- /dev/null +++ b/easyconfigs/u/uniprot/uniprot-2023_04.eb @@ -0,0 +1,36 @@ +easyblock = 'Bundle' + +name = 'uniprot' +version = '2023_04' + + +homepage = 'https://www.uniprot.org/' +description = """The mission of UniProt is to provide the scientific community with a comprehensive, high-quality and freely accessible resource of protein sequence and functional information. """ + +toolchain = SYSTEM + +dependencies = [] + + +sanity_check_paths = { + 'files': ['db/knowledgebase/complete/uniprot_sprot.fasta', + 'db/knowledgebase/complete/uniprot_trembl.fasta', + 'db/knowledgebase/idmapping/idmapping.dat', + 'db/uniref/uniref100/uniref100.fasta', + 'db/uniref/uniref90/uniref90.fasta', + 'db/uniref/uniref50/uniref50.fasta' + ], + 'dirs': ['db','blastdb_v4','blastdb_v5','diamond'] +} + +modextrapaths = {'BIODB':'db', + 'BLASTDB': ['blastdb_v4','blastdb_v5'], + 'DIAMONDDB': 'diamond', +} + +keeppreviousinstall = True + +moduleclass = 'data' + +modloadmsg = "%(name)s/%(version)s database and indexes are located at %(installdir)s/\n" + diff --git a/easyconfigs/u/uniprot/uniprot-2023_05.eb b/easyconfigs/u/uniprot/uniprot-2023_05.eb new file mode 100644 index 0000000..b0a0db3 --- /dev/null +++ b/easyconfigs/u/uniprot/uniprot-2023_05.eb @@ -0,0 +1,38 @@ +easyblock = 'Bundle' + +name = 'uniprot' +version = '2023_04' + + +homepage = 'https://www.uniprot.org/' +description = """The mission of UniProt is to provide the scientific community with a comprehensive, high-quality and freely accessible resource of protein sequence and functional information. """ + +toolchain = SYSTEM + +dependencies = [] + + +sanity_check_paths = { + 'files': ['db/knowledgebase/complete/uniprot_sprot.fasta', + 'db/knowledgebase/complete/uniprot_trembl.fasta', + 'db/knowledgebase/idmapping/idmapping.dat', + 'db/uniref/uniref100/uniref100.fasta', + 'db/uniref/uniref90/uniref90.fasta', + 'db/uniref/uniref50/uniref50.fasta' + ], + 'dirs': ['db','blastdb_v4','blastdb_v5','diamond'] +} + +modextrapaths = {'BIODB':'db', + 'BLASTDB': ['blastdb_v4','blastdb_v5'], + 'DIAMONDDB': 'diamond', +} + +keeppreviousinstall = True + +moduleclass = 'data' + +modloadmsg = "%(name)s/%(version)s database and indexes are located at %(installdir)s/\n" + +##Helpful documentation +#https://embl.service-now.com/kb?id=kb_article_view&sysparm_article=KB0011060 diff --git a/easyconfigs/u/uniprot/uniprot_download.sh b/easyconfigs/u/uniprot/uniprot_download.sh index e3698c8..d0d2e3b 100755 --- a/easyconfigs/u/uniprot/uniprot_download.sh +++ b/easyconfigs/u/uniprot/uniprot_download.sh @@ -1,5 +1,5 @@ # ----------------Load Modules-------------------- -module load globus-cli/3.10.1-IGB-gcc-8.2.0-Python-3.7.2 +module load globus-cli/3.18.0-IGB-gcc-8.2.0-Python-3.7.2 # ----------------Commands------------------------ @@ -15,7 +15,7 @@ FASTA_DIR=${MIRROR_DIR}/db echo "Downloading Files: `date "+%Y-%m-%d %k:%M:%S"`" mkdir -p ${FASTA_DIR} -globus transfer -r --exclude 'knowledgebase/reference_proteomes' --exclude 'knowledgebase/taxonomic_divisions' 'fd9c190c-b824-11e9-98d7-0a63aa6b37da:/gridftp/pub/databases/uniprot/current_release/' '1ccc563b-0542-44e5-a13c-fc4b00281b72:/private_stores/mirror/uniprot/${FASTA_DIR}/ +globus transfer -r --exclude "reference_proteomes" --exclude "taxonomic_divisions" "47772002-3e5b-4fd3-b97c-18cee38d6df2:/pub/databases/uniprot/current_release/" "1ccc563b-0542-44e5-a13c-fc4b00281b72:${FASTA_DIR}/" diff --git a/easyconfigs/u/uniprot/uniprot_extract.sh b/easyconfigs/u/uniprot/uniprot_extract.sh index 57ec288..a060e07 100755 --- a/easyconfigs/u/uniprot/uniprot_extract.sh +++ b/easyconfigs/u/uniprot/uniprot_extract.sh @@ -29,6 +29,7 @@ VERSION=$1 MIRROR_DIR=/private_stores/mirror/${DATABASE}/${VERSION} FASTA_DIR=${MIRROR_DIR}/db +echo "`date "+%Y-%m-%d %k:%M:%S"` Directory: ${FASTA_DIR}" echo "`date "+%Y-%m-%d %k:%M:%S"` Extracting Files" pigz -p ${SLURM_NTASKS} -dr ${FASTA_DIR}