Skip to content

Commit

Permalink
Added many databases
Browse files Browse the repository at this point in the history
Added globus template
  • Loading branch information
Data Mover committed Nov 17, 2023
1 parent a80344f commit 83e8ba0
Show file tree
Hide file tree
Showing 19 changed files with 358 additions and 43 deletions.
31 changes: 31 additions & 0 deletions easyconfigs/c/checkm2-db/checkm2-db-20230511.eb
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
easyblock = 'Bundle'

name = 'checkm2-db'
version = '20230511'

homepage = 'https://github.com/chklovski/CheckM2'
description = """CheckM2 Database"""

toolchain = SYSTEM

#source_urls = []
#sources = []

dependencies = []

sanity_check_paths = {
'files': ['CheckM2_database/uniref100.KO.1.dmnd'],
'dirs': [],
}

modextrapaths = {
'BIODB':'',
'CHECKM2DB': 'CheckM2_database/uniref100.KO.1.dmnd'

}

keeppreviousinstall = True
moduleclass = 'data'

modloadmsg = "%(name)s/%(version)s database is located at %(installdir)s\n"

28 changes: 28 additions & 0 deletions easyconfigs/e/ena/ena-20230511.eb
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
easyblock = 'Bundle'

name = 'ena'
version = '20230511'

homepage = 'https://www.ebi.ac.uk/ena'
description = """The European Nucleotide Archive (ENA) captures and presents information relating to experimental workflows that are based around nucleotide sequencing. """

toolchain = SYSTEM

source_urls = ['']
sources = []

dependencies = []

sanity_check_paths = {
'files': [],
'dirs': ['sequence','wgs'],
}

modextrapaths = {'BIODB':''
}

keeppreviousinstall = True
moduleclass = 'data'

modloadmsg = "%(name)s/%(version)s database is located at %(installdir)s\n"

28 changes: 28 additions & 0 deletions easyconfigs/e/ena/ena-20230925.eb
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
easyblock = 'Bundle'

name = 'ena'
version = '20230925'

homepage = 'https://www.ebi.ac.uk/ena'
description = """The European Nucleotide Archive (ENA) captures and presents information relating to experimental workflows that are based around nucleotide sequencing. """

toolchain = SYSTEM

source_urls = ['']
sources = []

dependencies = []

sanity_check_paths = {
'files': [],
'dirs': ['sequence','wgs'],
}

modextrapaths = {'BIODB':''
}

keeppreviousinstall = True
moduleclass = 'data'

modloadmsg = "%(name)s/%(version)s database is located at %(installdir)s\n"

12 changes: 6 additions & 6 deletions easyconfigs/e/ena/ena_download.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# ----------------Load Modules--------------------
module load globus-cli/3.10.1-IGB-gcc-8.2.0-Python-3.7.2
module load globus-cli/3.18.0-IGB-gcc-8.2.0-Python-3.7.2

# ----------------Commands------------------------

Expand All @@ -13,12 +13,12 @@ VERSION=$1
MIRROR_DIR=/private_stores/mirror/ena/${VERSION}

echo "Downloading Files: `date "+%Y-%m-%d %k:%M:%S"`"
mkdir -p ${MIRROR_DIR}
mkdir ${MIRROR_DIR}/wgs
mkdir ${MIRROR_DIR}/sequence
#mkdir -p ${MIRROR_DIR}
#mkdir ${MIRROR_DIR}/wgs
#mkdir ${MIRROR_DIR}/sequence

globus transfer --preserve-timestamp --skip-source-errors -r "fd9c190c-b824-11e9-98d7-0a63aa6b37da:/gridftp/pub/databases/ena/wgs/" "1ccc563b-0542-44e5-a13c-fc4b00281b72:${MIRROR_DIR}/wgs/"
globus transfer --preserve-timestamp --skip-source-errors -r "fd9c190c-b824-11e9-98d7-0a63aa6b37da:/gridftp/pub/databases/ena/sequence/update/" "1ccc563b-0542-44e5-a13c-fc4b00281b72:${MIRROR_DIR}/sequence/"
globus transfer --preserve-timestamp --skip-source-errors --delete -r "47772002-3e5b-4fd3-b97c-18cee38d6df2:/pub/databases/ena/wgs/" "1ccc563b-0542-44e5-a13c-fc4b00281b72:${MIRROR_DIR}/wgs/"
globus transfer --preserve-timestamp --skip-source-errors --delete -r "47772002-3e5b-4fd3-b97c-18cee38d6df2:/pub/databases/ena/sequence/snapshot_latest/" "1ccc563b-0542-44e5-a13c-fc4b00281b72:${MIRROR_DIR}/sequence/"



Expand Down
29 changes: 29 additions & 0 deletions easyconfigs/f/foldseek-db/foldseek-db-20230921.eb
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
easyblock = 'Bundle'

name = 'foldseek-db'
version = '20230921'

homepage = 'https://github.com/steineggerlab/foldseek'
description = """Foldseek computes for each match a simple estimate for the probability that the match is a true positive match given its structural bit score. Here, hits within the same superfamily are TP, hits to another fold are FP, and hits to the same family or to another superfamily are ignored."""

toolchain = SYSTEM

source_urls = ['']
sources = []

dependencies = []

sanity_check_paths = {
'files': ['afdb/afdb','pdb/pdb'],
'dirs': ['afdb','pdb'],
}

modextrapaths = {
'BIODB':''
}

keeppreviousinstall = True
moduleclass = 'data'

modloadmsg = "%(name)s/%(version)s database is located at %(installdir)s\n"

3 changes: 2 additions & 1 deletion easyconfigs/g/gtdb/gtdb-207.eb
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@ sanity_check_paths = {
}

modextrapaths = {
'BIODB':''
'BIODB':'',
'GTDBTK_DATA_PATH': 'auxillary_files/release214_v2'
}

keeppreviousinstall = True
Expand Down
30 changes: 30 additions & 0 deletions easyconfigs/g/gtdb/gtdb-214.eb
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
easyblock = 'Bundle'

name = 'gtdb'
version = '214'

homepage = 'https://gtdb.ecogenomic.org/'
description = """GENOME TAXONOMY DATABASE"""

toolchain = SYSTEM

source_urls = ['']
sources = []

dependencies = []

sanity_check_paths = {
'files': [],
'dirs': ['auxillary_files','genomic_files_all','genomic_files_reps'],
}

modextrapaths = {
'BIODB':'',
'GTDBTK_DATA_PATH': 'auxillary_files/release214'
}

keeppreviousinstall = True
moduleclass = 'data'

modloadmsg = "%(name)s/%(version)s database is located at %(installdir)s\n"

43 changes: 43 additions & 0 deletions easyconfigs/g/gtdb/gtdb_delete.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
#!/bin/bash
# ----------------SLURM Parameters----------------
#SBATCH -p admin
#SBATCH -n 4
#SBATCH --mem=20g
#SBATCH -N 1
#SBATCH --mail-user=datamover@igb.illinois.edu
#SBATCH --mail-type=ALL
#SBATCH -J gtdb_delete
#SBATCH -D /home/a-m/datamover/jobs
#SBATCH -o %x-%j.out
# ----------------Load Modules--------------------
module load pigz/2.4-IGB-gcc-8.2.0
# ----------------Commands------------------------
#
# Replace DATABASE with name of database you are downloading
# Replace WEBSITE with remote location of database#
#

DATABASE="gtdb"

if [ -z "$1" ];
then
echo "Please specify ${DATABASE} version number";
exit 1;
fi

VERSION=$1
MIRROR_DIR=/private_stores/mirror/${DATABASE}/${VERSION}


echo "`date "+%Y-%m-%d %k:%M:%S"` Deleting tar.gz Files"

find ${MIRROR_DIR} -type f -name '*.tar.gz' -exec rm -f {} \;
if [ $? -ne 0 ]
then
echo "`date "+%Y-%m-%d %k:%M:%S"` Delete files Failed"
exit $?
else
echo "`date "+%Y-%m-%d %k:%M:%S"` Delete Files Complete"
fi


2 changes: 1 addition & 1 deletion easyconfigs/g/gtdb/gtdb_extract.sh
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ MIRROR_DIR=/private_stores/mirror/${DATABASE}/${VERSION}

echo "`date "+%Y-%m-%d %k:%M:%S"` Extracting Files"

find ${MIRROR_DIR} -type f -name '*.tar' -exec tar -xf {} \;
find ${MIRROR_DIR} -type f -name '*.tar.gz' -execdir tar -I pigz -xvf {} \;
if [ $? -ne 0 ]
then
echo "`date "+%Y-%m-%d %k:%M:%S"` Extracting files Failed"
Expand Down
28 changes: 28 additions & 0 deletions easyconfigs/i/interpro/interpro-97.0.eb
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
easyblock = 'Bundle'

name = 'interpro'
version = '89.0'

homepage = 'https://www.ebi.ac.uk/interpro/'
description = """InterPro provides functional analysis of proteins by classifying them into families and predicting domains and important sites."""

toolchain = SYSTEM

source_urls = ['']
sources = []

dependencies = []

sanity_check_paths = {
'files': ['interpro.xml'],
'dirs': [],
}

modextrapaths = {'BIODB':''
}

keeppreviousinstall = True
moduleclass = 'data'

modloadmsg = "%(name)s/%(version)s database is located at %(installdir)s\n"

27 changes: 2 additions & 25 deletions easyconfigs/i/interpro/interpro_download.sh
Original file line number Diff line number Diff line change
@@ -1,20 +1,5 @@
#!/bin/bash
# ----------------SLURM Parameters----------------
#SBATCH -p admin
#SBATCH -n 1
#SBATCH --mem=20g
#SBATCH -N 1
#SBATCH --mail-user=datamover@igb.illinois.edu
#SBATCH --mail-type=ALL
#SBATCH -J interpro_download
#SBATCH -D /home/a-m/datamover/jobs
#SBATCH -o %x-%j.out
# ----------------Load Modules--------------------
# ----------------Commands------------------------
#
# Replace DATABASE with name of database you are downloading
# Replace WEBSITE with remote location of database
#
module load globus-cli/3.18.0-IGB-gcc-8.2.0-Python-3.7.2

DATABASE="interpro"

Expand All @@ -29,14 +14,6 @@ MIRROR_DIR=/private_stores/mirror/${DATABASE}/${VERSION}

echo "`date "+%Y-%m-%d %k:%M:%S"` Downloading Files"
mkdir -p ${MIRROR_DIR}
rsync -av rsync://ftp.ebi.ac.uk/pub/databases/interpro/${VERSION}/ ${MIRROR_DIR}/
if [ $? -ne 0 ]
then
echo "`date "+%Y-%m-%d %k:%M:%S"` Downloading Files Failed"
exit $?
else
echo "`date "+%Y-%m-%d %k:%M:%S"` Downloading Files Complete"
fi

globus transfer -r --exclude "reference_proteomes" --exclude "taxonomic_divisions" "47772002-3e5b-4fd3-b97c-18cee38d6df2:/pub/databases/interpro/releases/$VERSION/" "1ccc563b-0542-44e5-a13c-fc4b00281b72:${MIRROR_DIR}/"


18 changes: 14 additions & 4 deletions easyconfigs/i/interpro/interpro_extract.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
#SBATCH -N 1
#SBATCH --mail-user=datamover@igb.illinois.edu
#SBATCH --mail-type=ALL
#SBATCH -J DATABASE_extract
#SBATCH -J interpro_extract
#SBATCH -D /home/a-m/datamover/jobs
#SBATCH -o %x-%j.out
# ----------------Load Modules--------------------
Expand All @@ -17,7 +17,7 @@ module load pigz/2.4-IGB-gcc-8.2.0
# Replace WEBSITE with remote location of database#
#

DATABSE="interpro"
DATABASE="interpro"

if [ -z "$1" ];
then
Expand All @@ -30,8 +30,18 @@ MIRROR_DIR=/private_stores/mirror/${DATABASE}/${VERSION}


echo "`date "+%Y-%m-%d %k:%M:%S"` Extracting Files"

pigz -p ${SLURM_NTASKS} -dr ${MIRROR_DIR}
for f in $(find ${MIRROR_DIR} -name '*.tar.gz');
do
tar -xvzf $f -C dirname $f
if [ $? -ne 0 ]; then
echo "`date "+%Y-%m-%d %k:%M:%S"` Error extracting file: $f"
exit 1
else
echo "`date "+%Y-%m-%d %k:%M:%S"` Done extracting file: $f"
fi
done

pigz -p $SLURM_NTASKS -dr $MIRROR_DIR
if [ $? -ne 0 ]
then
echo "`date "+%Y-%m-%d %k:%M:%S"` Extracting files Failed"
Expand Down
4 changes: 2 additions & 2 deletions easyconfigs/r/R/R_mirror_update.pl
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,12 @@
my $cran_dir="/private_stores/mirror/R/cran";
my $cran_exclude="--exclude='bin'";

my @bioc_versions = ('3.16');
my @bioc_versions = ('3.18');

foreach my $version (@bioc_versions) {

#Rsync the files
my $command ="rsync -e 'ssh -i ~/.ssh/id_rsa' -zrtlv --delete $bioc_exclude bioc-rsync\@master.bioconductor.org:$version/ $bioc_dir/packages/$version/";
my $command ="rsync -e 'ssh -i ~/.ssh/id_rsa' -zrtlv $bioc_exclude bioc-rsync\@master.bioconductor.org:$version/ $bioc_dir/packages/$version/";
print $command . "\n";
system($command);

Expand Down
15 changes: 13 additions & 2 deletions easyconfigs/templates/extract.sh
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,19 @@ VERSION=$1
MIRROR_DIR=/private_stores/mirror/${DATABASE}/${VERSION}


echo "`date "+%Y-%m-%d %k:%M:%S"` Extracting Files"

echo "`date "+%Y-%m-%d %k:%M:%S"` Extracting tar.gz Files"
for f in $(find ${MIRROR_DIR} -name '*.tar.gz');
do
tar -xvzf $f -C dirname $f
if [ $? -ne 0 ]; then
echo "`date "+%Y-%m-%d %k:%M:%S"` Error extracting file: $f"
exit $?
else
echo "`date "+%Y-%m-%d %k:%M:%S"` Done extracting file: $f"
fi
done

echo "`date "+%Y-%m-%d %k:%M:%S"` Extracting .gz Files with Pigz"
pigz -p ${SLURM_NTASKS} -dr ${MIRROR_DIR}
if [ $? -ne 0 ]
then
Expand Down
Loading

0 comments on commit 83e8ba0

Please sign in to comment.