Skip to content

Commit

Permalink
Updated scripts to use globus commands
Browse files Browse the repository at this point in the history
Updated extraction scripts
Updated pfam
Updated interpro
Updated uniprot
Updated ena
  • Loading branch information
Data Mover committed Sep 23, 2024
1 parent b56160d commit ebb8ae1
Show file tree
Hide file tree
Showing 10 changed files with 221 additions and 39 deletions.
28 changes: 28 additions & 0 deletions easyconfigs/e/ena/ena-20240909.eb
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
easyblock = 'Bundle'

name = 'ena'
version = '20240909'

homepage = 'https://www.ebi.ac.uk/ena'
description = """The European Nucleotide Archive (ENA) captures and presents information relating to experimental workflows that are based around nucleotide sequencing. """

toolchain = SYSTEM

source_urls = ['']
sources = []

dependencies = []

sanity_check_paths = {
'files': [],
'dirs': ['sequence','wgs'],
}

modextrapaths = {'BIODB':''
}

keeppreviousinstall = True
moduleclass = 'data'

modloadmsg = "%(name)s/%(version)s database is located at %(installdir)s\n"

4 changes: 2 additions & 2 deletions easyconfigs/e/ena/ena_download.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@ echo "Downloading Files: `date "+%Y-%m-%d %k:%M:%S"`"
#mkdir ${MIRROR_DIR}/wgs
#mkdir ${MIRROR_DIR}/sequence

globus transfer --preserve-timestamp --skip-source-errors --delete -r "47772002-3e5b-4fd3-b97c-18cee38d6df2:/pub/databases/ena/wgs/" "1ccc563b-0542-44e5-a13c-fc4b00281b72:${MIRROR_DIR}/wgs/"
globus transfer --preserve-timestamp --skip-source-errors --delete -r "47772002-3e5b-4fd3-b97c-18cee38d6df2:/pub/databases/ena/sequence/snapshot_latest/" "1ccc563b-0542-44e5-a13c-fc4b00281b72:${MIRROR_DIR}/sequence/"
globus transfer --preserve-timestamp --skip-source-errors --delete -r "47772002-3e5b-4fd3-b97c-18cee38d6df2:/pub/databases/ena/wgs/" "4a467fda-f559-4fc3-b54a-e2842f439e06:${MIRROR_DIR}/wgs/"
globus transfer --preserve-timestamp --skip-source-errors --delete -r "47772002-3e5b-4fd3-b97c-18cee38d6df2:/pub/databases/ena/sequence/snapshot_latest/" "4a467fda-f559-4fc3-b54a-e2842f439e06:${MIRROR_DIR}/sequence/"



Expand Down
48 changes: 48 additions & 0 deletions easyconfigs/e/ena/ena_extract.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
#!/bin/bash
# ----------------SLURM Parameters----------------
#SBATCH -p admin
#SBATCH -n 1
#SBATCH --mem=20g
#SBATCH -N 1
#SBATCH --mail-user=datamover@igb.illinois.edu
#SBATCH --mail-type=ALL
#SBATCH -J ena_extract
#SBATCH -D /home/a-m/datamover/jobs
#SBATCH -o %x-%j.out
# ----------------Load Modules--------------------
module load pigz/2.4-IGB-gcc-8.2.0
# ----------------Commands------------------------
#
# Replace DATABASE with name of database you are downloading
# Replace WEBSITE with remote location of database#
#

DATABASE="ena"

if [ -z "$1" ];
then
echo "Please specify ${DATABASE} version number";
exit 1;
fi

VERSION=$1
MIRROR_DIR=/private_stores/mirror/${DATABASE}/${VERSION}


echo "`date "+%Y-%m-%d %k:%M:%S"` Extracting Files"
for f in $(find ${MIRROR_DIR} -name '*.gz');
do
gunzip $f
if [ $? -ne 0 ]; then
echo "`date "+%Y-%m-%d %k:%M:%S"` Error extracting file: $f"
exit 1
else
echo "`date "+%Y-%m-%d %k:%M:%S"` Done extracting file: $f"
fi
done

echo "`date "+%Y-%m-%d %k:%M:%S"` Fix Permissions Start"
find ${MIRROR_DIR} -type d -exec chmod 775 {} \;
find ${MIRROR_DIR} -type f -exec chmod 664 {} \;
echo "`date "+%Y-%m-%d %k:%M:%S"` Fix Permissions Completed"

28 changes: 28 additions & 0 deletions easyconfigs/i/interpro/interpro-101.0.eb
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
easyblock = 'Bundle'

name = 'interpro'
version = '101.0'

homepage = 'https://www.ebi.ac.uk/interpro/'
description = """InterPro provides functional analysis of proteins by classifying them into families and predicting domains and important sites."""

toolchain = SYSTEM

source_urls = ['']
sources = []

dependencies = []

sanity_check_paths = {
'files': ['interpro.xml'],
'dirs': [],
}

modextrapaths = {'BIODB':''
}

keeppreviousinstall = True
moduleclass = 'data'

modloadmsg = "%(name)s/%(version)s database is located at %(installdir)s\n"

15 changes: 3 additions & 12 deletions easyconfigs/i/interpro/interpro_extract.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/bin/bash
# ----------------SLURM Parameters----------------
#SBATCH -p admin
#SBATCH -n 4
#SBATCH -n 1
#SBATCH --mem=20g
#SBATCH -N 1
#SBATCH --mail-user=datamover@igb.illinois.edu
Expand Down Expand Up @@ -30,9 +30,9 @@ MIRROR_DIR=/private_stores/mirror/${DATABASE}/${VERSION}


echo "`date "+%Y-%m-%d %k:%M:%S"` Extracting Files"
for f in $(find ${MIRROR_DIR} -name '*.tar.gz');
for f in $(find ${MIRROR_DIR} -name '*.gz');
do
tar -xvzf $f -C dirname $f
gunzip $f
if [ $? -ne 0 ]; then
echo "`date "+%Y-%m-%d %k:%M:%S"` Error extracting file: $f"
exit 1
Expand All @@ -41,15 +41,6 @@ do
fi
done

pigz -p $SLURM_NTASKS -dr $MIRROR_DIR
if [ $? -ne 0 ]
then
echo "`date "+%Y-%m-%d %k:%M:%S"` Extracting files Failed"
exit $?
else
echo "`date "+%Y-%m-%d %k:%M:%S"` Extracting Files Complete"
fi

echo "`date "+%Y-%m-%d %k:%M:%S"` Fix Permissions Start"
find ${MIRROR_DIR} -type d -exec chmod 775 {} \;
find ${MIRROR_DIR} -type f -exec chmod 664 {} \;
Expand Down
57 changes: 57 additions & 0 deletions easyconfigs/i/interpro/interpro_extract.sh.backup
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
#!/bin/bash
# ----------------SLURM Parameters----------------
#SBATCH -p admin
#SBATCH -n 4
#SBATCH --mem=20g
#SBATCH -N 1
#SBATCH --mail-user=datamover@igb.illinois.edu
#SBATCH --mail-type=ALL
#SBATCH -J interpro_extract
#SBATCH -D /home/a-m/datamover/jobs
#SBATCH -o %x-%j.out
# ----------------Load Modules--------------------
module load pigz/2.4-IGB-gcc-8.2.0
# ----------------Commands------------------------
#
# Replace DATABASE with name of database you are downloading
# Replace WEBSITE with remote location of database#
#

DATABASE="interpro"

if [ -z "$1" ];
then
echo "Please specify ${DATABASE} version number";
exit 1;
fi

VERSION=$1
MIRROR_DIR=/private_stores/mirror/${DATABASE}/${VERSION}


echo "`date "+%Y-%m-%d %k:%M:%S"` Extracting Files"
for f in $(find ${MIRROR_DIR} -name '*.tar.gz');
do
tar -xvzf $f -C dirname $f
if [ $? -ne 0 ]; then
echo "`date "+%Y-%m-%d %k:%M:%S"` Error extracting file: $f"
exit 1
else
echo "`date "+%Y-%m-%d %k:%M:%S"` Done extracting file: $f"
fi
done

pigz -p $SLURM_NTASKS -dr $MIRROR_DIR
if [ $? -ne 0 ]
then
echo "`date "+%Y-%m-%d %k:%M:%S"` Extracting files Failed"
exit $?
else
echo "`date "+%Y-%m-%d %k:%M:%S"` Extracting Files Complete"
fi

echo "`date "+%Y-%m-%d %k:%M:%S"` Fix Permissions Start"
find ${MIRROR_DIR} -type d -exec chmod 775 {} \;
find ${MIRROR_DIR} -type f -exec chmod 664 {} \;
echo "`date "+%Y-%m-%d %k:%M:%S"` Fix Permissions Completed"

28 changes: 28 additions & 0 deletions easyconfigs/p/pfam/pfam-37.0.eb
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
easyblock = 'Bundle'

name = 'pfam'
version = '37.0'

homepage = 'https://pfam.xfam.org/'
description = """The Pfam database is a large collection of protein families, each represented by multiple sequence alignments and hidden Markov models (HMMs)"""

toolchain = SYSTEM

source_urls = ['']
sources = []

dependencies = []

sanity_check_paths = {
'files': ['Pfam-A.fasta'],
'dirs': [],
}

modextrapaths = {'BIODB':''
}

keeppreviousinstall = True
moduleclass = 'data'

modloadmsg = "%(name)s/%(version)s database and indexes are located at %(installdir)s\n"

35 changes: 25 additions & 10 deletions easyconfigs/p/pfam/pfam_extract.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/bin/bash
# ----------------SLURM Parameters----------------
#SBATCH -p admin
#SBATCH -n 4
#SBATCH -n 1
#SBATCH --mem=20g
#SBATCH -N 1
#SBATCH --mail-user=datamover@igb.illinois.edu
Expand All @@ -10,7 +10,7 @@
#SBATCH -D /home/a-m/datamover/jobs
#SBATCH -o %x-%j.out
# ----------------Load Modules--------------------
module load pigz/2.4-IGB-gcc-8.2.0

# ----------------Commands------------------------

if [ -z "$1" ];
Expand All @@ -25,14 +25,29 @@ MIRROR_DIR=/private_stores/mirror/pfam/${VERSION}

echo "`date "+%Y-%m-%d %k:%M:%S"` Extracting Files"

pigz -p $SLURM_NTASKS -dr $MIRROR_DIR
if [ $? -ne 0 ]
then
echo "`date "+%Y-%m-%d %k:%M:%S"` Extracting files Failed"
exit 1
else
echo "Extracting Files Complete: `date "+%Y-%m-%d %k:%M:%S"`"
fi
echo "`date "+%Y-%m-%d %k:%M:%S"` Extracting .gz Files"
for f in $(find ${MIRROR_DIR} -name '*.gz');
do
gunzip $f
if [ $? -ne 0 ]; then
echo "`date "+%Y-%m-%d %k:%M:%S"` Error extracting file: $f"
exit $?
else
echo "`date "+%Y-%m-%d %k:%M:%S"` Done extracting file: $f"
fi
done

echo "`date "+%Y-%m-%d %k:%M:%S"` Extracting .tgz Files"
for f in $(find ${MIRROR_DIR} -name '*.tgz');
do
tar -xvzf $f -C `dirname $f`
if [ $? -ne 0 ]; then
echo "`date "+%Y-%m-%d %k:%M:%S"` Error extracting file: $f"
exit $?
else
echo "`date "+%Y-%m-%d %k:%M:%S"` Done extracting file: $f"
fi
done

echo "`date "+%Y-%m-%d %k:%M:%S"` Fix Permissions Start"
find $MIRROR_DIR -type d -exec chmod 775 {} \;
Expand Down
4 changes: 2 additions & 2 deletions easyconfigs/u/uniprot/uniprot_2024_04.eb
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,11 @@ sanity_check_paths = {
'db/uniref/uniref90/uniref90.fasta',
'db/uniref/uniref50/uniref50.fasta'
],
'dirs': ['db','blastdb_v4','blastdb_v5','diamond']
'dirs': ['db','blastdb_v5','diamond']
}

modextrapaths = {'BIODB':'db',
'BLASTDB': ['blastdb_v4','blastdb_v5'],
'BLASTDB': ['blastdb_v5'],
'DIAMONDDB': 'diamond',
}

Expand Down
13 changes: 0 additions & 13 deletions easyconfigs/u/uniprot/uniprot_indexes.sh
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@ fi
VERSION=$1
MIRROR_DIR=/private_stores/mirror/uniprot
FASTA_DIR=$MIRROR_DIR/$VERSION/db
BLASTV4_DIR=$MIRROR_DIR/$VERSION/blastdb_v4
BLASTV5_DIR=$MIRROR_DIR/$VERSION/blastdb_v5
DIAMOND_DIR=$MIRROR_DIR/$VERSION/diamond
DIAMOND_OPTS="--quiet --threads $SLURM_NTASKS"
Expand All @@ -39,7 +38,6 @@ fi


echo "`date "+%Y-%m-%d %k:%M:%S"` Creating Directories"
mkdir -p $BLASTV4_DIR
mkdir -p $BLASTV5_DIR
mkdir -p $DIAMOND_DIR

Expand All @@ -50,17 +48,6 @@ for f in ${FASTA_FILES[@]}; do
FASTA_NAME=`basename $f`
DB_NAME=`basename $f .fasta`

#Make blast v4 indexes
echo "`date "+%Y-%m-%d %k:%M:%S"` Creating Blast v4 Index for File: $FULL_PATH"

makeblastdb -dbtype prot -title $DB_NAME -in $FULL_PATH -out $BLASTV4_DIR/$DB_NAME -blastdb_version 4
if [ $? -ne 0 ]; then
echo "`date "+%Y-%m-%d %k:%M:%S"` Error creating Blast v4 index for file: $FULL_PATH"
exit 1
else
echo "`date "+%Y-%m-%d %k:%M:%S"` Done Creating Blast v4 Index for File: $FULL_PATH"
fi

#Make blast v5 indexes
echo "`date "+%Y-%m-%d %k:%M:%S"` Creating Blast v5 Index for File: $FULL_PATH"
makeblastdb -dbtype prot -title $DB_NAME -in $FULL_PATH -out $BLASTV5_DIR/$DB_NAME -blastdb_version 5
Expand Down

0 comments on commit ebb8ae1

Please sign in to comment.