diff --git a/README.md b/README.md index 53f1807..9a21eb4 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@

--- -[![PEP compatible](http://pepkit.github.io/img/PEP-compatible-green.svg)](http://pepkit.github.io) +[![PEP compatible](http://pepkit.github.io/img/PEP-compatible-green.svg)](http://pep.databio.org) PEPPRO is a pipeline designed to process PRO-seq (and GRO-seq) data. For more information see: http://peppro.databio.org/ diff --git a/checkinstall b/checkinstall new file mode 100755 index 0000000..63d31f9 --- /dev/null +++ b/checkinstall @@ -0,0 +1,486 @@ +#!/usr/bin/env bash +# +# PEPPRO pipeline installation check +# + +if [ $# -gt 0 ] ; then + echo "Usage: checkinstall" + exit 1 +fi + +set -o pipefail +# set -e + +echo -e "-----------------------------------------------------------" +echo -e " " +echo -e " PEPPRO installation check " +echo -e " " +echo -e "-----------------------------------------------------------" + + +################################################################################ +# Helpful functions +trim() { + local var="$*" + # remove leading whitespace characters + var="${var#"${var%%[![:space:]]*}"}" + # remove trailing whitespace characters + var="${var%"${var##*[![:space:]]}"}" + printf '%s' "$var" +} + +is_executable() { + if [ -x "$(command -v $1)" ]; then + return 0 + else + return 1 + fi +} + +pip_show() { + if pip show -q $1; then + return 0 + else + return 1 + fi +} + +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[0;33m' +NC='\033[0m' # No Color + +fail() { + printf "${RED}\u2716 $@${NC}\n" +} + +success() { + printf "${GREEN}\xE2\x9C\x94 $@${NC}\n" +} + +warn() { + printf "${YELLOW}\u26A0 $@${NC}\n" +} + +################################################################################ +echo -e "Checking base requirements... " + +BASE_REQS=0 + +declare -a requiredPkgs=("looper") + +for package in ${requiredPkgs[@]}; do + if ! pip_show $package; then + echo $(fail "ERROR: PEPPRO requires the Python package, $package. Try pip install $package.") + printf "\n" + exit 1 + fi +done + +if [ $BASE_REQS -eq 0 ]; then + echo $(success "SUCCESS: Minimum requirements met.") +fi + + +################################################################################ +echo -e "-----------------------------------------------------------" +echo -e "Checking native installation... " +NATIVE_INSTALL=0 + +# Check Python +if ! is_executable "python"; then + echo $(warn "WARNING: PEPPRO requires python 3.0 or greater. Install python and checkinstall again.") + printf "\n" + NATIVE_INSTALL=1 + BULKER_INSTALL=1 +else + ver=$(python -V 2>&1 | sed 's/.* \([0-9]\).\([0-9]\).*/\1\2/') + if [ "$ver" -lt "30" ]; then + echo $(warn "WARNING: PEPPRO requires python 3.0 or greater. Update python and checkinstall again.") + printf "\n" + NATIVE_INSTALL=1 + BULKER_INSTALL=1 + fi +fi + +# Check Python packages +if ! is_executable "pip"; then + echo $(warn "WARNING: Please install pip and checkinstall again.") + printf "\n" + NATIVE_INSTALL=1 + BULKER_INSTALL=1 +fi + +if [ -f "requirements.txt" ]; then + REQS="requirements.txt" +else + REQS=$(curl https://raw.githubusercontent.com/databio/peppro/master/requirements.txt) +fi + +while IFS= read -r line; do + [ "${line:0:1}" = "#" ] && continue + IFS='>=' read -r -a array <<< "$line" + package=${array[0]} + required=${array[2]} + required=$(trim ${required}) + IFS='.' read -r -a required_version <<< "$required" + declare -i rmajor + declare -i rminor + declare -i rpatch + rmajor=$(echo "${required_version[0]}" | awk '{ print $1+0; exit }') + rminor=$(echo "${required_version[1]}" | awk '{ print $1+0; exit }') + rpatch=$(echo "${required_version[2]}" | awk '{ print $1+0; exit }') + + if ! pip_show "${package}"; then + echo $(warn "WARNING: PEPPRO requires the Python package, $package, >= $required. Try pip install $package and checkinstall again.") + printf "\n" + NATIVE_INSTALL=1 + BULKER_INSTALL=1 + else + if [ $package == "cutadapt" ]; then + installed=$(cutadapt --version) + installed=$(trim ${installed}) + else + installed=$(pip show ${package} | grep -iw 'Version' | awk -F':' '{print $2}' | tr -d '\n') + installed=$(trim ${installed}) + fi + IFS='.' read -r -a installed_version <<< "$installed" + declare -i imajor + declare -i iminor + declare -i ipatch + imajor=$(echo "${installed_version[0]}" | awk '{ print $1+0; exit }') + iminor=$(echo "${installed_version[1]}" | awk '{ print $1+0; exit }') + ipatch=$(echo "${installed_version[2]}" | awk '{ print $1+0; exit }') + + if ! [ -z "$required" ]; then + if [ $imajor -lt $rmajor ]; then + echo $(warn "WARNING: PEPPRO requires the python package, $package, >= $required. Try pip install --upgrade $package and checkinstall again.") + printf "\n" + NATIVE_INSTALL=1 + BULKER_INSTALL=1 + elif [ $imajor -eq $rmajor ] && [ $iminor -lt $rminor ]; then + echo $(warn "WARNING: PEPPRO requires the python package, $package, >= $required. Try pip install --upgrade $package and checkinstall again.") + printf "\n" + NATIVE_INSTALL=1 + BULKER_INSTALL=1 + elif [ $imajor -eq $rmajor ] && [ $iminor -eq $rminor ] && [ $ipatch -lt $rpatch ]; then + echo $(warn "WARNING: PEPPRO requires the python package, $package, >= $required. Try pip install --upgrade $package and checkinstall again.") + printf "\n" + NATIVE_INSTALL=1 + BULKER_INSTALL=1 + else + echo -e $(success "SUCCESS: Python package ${package}\trequired: ${required}\tinstalled: ${installed}") + fi + else + echo -e $(success "SUCCESS: Python package ${package}\trequired: any\tinstalled: ${installed_version}") + fi + fi +done < $REQS + +# Check tool installation +declare -a requiredCommands=("perl" "awk" "grep" "sed" "bedtools" "bigWigCat" "wigToBigWig" "bowtie2" "fastp" "fastq_pair" "flash" "preseq" "samtools" "seqtk" "seqkit" "Rscript") + +for cmd in ${requiredCommands[@]}; do + if ! is_executable $cmd; then + echo $(warn "WARNING: Install $cmd and checkinstall again.") + # printf "\n" + NATIVE_INSTALL=1 + else + echo -e $(success "SUCCESS: ${cmd}") + fi +done + +## Check R packages +if ! is_executable "R"; then + echo $(warn "WARNING: PEPPRO requires R 3.5 or greater. Install R and checkinstall again.") + printf "\n" + NATIVE_INSTALL=1 +else + rVer=$(R --version 2>&1 | grep 'R version' | awk '{print $3}') + rVer=$(echo "${rVer//.}") + if [ "$rVer" -lt "350" ]; then + echo $(warn "WARNING: Please update R to >=3.5 and checkinstall again.") + printf "\n" + NATIVE_INSTALL=1 + fi +fi + +declare -a requiredRPackages=("optigrab" "devtools" "GenomicDistributions" "PEPPROr" "data.table" "pepr" "gplots" "grid" "ggplot2" "scales" "IRanges" "GenomicRanges") +for package in ${requiredRPackages[@]}; do + cmd=$(echo "Rscript -e 'library(\"$package\")'") + packageInstalled=$(eval $cmd 2>&1) + if [[ "$packageInstalled" == *Error* ]]; then + echo $(warn "WARNING: Please install the R package, $package, and checkinstall again.") + printf "\n" + NATIVE_INSTALL=1 + else + echo -e $(success "SUCCESS: R package: ${package}") + fi +done + +################################################################################ +echo -e "-----------------------------------------------------------" +echo -e "Checking conda installation... " +CONDA_INSTALL=0 + +if ! is_executable "conda"; then + echo $(warn "WARNING: Install conda to use conda environments and checkinstall again.") + printf "\n" + CONDA_INSTALL=1 +else + eval "$(conda shell.bash hook)" + conda activate peppro + + unset PYTHONPATH + unset R_LIBS + + # Check Python + if ! is_executable "python"; then + echo $(warn "WARNING: PEPPRO requires python 3.0 or greater. Install python and checkinstall again.") + printf "\n" + CONDA_INSTALL=1 + else + #echo "which python: $(which python)" + ver=$(python -V 2>&1 | sed 's/.* \([0-9]\).\([0-9]\).*/\1\2/') + if [ "$ver" -lt "30" ]; then + echo $(warn "WARNING: PEPPRO requires python 3.0 or greater. Update python and checkinstall again.") + printf "\n" + CONDA_INSTALL=1 + fi + fi + + # Check Python packages + if ! is_executable "pip"; then + echo $(warn "WARNING: PEPPRO requires pip. Please install pip and checkinstall again.") + printf "\n" + CONDA_INSTALL=1 + fi + + if [ -f "requirements.txt" ]; then + REQS="requirements.txt" + else + REQS=$(curl https://raw.githubusercontent.com/databio/peppro/master/requirements.txt) + fi + + while IFS= read -r line; do + [ "${line:0:1}" = "#" ] && continue + IFS='>=' read -r -a array <<< "$line" + package=${array[0]} + required=${array[2]} + required=$(trim ${required}) + IFS='.' read -r -a required_version <<< "$required" + declare -i rmajor + declare -i rminor + declare -i rpatch + rmajor=$(echo "${required_version[0]}" | awk '{ print $1+0; exit }') + rminor=$(echo "${required_version[1]}" | awk '{ print $1+0; exit }') + rpatch=$(echo "${required_version[2]}" | awk '{ print $1+0; exit }') + + if ! pip_show "${package}"; then + echo $(warn "WARNING: PEPPRO requires the Python package, $package, >= $required. Try pip install $package and checkinstall again.") + printf "\n" + CONDA_INSTALL=1 + else + if [ $package == "cutadapt" ]; then + installed=$(cutadapt --version) + installed=$(trim ${installed}) + else + installed=$(pip show ${package} | grep -iw 'Version' | awk -F':' '{print $2}' | tr -d '\n') + installed=$(trim ${installed}) + fi + IFS='.' read -r -a installed_version <<< "$installed" + declare -i imajor + declare -i iminor + declare -i ipatch + imajor=$(echo "${installed_version[0]}" | awk '{ print $1+0; exit }') + iminor=$(echo "${installed_version[1]}" | awk '{ print $1+0; exit }') + ipatch=$(echo "${installed_version[2]}" | awk '{ print $1+0; exit }') + + if ! [ -z "$required" ]; then + if [ $imajor -lt $rmajor ]; then + #echo -e "Installed major: ${imajor}\tRequired major: ${rmajor}" + echo $(warn "WARNING: PEPPRO requires the python package, $package, >= $required. Try pip install --upgrade $package and checkinstall again.") + printf "\n" + CONDA_INSTALL=1 + elif [ $imajor -eq $rmajor ] && [ $iminor -lt $rminor ]; then + #echo -e "Installed minor: ${iminor}\tRequired minor: ${rminor}" + echo $(warn "WARNING: PEPPRO requires the python package, $package, >= $required. Try pip install --upgrade $package and checkinstall again.") + printf "\n" + CONDA_INSTALL=1 + elif [ $imajor -eq $rmajor ] && [ $iminor -eq $rminor ] && [ $ipatch -lt $rpatch ]; then + #echo -e "Installed patch: ${ipatch}\tRequired patch: ${rpatch}" + echo $(warn "WARNING: PEPPRO requires the python package, $package, >= $required. Try pip install --upgrade $package and checkinstall again.") + printf "\n" + CONDA_INSTALL=1 + else + echo -e $(success "SUCCESS: Python package ${package}\trequired: ${required}\tinstalled: ${installed}") + fi + else + echo -e $(success "SUCCESS: Python package ${package}\trequired: any\tinstalled: ${installed_version}") + fi + fi + done < $REQS + + # Check tool installation + declare -a requiredCommands=("perl" "awk" "grep" "sed" "bedtools" "bigWigCat" "wigToBigWig" "bowtie2" "fastp" "fastq_pair" "flash" "preseq" "samtools" "seqtk" "seqkit" "Rscript") + + for cmd in ${requiredCommands[@]}; do + if ! is_executable $cmd; then + echo $(warn "WARNING: Please install $cmd and checkinstall again.") + printf "\n" + CONDA_INSTALL=1 + else + echo -e $(success "SUCCESS: ${cmd}") + fi + done + + ## Check R packages + if ! is_executable "R"; then + echo $(warn "WARNING: PEPPRO requires R 3.5 or greater.\n Please install R>=3.5 and checkinstall again.") + printf "\n" + exit 1 + else + rVer=$(R --version 2>&1 | grep 'R version' | awk '{print $3}') + rVer=$(echo "${rVer//.}") + if [ "$rVer" -lt "350" ]; then + echo $(warn "WARNING: PEPPRO requires R 3.5 or greater. Update R and checkinstall again.") + printf "\n" + CONDA_INSTALL=1 + fi + fi + + declare -a requiredRPackages=("optigrab" "devtools" "GenomicDistributions" "PEPPROr" "data.table" "pepr" "gplots" "grid" "ggplot2" "scales" "IRanges" "GenomicRanges") + for package in ${requiredRPackages[@]}; do + cmd=$(echo "Rscript -e 'library(\"$package\")'") + packageInstalled=$(eval $cmd 2>&1) + if [[ "$packageInstalled" == *Error* ]]; then + echo $(warn "WARNING: Please install the R package, $package, and checkinstall again.") + printf "\n" + CONDA_INSTALL=1 + else + echo -e $(success "SUCCESS: R package: ${package}") + fi + done + + conda deactivate +fi + +################################################################################ +echo -e "-----------------------------------------------------------" +echo -e "Checking bulker installation... " +BULKER_INSTALL=0 + +if ! is_executable "docker"; then + DOCKER=1 +else + DOCKER=0 +fi + +if ! is_executable "singularity"; then + SINGULARITY=1 +else + SINGULARITY=0 +fi + +if [ "$DOCKER" -eq 0 ]; then + CMD_CHECK=$(docker --help) + if [ $? -eq 0 ]; then + echo -e $(success "SUCCESS: docker.") + else + echo -e $(warn "WARNING: Docker is a recognized command, but does not appear to be active. Please ensure docker is running and checkinstall again.") + DOCKER=1 + fi +fi + +if [ "$SINGULARITY" -eq 0 ]; then + echo -e $(success "SUCCESS: singularity.") +fi + +if [ "$DOCKER" -eq 1 ] && [ "$SINGULARITY" -eq 1 ]; then + echo -e $(fail "ERROR: bulker") + BULKER_INSTALL=1 +else + if ! is_executable "bulker"; then + echo $(warn "WARNING: To use bulker, pip install bulker and checkinstall again.") + printf "\n" + BULKER_INSTALL=1 + else + BULKER_INSTALL=0 + echo -e $(success "SUCCESS: bulker") + fi + + CWD=$(pwd) + + if [ -f "sample_pipeline_interface.yaml" ]; then + IFACE="sample_pipeline_interface.yaml" + CRATE=$(cat $IFACE | grep 'bulker_crate' | tr " " "\n" | tail -n 1) + else + IFACE=$(curl https://raw.githubusercontent.com/databio/peppro/master/sample_pipeline_interface.yaml) + CRATE=$(echo $IFACE | tr " " "\n" | grep -A1 'bulker_crate' | tail -n 1) + fi + + yes n | bulker load $CRATE + if [ $? -eq 0 ]; then + echo $(warn "WARNING: Could not bulker load ${CRATE}. Check out https://bulker.databio.org/en/latest/install/.") + printf "\n" + BULKER_INSTALL=1 + fi + + if [ -f "$CWD/pipelines/peppro.py" ]; then + PIPELINE="$CWD/pipelines/peppro.py" + else + PIPELINE=$(curl https://raw.githubusercontent.com/databio/peppro/master/pipelines/peppro.py) + fi + + CMD_CHECK=$(bulker run ${CRATE} $PIPELINE --help) + EXIT_CODE=$(echo $?) + isActivatable=$(echo "${EXIT_CODE}" | awk '{ print $1+0; exit }') + if [ "$isActivatable" -eq 0 ]; then + BULKER_INSTALL=0 + echo -e $(success "SUCCESS: bulker run ${CRATE}") + else + echo $(warn "WARNING: Could not activate the bulker crate, ${CRATE}. Check out https://bulker.databio.org/en/latest/install/.") + printf "\n" + BULKER_INSTALL=1 + fi +fi + +################################################################################ +echo -e "-----------------------------------------------------------" +echo -e " PEPPRO checkinstall results " + +if [ "$NATIVE_INSTALL" -eq 0 ]; then + echo -e $(success "SUCCESS: PEPPRO can be run using native installations!") +else + echo -e $(fail "ERROR: PEPPRO cannot be run using native installations.") +fi + +if [ "$CONDA_INSTALL" -eq 0 ]; then + echo -e $(success "SUCCESS: PEPPRO can be run using conda installation!") +else + echo -e $(fail "ERROR: PEPPRO cannot be run via conda.") +fi + +if [ "$DOCKER" -eq 0 ]; then + echo -e $(success "SUCCESS: PEPPRO can be run using docker!") +else + echo -e $(fail "ERROR: PEPPRO cannot be run using docker.") +fi + +if [ "$SINGULARITY" -eq 0 ]; then + echo -e $(success "SUCCESS: PEPPRO can be run using singularity!") +else + echo -e $(fail "ERROR: PEPPRO cannot be run using singularity.") +fi + +if [ "$BULKER_INSTALL" -eq 0 ]; then + echo -e $(success "SUCCESS: PEPPRO can be run using bulker!") +else + echo -e $(fail "ERROR: PEPPRO cannot be run using bulker.") +fi + +if [ "$NATIVE_INSTALL" -eq 1 ] && [ "$CONDA_INSTALL" -eq 1 ] && [ "$BULKER_INSTALL" -eq 1 ]; then + echo -e "ERROR: PEPPRO is not successfully installed. Check the above output for direction on missing tools or packages." +fi + +echo -e "-----------------------------------------------------------" diff --git a/containers/peppro.Dockerfile b/containers/peppro.Dockerfile new file mode 100644 index 0000000..6056a07 --- /dev/null +++ b/containers/peppro.Dockerfile @@ -0,0 +1,282 @@ +# Pull base image +FROM phusion/baseimage:master + +# Who maintains this image +LABEL maintainer Jason Smith "jasonsmith@virginia.edu" + +# Version info +LABEL version 0.10.0 + +# Use baseimage-docker's init system. +CMD ["/sbin/my_init"] + +# Install dependencies +RUN add-apt-repository ppa:deadsnakes/ppa +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install --assume-yes \ + curl \ + cmake \ + default-jre \ + default-jdk \ + git \ + gsl-bin \ + libbz2-dev \ + libgsl-dbg \ + libgsl-dev \ + libcommons-math3-java \ + libcurl4-gnutls-dev \ + libjbzip2-java \ + libpng-dev \ + liblua5.1-0-dev \ + libisal-dev \ + libdeflate-dev \ + libssl-dev \ + libtbb2 \ + libtbb-dev \ + libbam-dev \ + libssl-dev \ + libtbb2 \ + libtbb-dev \ + lua-filesystem-dev \ + lua-lpeg-dev \ + lua-md5-dev \ + libexpat1-dev \ + libtre-dev \ + libcairo2-dev \ + libpango1.0-dev \ + libsqlite3-dev \ + libxml2-dev \ + openssl \ + pigz=2.4-1 \ + python3.8 \ + python3-pip \ + python3-dev \ + software-properties-common \ + build-essential \ + rustc \ + wget \ + zlib1g \ + zlib1g-dev + +# Install MySQL server +RUN DEBIAN_FRONTEND=noninteractive apt-get install --assume-yes mysql-server \ + mysql-client \ + libmysqlclient-dev + +# Install python tools +RUN python3.8 -m pip install -U pip +RUN pip install attmap>=0.12.9 && \ + pip install cython>=0.29 && \ + pip install cykhash && \ + pip install jinja2>=2.11.2 && \ + pip install jsonschema>=3.0.1 && \ + pip install logmuse>=0.2.5 && \ + pip install numpy>=1.17 && \ + pip install https://github.com/pepkit/looper/zipball/master && \ + pip install pararead && \ + pip install pandas>=0.20.2 && \ + pip install peppy>=0.31.0 && \ + pip install piper>=0.12.1 && \ + pip install psutil>=5.6.3 && \ + pip install pysam>=0.13 && \ + pip install pyyaml>=3.13 && \ + pip install refgenconf>=0.7.0 && \ + pip install refgenie>=0.9.3 && \ + pip install ubiquerg>=0.6.1 && \ + pip install yacman>=0.6.7 && \ + pip install cutadapt + +# Install R +RUN apt update -qq && \ + DEBIAN_FRONTEND=noninteractive apt --assume-yes install --no-install-recommends dirmngr +RUN apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv-keys E298A3A825C0D65DFD57CBB651716619E084DAB9 && \ + add-apt-repository "deb https://cloud.r-project.org/bin/linux/ubuntu $(lsb_release -cs)-cran40/" + +RUN DEBIAN_FRONTEND=noninteractive apt-get --assume-yes install r-base r-base-dev r-base-core r-recommended && \ + echo "r <- getOption('repos'); r['CRAN'] <- 'http://cran.us.r-project.org'; options(repos = r);" > ~/.Rprofile + +RUN Rscript -e "install.packages('argparser')" && \ + Rscript -e "install.packages('optigrab')" && \ + Rscript -e "install.packages('data.table')" && \ + Rscript -e "install.packages('xml2')" && \ + Rscript -e "install.packages('roxygen2')" && \ + Rscript -e "install.packages('rversions')" && \ + Rscript -e "install.packages('callr')" && \ + Rscript -e "install.packages('pkgbuild')" && \ + Rscript -e "install.packages('rcmdcheck')" && \ + Rscript -e "install.packages('testthat')" && \ + Rscript -e "install.packages('devtools')" + +RUN Rscript -e "devtools::install_github('pepkit/pepr')" && \ + Rscript -e "install.packages('data.table')" && \ + Rscript -e "install.packages('BiocManager')" && \ + Rscript -e "BiocManager::install('GenomicRanges')" && \ + Rscript -e "BiocManager::install('BSgenome')" && \ + Rscript -e "BiocManager::install('GenomicFeatures')" && \ + Rscript -e "BiocManager::install('ensembldb')" && \ + Rscript -e "BiocManager::install('ExperimentHub')" && \ + Rscript -e "devtools::install_github('databio/GenomicDistributions')" && \ + Rscript -e "install.packages('http://big.databio.org/GenomicDistributionsData/GenomicDistributionsData_0.0.2.tar.gz', repos=NULL)" &&\ + Rscript -e "install.packages('ggrepel')" && \ + Rscript -e "install.packages('ggplot2')" && \ + Rscript -e "install.packages('gplots')" && \ + Rscript -e "install.packages('grid')" && \ + Rscript -e "install.packages('gtable')" && \ + Rscript -e "install.packages('scales')" && \ + Rscript -e "install.packages('stringr')" && \ + Rscript -e "devtools::install_github('databio/peppro/PEPPROr/', ref = 'master')" + +# Install htslib +WORKDIR /home/src/ +RUN wget https://github.com/samtools/htslib/releases/download/1.12/htslib-1.12.tar.bz2 && \ + tar xf htslib-1.12.tar.bz2 && \ + cd /home/src/htslib-1.12 && \ + ./configure --prefix /home/src/ && \ + make && \ + make install + +# Install samtools +WORKDIR /home/src/ +RUN wget https://github.com/samtools/samtools/releases/download/1.12/samtools-1.12.tar.bz2 && \ + tar xf samtools-1.12.tar.bz2 && \ + cd /home/src/samtools-1.12 && \ + ./configure && \ + make && \ + make install && \ + ln -s /home/src/samtools-1.12/samtools /usr/bin/ + +# Install bedtools +RUN DEBIAN_FRONTEND=noninteractive apt-get install --assume-yes \ + ant \ + bedtools>=2.29.2 + +# Install bowtie2 +WORKDIR /home/src/ +RUN wget -O bowtie2-2.4.2-source.zip 'https://downloads.sourceforge.net/project/bowtie-bio/bowtie2/2.4.2/bowtie2-2.4.2-source.zip?ts=gAAAAABgfxZxKMUjBU0A0XjfO55q36KUoO9RRemjzTT_WCDpSSZCy8NtKrFODKV4xS_135KTiIdnBSaqmvHuQw9l6nqM2EULvw%3D%3D&r=https%3A%2F%2Fsourceforge.net%2Fprojects%2Fbowtie-bio%2Ffiles%2Fbowtie2%2F2.4.2%2Fbowtie2-2.4.2-source.zip%2Fdownload' && \ + unzip bowtie2-2.4.2-source.zip && \ + cd /home/src/bowtie2-2.4.2 && \ + make && \ + make install && \ + ln -s /home/src/bowtie2-2.4.2/bowtie2 /usr/bin/ + +# Install seqkit +WORKDIR /home/src/ +RUN wget https://github.com/shenwei356/seqkit/releases/download/v0.10.1/seqkit_linux_amd64.tar.gz && \ + tar -zxvf seqkit_linux_amd64.tar.gz && \ + ln -s /home/src/seqkit /usr/bin/ + +# Install fastp +WORKDIR /home/src/ +RUN git clone https://github.com/OpenGene/fastp.git && \ + cd fastp && \ + make && \ + make install && \ + ln -s /home/src/fastp/fastp /usr/bin/ + +# Install seqtk +WORKDIR /home/src/ +RUN git clone https://github.com/lh3/seqtk.git && \ + cd seqtk && \ + make && \ + ln -s /home/src/seqtk/seqtk /usr/bin/ + +# Install preseq +WORKDIR /home/src/ +RUN wget https://github.com/smithlabcode/preseq/releases/download/v3.1.2/preseq-3.1.2.tar.gz && \ + tar xf preseq-3.1.2.tar.gz && \ + cd preseq-3.1.2 && \ + mkdir build && cd build && \ + ../configure --enable-hts \ + CPPFLAGS=-I"/home/src/include" \ + LDFLAGS="-L/home/src/lib -Wl,-R/home/src/lib" && \ + make && \ + make install + +# Install UCSC tools +WORKDIR /home/tools/ +RUN wget http://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/wigToBigWig && \ + wget http://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/bigWigCat && \ + chmod +x /home/tools/wigToBigWig && \ + chmod +x /home/tools/bigWigCat && \ + ln -s /home/tools/wigToBigWig /usr/bin/ && \ + ln -s /home/tools/bigWigCat /usr/bin/ + +# Install FLASH +WORKDIR /home/src/ +RUN wget -O FLASH-1.2.11.tar.gz http://ccb.jhu.edu/software/FLASH/FLASH-1.2.11-Linux-x86_64.tar.gz && \ + tar xf FLASH-1.2.11.tar.gz && \ + ln -s /home/src/FLASH-1.2.11-Linux-x86_64/flash /usr/bin/ + +# Install fastq_pair +WORKDIR /home/src/ +RUN git clone https://github.com/linsalrob/fastq-pair.git && \ + cd fastq-pair/ &&\ + mkdir build && cd build && \ + cmake /home/src/fastq-pair/ && \ + make && \ + make install + +# OPTIONAL REQUIREMENTS +# Install fastqc +WORKDIR /home/tools/ +RUN wget https://www.bioinformatics.babraham.ac.uk/projects/fastqc/fastqc_v0.11.9.zip && \ + unzip fastqc_v0.11.9.zip && \ + cd /home/tools/FastQC && \ + chmod 755 fastqc && \ + ln -s /home/tools/FastQC/fastqc /usr/bin/ + +# Install fqdedup +WORKDIR /home/tools/ +RUN git clone https://github.com/guertinlab/fqdedup.git && \ + cd fqdedup && \ + cargo build --release && \ + ln -s /home/tools/fqdedup/target/release/fqdedup /usr/bin/ + +# Install fastx_toolkit +WORKDIR /home/tools/ +RUN git clone https://github.com/agordon/libgtextutils.git && \ + cd libgtextutils && \ + ./reconf && \ + ./configure && \ + make && \ + make install && \ + cd /home/tools/ && \ + git clone https://github.com/agordon/fastx_toolkit && \ + cd fastx_toolkit && \ + ./reconf && \ + sed -i 's/-Werror//g' configure.ac && \ + ./configure && \ + make && \ + make install + +# Install genometools +WORKDIR /home/tools/ +RUN wget http://genometools.org/pub/genometools-1.6.1.tar.gz && \ + tar xf genometools-1.6.1.tar.gz && \ + cd /home/tools/genometools-1.6.1 && \ + make useshared=yes && \ + make install + +# Install seqOutBias +WORKDIR /home/tools/ +RUN wget -O seqOutBias-v1.3.0.tar.gz 'https://github.com/guertinlab/seqOutBias/archive/refs/tags/v1.3.0.tar.gz' && \ + tar xf seqOutBias-v1.3.0.tar.gz && \ + cd seqOutBias-1.3.0 && \ + cargo build --release && \ + ln -s /home/tools/seqOutBias-1.3.0/target/release/seqOutBias /usr/bin/ + +# Set environment variables +ENV PATH=/home/tools/bin:/home/tools/:/home/tools/bin/kentUtils/:/home/src/bowtie2-2.4.2:/home/src/skewer:/home/src/samtools-1.12:/home/src/htslib-1.12:$PATH \ + R_LIBS_USER=/usr/local/lib/R/site-library/ \ + PYTHONPATH=/usr/local/lib/python3.8/dist-packages:$PYTHONPATH + +# Set Python3 as python +RUN update-alternatives --install /usr/bin/python python /usr/bin/python3 1 + +# Define default command +WORKDIR /home/ +CMD ["/bin/bash"] + +# Clean up APT when done. +RUN apt-get clean && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* + diff --git a/docs/annotation.md b/docs/annotation.md index cda133e..6ecfd67 100644 --- a/docs/annotation.md +++ b/docs/annotation.md @@ -1,51 +1,53 @@ # Custom reference data -The pipeline uses reference data at various stages, such as for alignment, calculating TSS enrichments, and other QC scores. If you're using a common genome assembly, these resources are pre-built and can be easily downloaded using `refgenie pull`, as described in the setup instructions. If the resources are not available, you'll have to build them. This document outlines how we created the reference data, so you can recreate it if you need to. The easiest way to do this is use `refgenie build`. All you need to do is: +The pipeline uses reference data at various stages, such as for alignment, calculating mRNA contamination, pause indicies, and other QC scores. If you're using a common genome assembly, these resources are pre-built and can be easily downloaded using `refgenie pull`. If the resources are not available, you'll have to build them. This document outlines how we created the reference data, so you can recreate it if you need to. The easiest way to do this is use `refgenie build`. This is assuming you've [already installed and initialized `refgenie`](http://refgenie.databio.org/en/latest/install/). [More detail on all of the assets `PEPPRO` utilizes](assets.md) is available if you're interested. -## 1: Build the fasta asset +## 1. Build the fasta asset -You need a FASTA file for your genome. You can insert this file into refgenie like this: +We'll use `hg38` as an example genome. + +You need a FASTA file for your genome. You can have `refgenie` manage this file like this: ```console -refgenie build -g GENOME -a fasta --files fasta=/path/to/file.fa +refgenie build hg38/fasta --files fasta=/path/to/hg38.fa ``` ## 2. Build the bowtie2_index To build a bowtie2_index and have it managed by `refgenie` you'll, of course, need [bowtie2](http://bowtie-bio.sourceforge.net/bowtie2/index.shtml) already installed. You will also need the requisite FASTA file, which you just added in step 1. ```console -refgenie build -g GENOME -a bowtie2_index +refgenie build hg38/bowtie2_index ``` -## 3: Build the ensembl_gtf asset +## 3. Build the ensembl_gtf asset -The ensembl_gtf asset includes several related assets (*e.g.* pause index gene bodies and TSS's) the pipeline will employ. To build an ensembl_gtf asset, you need an Ensembl GTF file (or equivalent) for your genome. You can have refgenie build and manage this file as follows: +The `ensembl_gtf` asset includes several related assets (*e.g.* pause index gene bodies and TSS's) the pipeline will employ. To build an `ensembl_gtf` asset, you need an Ensembl GTF file (or equivalent) for your genome. You can have `refgenie` build and manage this file as follows: ```console -refgenie build -g GENOME -a ensembl-gtf --files ensembl_gtf=/path/to/Homo_sapiens.GRCh38.97.gtf.gz +refgenie build -hg38/ensembl-gtf --files ensembl_gtf=/path/to/Homo_sapiens.GRCh38.97.gtf.gz ``` -## 4: Build the refgene_anno asset +## 4. Build the refgene_anno asset -The refgene_anno asset actually includes several related assets that we'll need (*e.g.* TSS and premature mRNA annotations). To build these, for example for hg38, you will need to [download a refGene annotation](http://hgdownload.soe.ucsc.edu/goldenPath/hg38/database/refGene.txt.gz). Build it for a any genome like so: +The refgene_anno asset actually includes several related assets that we'll need (*e.g.* TSS and premature mRNA annotations). To build these, for example for `hg38`, you will need to [download a refGene annotation](http://hgdownload.soe.ucsc.edu/goldenPath/hg38/database/refGene.txt.gz). Build it like so: ```console -refgenie build -g GENOME -a refgene_anno --files refgene=/path/to/refGene.txt.gz +refgenie build hg38/refgene_anno --files refgene=/path/to/refGene.txt.gz ``` -## 5: Build the feat_annotation asset +## 5. Build the feat_annotation asset The `feat_annotation` asset includes feature annotations used to calculate the FRiF and cFRiF. `Refgenie` can automatically build this after you have the above assets installed: ```console -refgenie build -g GENOME -a feat_annotation +refgenie build -hg38/feat_annotation ``` -That's it! These assets will be automatically detected by PEPPRO if you build them like this with `refgenie`. +That's it! These assets will be automatically detected by `PEPPRO` if you build them like this with `refgenie` and [use `refgenie` configuration files](https://github.com/databio/peppro/blob/master/examples/meta/peppro_test_refgenie.yaml). ### Create a custom feature annotation file -The pipeline will calculate the fraction (and proportion) of reads in genomic features using the feat_annotation asset, but you can also construct this file yourself. +The pipeline will calculate the fraction (and proportion) of reads in genomic features using the `feat_annotation `asset, but you can also construct this file yourself. -This annotation file is really just a modified `BED` file, with the chromosomal coordinates and type of feature included. For example, the [downloadable `hg38_annotations.bed.gz` file](http://big.databio.org/pepatac/hg38_annotations.bed.gz) looks like so: +This annotation file is really just a modified `BED` file, with the chromosomal coordinates and type of feature included. For example, the [downloadable `hg38_annotations.bed.gz` file](http://big.databio.org/peppro/hg38_annotations.bed.gz) looks like so: ``` chr1 28200 30001 Promoter . * @@ -65,6 +67,6 @@ Just like a standard `BED` file, the first three fields are: 2. **chromStart** - the starting position of the feature 3. **chromEnd** - the ending position of the feature -Column four is the **name** column, in our case the name of our feature of interest. The fifth column is the **score**, which would determine how darkly an item would be displayed in a genome browser if you chose to set that or if the information in your file of interest has ascribed a score to the features. The final, sixth, column is the **strand** column. +Column four is the **name** column, in our case the name of our feature of interest. The fifth column is the **score**, which would determine how darkly an item would be displayed in a genome browser if you chose to set that or if the information in your file of interest has ascribed a score to the feature. The final, sixth, column is the **strand** column. After creating your `BED` file, you can point the pipeline to it using the `--anno-name` option followed with the path to your file. The pipeline will then use that file to determine the fractions of reads that cover those features. diff --git a/docs/assets.md b/docs/assets.md new file mode 100644 index 0000000..8918261 --- /dev/null +++ b/docs/assets.md @@ -0,0 +1,113 @@ +# Genome assets + +`PEPPRO` can use either manually constructed or `refgenie` managed assets. `Refgenie` streamlines sample processing, where once assets are built by `refgenie` there is minimal argument calls to `PEPPRO` to use all assets. Pipeline assets include: + +**Required** + +| `PEPPRO` argument | `refgenie` asset name | Description | +|--------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------| +| `--genome-index` | [`bowtie2_index`](http://refgenie.databio.org/en/latest/available_assets/#bowtie2_index) | A genome index file constructed from `bowtie2-build` | +| `--chrom-sizes` | With `refgenie`, this asset is built automatically when you build/pull the [`fasta`](http://refgenie.databio.org/en/latest/available_assets/#fasta) asset. | A text file containing "chr" and "size" columns. | + +**Optional** + +| `PEPPRO` argument | `refgenie` asset name | Description | +|------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `--prealignment-names` | Human readable genome alias(es) for `refgenie` managed `bowtie2_index` asset(s). | A space-delimited list of genome names. *e.g.* ["rCRSd", "human_repeats"] | +| `--prealignment-index` | [`bowtie2_index`](http://refgenie.databio.org/en/latest/available_assets/#bowtie2_index) | A genome index file constructed from `bowtie2-build`. Used for manually pointing to prealignment genome indices when using `bowtie2` (default) for alignment. | +| `--TSS-name` | [`refgene_anno`](http://refgenie.databio.org/en/latest/available_assets/#refgene_anno). `refgenie` `build/pull` the TSS annotation file with this asset. | Transcription start site (TSS) annotations. *e.g.* [refGene.txt.gz](https://hgdownload.cse.ucsc.edu/goldenPath/hg38/database/refGene.txt.gz) | +| `--anno-name` | [`feat_annotation`](annotation.md) | A BED-style file with "chr", "start", "end", "genomic feature name", "score" and "strand" columns. | +| `--pi-tss` | [`ensembl_gtf.ensembl_tss`](http://refgenie.databio.org/en/latest/available_assets/#ensembl_gtf) | A derived asset from an Ensembl GTF file. Represents all possible TSSs. | +| `--pi-body` | [`ensembl_gtf.ensembl_gene_body`](http://refgenie.databio.org/en/latest/available_assets/#ensembl_gtf) | A derived asset from an Ensembl GTF file. Represents all possible gene body coordinates. | +| `--pre-name` | [`refgene_anno.refgene_pre_mRNA`](http://refgenie.databio.org/en/latest/available_assets/#refgene_anno) | Asset derived from a refGene annotation file. Represents premature mRNA coordinates. | +| `--exon-name` | [`refgene_anno.refgene_exon`](http://refgenie.databio.org/en/latest/available_assets/#refgene_anno) | Asset derived from a refGene annotation file. Represents all exon coordinates. | +| `--intron-name` | [`refgene_anno.refgene_intron`](http://refgenie.databio.org/en/latest/available_assets/#refgene_anno) | Asset derived from a refGene annotation file. Represents all intron coordinates. | +| `--fasta` | [`fasta`](https://refgenie.databio.org/en/latest/available_assets/#fasta) The `fasta` asset. | A genome fasta file. Required for `--sob` argument. | +| `--search-file` | [`tallymer_index`](https://refgenie.databio.org/en/latest/available_assets/#tallymer_index) The `search_file` is built from this `refgenie` asset. | File used to search an index of k-mers in the genome of the same size as input read lengths. Only required for `--sob` argument | + +## Using `refgenie` managed assets + +`PEPPRO` can utilize [`refgenie`](http://refgenie.databio.org/) assets. Because assets are user-dependent, these files must be available natively. Therefore, you need to [install and initialize a refgenie config file.](http://refgenie.databio.org/en/latest/install/). For example: + +```console +pip install refgenie +export REFGENIE=/path/to/your_genome_folder/genome_config.yaml +refgenie init -c $REFGENIE +``` + +Add the `export REFGENIE` line to your `.bashrc` or `.profile` to ensure it persists. + +Next, pull the assets you need. Replace `hg38` in the example below if you need to use a different genome assembly. If these assets are not available automatically for your genome of interest, then you'll need to [build them](annotation.md). Download all standard assets for `hg38` like so: + +```console +refgenie pull hg38/fasta hg38/bowtie2_index hg38/refgene_anno hg38/ensembl_gtf hg38/ensembl_rb +refgenie build hg38/feat_annotation +``` + +`PEPPRO` also requires a `fasta` and `bowtie2_index` asset for any prealignment genomes: + +```console +refgenie pull human_rDNA/fasta human_rDNA/bowtie2_index +``` + +Furthermore, you can [learn more about using `seqOutBias` and the required `tallymer_index` here](sob.md). + +### Example using `refgenie` managed assets + +When using `refgenie`, you only need to provide the `--genome` and `--prealignment-names` argument to provide the pipeline with every required index and optional annotation file that exists for those genomes. This means, the TSS file, feature annotation file, and blacklist will all be used without needing to directly specify the paths to these files. + +From the `peppro/` repository directory: +```console +looper run examples/meta/peppro_test_refgenie.yaml +``` + +## Using manually managed assets + +Assets may also be managed manually and specified directly to the pipeline. While this frees you from needing `refgenie` installed and initialized, it does require a few more arguments to be specified. + +The TSS annotation file may be specified using `--TSS-name `. This file is a `BED6` (e.g. chr, start, end, name, score, strand) formatted file. + +The `feat_annotation` asset may also be directly specified using `--anno-name `. Read [more about using custom reference data](annotation.md). + +The `pi_tss` asset, representing all possible TSSs for calculating the pause index, may be directly specified using `--pi-tss`. This file is a `BED6` (e.g. chr, start, end, name, score, strand) formatted file. + +The `pi_body` asset, representing all possible gene bodies for calculating the pause index, may be directly specified using `--pi-body`. This file is a `BED6` (e.g. chr, start, end, name, score, strand) formatted file. + +The `pre_name` asset, representing premature mRNA sequence coordinates, may be directly specified using `--pre-name`. This file is a `BED6` (e.g. chr, start, end, name (a gene name), score, strand) formatted file. + +The `exon_name` asset, representing gene exon coordinates, may be directly specified using `--exon-name`. This file is a `BED6` (e.g. chr, start, end, name (the name of the gene the exon is from), score, strand) formatted file. + +The `intron_name` asset, representing gene intron coordinates, may be directly specified using `--intron-name`. This file is a `BED6` (e.g. chr, start, end, name (the name of the gene the intron is from), score, strand) formatted file. + +### Example using manually managed assets + +Even if you are *not* using `refgenie`, you can still grab these assets for all required and optional assets from the `refgenie` servers. `Refgenie` uses algorithmically derived genome digests under-the-hood to unambiguously define genomes. That's what you'll see being used in the example below when we manually download these assets. Therefore, `2230c535660fb4774114bfa966a62f823fdb6d21acf138d4` is the digest for the human readable alias, "hg38", and `b769bcf2deaf9d061d94f2007a0e956249905c64653cb5c8` is the digest for "human_rDNA." + +From within the `peppro/` repository: +```console +wget -O hg38.fasta.tgz http://refgenomes.databio.org/v3/assets/archive/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4/fasta?tag=default +wget -O hg38.bowtie2_index.tgz http://refgenomes.databio.org/v3/assets/archive/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4/bowtie2_index?tag=default +wget -O hg38.ensembl_gtf.tgz http://refgenomes.databio.org/v3/assets/archive/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4/ensembl_gtf?tag=default +wget -O hg38.ensembl_rb.tgz http://refgenomes.databio.org/v3/assets/archive/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4/ensembl_rb?tag=default +wget -O hg38.refgene_anno.tgz http://refgenomes.databio.org/v3/assets/archive/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4/refgene_anno?tag=default +wget -O hg38.feat_annotation.gz http://big.databio.org/peppro/hg38_annotations.bed.gz +wget -O human_rDNA.fasta.tgz http://refgenomes.databio.org/v3/assets/archive/b769bcf2deaf9d061d94f2007a0e956249905c64653cb5c8/fasta?tag=default +wget -O human_rDNA.bowtie2_index.tgz http://refgenomes.databio.org/v3/assets/archive/b769bcf2deaf9d061d94f2007a0e956249905c64653cb5c8/bowtie2_index?tag=default +``` + +Then, extract these files to the `peppro/` parent directory: +```console +tar xvf hg38.fasta.tgz +tar xvf hg38.bowtie2_index.tgz +mv hg38.feat_annotation.gz default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4.annotation.bed.gz +tar xvf hg38.refgene_anno.tgz +tar xvf hg38.ensembl_rb.tgz +tar xvf hg38.ensembl_gtf.tgz +tar xvf human_rDNA.fasta.tgz +tar xvf human_rDNA.bowtie2_index.tgz +``` + +From the `peppro/` repository folder (using the manually downloaded genome assets): +```console +looper run examples/meta/peppro_test.yaml +``` diff --git a/docs/changelog.md b/docs/changelog.md index f733d3b..e310e16 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -1,6 +1,20 @@ # Change log All notable changes to this project will be documented in this file. +## [0.10.0] -- 2022-01-26 + +### Added + - Include single, monolithic style Dockerfile + - Added conda install option and guide + - Added descriptions of assets used to docs + +### Changed + - Updated requirements.txt for package updates + - Update bulker crate version + - Made refgenie optional + - Updated installation guides for native, containers, multi-container, and conda. + - Updated detailed installation guide + - Updated guide for running samples on a cluster ## [0.9.11] -- 2021-03-04 diff --git a/docs/detailed_install.md b/docs/detailed_install.md index 0e30ce8..2691373 100644 --- a/docs/detailed_install.md +++ b/docs/detailed_install.md @@ -2,72 +2,72 @@ This guide walks you through the minutiae of how to install each prerequisite component. We'll presume you're installing this in a Linux environment. If not the case, you'll need to go to each tool's respective site to find alternative installation approaches and options. -## Install required software +You have several options for installing the software prerequisites: 1) use a container, either [a single container](run-container.md) or with a [multi-container environment manager](run-bulker.md), in which case you need only either `docker` or `singularity`; 2) [install via `conda`](run-conda.md) or 3) install all prerequisites natively. We'll install everything natively in this guide. -You have two options for installing the software prerequisites: 1) use a container, in which case you need only either `docker` or `singularity`; or 2) install all prerequisites natively. We'll install everything natively in this guide. If you want to try the container approach, read [PEPPRO in containers](container.md). +## 1. Install required software -To use `PEPPRO`, we need the following software: -**Python packages**. The pipeline uses [`pypiper`](http://pypiper.readthedocs.io/en/latest/) to run a single sample, [`looper`](http://looper.readthedocs.io/en/latest/) to handle multi-sample projects (for either local or cluster computation), [`pararead`](https://github.com/databio/pararead) for parallel processing sequence reads, [`refgenie`](http://refgenie.databio.org/en/latest/) to organize and build reference assemblies, [`cutadapt`](https://cutadapt.readthedocs.io/) to remove adapters, [`refgenie`](http://refgenie.databio.org/) to manage genome assets, and the common `python` libraries [`numpy`](https://www.numpy.org/) and [`pandas`](https://pandas.pydata.org/). You can do a user-specific install using the included requirements.txt file in the pipeline directory: +**Python packages**. The pipeline uses [`pypiper`](http://pypiper.readthedocs.io/en/latest/) to run a single sample, [`looper`](http://looper.readthedocs.io/en/latest/) to handle multi-sample projects (for either local or cluster computation), [`pararead`](https://github.com/databio/pararead) for parallel processing sequence reads, [`refgenie`](http://refgenie.databio.org/en/latest/), optionally, to organize, build, and manage reference assemblies, [`cutadapt`](https://cutadapt.readthedocs.io/) to remove adapters, and a handful of `python` libraries. You can do a user-specific install of required `python` packages using the included `requirements.txt` file in the pipeline directory: ```console pip install --user -r requirements.txt ``` -Remember to add your user specific install location to your `PATH`. -```console -export PATH="$PATH:$HOME/.local/bin/" -``` **Required executables**. We will need some common bioinformatics tools installed. The complete list (including optional tools) is specified in the pipeline configuration file (pipelines/peppro.yaml) tools section. -The following tools are used by the pipeline: +The following tools are used by the pipeline by default: -* [bedtools (v2.25.0+)](http://bedtools.readthedocs.io/en/latest/) -* [bowtie2 (v2.2.9+)](http://bowtie-bio.sourceforge.net/bowtie2/index.shtml) +* [bedtools (v2.30.0+)](http://bedtools.readthedocs.io/en/latest/) +* [bowtie2 (v2.4.2+)](http://bowtie-bio.sourceforge.net/bowtie2/index.shtml) +* [cutadapt](https://cutadapt.readthedocs.io/en/stable/) *installed via `pip`* * [fastq-pair](https://github.com/linsalrob/fastq-pair.git) * [flash](https://ccb.jhu.edu/software/FLASH/) -* [preseq](http://smithlabresearch.org/software/preseq/) -* [picard](https://broadinstitute.github.io/picard/) -* [samtools (v1.7)](http://www.htslib.org/) +* [picard](https://broadinstitute.github.io/picard/) *which is required by the `python` package `pypiper`* +* [preseq (v2.0.3)](http://smithlabresearch.org/software/preseq/) +* [samtools (v1.14+)](http://www.htslib.org/) * [seqkit](https://bioinf.shenwei.me/seqkit/) * [seqtk](https://github.com/lh3/seqtk) -* Two specific UCSC tools (v3.5.1) +* Two UCSC tools (v3.5.1+) * [bigWigCat (v4)](http://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/) * [wigToBigWig (v4)](https://www.encodeproject.org/software/wigtobigwig/) -#### bedtools -We'll install each of these pieces of software before moving forward. Let's start right at the beginning and install `bedtools`. We're going to install from source, but if you would prefer to install from a package manager, you can follow the instructions in the [bedtools' installation guide](http://bedtools.readthedocs.io/en/latest/content/installation.html). +We'll install each of these pieces of software before moving forward. Let's create an initial working directory to download and make all of this software. +```console +mkdir tools && cd tools/ +``` + +### bedtools +We're going to install from source, but if you would prefer to install from a package manager, you can follow the instructions in the [bedtools' installation guide](https://bedtools.readthedocs.io/en/latest/content/installation.html). ```console -cd tools/ -wget https://github.com/arq5x/bedtools2/releases/download/v2.25.0/bedtools-2.25.0.tar.gz -tar -zxvf bedtools-2.25.0.tar.gz -rm bedtools-2.25.0.tar.gz +wget https://github.com/arq5x/bedtools2/releases/download/v2.30.0/bedtools-2.30.0.tar.gz +tar -zxvf bedtools-2.30.0.tar.gz +rm bedtools-2.30.0.tar.gz cd bedtools2 make ``` Now, let's add `bedtools` to our `PATH` environment variable. Look here to [learn more about the concept of environment variables](https://www.digitalocean.com/community/tutorials/how-to-read-and-set-environmental-and-shell-variables-on-a-linux-vps) if you are unfamiliar. ```console -export PATH="$PATH:/path/to/peppro_tutorial/tools/bedtools2/bin/" +export PATH="$PATH:/path/to/tools/bedtools2/bin/" ``` -#### bowtie2 +### bowtie2 Next, let's install `bowtie2`. For more more specific instruction, [read the author's installation guide](http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml#building-from-source). ```console cd ../ -wget https://downloads.sourceforge.net/project/bowtie-bio/bowtie2/2.3.4.1/bowtie2-2.3.4.1-source.zip -unzip bowtie2-2.3.4.1-source.zip -rm bowtie2-2.3.4.1-source.zip -cd bowtie2-2.3.4.1 +wget https://downloads.sourceforge.net/project/bowtie-bio/bowtie2/2.4.2/bowtie2-2.4.2-source.zip +unzip bowtie2-2.4.2-source.zip +rm bowtie2-2.4.2-source.zip +cd bowtie2-2.4.2/ make -cd ../ ``` Again, let's add `bowtie2` to our `PATH` environment variable: ``` -export PATH="$PATH:/path/to/peppro_tutorial/tools/bowtie2-2.3.4.1/" +export PATH="$PATH:/path/to/tools/bowtie2-2.3.4.1/" ``` Great! On to the next one. -#### fastq_pair -Finally, because PRO-seq treats read1 differently than read2 in paired-end data, we need to resync paired-end files after processing. We [use `fastq_pair`](https://github.com/linsalrob/fastq-pair/blob/master/INSTALLATION.md) to do so efficiently. +### fastq_pair +Finally, because PRO-seq treats read1 differently than read2 in paired-end data, we need to re-sync paired-end files after processing. We [use `fastq_pair`](https://github.com/linsalrob/fastq-pair/blob/master/INSTALLATION.md) to do so efficiently. ```console +cd ../ git clone https://github.com/linsalrob/fastq-pair.git cd fastq-pair/ mkdir build @@ -75,76 +75,89 @@ cd build/ cmake3 .. make make install -cd ../../ ``` ### flash -To obtain a plot to evaluate library quality when we have paired-end reads, we use FLASH to generate a distribution of reads. +To obtain a plot to evaluate library quality when we have paired-end reads, [we use `FLASH`](https://ccb.jhu.edu/software/FLASH/) to generate a distribution of reads. ```console +cd ../../ wget http://ccb.jhu.edu/software/FLASH/FLASH-1.2.11-Linux-x86_64.tar.gz tar xvfz FLASH-1.2.11-Linux-x86_64.tar.gz ``` And let's add `FLASH` to our `PATH` environment variable: ``` -export PATH="$PATH:/path/to/peppro_tutorial/tools/FLASH-1.2.11-Linux-x86_64/" +export PATH="$PATH:/path/to/tools/FLASH-1.2.11-Linux-x86_64/" ``` -#### picard +### picard `PEPPRO` is built using `PyPiper` and relies upon the `PyPiper NGSTK` tool kit which itself employs `Picard`. [Read the `picard` installation guide](http://broadinstitute.github.io/picard/) for more assistance. ```console -wget https://github.com/broadinstitute/picard/releases/download/2.20.3/picard.jar +wget https://github.com/broadinstitute/picard/releases/download/2.26.10/picard.jar chmod +x picard.jar ``` -Create an environmental variable pointing to the `picard.jar` file called `$PICARD`. Alternatively, [update the `peppro.yaml` file](https://github.com/databio/peppro/blob/master/pipelines/peppro.yaml) with the full PATH to the `picard.jar` file. +Create an environmental variable pointing to the `picard.jar` file called `PICARD`. Alternatively, [update the `peppro.yaml` file](https://github.com/databio/peppro/blob/master/pipelines/peppro.yaml) with the full `PATH` to the `picard.jar` file. ``` -export PICARD="/path/to/peppro_tutorial/tools/picard.jar" +export PICARD="/path/to/tools/picard.jar" ``` -#### preseq -The pipeline uses `preseq` to calculate library complexity. Check out the author's [page for more instruction](https://github.com/smithlabcode/preseq). +### HTSlib +`PEPPRO` uses `samtools`, and `samtools` requires `HTSlib` internally. So first we'll install `HTSlib`. ```console -wget http://smithlabresearch.org/downloads/preseq_linux_v2.0.tar.bz2 -tar xvfj preseq_linux_v2.0.tar.bz2 +wget https://github.com/samtools/htslib/releases/download/1.14/htslib-1.14.tar.bz2 +tar xvfj htslib-1.14.tar.bz2 +cd htslib-1.14/ +./configure ``` -Add to `PATH`! + +Alternatively, if you do not have the ability to install `HTSlib` to the default location, you can specify using the `--prefix=/install/destination/dir/` option. [Learn more about the `--prefix` option here](http://samtools.github.io/bcftools/howtos/install.html). Otherwise, we will install to the default location. ```console -export PATH="$PATH:/path/to/peppro_tutorial/tools/preseq_v2.0/" +make +make install ``` -#### samtools +### samtools Next up, `samtools`. ```console -wget https://github.com/samtools/samtools/releases/download/1.10/samtools-1.10.tar.bz2 -tar xvfj samtools-1.10.tar.bz2 -rm samtools-1.10.tar.bz2 -cd samtools-1.10/ +wget https://github.com/samtools/samtools/releases/download/1.14/samtools-1.14.tar.bz2 +tar xvfj samtools-1.14.tar.bz2 +rm samtools-1.14.tar.bz2 +cd samtools-1.14/ ./configure ``` -Alternatively, if you do not have the ability to install `samtools` to the default location, you can specify using the `--prefix=/install/destination/dir/` option. [Learn more about the `--prefix` option here](http://samtools.github.io/bcftools/howtos/install.html). +Alternatively, if you do not have the ability to install `samtools` to the default location, you can specify using the `--prefix=/install/destination/dir/` option. [Learn more about the `--prefix` option here](http://samtools.github.io/bcftools/howtos/install.html). Otherwise, we will install to the default location. ```console make make install ``` -As for our other tools, add `samtools` to our `PATH` environment variable: + +### preseq +The pipeline uses `preseq` to calculate library complexity. Check out the author's [page for more instruction](https://github.com/smithlabcode/preseq). +```console +wget https://github.com/smithlabcode/preseq/releases/download/v2.0.3/preseq_v2.0.3.tar.bz2 +tar xvfj preseq_v2.0.3.tar.bz2 +cd preseq/ +make all SAMTOOLS_DIR=/path/to/tools/samtools-1.14 ``` -export PATH="$PATH:/path/to/peppro_tutorial/tools/samtools-1.10/" +Add to `PATH`! +```console +export PATH="$PATH:/path/to/tools/preseq/" ``` -#### seqkit +### seqkit Let's grab `seqkit` now. Check out [the author's installation guide](https://github.com/shenwei356/seqkit#installation) for more instruction if necessary. ```console cd ../ -wget https://github.com/shenwei356/seqkit/releases/download/v0.10.1/seqkit_linux_amd64.tar.gz +wget https://github.com/shenwei356/seqkit/releases/download/v2.1.0/seqkit_linux_amd64.tar.gz tar -zxvf seqkit_linux_amd64.tar.gz ``` And then make sure that executable is in our `PATH`. ```console -export PATH="$PATH:/path/to/peppro_tutorial/tools/" +export PATH="$PATH:/path/to/tools/" ``` -#### UCSC utilities +### UCSC utilities Finally, we need a few of the UCSC utilities. You can install the [entire set of tools](https://github.com/ENCODE-DCC/kentUtils) should you choose, but here we'll just grab the subset that we need. ``` wget http://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/wigToBigWig @@ -152,21 +165,18 @@ wget http://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/bigWigCat chmod 755 wigToBigWig chmod 755 bigWigCat ``` -Add our `tools/` directory to our `PATH` environment variable. -``` -export PATH="$PATH:/path/to/peppro_tutorial/tools/" -``` -That should do it! Now we'll install some **optional** packages. Of course, these are not required, but for the purposes of this tutorial we're going to be completionists. -### Optional software +That should do it! + +## 2. Install R packages -`PEPPRO` uses `R` to generate quality control plots. These are **optional** and the pipeline will run without them, but you would not get any QC plots. If you need to don't have [R installed, you can follow these instructions](https://cran.r-project.org/doc/manuals/r-release/R-admin.html). We'll use and install the necessary packages in this example. Here is the list of required packages: +`PEPPRO` uses `R` to generate quality control plots. These are technically **optional** and the pipeline will run without them, but you would not get any QC plots. If you need to but don't have [R installed, you can follow these instructions](https://cran.r-project.org/doc/manuals/r-release/R-admin.html). We'll use and install the necessary packages in this example. Here is the list of required packages: - - [data.table (v1.11.2)](https://cran.r-project.org/package=data.table) - - [devtools](https://cran.r-project.org/web/packages/devtools/index.html) - - [GenomicDistributions (v0.5)](http://code.databio.org/GenomicDistributions/index.html) - - [ggplot2 (v2.2.1)](https://cran.r-project.org/package=ggplot2) - - [pepr (v0.2.1)](http://code.databio.org/pepr/) + - [data.table (v1.14.2)](https://cran.r-project.org/package=data.table) + - [devtools (v2.4.3)](https://cran.r-project.org/web/packages/devtools/index.html) + - [GenomicDistributions (v1.3.2)](http://code.databio.org/GenomicDistributions/index.html) + - [ggplot2 (v3.3.5)](https://cran.r-project.org/package=ggplot2) + - [pepr (v0.4.0)](http://code.databio.org/pepr/) - [optigrab (v0.9.2.1)](https://cran.r-project.org/web/packages/optigrab/index.html) To install the needed packages, enter the following command in the pipeline folder: @@ -176,44 +186,60 @@ Rscript -e 'devtools::install_github("pepkit/pepr")' Rscript -e 'install.packages("BiocManager")' Rscript -e 'BiocManager::install("GenomicRanges")' Rscript -e 'devtools::install_github("databio/GenomicDistributions")' -Rscript -e 'install.packages("http://big.databio.org/GenomicDistributionsData/GenomicDistributionsData_0.0.1.tar.gz", repos=NULL)' +wget http://big.databio.org/GenomicDistributionsData/GenomicDistributionsData_0.0.2.tar.gz" +Rscript -e 'install.packages("GenomicDistributionsData_0.0.2.tar.gz", type="source", repos=NULL)' Rscript -e 'devtools::install(file.path("PEPPROr/"), dependencies=TRUE, repos="https://cloud.r-project.org/")' ``` To extract files quicker, `PEPPRO` can also utilize `pigz` in place of `gzip` if you have it installed. Let's go ahead and do that now. It's not required, but it can help speed everything up when you have many samples to process. ``` -cd /path/to/peppro_tutorial/tools/ -wget http://zlib.net/pigz/pigz-2.4.tar.gz -tar xvfz pigz-2.4.tar.gz -rm pigz-2.4.tar.gz -cd pigz-2.4/ +wget https://zlib.net/pigz/pigz-2.7.tar.gz +tar xvfz pigz-2.7.tar.gz +rm pigz-2.7.tar.gz +cd pigz-2.7/ make +cd ../ ``` Don't forget to add this to your `PATH` too! ``` -export PATH="$PATH:/path/to/peppro_tutorial/tools/pigz-2.4/" +export PATH="$PATH:/path/to/tools/pigz-2.7/" ``` -## Download `refgenie` assets +## 3. Download genomic assets using `refgenie` -PEPPRO uses [`refgenie`](http://refgenie.databio.org/) assets for alignment, quality control reports, and some outputs. You can initialize a refgenie config file like this: +`PEPPRO` can use [`refgenie`](http://refgenie.databio.org/) to simplify asset management for alignment, quality control reports, and some outputs. You can initialize a `refgenie` config file like this: ```console -export REFGENIE=your_genome_folder/genome_config.yaml +pip install refgenie +export REFGENIE=genome_config.yaml refgenie init -c $REFGENIE ``` -Add the `export REFGENIE` line to your `.bashrc` or `.profile` to ensure it persists. +Add the `export REFGENIE=genome_config.yaml` line to your `.bashrc` or `.profile` to ensure it persists. Next, pull the assets you need. Replace `hg38` in the example below if you need to use a different genome assembly. If these assets are not available automatically for your genome of interest, then you'll need to [build them](annotation.md). Download these required assets with this command: ```console -refgenie pull -g hg38 -a bowtie2_index ensembl_gtf ensembl_rb refgene_anno feat_annotation +refgenie pull hg38/fasta hg38/bowtie2_index hg38/refgene_anno hg38/ensembl_gtf hg38/ensembl_rb +refgenie build hg38/feat_annotation ``` PEPPRO also requires `bowtie2_index` for any pre-alignment genomes: ```console -refgenie pull -g human_rDNA -a bowtie2_index +refgenie pull human_rDNA/fasta human_rDNA/bowtie2_index ``` -That's it! Everything we need to run `PEPPRO` to its full potential should be installed. \ No newline at end of file +That's it! Everything we need to run `PEPPRO` to its full potential should be installed. + +## 4. Confirm installation + +You can confirm the pipeline is now executable natively using the included `checkinstall` script. This can either be run directly from the `peppro/` repository... + +```console +./checkinstall +``` + +or from the web: +```console +curl -sSL https://raw.githubusercontent.com/databio/peppro/checkinstall | bash +``` diff --git a/docs/install.md b/docs/install.md index 16c6c0c..4b027fe 100644 --- a/docs/install.md +++ b/docs/install.md @@ -1,130 +1,21 @@ -# Install and run PEPPRO +# Install and run PEPPRO -## 1: Clone the `PEPPRO` pipeline +We provide several methods to setup `PEPPRO`. A fundamental challenge of any complex pipeline is that they rely on many independent tools. Installing all of these from scratch can be a chore, although the common use of many of the required bioinformatic tools means they are likely to already be available on an HPC or server. Installation can also be addressed through the use of containers, although that requires setting up and learning to use containers. No single approach appears to resolve all challenges for all users, but we've done our best to provide various ways to ease setup as much as possible. -``` -git clone https://github.com/databio/peppro.git -``` - -## 2: Install required software - -PEPPRO requires a set of Python and R packages to run. - -### Python packages - -`PEPPRO` uses several packages under the hood. Make sure you're up-to-date with a user-specific install: - -```{bash} -cd peppro -pip install --user -r requirements.txt -``` +1. [Run the pipeline using the multi-container environment manager, `bulker`.](run-bulker.md) +2. [Run the pipeline using a single, monolithic container.](run-container.md) +3. [Run the pipeline in a conda environment.](run-conda.md) +4. [Run the pipeline natively.](detailed-install.md) -### R package +## Confirm installation -`PEPPRO` uses R to produce QC plots, and we include an R package for these functions. The `PEPPRO` package relies on a handful of additional packages. +After setting up your environment to run `PEPPRO`, you can confirm which means of running the pipeline are now executable using the included `checkinstall` script. This can either be run directly from the `peppro/` repository: -To install the prerequisite packages from the command line: ```console -Rscript -e 'install.packages("devtools")' -Rscript -e 'devtools::install_github("pepkit/pepr")' -Rscript -e 'install.packages("BiocManager")' -Rscript -e 'BiocManager::install("GenomicRanges")' -Rscript -e 'devtools::install_github("databio/GenomicDistributions")' -Rscript -e 'BiocManager::install(c("BSgenome", "GenomicFeatures", "ensembldb"))' -Rscript -e 'install.packages("http://big.databio.org/GenomicDistributionsData/GenomicDistributionsData_0.0.1.tar.gz", repos=NULL)' +./checkinstall ``` -Then, install the `PEPPRO` package. From the `peppro/` directory: +or from the web: ```console -Rscript -e 'devtools::install(file.path("PEPPROr/"), dependencies=TRUE, repos="https://cloud.r-project.org/")' - -``` - -### Tools - -The pipeline also relies on a set of publicly available bioinformatic tools, but if you don't want to install the prerequisite software used by PEPPRO natively, you can learn to [run PEPPRO using containers](container.md) and skip this step. - -Otherwise, you'll need to install the following: [bedtools](https://bedtools.readthedocs.io/en/latest/content/installation.html), [bigWigCat](http://hgdownload.soe.ucsc.edu/admin/exe/), [bowtie2](http://bowtie-bio.sourceforge.net/bowtie2/index.shtml), [fastq-pair](https://github.com/linsalrob/fastq-pair.git), [flash](https://ccb.jhu.edu/software/FLASH/), [picard](https://broadinstitute.github.io/picard/), [preseq](http://smithlabresearch.org/software/preseq/), [seqkit](https://bioinf.shenwei.me/seqkit/), [samtools](http://www.htslib.org/), [seqtk](https://github.com/lh3/seqtk), and [wigToBigWig](http://hgdownload.soe.ucsc.edu/admin/exe/). If you need help, we have [detailed installation instructions](detailed_install.md) for installing these. - -## 3: Download `refgenie` assets - -PEPPRO uses [`refgenie`](http://refgenie.databio.org/) assets for alignment. If you haven't already, initialize a refgenie config file like this: - -```console -export REFGENIE=your_genome_folder/genome_config.yaml -refgenie init -c $REFGENIE -``` - -Add the `export REFGENIE` line to your `.bashrc` or `.profile` to ensure it persists. - -Next, pull the assets you need. Replace `hg38` in the example below if you need to use a different genome assembly. If these assets are not available automatically for your genome of interest, then you'll need to [build them](annotation.md). Download these required assets with this command: - -```console -refgenie pull -g hg38 -a fasta bowtie2_index ensembl_gtf ensembl_rb refgene_anno feat_annotation -``` -PEPPRO also requires `bowtie2_index` for any pre-alignment genomes: - -```console -refgenie pull -g human_rDNA -a bowtie2_index -``` - -### Optional software - -Optionally, `PEPPRO` can mix and match tools for adapter removal, read trimming, deduplication, and reverse complementation. The use of `fqdedup`, in particular, is useful if you wish to minimize memory use at the expense of speed. We suggest using the default tools simply due to the fact that `fastx toolkit` has not been supported since 2012. `seqOutBias` can be used to take into account the mappability at a given read length to filter the sample signal. - -*Optional tools:* - -* [fastp](https://github.com/OpenGene/fastp) -* [fqdedup](https://github.com/guertinlab/fqdedup) -* [fastx toolkit](http://hannonlab.cshl.edu/fastx_toolkit/) -* [seqOutBias](https://github.com/guertinlab/seqOutBias) -* [fastqc](https://www.bioinformatics.babraham.ac.uk/projects/download.html#fastqc) -* [pigz (v2.3.4+)](https://zlib.net/pigz/) - -## 4: Run an example project through PEPPRO - -Start by running the example project (`peppro_test.yaml`) in the [`examples/meta/`](https://github.com/databio/peppro/tree/master/examples/meta) folder. PEPPRO uses a project management tool called [looper](https://looper.databio.org) to run the pipeline across samples in a project. Let's use the `-d` argument to do a *dry run*, which will create job scripts for every sample in a project, but will not execute them: - -``` -cd peppro -looper run -d examples/meta/peppro_test.yaml -``` - -If the looper executable is not in your `$PATH`, add the following line to your `.bashrc` or `.profile`: -``` -export PATH=$PATH:~/.local/bin -``` -If that worked, let's actually run the example by taking out the `-d` flag: - -```console -looper run examples/meta/peppro_test.yaml -``` - -Or, if you're using [`bulker`](https://bulker.databio.org/en/latest/) to run the pipeline in containers: - -```console -bulker activate databio/peppro -looper run examples/meta/peppro_test.yaml -``` - -There are lots of other cool things you can do with looper, like dry runs, summarize results, check on pipeline run status, clean intermediate files to save disk space, lump multiple samples into one job, and more. For details, consult the [`looper` docs](http://looper.databio.org/). - -## 5: Configure your project files - -To run your own samples, you'll need to organize them in **PEP format**, which is explained in [how to create a PEP](http://pep.databio.org/en/latest/simple_example/#how-do-i-create-my-own-pep-a-simple-example) and is universal to all pipelines that read PEPs, including `PEPPRO`. To get you started, there are examples you can adapt in the `examples/` folder (*e.g.* [example test PEP](https://github.com/databio/peppro/tree/master/examples/meta/peppro_test.yaml)). In short, you need two files for your project: - - 1. project config file -- describes output locations, pointers to data, etc. - 2. sample annotation file -- comma-separated value (CSV) list of your samples. - -The sample annotation file must specify these columns: - -- sample_name -- library (*e.g.* 'PRO', 'PROSEQ', 'PRO-seq', 'GRO', 'GROSEQ', 'GRO-seq') -- organism (*e.g.* 'human' or 'mouse') -- read1 -- read2 (if paired) -- anything else you wish to include - -## Next steps - -This is just the beginning. For your next step, the [extended tutorial](tutorial.md) will walk you through a real project. Or, take a look at one of other detailed user guide links in the side menu. +curl -sSL https://raw.githubusercontent.com/databio/peppro/checkinstall | bash +``` \ No newline at end of file diff --git a/docs/run-bulker.md b/docs/run-bulker.md new file mode 100644 index 0000000..f92841b --- /dev/null +++ b/docs/run-bulker.md @@ -0,0 +1,244 @@ +# Run PEPPRO with a multiple container manager. + +Whether you are using `docker` or `singularity`, we have a solution to run the pipeline using containers that reduces the installation burden. + +In addition to cloning the `PEPPRO` repository, this requires the installation and configuration of a single python package, our [multi-container environment manager `bulker`](https://bulker.databio.org/en/latest/). We support using `bulker` for a few reasons: + +1. It simplifies container use by wrapping the complexities of `docker` or `singularity` calls so that you can use a containerized program without even realizing you're using a container. You can call a program at the command line the same as your would *without* using bulker. +2. Similar to a dockerfile, you can distribute sets of tools *but* as a separate set of containers, not a single, unwieldy, and monolithic container. +3. Since `bulker` commands behave like native commands, a workflow becomes automatically containerized with bulker. +4. Finally, this makes bulker environments very portable, since the only requirement for native-like command use is `docker` or `singularity`. + +[`Bulker` has a guide to running `PEPPRO`](https://bulker.databio.org/en/latest/peppro/), but we'll go into more detail below. + +If you would still prefer using a single container, we do provide a [PEPPRO dockerfile](https://github.com/databio/peppro/blob/master/containers/peppro.Dockerfile) and support for [running the pipeline using a single, monolithic container.](run-container.md). + +## Running `PEPPRO` using `bulker` + +### 1. Clone the `PEPPRO` pipeline + +```console +git clone https://github.com/databio/peppro.git +``` + +### 2. Get genome assets + +We [recommend `refgenie` to manage all required and optional genome assets](run-bulker.md#2a-initialize-refgenie-and-download-assets). However, [`PEPPRO` can also accept file paths to any of the assets](run-bulker.md#2b-download-assets). + +#### 2a. Initialize `refgenie` and download assets + +`PEPPRO` can utilize [`refgenie`](http://refgenie.databio.org/) assets. Because assets are user-dependent, these files must still exist outside of a container system. Therefore, we need to [install and initialize a refgenie config file.](http://refgenie.databio.org/en/latest/install/). For example: + +```console +pip install refgenie +export REFGENIE=/path/to/your_genome_folder/genome_config.yaml +refgenie init -c $REFGENIE +``` + +Add the `export REFGENIE` line to your `.bashrc` or `.profile` to ensure it persists. + +Next, pull the assets you need. Replace `hg38` in the example below if you need to use a different genome assembly. If these assets are not available automatically for your genome of interest, then you'll need to [build them](annotation.md). + +```console +refgenie pull hg38/fasta hg38/bowtie2_index hg38/refgene_anno hg38/ensembl_gtf hg38/ensembl_rb +refgenie build hg38/feat_annotation +``` + +`PEPPRO` also requires a `bowtie2_index` asset for any pre-alignment genomes: + +```console +refgenie pull human_rDNA/fasta human_rDNA/bowtie2_index +``` + +#### 2b. Download assets manually + +If you prefer not to use `refgenie`, you can also download and construct assets manually. The minimum required assets for a genome includes: + +- a chromosome sizes file: a text file containing "chr" and "size" columns. +- a [`bowtie2` genome index](http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml#the-bowtie2-build-indexer). + +Optional assets include: + +- a TSS annotation file: a BED6 file containing "chr", "start", "end", "gene name", "score", and "strand" columns. +- a [genomic feature annotation file](annotation.md) +- a [BED6 file containing all possible TSSs used for pause-index calculation](assets.md) +- a [BED6 file containing all possible gene bodies used for pause-index calculation](assets.md) +- a [BED6 file containing gene's exon coordinates](assets.md) +- a [BED6 file containing gene's intron coordinates](assets.md) +- a [BED6 file containing premature mRNA gene coordinates](assets.md) + +Even if you are *not* using `refgenie`, you can still grab premade assets for all required and optional assets from the `refgenie` servers. `Refgenie` uses algorithmically derived genome digests under-the-hood to unambiguously define genomes. That's what you'll see being used in the example below when we manually download these assets. Therefore, `2230c535660fb4774114bfa966a62f823fdb6d21acf138d4` is the digest for the human readable alias, "hg38", and `b769bcf2deaf9d061d94f2007a0e956249905c64653cb5c8` is the digest for "human_rDNA." + +From the `peppro/` repository: +```console +wget -O hg38.fasta.tgz http://refgenomes.databio.org/v3/assets/archive/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4/fasta?tag=default +wget -O hg38.bowtie2_index.tgz http://refgenomes.databio.org/v3/assets/archive/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4/bowtie2_index?tag=default +wget -O hg38.ensembl_gtf.tgz http://refgenomes.databio.org/v3/assets/archive/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4/ensembl_gtf?tag=default +wget -O hg38.ensembl_rb.tgz http://refgenomes.databio.org/v3/assets/archive/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4/ensembl_rb?tag=default +wget -O hg38.refgene_anno.tgz http://refgenomes.databio.org/v3/assets/archive/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4/refgene_anno?tag=default +wget -O hg38.feat_annotation.gz http://big.databio.org/peppro/hg38_annotations.bed.gz +wget -O human_rDNA.fasta.tgz http://refgenomes.databio.org/v3/assets/archive/b769bcf2deaf9d061d94f2007a0e956249905c64653cb5c8/fasta?tag=default +wget -O human_rDNA.bowtie2_index.tgz http://refgenomes.databio.org/v3/assets/archive/b769bcf2deaf9d061d94f2007a0e956249905c64653cb5c8/bowtie2_index?tag=default +``` + +Then, extract these files: +```console +tar xvf hg38.fasta.tgz +tar xvf hg38.bowtie2_index.tgz +mv hg38.feat_annotation.gz default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4.annotation.bed.gz +tar xvf hg38.refgene_anno.tgz +tar xvf hg38.ensembl_rb.tgz +tar xvf hg38.ensembl_gtf.tgz +tar xvf human_rDNA.fasta.tgz +tar xvf human_rDNA.bowtie2_index.tgz +``` + +### 3. Install and configure `bulker` + +Check out [the `bulker` setup guide to install bulker](https://bulker.databio.org/en/latest/install/) on your system. It is a straightforward python package with a few configuration steps required prior to use with `PEPPRO`. + +### 4. Confirm installation + +After setting up your environment to run `PEPPRO` with `bulker`, you can confirm the pipeline is now executable with `bulker` using the included `checkinstall` script. This can either be run directly from the `peppro/` repository... + +```console +./checkinstall +``` + +or from the web: +```console +curl -sSL https://raw.githubusercontent.com/databio/peppro/checkinstall | bash +``` + +### 5. Load the `PEPPRO` crate + +We've already produced a `bulker` crate for `PEPPRO` that requires all software needed to run the pipeline. We can load this crate directly from the [`bulker registry`](http://hub.bulker.io/): +```console +bulker load databio/peppro:1.0.1 -r +``` + +### 6. Activate the `PEPPRO` crate + +Now that we've loaded the `PEPPRO` crate, we need to activate that specific crate so its included tools are available. +```console +bulker activate databio/peppro:1.0.1 +``` +Now, you can run any of the commands in the crate as if they were natively installed, **but they're actually running in containers**! + +### 7. Run the sample processing pipeline + +Now we simply run the pipeline like you would with a native installation, but we wouldn't have needed to install any additional tools! + +#### 7a. Run the pipeline using `looper` + +Since `bulker` automatically directs any calls to required software to instead be executed in containers, we can just run our project the exact same way we would when we installed everything natively! + +**Run the pipeline with looper and refgenie** +```console +looper run examples/meta/peppro_test_refgenie.yaml +``` + +**Run the pipeline with looper and manual asset specifications** +```console +looper run examples/meta/peppro_test.yaml +``` + +#### 7b. Run the pipeline at the command line + +If you are using `refgenie`, but running directly at the command-line you need to specify paths to any assets that you pulled above. When [the pipeline is run with `looper`](run-bulker.md#7a-run-the-pipeline-using-looper), you can simply specify human-readable aliases to auto-populate these variables. [See the looper refgenie configuration file for an example](examples/meta/peppro_test_refgenie.yaml). + +You can grab the path to the minimally required `--chrom-sizes` and `--genome-index` files as follows: +```console +refgenie seek hg38/fasta.chrom_sizes +refgenie seek hg38/bowtie2_index.dir +``` + +And if you are using pre-alignments, you need the genome index for any pre-alignment genomes, `--prealignment-index`: +```console +refgenie seek human_rDNA/bowtie2_index.dir +``` + +For the full potential of the pipeline, you'll also need the file paths for the following assets: + +| pipeline argument | `refgenie` command to retrieve file path | +|-------------------|----------------------------------------------------| +| `--TSS-name` | `refgenie seek hg38/refgene_anno.refgene_tss` | +| `--anno-name` | `refgenie seek hg38/feat_annotation` | +| `--pre-name` | `refgenie seek hg38/refgene_anno.refgene_pre_mRNA` | +| `--exon-name` | `refgenie seek hg38/refgene_anno.refgene_exon` | +| `--intron-name` | `refgenie seek hg38/refgene_anno.refgene_intron` | +| `--pi-tss` | `refgenie seek hg38/ensembl_gtf.ensembl_tss` | +| `--pi-body` | `refgenie seek hg38/ensembl_gtf.ensembl_gene_body` | + +You'll need to update the paths to the assets to reflect the results from `refgenie seek`. Below is an example where all those assets are local to the `peppro/` repository. + +From the `peppro/` repository folder (using `refgenie` managed genome assets file paths): +```console +pipelines/peppro.py --single-or-paired single \ + --prealignment-index human_rDNA=default/b769bcf2deaf9d061d94f2007a0e956249905c64653cb5c8 \ + --genome-index default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4 \ + --chrom-sizes default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4.chrom.sizes \ + --genome hg38 \ + --sample-name test \ + --input examples/data/test_r1.fq.gz \ + --TSS-name default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4_TSS.bed \ + --anno-name default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4.annotation.bed.gz \ + --pre-name default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4_pre-mRNA.bed \ + --exon-name default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4_exons.bed \ + --intron-name default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4_introns.bed \ + --pi-tss default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4_ensembl_TSS.bed \ + --pi-body default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4_ensembl_gene_body.bed \ + -O peppro_test +``` + +In the previous example, we used `refgenie` assets that we placed in the same location as if we manually downloaded assets to the `peppro/` repository, so the file paths here look the same. From the `peppro/` repository folder (using the manually downloaded genome assets): +```console +pipelines/peppro.py --single-or-paired single \ + --prealignment-index human_rDNA=default/b769bcf2deaf9d061d94f2007a0e956249905c64653cb5c8 \ + --genome hg38 \ + --genome-index default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4 \ + --chrom-sizes default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4.chrom.sizes \ + --sample-name test \ + --input examples/data/test_r1.fq.gz \ + --TSS-name default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4_TSS.bed \ + --anno-name default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4.annotation.bed.gz \ + --pre-name default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4_pre-mRNA.bed \ + --exon-name default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4_exons.bed \ + --intron-name default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4_introns.bed \ + --pi-tss default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4_ensembl_TSS.bed \ + --pi-body default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4_ensembl_gene_body.bed \ + -O peppro_test +``` + +With a single processor, this will take around 30 minutes to complete. + +### 8. Run the project level pipeline + +`PEPPRO` also includes a project-level processing pipeline to summarize the pipeline with reports on sample library complexities and count matrices across the samples in the project. + +This should take < a minute on the test sample and will generate a `summary/` directory containing project level output in the parent project directory. To see more, you can [run through the extended tutorial](tutorial.md) to see this in action. + +**Run the project pipeline with looper and refgenie** +```console +looper runp examples/meta/peppro_test_refgenie.yaml +``` + +**Run the project pipeline with looper and manual asset specifications** +```console +looper runp examples/meta/peppro_test.yaml +``` + +### 9. Generate an HTML report using `looper` + +`Looper` can generate a pipeline HTML report that makes all our results easy to view and browse. Using the same configuration file we used to run the samples through the pipeline, we'll now employ the `report` function of `looper`. + +**Generate the HTML report with looper and refgenie managed assets** +```console +looper report examples/meta/peppro_test_refgenie.yaml +``` + +**Generate the HTML report with looper and manual asset specifications** +```console +looper report examples/meta/peppro_test.yaml +``` + diff --git a/docs/run-cluster.md b/docs/run-cluster.md new file mode 100644 index 0000000..f47cb44 --- /dev/null +++ b/docs/run-cluster.md @@ -0,0 +1,52 @@ +# Running on a cluster + +## Default computing options + +When you run your `PEPPRO` project using `looper run`, by default it will simply run each sample locally. You can change that using `looper run --package COMPUTE_PACKAGE`, where `COMPUTE_PACKAGE` is an option described below. This enables you to adjust your computing preferences on-the-fly. You have several built-in packages, which you can view by typing `divvy list`. Default packages include: + +- `--package slurm`. Submit the jobs to a `SLURM` cluster using `sbatch`. +- `--package sge`. Submit the jobs to a `SGE` cluster using `qsub`. +- `--package docker`. Submit the jobs locally using the `databio/peppro` docker image. +- `--package singularity`. Submit the jobs locally using the singularity image. +- `--package singularity_slurm`. Submit jobs using `sbatch`, but run them using the singularity image. + +To show how this works, let's run the example project using the `slurm` compute package. Used `-d` for a dry run to create the submits scripts but not run them. + +Using the [manually downloaded assets](assets.md#example-using-manually-managed-assets) (run from within the `peppro/` repository): +```console +looper run examples/meta/peppro_test.yaml -d \ + --package slurm +``` + +This will produce a job script: + +```console +cat peppro_test/submission/PEPPRO_test.sub +``` + +If all looks well, run looper without `-d` to actually submit the jobs. Read more to [learn how to run `PEPPRO` in containers](run-container.md). + +Using `refgenie` managed assets (run from within the `peppro/` repository): +```console +looper run examples/meta/peppro_test_refgenie.yaml -d \ + --package slurm +``` + +This will produce a job script: + +```console +cat peppro_test/submission/PEPPRO_test.sub +``` + +## Customizing compute options + +These default computing options may not fit your needs exactly. `PEPPRO` allows you to very easily change templates or add your own, so you can run `PEPPRO` in any possible computing environment. `PEPPRO` uses a standardized computing configuration called [`divvy`](https://divvy.databio.org). The instructions for changing these computing configuration options are universal for any software that relies on `divvy`. + +To customize your compute packages, you first create a `divvy` computing configuration file and point an environment variable (`DIVCFG`) to that file: + +```console +export DIVCFG="divvy_config.yaml" +divvy init $DIVCFG +``` + +Next, you edit that config file to add in any compute packages you need. `PEPPRO` will then give you access to any of your custom packages with `looper --package `. For complete instructions on how to create a custom compute package, read [how to configure divvy](https://divvy.databio.org/en/latest/configuration/). diff --git a/docs/run-conda.md b/docs/run-conda.md new file mode 100644 index 0000000..695df09 --- /dev/null +++ b/docs/run-conda.md @@ -0,0 +1,243 @@ +# Run PEPPRO in a conda environment. + +We also enable setup of the pipeline using `conda`. As with container-based approaches, some native installation is required for complete setup. + +## 1. Clone the `PEPPRO` pipeline + +```console +git clone https://github.com/databio/peppro.git +``` + +## 2. Install bioinformatic tools + +Be prepared for this initial installation process to take more than an hour to complete. + +From the `peppro/` repository directory: +```{bash} +conda env create -f requirements-conda.yml +``` + +Note: The subsequent steps all assume you have installed using `conda`. Alternatively, you can [follow instructions to install each individual program natively](detailed-install.md). + +## 3. Install python packages + +`PEPPRO` uses several Python packages under the hood. Not all of these are available through `conda`, so we'll ensure they are installed ourselves to the `peppro` `conda` environment. From the `peppro/` directory: + +```{bash} +conda activate peppro +unset PYTHONPATH +python -m pip install --ignore-installed --upgrade -r requirements.txt +``` + +## 4. Install R packages + +`PEPPRO` uses `R` to generate quality control and read/peak annotation plots. We have packaged the `peppro` specific `R` code into a supporting package called [PEPPROr](https://github.com/databio/peppro/tree/master/PEPPROr). The `PEPPROr` package relies on a few additional packages which can be installed to the `conda` environment. + +To ensure these packages are installed to the `peppro` `conda` environment, make sure to point your `R_LIBS` environment variable to the `conda` environment `R` library. For example: +```{bash} +conda activate peppro +unset R_LIBS +export R_LIBS="$CONDA_PREFIX/lib/R/library" +``` + +From the `peppro/` directory, open `R` and install the following packages: +```{R} +install.packages("optigrab") +devtools::install_github("databio/GenomicDistributions") +install.packages("http://big.databio.org/GenomicDistributionsData/GenomicDistributionsData_0.0.2.tar.gz", repos=NULL) +devtools::install(file.path("PEPPROr/"), dependencies=TRUE, repos="https://cloud.r-project.org/") +``` + +## 5. Get genome assets + +### 5a. Initialize `refgenie` and download assets + +`PEPPRO` can utilize [`refgenie`](http://refgenie.databio.org/) assets. Because assets are user-dependent, these files must still be available natively. Therefore, we need to [install and initialize a refgenie config file.](http://refgenie.databio.org/en/latest/install/). For example: + +```console +pip install refgenie +export REFGENIE=genome_config.yaml +refgenie init -c $REFGENIE +``` + +Add the `export REFGENIE` line to your `.bashrc` or `.profile` to ensure it persists. + +Next, pull the assets you need. Replace `hg38` in the example below if you need to use a different genome assembly. If these assets are not available automatically for your genome of interest, then you'll need to [build them](annotation.md). Download these required assets with this command: + +```console +refgenie pull hg38/fasta hg38/bowtie2_index hg38/refgene_anno hg38/ensembl_gtf hg38/ensembl_rb +refgenie build hg38/feat_annotation +``` + +`PEPPRO` also requires a `fasta` and `bowtie2_index` asset for any pre-alignment genomes: + +```console +refgenie pull human_rDNA/fasta human_rDNA/bowtie2_index +``` + +### 5b. Download assets manually + +If you prefer not to use `refgenie`, you can also download [assets](assets.md) manually. To realize the full potential of the pipeline, you will need the following: + + - a chromosome sizes file: a text file containing "chr" and "size" columns. + - a [`bowtie2` genome index](http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml#the-bowtie2-build-indexer). + - an [ensembl_gtf](http://refgenie.databio.org/en/latest/available_assets/#ensembl_gtf) asset used to build other derived assets including a comprehensive TSS annotation and gene body annotation. + - an [ensembl_rb] (http://refgenie.databio.org/en/latest/available_assets/#ensembl_rb) asset containing known genomic features such as promoters and used to produce derived assets such as genomic feature annotations. + - a [refgene_anno](http://refgenie.databio.org/en/latest/available_assets/#refgene_anno) asset used to produce derived assets including transcription start sites (TSSs), exons, introns, and premature mRNA sequences. + - a [genomic feature annotation file](annotation.md) + +Even if you are *not* using `refgenie`, you can still grab these assets for all required and optional assets from the `refgenie` servers. `Refgenie` uses algorithmically derived genome digests under-the-hood to unambiguously define genomes. That's what you'll see being used in the example below when we manually download these assets. Therefore, `2230c535660fb4774114bfa966a62f823fdb6d21acf138d4` is the digest for the human readable alias, "hg38", and `b769bcf2deaf9d061d94f2007a0e956249905c64653cb5c8` is the digest for "human_rDNA." + +From within the `peppro/` repository: +```console +wget -O hg38.fasta.tgz http://refgenomes.databio.org/v3/assets/archive/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4/fasta?tag=default +wget -O hg38.bowtie2_index.tgz http://refgenomes.databio.org/v3/assets/archive/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4/bowtie2_index?tag=default +wget -O hg38.ensembl_gtf.tgz http://refgenomes.databio.org/v3/assets/archive/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4/ensembl_gtf?tag=default +wget -O hg38.ensembl_rb.tgz http://refgenomes.databio.org/v3/assets/archive/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4/ensembl_rb?tag=default +wget -O hg38.refgene_anno.tgz http://refgenomes.databio.org/v3/assets/archive/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4/refgene_anno?tag=default +wget -O hg38.feat_annotation.gz http://big.databio.org/peppro/hg38_annotations.bed.gz +wget -O human_rDNA.fasta.tgz http://refgenomes.databio.org/v3/assets/archive/b769bcf2deaf9d061d94f2007a0e956249905c64653cb5c8/fasta?tag=default +wget -O human_rDNA.bowtie2_index.tgz http://refgenomes.databio.org/v3/assets/archive/b769bcf2deaf9d061d94f2007a0e956249905c64653cb5c8/bowtie2_index?tag=default +``` + +Then, extract those files: +```console +tar xvf hg38.fasta.tgz +tar xvf hg38.bowtie2_index.tgz +mv hg38.feat_annotation.gz default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4.annotation.bed.gz +tar xvf hg38.refgene_anno.tgz +tar xvf hg38.ensembl_rb.tgz +tar xvf hg38.ensembl_gtf.tgz +tar xvf human_rDNA.fasta.tgz +tar xvf human_rDNA.bowtie2_index.tgz +``` + +## 6. Confirm installation + +After setting up your environment to run `PEPPRO` with `conda`, you can confirm the pipeline is executable with `conda` using the included `checkinstall` script. This can either be run directly from the `peppro/` repository... + +```console +./checkinstall +``` + +or from the web: +```console +curl -sSL https://raw.githubusercontent.com/databio/peppro/checkinstall | bash +``` + +## 7. Run the sample processing pipeline + +Now we can run the pipeline in the `peppro` conda environment. The easiest approach is to use `looper`, but you can also run the pipeline for a single sample directly at the command line. + +### 7a. Run the pipeline using `looper` + +`PEPPRO` can utilize a [pipeline submission engine called `looper`](http://looper.databio.org/en/latest/) to run the pipeline across each sample in a project. We can use the `-d` argument to first try a dry run, which will create job scripts for every sample in a project, but will not execute them. + +**Run the pipeline with looper and refgenie** +```console +looper run examples/meta/peppro_test_refgenie.yaml +``` + +**Run the pipeline with looper and manual asset specifications** +```console +looper run examples/meta/peppro_test.yaml +``` + +There are lots of other cool things you can do with `looper`, like the dry runs, or report results, check on pipeline run status, clean intermediate files to save disk space, lump multiple samples into one job, and more. For details, consult the [looper docs](http://looper.databio.org/). + +### 7b. Run the pipeline at the command line + +If you are using `refgenie`, but running directly at the command-line you need to specify paths to any assets that you pulled above. When [the pipeline is run with `looper`](run-conda.md#7a-run-the-pipeline-using-looper), you can simply specify human-readable aliases to auto-populate these variables. [See the looper refgenie configuration file for an example](examples/meta/peppro_test_refgenie.yaml). + +You can grab the path to the minimally required `--chrom-sizes` and `--genome-index` files as follows: +```console +refgenie seek hg38/fasta.chrom_sizes +refgenie seek hg38/bowtie2_index.dir +``` + +And if you are using pre-alignments, you need the genome index for any pre-alignment genomes, `--prealignment-index`: +```console +refgenie seek human_rDNA/bowtie2_index.dir +``` + +For the full potential of the pipeline, you'll also need the file paths for the following assets: + +| pipeline argument | `refgenie` command to retrieve file path | +|-------------------|----------------------------------------------------| +| `--TSS-name` | `refgenie seek hg38/refgene_anno.refgene_tss` | +| `--anno-name` | `refgenie seek hg38/feat_annotation` | +| `--pre-name` | `refgenie seek hg38/refgene_anno.refgene_pre_mRNA` | +| `--exon-name` | `refgenie seek hg38/refgene_anno.refgene_exon` | +| `--intron-name` | `refgenie seek hg38/refgene_anno.refgene_intron` | +| `--pi-tss` | `refgenie seek hg38/ensembl_gtf.ensembl_tss` | +| `--pi-body` | `refgenie seek hg38/ensembl_gtf.ensembl_gene_body` | + +You'll need to update the paths to the assets to reflect the results from `refgenie seek`. Below is an example where all those assets are local to the `peppro/` repository. + +From the `peppro/` repository folder (using `refgenie` managed genome assets file paths): +```console +pipelines/peppro.py --single-or-paired single \ + --prealignment-index human_rDNA=default/b769bcf2deaf9d061d94f2007a0e956249905c64653cb5c8 \ + --genome-index default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4 \ + --chrom-sizes default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4.chrom.sizes \ + --genome hg38 \ + --sample-name test \ + --input examples/data/test_r1.fq.gz \ + --TSS-name default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4_TSS.bed \ + --anno-name default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4.annotation.bed.gz \ + --pre-name default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4_pre-mRNA.bed \ + --exon-name default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4_exons.bed \ + --intron-name default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4_introns.bed \ + --pi-tss default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4_ensembl_TSS.bed \ + --pi-body default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4_ensembl_gene_body.bed \ + -O peppro_test +``` + +In the previous example, we used `refgenie` assets that we placed in the same location as if we manually downloaded assets to the `peppro/` repository, so the file paths here look the same. From the `peppro/` repository folder (using the manually downloaded genome assets): +```console +pipelines/peppro.py --single-or-paired single \ + --prealignment-index human_rDNA=default/b769bcf2deaf9d061d94f2007a0e956249905c64653cb5c8 \ + --genome hg38 \ + --genome-index default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4 \ + --chrom-sizes default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4.chrom.sizes \ + --sample-name test \ + --input examples/data/test_r1.fq.gz \ + --TSS-name default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4_TSS.bed \ + --anno-name default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4.annotation.bed.gz \ + --pre-name default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4_pre-mRNA.bed \ + --exon-name default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4_exons.bed \ + --intron-name default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4_introns.bed \ + --pi-tss default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4_ensembl_TSS.bed \ + --pi-body default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4_ensembl_gene_body.bed \ + -O peppro_test +``` + +## 8. Use `looper` to run the project level pipeline + +`PEPPRO` also includes a project-level processing pipeline to summarize the pipeline with reports on sample library complexities and count matrices across the samples in the project. + +**Run the project pipeline with looper and refgenie managed assets** +```console +looper runp examples/meta/peppro_test_refgenie.yaml +``` + +**Run the project pipeline with looper and manual asset specifications** +```console +looper runp examples/meta/peppro_test.yaml +``` + +This should take < a minute on the test sample and will generate a `summary/` directory containing project level output in the parent project directory. In this small example, there won't be a consensus peak set or count table because it is only a single sample. To see more, you can [run through the extended tutorial](tutorial.md) to see this in action. + +## 9. Generate an HTML report using `looper` + +`Looper` can generate a pipeline HTML report that makes all our results easy to view and browse. Using the same configuration file we used to run the samples through the pipeline, we'll now employ the `report` function of `looper`. + +**Generate the HTML report with looper and refgenie managed assets** +```console +looper report examples/meta/peppro_test_refgenie.yaml +``` + +**Generate the HTML report with looper and manual asset specifications** +```console +looper report examples/meta/peppro_test.yaml +``` \ No newline at end of file diff --git a/docs/run-container.md b/docs/run-container.md new file mode 100644 index 0000000..7c9c67c --- /dev/null +++ b/docs/run-container.md @@ -0,0 +1,218 @@ +# Run PEPPRO in a container. + +A popular approach is installing all dependencies in a container and just use that single container. This container can be used with either `docker` or `singularity`. You can run `PEPPRO` as an individual pipeline on a single sample using the container with `docker run` or `singularity exec`. Or, you can rely on `looper`, which is already set up to run any pipeline in existing containers using the `divvy` templating system. + +## Running `PEPPRO` using a single, monolithic container. + +### 1: Clone the `PEPPRO` pipeline + +```console +git clone https://github.com/databio/peppro.git +``` + +### 2: Get genome assets + +We [recommend `refgenie` to manage all required and optional genome assets](run-container.md#2a-initialize-refgenie-and-download-assets). However, [`PEPPRO` can also accept file paths to any of the assets](run-container.md#2b-download-assets). + +#### 2a: Initialize `refgenie` and download assets + +`PEPPRO` can use [`refgenie`](http://refgenie.databio.org/) assets for alignment and annotation. Because assets are user-dependent, these files must still exist outside of a container system. We need to [install and initialize a refgenie config file.](http://refgenie.databio.org/en/latest/install/). For example: + +```console +pip install refgenie +export REFGENIE=/path/to/your_genome_folder/genome_config.yaml +refgenie init -c $REFGENIE +``` + +Add the `export REFGENIE` line to your `.bashrc` or `.profile` to ensure it persists. + +Next, pull the assets you need. Replace `hg38` in the example below if you need to use a different genome assembly. If these assets are not available automatically for your genome of interest, then you'll need to [build them](annotation.md). + +```console +refgenie pull hg38/fasta hg38/bowtie2_index hg38/refgene_anno hg38/ensembl_gtf hg38/ensembl_rb +refgenie build hg38/feat_annotation +``` + +`PEPPRO` also requires a `fasta` and `bowtie2_index` asset for any pre-alignment genomes: + +```console +refgenie pull human_rDNA/fasta human_rDNA/bowtie2_index +``` + +#### 2b: Download assets manually + +If you prefer not to use `refgenie`, you can also download and construct assets manually. Again, because these are user-defined assets, they must exist outside of any container system. The minimum required assets for a genome includes: +- a chromosome sizes file: a text file containing "chr" and "size" columns. +- a [`bowtie2` genome index](http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml#the-bowtie2-build-indexer). +- an [ensembl_gtf](http://refgenie.databio.org/en/latest/available_assets/#ensembl_gtf) asset used to build other derived assets including a comprehensive TSS annotation and gene body annotation. +- an [ensembl_rb] (http://refgenie.databio.org/en/latest/available_assets/#ensembl_rb) asset containing known genomic features such as promoters and used to produce derived assets such as genomic feature annotations. +- a [refgene_anno](http://refgenie.databio.org/en/latest/available_assets/#refgene_anno) asset used to produce derived assets including transcription start sites (TSSs), exons, introns, and premature mRNA sequences. +- a [genomic feature annotation file](annotation.md) (which may also be built locally through the `refgenie build /feat_annotation`) + +You can still obtain the pre-constructed assets from the [`refgenie` servers](http://refgenomes.databio.org/v3/genomes/splash/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4). `Refgenie` uses algorithmically derived genome digests under-the-hood to unambiguously define genomes. That's what you'll see being used in the example below when we manually download these assets. Therefore, `2230c535660fb4774114bfa966a62f823fdb6d21acf138d4` is the digest for the human readable alias, "hg38", and `b769bcf2deaf9d061d94f2007a0e956249905c64653cb5c8` is the digest for "human_rDNA." +```console +wget -O hg38.fasta.tgz http://refgenomes.databio.org/v3/assets/archive/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4/fasta?tag=default +wget -O hg38.bowtie2_index.tgz http://refgenomes.databio.org/v3/assets/archive/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4/bowtie2_index?tag=default +wget -O hg38.ensembl_gtf.tgz http://refgenomes.databio.org/v3/assets/archive/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4/ensembl_gtf?tag=default +wget -O hg38.ensembl_rb.tgz http://refgenomes.databio.org/v3/assets/archive/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4/ensembl_rb?tag=default +wget -O hg38.refgene_anno.tgz http://refgenomes.databio.org/v3/assets/archive/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4/refgene_anno?tag=default +wget -O hg38.feat_annotation.gz http://big.databio.org/peppro/hg38_annotations.bed.gz +wget -O human_rDNA.fasta.tgz http://refgenomes.databio.org/v3/assets/archive/b769bcf2deaf9d061d94f2007a0e956249905c64653cb5c8/fasta?tag=default +wget -O human_rDNA.bowtie2_index.tgz http://refgenomes.databio.org/v3/assets/archive/b769bcf2deaf9d061d94f2007a0e956249905c64653cb5c8/bowtie2_index?tag=default +``` + +Then, extract these files: +```console +tar xf hg38.fasta.tgz +tar xf hg38.bowtie2_index.tgz +tar xf hg38.ensembl_gtf.tgz +tar xf hg38.ensembl_rb.tgz +tar xf hg38.refgene_anno.tgz +mv hg38.feat_annotation.gz default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4.annotation.bed.gz +tar xf human_rDNA.fasta.tgz +tar xf human_rDNA.bowtie2_index.tgz +``` + +### 3. Pull the container image. + +**Docker**: You can pull the docker [databio/peppro image](https://hub.docker.com/r/databio/peppro/) from `dockerhub` like this: + +```console +docker pull databio/peppro +``` + +Or build the image using the included `Dockerfile` (you can use a recipe in the included `Makefile` in the `peppro/` repository): +```console +make docker +``` + +**Singularity**: You can [download the `singularity` image](http://big.databio.org/simages/peppro) or build it from the docker image using the `Makefile`: +```console +make singularity +``` + +Now you'll need to tell the pipeline where you saved the singularity image. You can either create an environment variable called `$SIMAGES` that points to the folder where your image is stored, or you can tweak the `pipeline_interface.yaml` file so that the `compute.singularity_image` attribute is pointing to the right location on disk. + +### 6. Confirm installation + +After setting up your environment to run `PEPPRO` using containers, you can confirm the pipeline is now executable with your container system using the included `checkinstall` script. This can either be run directly from the `peppro/` repository... + +```console +./checkinstall +``` + +or from the web: +```console +curl -sSL https://raw.githubusercontent.com/databio/peppro/checkinstall | bash +``` + +### 4. Run individual samples in a container + +Individual jobs can be run in a container by simply running the `peppro.py` command through `docker run` or `singularity exec`. You can run containers either on your local computer, or in an HPC environment, as long as you have `docker` or `singularity` installed. You will need to include any volumes that contain data required by the pipeline. For example, to utilize `refgenie` assets you'll need to ensure the volume containing those files is available. In the following example, we are including an environment variable (`$GENOMES`) which points to such a directory. + +For example, run it locally in `singularity` like this: +```console +singularity exec $SIMAGES/peppro pipelines/peppro.py --help +``` + +With `docker`, you can use: +```console +docker run --rm -it databio/peppro pipelines/peppro.py --help +``` + +### 5. Running multiple samples in a container with looper + +To run multiple samples in a container, you simply need to configure `looper` to use a container-compatible template. The looper documentation has instructions for [running jobs in containers](http://looper.databio.org/en/latest/containers/). + +### Container details + +#### Using `docker` +The pipeline has been successfully run in both a `Linux` and `MacOS` environment. With `docker` you need to bind mount your volume that contains the pipeline and your genome assets locations, as well as provide the container the same environment variables your host environment is using. + +In the first example, we're mounting our home user directory (`/home/jps3ag/`) which contains the parent directories to our genome assets and to the pipeline itself. We'll also provide the pipeline environment variables, such as `$HOME`. + +Here's that example command in a Linux environment to run the test example through the pipeline (using the manually downloaded genome assets): +```console +docker run --rm -it --volume /home/jps3ag/:/home/jps3ag/ \ + -e HOME='/home/jps3ag/' \ + databio/peppro \ + /home/jps3ag/src/peppro/pipelines/peppro.py --single-or-paired single \ + --prealignment-index human_rDNA=default/b769bcf2deaf9d061d94f2007a0e956249905c64653cb5c8 \ + --genome hg38 \ + --genome-index /home/jps3ag/src/peppro/default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4 \ + --chrom-sizes /home/jps3ag/src/peppro/default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4.chrom.sizes \ + --sample-name test \ + --input /home/jps3ag/src/peppro/examples/data/test_r1.fq.gz \ + --TSS-name /home/jps3ag/src/peppro/default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4_TSS.bed \ + --anno-name /home/jps3ag/src/peppro/default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4.annotation.bed.gz \ + --pre-name /home/jps3ag/src/peppro/default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4_pre-mRNA.bed \ + --exon-name /home/jps3ag/src/peppro/default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4_exons.bed \ + --intron-name /home/jps3ag/src/peppro/default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4_introns.bed \ + --pi-tss /home/jps3ag/src/peppro/default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4_ensembl_TSS.bed \ + --pi-body /home/jps3ag/src/peppro/default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4_ensembl_gene_body.bed \ + -O $HOME/peppro_test +``` + +In this second example, we'll perform the same command in a `MacOS` environment using [`Docker` for `Mac`](https://docs.docker.com/desktop/mac/install/). + +This necessitates a few minor changes to run that same example: + +- replace `/home/` with `/Users/` format +- e.g. `--volume /Users/jps3ag/:/Users/jps3ag/` + +Be sure to [allocate sufficient memory](https://docs.docker.com/desktop/mac/#resources) (6-8GB should generally be adequate) in Docker for Mac. + +```console +docker run --rm -it --volume /Users/jps3ag/:/Users/jps3ag/ \ + -e HOME="/Users/jps3ag/" \ + databio/peppro \ + /Users/jps3ag/src/peppro/pipelines/peppro.py --single-or-paired single \ + --prealignment-index human_rDNA=/Users/jps3ag/src/peppro/default/b769bcf2deaf9d061d94f2007a0e956249905c64653cb5c8 \ + --genome hg38 \ + --genome-index /Users/jps3ag/src/peppro/default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4 \ + --chrom-sizes /Users/jps3ag/src/peppro/default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4.chrom.sizes \ + --sample-name test \ + --input /Users/jps3ag/src/peppro/examples/data/test_r1.fq.gz \ + --TSS-name /Users/jps3ag/src/peppro/default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4_TSS.bed \ + --anno-name /Users/jps3ag/src/peppro/default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4.annotation.bed.gz \ + --pre-name /Users/jps3ag/src/peppro/default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4_pre-mRNA.bed \ + --exon-name /Users/jps3ag/src/peppro/default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4_exons.bed \ + --intron-name /Users/jps3ag/src/peppro/default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4_introns.bed \ + --pi-tss /Users/jps3ag/src/peppro/default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4_ensembl_TSS.bed \ + --pi-body /Users/jps3ag/src/peppro/default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4_ensembl_gene_body.bed \ + -O peppro_test +``` + +#### Using `singularity` + +First, build a singularity container from the docker image and create a running instance: +```console +singularity build peppro docker://databio/peppro:latest +singularity instance start -B /home/jps3ag/:/home/jps3aq/ peppro peppro_instance +``` + +Second, run your command. +```console +singularity exec instance://peppro_instance \ + /home/jps3ag/src/peppro/pipelines/peppro.py --single-or-paired single \ + --prealignment-index human_rDNA=/Users/jps3ag/src/peppro/default/b769bcf2deaf9d061d94f2007a0e956249905c64653cb5c8 \ + --genome hg38 \ + --genome-index /Users/jps3ag/src/peppro/default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4 \ + --chrom-sizes /Users/jps3ag/src/peppro/default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4.chrom.sizes \ + --sample-name test \ + --input /home/jps3ag/src/peppro/examples/data/test_r1.fq.gz \ + --TSS-name /Users/jps3ag/src/peppro/default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4_TSS.bed \ + --anno-name /Users/jps3ag/src/peppro/default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4.annotation.bed.gz \ + --pre-name /Users/jps3ag/src/peppro/default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4_pre-mRNA.bed \ + --exon-name /Users/jps3ag/src/peppro/default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4_exons.bed \ + --intron-name /Users/jps3ag/src/peppro/default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4_introns.bed \ + --pi-tss /Users/jps3ag/src/peppro/default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4_ensembl_TSS.bed \ + --pi-body /Users/jps3ag/src/peppro/default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4_ensembl_gene_body.bed \ + -O peppro_test +``` + +Third, close your instance when finished. +``` +singularity instance stop peppro_instance +``` + diff --git a/docs/run-directly.md b/docs/run-directly.md new file mode 100644 index 0000000..7e7dbc5 --- /dev/null +++ b/docs/run-directly.md @@ -0,0 +1,19 @@ +# Run the pipeline script directly + +The pipeline at its core is just a python script, and you can run it on the command line for a single sample (see [command-line usage](usage.md) or get this on the command line by running `pipelines/peppro.py --help`). You just need to pass a few command-line parameters to specify: paired-end status, any prealignment genome indicies, primary genome name, index, and chromosome sizes assets, a sample name, an input file, and the output destination folder: + +From the `peppro/` repository folder (using the [manually downloaded genome assets](assets.md#example-using-manually-managed-assets)): +```console +pipelines/peppro.py --single-or-paired single \ + --prealignment-index human_rDNA=default/b769bcf2deaf9d061d94f2007a0e956249905c64653cb5c8 \ + --genome hg38 \ + --genome-index default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4 \ + --chrom-sizes default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4.chrom.sizes \ + --sample-name test \ + --input examples/data/test_r1.fastq.gz \ + -O peppro_test/ +``` + +This example should take about 30 minutes to complete. + +You can also run the pipeline directly at the command line [using `conda`](run-conda.md#7b-run-the-pipeline-at-the-command-line), [a single container](run-container.md), or with the [multi-container environment manager `bulker`](run-bulker.md#7b-run-the-pipeline-at-the-command-line). \ No newline at end of file diff --git a/docs/tutorial.md b/docs/tutorial.md index f7b1a62..adc6665 100644 --- a/docs/tutorial.md +++ b/docs/tutorial.md @@ -1,8 +1,8 @@ # PEPPRO pipeline step-by-step guide -In this guide, we'll walk you through the step by step procedure of running a tutorial PRO-seq dataset through the pipeline. The output from this process is the same as you see in the [example PRO-seq output](browse_output.md) we've provided. To use this tutorial, you should have a basic familiarity with [working in a command line driven environment](http://matt.might.net/articles/basic-unix/). You also need to have already installed `PEPPRO` prerequisites, which you can do following the [basic installation instructions](install.md). +In this guide, we'll walk you through the step by step procedure of running a tutorial PRO-seq dataset through the pipeline. The output from this process is the same as you see in the [example PRO-seq output](browse_output.md) we've provided. To use this tutorial, you should have a basic familiarity with [working in a command line driven environment](http://matt.might.net/articles/basic-unix/). You also need to have already installed `PEPPRO` prerequisites, which you can do following one of the [installation instructions](install.md). -## 1: Set up folders +## 1. Set up folders From an open terminal, let's first create a directory we'll use to run through this guide: ```console @@ -13,15 +13,14 @@ Let's move into our newly created directory and create a few more folders that w ```console cd peppro_tutorial/ mkdir data -mkdir genomes mkdir processed -mkdir templates +mkdir divvy_templates mkdir tools cd tools/ git clone https://github.com/databio/peppro.git ``` -## 2: Download tutorial read files +## 2. Download tutorial read files We're going to work with some files a little larger than the test data included in the pipeline so we can see all the features included in a full run of the pipeline. Go ahead and download the [tutorial_r1.fastq.gz](http://big.databio.org/peppro/fastq/tutorial_r1.fq.gz) and [tutorial_r2.fq.gz](http://big.databio.org/peppro/fastq/tutorial_r2.fq.gz) files. ```console @@ -35,26 +34,26 @@ mv tutorial_r1.fq.gz peppro/examples/data/ mv tutorial_r2.fq.gz peppro/examples/data/ ``` -## 3: Configure project files +## 3. Configure project files We're going to use `looper` to analyze our data. For that, we need to pass looper a configuration file. This project config file describes your project. See [`looper` docs](https://looper.readthedocs.io/en/latest/) for details. A configuration file has been provided for you in the pipeline itself, conveniently named `tutorial.yaml`. This configuration file also points to our sample. In this case, we've provided a sample for you with the pipeline. You don't have to do anything else at this point and may [skip right to running the sample if you'd like](tutorial.md#3-using-looper-to-run-the-pipeline). Otherwise, we'll briefly touch on what those configuration files look like. You can open the configuration file in your favorite text editor if you'd like to look closer. For the purposes of the tutorial you may safely move past this step should you choose. -``` +```console cd peppro/examples/meta/ nano tutorial.yaml ``` The following is what you should see in that configuration file. -``` +```console # Run tutorial samples through PEPPRO name: tutorial pep_version: 2.0.0 -sample_table: "tutorial.csv" +sample_table: tutorial.csv looper: - output_dir: "$PROCESSED/tutorial" # export PROCESSED="/path/to/your_output_folder/" - pipeline_interfaces: ["$CODEBASE/peppro/project_pipeline_interface.yaml"] # export CODEBASE="/path/to/your_tools_folder/" + output_dir: "$PROCESSED/tutorial" + pipeline_interfaces: ["$CODEBASE/peppro/project_pipeline_interface.yaml"] sample_modifiers: append: @@ -69,91 +68,136 @@ sample_modifiers: organism: ["human", "Homo sapiens", "Human", "Homo_sapiens"] then: genome: "hg38" - prealignments: ["human_rDNA", "rCRSd"] + prealignments: ["human_rDNA"] ``` There is also a sample annotation file referenced in our configuration file. The sample annotation file contains metadata and other information about our sample. Just like before, this file, named `tutorial.csv` has been provided. You may check it out if you wish, otherwise we're all set. If you open `tutorial.csv`, you should see the following: -``` +```console sample_name,organism,protocol,read_type,read1,read2 tutorial,human,PROSEQ,paired,R1,R2 ``` That's it! Let's analyze that sample! +## 4. Create environment variables -## 4: Create environment variables - -We also need to create some environment variables to help point `looper` to where we keep our data files and our tools. You may either set the environment variables up, like we're going to do now, or you may simply hard code the necessary locations in our configuration files. +We also need to create some environment variables to help point `looper` to where we keep our data files and our tools. You may either set the environment variables up, like we're going to do now, or you may simply hard code the necessary locations in the configuration files. First, let's create a `PROCESSED` variable that represents the location where we want to save output. -``` +```console export PROCESSED="/path/to/peppro_tutorial/processed/" ``` Second, we'll create a variable representing the root path to all our tools named `CODEBASE`. -``` +```console export CODEBASE="/path/to/peppro_tutorial/tools/" ``` (Add these environment variables to your `.bashrc` or `.profile` so you don't have to always do this step). Fantastic! Now that we have the pipeline and its requirements installed, we're ready to get our reference genome(s). -## 5: Use `looper` to run the pipeline +## 5. Use `looper` to run the pipeline Looper requires a few variables and configuration files to work for the specific user. Let's get those set up now. `Looper` uses [`divvy`](https://divvy.databio.org/) to manage computing resource configuration so that projects and pipelines can easily travel among environments. For more detailed information, [check out the `looper` docs](https://looper.readthedocs.io/en/latest/running-on-a-cluster/). Let's set it up. -``` +```console cd /path/to/peppro_tutorial/ export DIVCFG="/path/to/peppro_tutorial/compute_config.yaml" divvy init $DIVCFG ``` You can open that initialized file in your favorite text editor if you want to learn more about its structure. If you need to edit this file further for your own setup you can [learn more about that in the `looper` docs](https://looper.readthedocs.io/en/latest/index.html). -``` +```console nano compute_config.yaml +# Use this to change your cluster manager (SLURM, SGE, LFS, etc). +# Relative paths are relative to this compute environment configuration file. +# Compute resource parameters fill the submission_template file's fields. +adapters: + CODE: looper.command + JOBNAME: looper.job_name + CORES: compute.cores + LOGFILE: looper.log_file + TIME: compute.time + MEM: compute.mem + DOCKER_ARGS: compute.docker_args + DOCKER_IMAGE: compute.docker_image + SINGULARITY_IMAGE: compute.singularity_image + SINGULARITY_ARGS: compute.singularity_args compute_packages: default: - submission_template: templates/localhost_template.sub + submission_template: divvy_templates/localhost_template.sub + submission_command: . + local: + submission_template: divvy_templates/localhost_template.sub + submission_command: . + slurm: + submission_template: divvy_templates/slurm_template.sub + submission_command: sbatch + singularity: + submission_template: divvy_templates/localhost_singularity_template.sub + submission_command: . + singularity_args: "" + singularity_slurm: + submission_template: divvy_templates/slurm_singularity_template.sub + submission_command: sbatch + singularity_args: "" + bulker_local: + submission_template: divvy_templates/localhost_bulker_template.sub submission_command: sh + docker: + submission_template: divvy_templates/localhost_docker_template.sub + submission_command: . + docker_args: | + --user=$(id -u):$(id -g) \ + --env="DISPLAY" \ + --volume="/etc/group:/etc/group:ro" \ + --volume="/etc/passwd:/etc/passwd:ro" \ + --volume="/etc/shadow:/etc/shadow:ro" \ + --volume="/etc/sudoers.d:/etc/sudoers.d:ro" \ + --volume="/tmp/.X11-unix:/tmp/.X11-unix:rw" \ + --workdir="`pwd`" \ + ``` (Remember to add `DIVCFG` to your `.bashrc` or `.profile` to ensure it persists). The `looper` environment configuration file points to submission template(s) in order to know how to run a samples locally or using cluster resources. If you'd like to learn more, check out the [`DIVCFG` configuration file and submission templates](https://divvy.databio.org/). We're going to simply setup a local template for the purposes of this tutorial. You can also easily create [templates for cluster or container use as well](https://github.com/pepkit/divcfg/tree/master/templates)! Let's change to our `templates/` directory to make our first submission template. -``` -cd /path/to/peppro_tutorial/templates/ +```console +cd /path/to/peppro_tutorial/divvy_templates/ nano localhost_template.sub ``` Paste the following into the localhost_template.sub: -``` +```console #!/bin/bash echo 'Compute node:' `hostname` echo 'Start time:' `date +'%Y-%m-%d %T'` -{CODE} | tee {LOGFILE} +{ +{CODE} +} | tee {LOGFILE} ``` Save and close that file, and return to the pipeline repository directory. -``` +```console cd /path/to/peppro_tutorial/tools/peppro/ ``` Now, we'll use `looper` to run the sample pipeline locally. -``` +```console looper run examples/meta/tutorial.yaml ``` Congratulations! Your first sample should be running through the pipeline now. It takes right around 25 minutes for this process to complete using a single core and maxes at about 3.5 GB of memory. We will also use `looper` to run the project pipeline locally. At the project level we can aggregate all the samples in our project (just 1 in this simple case) and view everything together. -``` +```console looper runp examples/meta/tutorial.yaml ``` After the pipeline is finished, we can look through the output directory together. We've provided a breakdown of that directory in the [browse output page](browse_output.md). -## 6: Generate an `HTML` report using `looper` +## 6. Generate an `HTML` report using `looper` Let's take full advantage of `looper` and generate a pipeline `HTML` report that makes all our results easy to view and browse. If you'd like to skip right to the results and see what it looks like, [check out the tutorial results](files/examples/tutorial/tutorial_summary.html). Otherwise, let's generate a report ourselves. Using our same configuration file we used to run the samples through the pipeline, we'll now employ the `report` function of `looper`. -``` -looper report tutorial.yaml +```console +looper report examples/meta/tutorial.yaml ``` That's it! Easy, right? `Looper` conveniently provides you with the location where the HTML report is produced. You may either open the report with your preferred internet browser using the PATH provided, or we can change directories to the report's location and open it there. Let's go ahead and change into the directory that contains the report. -``` +```console cd /path/to/peppro_tutorial/processed/tutorial/ firefox tutorial_summary.html ``` diff --git a/docs/usage.md b/docs/usage.md index 7f172ca..9b6e09d 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -8,24 +8,26 @@ ```{console} usage: peppro.py [-h] [-R] [-N] [-D] [-F] [-T] [--silent] [--verbosity V] [--logdev] [-C CONFIG_FILE] -O PARENT_OUTPUT_FOLDER - [-M MEMORY_LIMIT] [-P NUMBER_OF_CORES] -S SAMPLE_NAME -I - INPUT_FILES [INPUT_FILES ...] - [-I2 [INPUT_FILES2 [INPUT_FILES2 ...]]] -G GENOME_ASSEMBLY - [-Q SINGLE_OR_PAIRED] + [-M MEMORY_LIMIT] [-P NUMBER_OF_CORES] + [--pipeline-name PIPELINE_NAME] -S SAMPLE_NAME -I INPUT_FILES + [INPUT_FILES ...] [-I2 [INPUT_FILES2 [INPUT_FILES2 ...]]] -G + GENOME_ASSEMBLY [-Q SINGLE_OR_PAIRED] [--protocol {PRO,pro,PRO-SEQ,PRO-seq,proseq,PROSEQ,GRO,gro,groseq,GROSEQ,GRO-SEQ,GRO-seq}] [--adapter-tool {cutadapt,fastp}] [--dedup-tool {seqkit,fqdedup}] [--trimmer-tool {seqtk,fastx}] [--umi-len UMI_LEN] [--max-len MAX_LEN] [--sob] [--scale] - [--prealignments PREALIGNMENTS [PREALIGNMENTS ...]] - [--TSS-name TSS_NAME] [--pi-tss ENSEMBL_TSS] + [--prealignment-names PREALIGNMENT_NAMES [PREALIGNMENT_NAMES ...]] + [--prealignment-index PREALIGNMENT_INDEX [PREALIGNMENT_INDEX ...]] + --genome-index GENOME_INDEX [--fasta FASTA] --chrom-sizes + CHROM_SIZES [--TSS-name TSS_NAME] [--pi-tss ENSEMBL_TSS] [--pi-body ENSEMBL_GENE_BODY] [--pre-name PRE_NAME] [--anno-name ANNO_NAME] [--exon-name EXON_NAME] [--intron-name INTRON_NAME] [--search-file SEARCH_FILE] [--coverage] [--keep] [--noFIFO] [--no-complexity] [--prioritize] [-V] -PEPPRO version 0.9.11 +PEPPRO version 0.10.0 optional arguments: -h, --help show this help message and exit @@ -46,6 +48,8 @@ optional arguments: [K|M|G|T]. -P NUMBER_OF_CORES, --cores NUMBER_OF_CORES Number of cores for parallelized processes + --pipeline-name PIPELINE_NAME + Name of the pipeline -I2 [INPUT_FILES2 [INPUT_FILES2 ...]], --input2 [INPUT_FILES2 [INPUT_FILES2 ...]] Secondary input files, such as read2 -Q SINGLE_OR_PAIRED, --single-or-paired SINGLE_OR_PAIRED @@ -67,9 +71,20 @@ optional arguments: --scale Scale signal tracks: Default is to scale by read count. If using seqOutBias, scales by the expected/observed cut frequency. - --prealignments PREALIGNMENTS [PREALIGNMENTS ...] - Space-delimited list of reference genomes to align to - before primary alignment. + --prealignment-names PREALIGNMENT_NAMES [PREALIGNMENT_NAMES ...] + Space-delimited list of prealignment genome names to + align to before primary alignment. + --prealignment-index PREALIGNMENT_INDEX [PREALIGNMENT_INDEX ...] + Space-delimited list of prealignment genome name and + index files delimited by an equals sign to align to + before primary alignment. e.g. + rCRSd=/path/to/bowtie2_index/. + --genome-index GENOME_INDEX + Path to bowtie2 primary genome index file. + --fasta FASTA Path to primary genome fasta file. Required with + --sob. + --chrom-sizes CHROM_SIZES + Path to primary genome chromosome sizes file. --TSS-name TSS_NAME file_name of TSS annotation file. --pi-tss ENSEMBL_TSS file_name of pause index TSS annotation file. --pi-body ENSEMBL_GENE_BODY @@ -82,8 +97,9 @@ optional arguments: --intron-name INTRON_NAME file_name of intron annotation file. --search-file SEARCH_FILE - file_name of read length matched gt tallymer index - search file + Required for seqOutBias (--sob). Path to tallymer + index search file built with the same read length as + the input. --coverage Report library complexity using coverage: reads / (bases in genome / read length) --keep Keep prealignment BAM files diff --git a/examples/meta/peppro_test.yaml b/examples/meta/peppro_test.yaml index 7cd1715..7fed4a4 100644 --- a/examples/meta/peppro_test.yaml +++ b/examples/meta/peppro_test.yaml @@ -5,12 +5,12 @@ pep_version: 2.0.0 sample_table: "peppro_test.csv" looper: - output_dir: "$PROCESSED/peppro/peppro_test/" # export PROCESSED="/path/to/your_output_folder/" - pipeline_interfaces: ["$CODE/peppro/project_pipeline_interface.yaml"] # export CODE="/path/to/your_tools_folder/" + output_dir: peppro_test + pipeline_interfaces: ../../project_pipeline_interface.yaml # PATH to the directory where looper will find the pipeline repository. sample_modifiers: append: - pipeline_interfaces: ["$CODE/peppro/sample_pipeline_interface.yaml"] + pipeline_interfaces: ../../sample_pipeline_interface.yaml # PATH to the directory where looper will find the pipeline repository. #prioritize: True # Default is FALSE. Pass flag to prioritize features by the order they appear in the feat_annotation asset when calculating FRiF/PRiF #sob: True # Default is FALSE. Pass flag to use seqOutBias for signal track generation and to incorporate mappability #no_scale: True # Default is FALSE. Pass flag to not scale signal tracks @@ -21,16 +21,19 @@ sample_modifiers: derive: attributes: [read1] sources: - R1: "$CODE/peppro/examples/data/{sample_name}_r1.fq.gz" + R1: "examples/data/{sample_name}_r1.fq.gz" imply: - if: organism: ["human", "Homo sapiens", "Human", "Homo_sapiens"] then: - genome: "hg38" - prealignments: "human_rDNA" - adapter: "cutadapt" # Default - dedup: "seqkit" # Default - trimmer: "seqtk" # Default - protocol: "pro" # Default - umi_len: "0" # Default; no UMI - max_len: "-1" # Default; do NOT trim to a max length \ No newline at end of file + genome: hg38 + genome_index: default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4 + chrom_sizes: default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4.chrom.sizes + prealignment_index: [human_rDNA=default/b769bcf2deaf9d061d94f2007a0e956249905c64653cb5c8] + TSS_name: default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4_TSS.bed + anno_name: default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4.annotation.bed.gz + pre_name: default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4_pre-mRNA.bed + exon_name: default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4_exons.bed + intron_name: default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4_introns.bed + pi_tss: default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4_ensembl_TSS.bed + pi_body: default/2230c535660fb4774114bfa966a62f823fdb6d21acf138d4_ensembl_gene_body.bed diff --git a/examples/meta/peppro_test_refgenie.yaml b/examples/meta/peppro_test_refgenie.yaml new file mode 100644 index 0000000..2323a4d --- /dev/null +++ b/examples/meta/peppro_test_refgenie.yaml @@ -0,0 +1,23 @@ +# Run test sample through PEPPRO +name: test + +pep_version: 2.0.0 +sample_table: peppro_test.csv + +looper: + output_dir: peppro_test + pipeline_interfaces: ../../project_pipeline_interface.yaml # PATH to the directory where looper will find the pipeline repository. + +sample_modifiers: + append: + pipeline_interfaces: ../../sample_pipeline_interface.yaml # PATH to the directory where looper will + derive: + attributes: [read1] + sources: + R1: "examples/data/{sample_name}_r1.fq.gz" + imply: + - if: + organism: ["human", "Homo sapiens", "Human", "Homo_sapiens"] + then: + genome: hg38 + prealignment_names: ["human_rDNA"] diff --git a/examples/meta/tutorial.yaml b/examples/meta/tutorial.yaml index f21169c..caa1909 100644 --- a/examples/meta/tutorial.yaml +++ b/examples/meta/tutorial.yaml @@ -2,11 +2,11 @@ name: tutorial pep_version: 2.0.0 -sample_table: "tutorial.csv" +sample_table: tutorial.csv looper: - output_dir: "$PROCESSED/tutorial" # export PROCESSED="/path/to/your_output_folder/" - pipeline_interfaces: ["$CODEBASE/peppro/project_pipeline_interface.yaml"] # export CODEBASE="/path/to/your_tools_folder/" + output_dir: "$PROCESSED/tutorial" + pipeline_interfaces: ["$CODEBASE/peppro/project_pipeline_interface.yaml"] sample_modifiers: append: @@ -21,4 +21,4 @@ sample_modifiers: organism: ["human", "Homo sapiens", "Human", "Homo_sapiens"] then: genome: "hg38" - prealignments: ["human_rDNA", "rCRSd"] + prealignments: ["human_rDNA"] diff --git a/mkdocs.yml b/mkdocs.yml index 2dba947..6358103 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -1,6 +1,6 @@ theme: databio -site_name: peppro +site_name: PEPPRO site_author: Jason Smith site_url: http://peppro.databio.org/en/latest/ site_logo: img/peppro_logo_gray.svg @@ -19,14 +19,15 @@ nav: - Install and run: 'install.md' - Extended tutorial: 'tutorial.md' - How-to Guides: - - Run PEPPRO directly: 'run_direct.md' - - Run PEPPRO on a cluster: 'cluster.md' - - Run PEPPRO in containers: 'container.md' + - Run using bulker: 'run-bulker.md' + - Run using containers: 'run-container.md' + - Run using conda: 'run-conda.md' + - Run natively: 'detailed-install.md' + - Configure assets: 'assets.md' - Configure prealignments: 'prealignments.md' - Configure seqOutBias assets: 'sob.md' - Configure UMI: 'umi.md' - Configure custom adapters: 'custom_adapters.md' - - Detailed install guide: 'detailed_install.md' - Use custom reference data: 'annotation.md' - Reference: - FAQ: 'faq.md' diff --git a/pipelines/peppro.py b/pipelines/peppro.py index b1a146e..57bbd04 100755 --- a/pipelines/peppro.py +++ b/pipelines/peppro.py @@ -5,7 +5,7 @@ __author__ = ["Jason Smith", "Nathan Sheffield", "Mike Guertin"] __email__ = "jasonsmith@virginia.edu" -__version__ = "0.9.11" +__version__ = "0.10.0" from argparse import ArgumentParser import os @@ -44,7 +44,8 @@ def parse_arguments(): parser = ArgumentParser(description='PEPPRO version ' + __version__) parser = pypiper.add_pypiper_args(parser, groups= ['pypiper', 'looper', 'ngs'], - required=["input", "genome", "sample-name", "output-parent"]) + required=["input", "genome", "sample-name", "output-parent", + "chrom_sizes", "genome_index"]) # Pipeline-specific arguments parser.add_argument("--protocol", dest="protocol", @@ -90,9 +91,32 @@ def parse_arguments(): "If using seqOutBias, scales by the expected/" "observed cut frequency.") - parser.add_argument("--prealignments", default=[], type=str, nargs="+", - help="Space-delimited list of reference genomes to " - "align to before primary alignment.") + # Prealignment genome assets + parser.add_argument("--prealignment-names", default=[], type=str, + nargs="+", + help="Space-delimited list of prealignment genome " + "names to align to before primary alignment.") + + parser.add_argument("--prealignment-index", default=[], type=str, + nargs="+", + help="Space-delimited list of prealignment genome " + "name and index files delimited by an equals sign " + "to align to before primary alignment. " + "e.g. rCRSd=/path/to/bowtie2_index/.") + + # Genome assets + parser.add_argument("--genome-index", default=None, required=True, + dest="genome_index", type=str, + help="Path to bowtie2 primary genome index file.") + + parser.add_argument("--fasta", default=None, required=False, + dest="fasta", type=str, + help="Path to primary genome fasta file. Required " + "with --sob.") + + parser.add_argument("--chrom-sizes", default=None, required=True, + dest="chrom_sizes", type=str, + help="Path to primary genome chromosome sizes file.") parser.add_argument("--TSS-name", default=None, dest="TSS_name", type=str, @@ -124,8 +148,9 @@ def parse_arguments(): parser.add_argument("--search-file", default=None, dest="search_file", type=str, - help="file_name of read length matched gt tallymer " - "index search file") + help="Required for seqOutBias (--sob). " + "Path to tallymer index search file built " + "with the same read length as the input.") parser.add_argument("--coverage", action='store_true', default=False, dest="coverage", @@ -1958,7 +1983,9 @@ def main(): res = pm.config.resources #sstructure = pm.sample_structure # maybe possible in the future? - # Check that the required tools are callable by the pipeline + ############################################################################ + # Confirm required tools are all callable # + ############################################################################ tool_list = [v for k,v in tools.items()] # extract tool list tool_list = [t.replace('fastx', 'fastx_trimmer') for t in tool_list] tool_list = [t.replace('seqoutbias', 'seqOutBias') for t in tool_list] @@ -1979,106 +2006,97 @@ def main(): pm.fail_pipeline(RuntimeError(err_msg)) if args.input2 and not args.paired_end: - err_msg = "Incompatible settings: You specified single-end, but provided --input2." + err_msg = (f"Incompatible settings: You specified single-end, " + f"but provided --input2.") pm.fail_pipeline(RuntimeError(err_msg)) - # Set up reference resource according to genome prefix. - check_list = [ - {"asset_name":"fasta", "seek_key":"chrom_sizes", - "tag_name":"default", "arg":None, "user_arg":None, - "required":True}, - {"asset_name":"fasta", "seek_key":None, - "tag_name":"default", "arg":None, "user_arg":None, - "required":True}, - {"asset_name":BT2_IDX_KEY, "seek_key":None, - "tag_name":"default", "arg":None, "user_arg":None, - "required":True} - ] - # If user specifies TSS file, use that instead of the refgenie asset - if not (args.TSS_name): - check_list.append( - {"asset_name":"refgene_anno", "seek_key":"refgene_tss", - "tag_name":"default", "arg":"TSS_name", "user_arg":"TSS-name", - "required":False} - ) - # If user specifies a custom pause index TSS file, use that instead - if not (args.ensembl_tss): - check_list.append( - {"asset_name":"ensembl_gtf", "seek_key":"ensembl_tss", - "tag_name":"default", "arg":"ensembl_tss", "user_arg":"pi-tss", - "required":False} - ) - # If user specifies a custom pause index gene body file, use that instead - if not (args.ensembl_gene_body): - check_list.append( - {"asset_name":"ensembl_gtf", "seek_key":"ensembl_gene_body", - "tag_name":"default", "arg":"ensembl_gene_body", - "user_arg":"pi-body", "required":False} - ) - # If user specifies a custom premature RNA file, use that instead - if not (args.pre_name): - check_list.append( - {"asset_name":"refgene_anno", "seek_key":"refgene_pre_mRNA", - "tag_name":"default", "arg":"pre_name", "user_arg":"pre-name", - "required":False} - ) - # If user specifies feature annotation file, - # use that instead of the refgenie managed asset - if not (args.anno_name): - check_list.append( - {"asset_name":"feat_annotation", "seek_key":"feat_annotation", - "tag_name":"default", "arg":"anno_name", "user_arg":"anno-name", - "required":False} - ) - # If user specifies a custom exon file, use that instead - if not (args.exon_name): - check_list.append( - {"asset_name":"refgene_anno", "seek_key":"refgene_exon", - "tag_name":"default", "arg":"exon_name", "user_arg":"exon-name", - "required":False} - ) - # If user specifies a custom intron file, use that instead - if not (args.intron_name): - check_list.append( - {"asset_name":"refgene_anno", "seek_key":"refgene_intron", - "tag_name":"default", "arg":"intron_name", - "user_arg":"intron-name", "required":False} - ) - res, rgc = _add_resources(args, res, check_list) + ############################################################################ + # Set up reference resources # + ############################################################################ - # If the user specifies optional files, add those to our resources - if ((args.TSS_name) and os.path.isfile(args.TSS_name) and + # Add prealignment genome annotation files to resources + if args.prealignment_index: + pm.debug(f"prealignments: {args.prealignment_index}") + res.prealignment_index = args.prealignment_index + + # Add primary genome annotation files to resources + res.genome_index = args.genome_index + + if res.genome_index.endswith("."): + # Replace last occurrence of . with genome name + res.genome_index = os.path.abspath(( + res.genome_index[:res.genome_index.rfind(".")] + + args.genome_assembly) + ) + pm.debug(f"primary genome index: {args.genome_index}") + + if (args.chrom_sizes and os.path.isfile(args.chrom_sizes) and + os.stat(args.chrom_sizes).st_size > 0): + res.chrom_sizes = os.path.abspath(args.chrom_sizes) + + # Add optional files to resources + if args.sob and not args.search_file: + err_msg = (f"You specified --sob but did not include the path to" + f"the tallymer index search file. Specify this with" + f"--search-file ") + pm.fail_pipeline(RuntimeError(err_msg)) + if args.sob and not args.fasta: + err_msg = (f"You specified --sob but did not include the path to" + f"the genome fasta file. Specify this with" + f"--fasta ") + pm.fail_pipeline(RuntimeError(err_msg)) + if (args.fasta and os.path.isfile(args.fasta) and + os.stat(args.fasta).st_size > 0): + res.fasta = os.path.abspath(args.fasta) + if (args.search_file and os.path.isfile(args.search_file) and + os.stat(args.search_file).st_size > 0): + res.search_file = os.path.abspath(args.search_file) + if (args.TSS_name and os.path.isfile(args.TSS_name) and os.stat(args.TSS_name).st_size > 0): - res.refgene_tss = args.TSS_name - if ((args.ensembl_tss) and os.path.isfile(args.ensembl_tss) and + res.refgene_tss = os.path.abspath(args.TSS_name) + if (args.anno_name and os.path.isfile(args.anno_name) and + os.stat(args.anno_name).st_size > 0): + res.feat_annotation = os.path.abspath(args.anno_name) + if (args.ensembl_tss and os.path.isfile(args.ensembl_tss) and os.stat(args.ensembl_tss).st_size > 0): - res.ensembl_tss = args.ensembl_tss - if ((args.ensembl_gene_body) and os.path.isfile(args.ensembl_gene_body) and + res.ensembl_tss = os.path.abspath(args.ensembl_tss) + if (args.ensembl_gene_body and os.path.isfile(args.ensembl_gene_body) and os.stat(args.ensembl_gene_body).st_size > 0): - res.ensembl_gene_body = args.ensembl_gene_body - if ((args.pre_name) and os.path.isfile(args.pre_name) and + res.ensembl_gene_body = os.path.abspath(args.ensembl_gene_body) + if (args.pre_name and os.path.isfile(args.pre_name) and os.stat(args.pre_name).st_size > 0): - res.refgene_pre_mRNA = args.pre_name - if ((args.anno_name) and os.path.isfile(args.anno_name) and - os.stat(args.anno_name).st_size > 0): - res.feat_annotation = args.anno_name - if ((args.exon_name) and os.path.isfile(args.exon_name) and + res.pre_name = os.path.abspath(args.pre_name) + if (args.exon_name and os.path.isfile(args.exon_name) and os.stat(args.exon_name).st_size > 0): - res.refgene_exon = args.exon_name - if ((args.intron_name) and os.path.isfile(args.intron_name) and + res.exon_name = os.path.abspath(args.exon_name) + if (args.intron_name and os.path.isfile(args.intron_name) and os.stat(args.intron_name).st_size > 0): - res.refgene_intron = args.intron_name + res.intron_name = os.path.abspath(args.intron_name) # Adapter file can be set in the config; if left null, we use a default. # Expects headers to include >5prime and >3prime res.adapters = res.adapters or tool_path("adapter.fa") param.outfolder = outfolder - + # Report utilized assets assets_file = os.path.join(param.outfolder, "assets.tsv") + pm.debug(f"res: {res}") for asset in res: - message = "{}\t{}".format(asset, os.path.expandvars(res[asset])) - report_message(pm, assets_file, message) + if isinstance(res[asset], list): + for a in res[asset]: + if a is not None: + message = "{}\t{}".format(asset, os.path.expandvars(a)) + pm.debug(message) + report_message(pm, assets_file, message) + else: + if asset is not None: + message = "{}\t{}".format( + asset, os.path.expandvars(res[asset])) + pm.debug(message) + report_message(pm, assets_file, message) + + if not args.prealignment_index: + res.prealignment_index = None # Report primary genome message = "genome\t{}".format(args.genome_assembly) @@ -2110,8 +2128,6 @@ def main(): err_msg = "Could not find: {}" pm.fail_pipeline(IOError(err_msg.format(args.input2[0]))) - container = None # legacy - ########################################################################### # Grab and prepare input files # ########################################################################### @@ -2440,32 +2456,35 @@ def main(): pm.timestamp("### Prealignments") to_compress = [] - #if not pm.get_stat("Aligned_reads") or args.new_start: - if len(args.prealignments) == 0: - print("You may use `--prealignments` to align to references before " - "the genome alignment step. See docs.") + if res.prealignment_index is None or len(res.prealignment_index) == 0: + print("You may use `--prealignment-index` to align to references " + "before the genome alignment step. " + "See http://peppro.databio.org/en/latest/ for documentation.") + else: - print("Prealignment assemblies: " + str(args.prealignments)) # Loop through any prealignment references and map to them sequentially - for reference in args.prealignments: - bt2_index = rgc.seek(reference, BT2_IDX_KEY) - if not bt2_index.endswith(reference): - bt2_index = os.path.join( - rgc.seek(reference, BT2_IDX_KEY), reference) + for reference in res.prealignment_index: + pm.debug(f"prealignment reference: {reference}") + #res.genome_index = rgc.seek(reference, BT2_IDX_KEY) # DEPRECATED + genome, genome_index = reference.split('=') + if genome_index.endswith("."): + # Replace last occurrence of . with genome name + genome_index = genome_index[:genome_index.rfind(".")] + genome + genome_index = os.path.abspath(genome_index) if not args.complexity and int(args.umi_len) > 0: if args.no_fifo: unmap_fq1, unmap_fq2 = _align_with_bt2( args, tools, args.paired_end, False, unmap_fq1, - unmap_fq2, reference, - assembly_bt2=bt2_index, + unmap_fq2, genome, + assembly_bt2=genome_index, outfolder=param.outfolder, aligndir="prealignments", bt2_opts_txt=param.bowtie2_pre.params) unmap_fq1_dups, unmap_fq2_dups = _align_with_bt2( args, tools, args.paired_end, False, unmap_fq1_dups, - unmap_fq2_dups, reference, - assembly_bt2=bt2_index, + unmap_fq2_dups, genome, + assembly_bt2=genome_index, outfolder=param.outfolder, aligndir="prealignments", dups=True, @@ -2473,16 +2492,16 @@ def main(): else: unmap_fq1, unmap_fq2 = _align_with_bt2( args, tools, args.paired_end, True, unmap_fq1, - unmap_fq2, reference, - assembly_bt2=bt2_index, + unmap_fq2, genome, + assembly_bt2=genome_index, outfolder=param.outfolder, aligndir="prealignments", bt2_opts_txt=param.bowtie2_pre.params) unmap_fq1_dups, unmap_fq2_dups = _align_with_bt2( args, tools, args.paired_end, True, unmap_fq1_dups, - unmap_fq2_dups, reference, - assembly_bt2=bt2_index, + unmap_fq2_dups, genome, + assembly_bt2=genome_index, outfolder=param.outfolder, aligndir="prealignments", dups=True, @@ -2500,16 +2519,16 @@ def main(): if args.no_fifo: unmap_fq1, unmap_fq2 = _align_with_bt2( args, tools, args.paired_end, False, - unmap_fq1, unmap_fq2, reference, - assembly_bt2=bt2_index, + unmap_fq1, unmap_fq2, genome, + assembly_bt2=genome_index, outfolder=param.outfolder, aligndir="prealignments", bt2_opts_txt=param.bowtie2_pre.params) else: unmap_fq1, unmap_fq2 = _align_with_bt2( args, tools, args.paired_end, True, - unmap_fq1, unmap_fq2, reference, - assembly_bt2=bt2_index, + unmap_fq1, unmap_fq2, genome, + assembly_bt2=genome_index, outfolder=param.outfolder, aligndir="prealignments", bt2_opts_txt=param.bowtie2_pre.params) @@ -2578,12 +2597,7 @@ def main(): unmap_fq1_gz = unmap_fq1 + ".gz" unmap_fq2_gz = unmap_fq2 + ".gz" - bt2_index = rgc.seek(args.genome_assembly, BT2_IDX_KEY) - if not bt2_index.endswith(args.genome_assembly): - bt2_index = os.path.join( - rgc.seek(args.genome_assembly, BT2_IDX_KEY), - args.genome_assembly) - + # res.genome_index = rgc.seek(args.genome_assembly, BT2_IDX_KEY) # DEPRECATED if _itsa_file(unmap_fq1_gz) and not _itsa_file(unmap_fq1): cmd = (ngstk.ziptool + " -d " + unmap_fq1_gz) pm.run(cmd, mapping_genome_bam) @@ -2597,7 +2611,7 @@ def main(): cmd = tools.bowtie2 + " -p " + str(pm.cores) cmd += bt2_options cmd += " --rg-id " + args.sample_name - cmd += " -x " + bt2_index + cmd += " -x " + res.genome_index if args.paired_end: cmd += " --rf -1 " + unmap_fq1 + " -2 " + unmap_fq2 else: @@ -2620,7 +2634,7 @@ def main(): cmd_dups = tools.bowtie2 + " -p " + str(pm.cores) cmd_dups += bt2_options cmd_dups += " --rg-id " + args.sample_name - cmd_dups += " -x " + bt2_index + cmd_dups += " -x " + res.genome_index if args.paired_end: cmd_dups += " --rf -1 " + unmap_fq1_dups + " -2 " + unmap_fq2_dups else: @@ -2666,9 +2680,9 @@ def check_alignment_genome(temp_bam, bam): else: tr = 0 - if os.path.exists(res.refgene_pre_mRNA): + if os.path.exists(res.pre_name): cmd = (tools.samtools + " depth -b " + - res.refgene_pre_mRNA + " " + bam + + res.pre_name + " " + bam + " | awk '{counter++;sum+=$3}END{print sum/counter}'") rd = pm.checkprint(cmd) else: @@ -2706,7 +2720,7 @@ def check_alignment_genome(temp_bam, bam): unmapped_fq = unmapped_fq + ".gz" pm.run(cmd, unmapped_fq) - if not args.prealignments and os.path.exists(mapping_genome_bam_temp): + if not args.prealignment_names and os.path.exists(mapping_genome_bam_temp): # Index the temporary bam file cmd = tools.samtools + " index " + mapping_genome_bam_temp pm.run(cmd, temp_mapping_index) @@ -2788,33 +2802,14 @@ def check_alignment_genome(temp_bam, bam): else: max_len = int(DEFAULT_MAX_LEN) pm.report_result("Maximum_read_length", max_len) + pm.info(f"If args.sob is set, the args.search_file asset must be built " + f"using this read length: {max_len}." + f"See: https://refgenie.databio.org/en/latest/available_assets/#tallymer_index") else: max_len = int(pm.get_stat("Maximum_read_length")) - - # At this point we can check for seqOutBias required indicies. - # Can't do it earlier because we haven't determined the read_length of - # interest for mappability purposes. - if args.sob: - pm.debug("max_len: {}".format(max_len)) # DEBUG - if not args.search_file: - if max_len == DEFAULT_MAX_LEN: - search_asset = [{"asset_name":"tallymer_index", - "seek_key":"search_file", - "tag_name":"default", - "arg":"search_file", - "user_arg":"search-file", - "required":True}] - else: - search_asset = [{"asset_name":"tallymer_index", - "seek_key":"search_file", - "tag_name":max_len, - "arg":"search_file", - "user_arg":"search-file", - "required":True}] - elif ((args.search_file) and os.path.isfile(args.search_file) and - os.stat(args.search_file).st_size > 0): - res.search_file = args.search_file - res, rgc = _add_resources(args, res, search_asset) + pm.info(f"If args.sob is set, the args.search_file asset must be built " + f"using this read length: {max_len}." + f"See: https://refgenie.databio.org/en/latest/available_assets/#tallymer_index") # Calculate size of genome if not pm.get_stat("Genome_size") or args.new_start: @@ -3298,22 +3293,22 @@ def count_unmapped_reads(): param.outfolder, "signal_" + args.genome_assembly) ngstk.make_dir(signal_folder) - if not os.path.exists(res.refgene_pre_mRNA): + if not os.path.exists(res.pre_name): print("Skipping FRiP and gene coverage calculation which require the " "pre-mRNA annotation file: {}" - .format(res.refgene_pre_mRNA)) + .format(res.pre_name)) else: pm.timestamp("### Calculate Fraction of Reads in pre-mature mRNA") if not pm.get_stat('Plus_FRiP') or args.new_start: # Plus - plus_frip = calc_frip(plus_bam, res.refgene_pre_mRNA, + plus_frip = calc_frip(plus_bam, res.pre_name, frip_func=ngstk.simple_frip, pipeline_manager=pm) pm.report_result("Plus_FRiP", round(plus_frip, 2)) if not pm.get_stat('Minus_FRiP') or args.new_start: # Minus - minus_frip = calc_frip(minus_bam, res.refgene_pre_mRNA, + minus_frip = calc_frip(minus_bam, res.pre_name, frip_func=ngstk.simple_frip, pipeline_manager=pm) pm.report_result("Minus_FRiP", round(minus_frip, 2)) @@ -3323,7 +3318,7 @@ def count_unmapped_reads(): args.sample_name + "_gene_coverage.bed") gene_sort = os.path.join(QC_folder, args.genome_assembly + "_gene_sort.bed") - cmd1 = ("grep -wf " + chr_keep + " " + res.refgene_pre_mRNA + + cmd1 = ("grep -wf " + chr_keep + " " + res.pre_name + " | " + tools.bedtools + " sort -i stdin -faidx " + chr_order + " > " + gene_sort) cmd2 = (tools.bedtools + " coverage -sorted -counts -s -a " + @@ -3621,8 +3616,8 @@ def count_unmapped_reads(): ############################################################################ # Report mRNA contamination # ############################################################################ - if (os.path.exists(res.refgene_exon) and - os.path.exists(res.refgene_intron)): + if (os.path.exists(res.exon_name) and + os.path.exists(res.intron_name)): pm.timestamp("### Calculate mRNA contamination") intron_exon = os.path.join(QC_folder, args.sample_name + @@ -3636,11 +3631,11 @@ def count_unmapped_reads(): "_exons_sort.bed") introns_sort = os.path.join(QC_folder, args.genome_assembly + "_introns_sort.bed") - cmd1 = ("grep -wf " + chr_keep + " " + res.refgene_exon + + cmd1 = ("grep -wf " + chr_keep + " " + res.exon_name + " | " + tools.bedtools + " sort -i stdin -faidx " + chr_order + " > " + exons_sort) # a single sort fails to sort a 1 bp different start position intron - cmd2 = ("grep -wf " + chr_keep + " " + res.refgene_intron + + cmd2 = ("grep -wf " + chr_keep + " " + res.intron_name + " | " + tools.bedtools + " sort -i stdin -faidx " + chr_order + " | " + tools.bedtools + " sort -i stdin -faidx " + chr_order + " > " + introns_sort) @@ -3759,9 +3754,6 @@ def count_unmapped_reads(): ############################################################################ # Produce BigWigs # ############################################################################ - genome_fq = rgc.seek(args.genome_assembly, - asset_name="fasta", - seek_key="fasta") plus_exact_bw = os.path.join( signal_folder, args.sample_name + "_plus_exact_body_0-mer.bw") plus_smooth_bw = os.path.join( diff --git a/project_pipeline_interface.yaml b/project_pipeline_interface.yaml index ac9ed8c..120fb33 100644 --- a/project_pipeline_interface.yaml +++ b/project_pipeline_interface.yaml @@ -15,7 +15,7 @@ command_template: > compute: singularity_image: ${SIMAGES}peppro docker_image: databio/peppro - bulker_crate: databio/peppro + bulker_crate: databio/peppro:1.0.1 size_dependent_variables: resources.tsv bioconductor: diff --git a/requirements-conda.yml b/requirements-conda.yml new file mode 100644 index 0000000..ec70f2d --- /dev/null +++ b/requirements-conda.yml @@ -0,0 +1,404 @@ +name: peppro +channels: + - conda-forge + - bioconda + - r + - defaults +dependencies: + - _libgcc_mutex=0.1=conda_forge + - _openmp_mutex=4.5=1_gnu + - _r-mutex=1.0.1=anacondar_1 + - alsa-lib=1.2.3=h516909a_0 + - attmap=0.13.0=pyhd8ed1ab_0 + - attrs=21.2.0=pyhd8ed1ab_0 + - bedtools=2.30.0=h7d7f7ad_2 + - binutils_impl_linux-64=2.36.1=h193b22a_2 + - binutils_linux-64=2.36=hf3e587d_1 + - bioconductor-annotationdbi=1.54.0=r41hdfd78af_0 + - bioconductor-annotationfilter=1.16.0=r41hdfd78af_0 + - bioconductor-annotationhub=3.0.0=r41hdfd78af_0 + - bioconductor-biobase=2.52.0=r41hd029910_0 + - bioconductor-biocfilecache=2.0.0=r41hdfd78af_0 + - bioconductor-biocgenerics=0.38.0=r41hdfd78af_0 + - bioconductor-biocio=1.2.0=r41hdfd78af_0 + - bioconductor-biocparallel=1.26.0=r41h399db7b_0 + - bioconductor-biocversion=3.13.1=r41hdfd78af_0 + - bioconductor-biomart=2.48.0=r41hdfd78af_0 + - bioconductor-biostrings=2.60.0=r41hd029910_0 + - bioconductor-bsgenome=1.60.0=r41hdfd78af_0 + - bioconductor-delayedarray=0.18.0=r41hd029910_0 + - bioconductor-ensembldb=2.16.0=r41hdfd78af_0 + - bioconductor-experimenthub=2.0.0=r41hdfd78af_0 + - bioconductor-genomeinfodb=1.28.0=r41hdfd78af_0 + - bioconductor-genomeinfodbdata=1.2.6=r41hdfd78af_0 + - bioconductor-genomicalignments=1.28.0=r41hd029910_0 + - bioconductor-genomicfeatures=1.44.0=r41hdfd78af_0 + - bioconductor-genomicranges=1.44.0=r41hd029910_0 + - bioconductor-interactivedisplaybase=1.30.0=r41hdfd78af_0 + - bioconductor-iranges=2.26.0=r41hd029910_0 + - bioconductor-keggrest=1.32.0=r41hdfd78af_0 + - bioconductor-matrixgenerics=1.4.0=r41hdfd78af_0 + - bioconductor-protgenerics=1.24.0=r41hdfd78af_0 + - bioconductor-rhtslib=1.24.0=r41hd029910_0 + - bioconductor-rsamtools=2.8.0=r41h399db7b_0 + - bioconductor-rtracklayer=1.52.0=r41hd029910_0 + - bioconductor-s4vectors=0.30.0=r41hd029910_0 + - bioconductor-summarizedexperiment=1.22.0=r41hdfd78af_0 + - bioconductor-xvector=0.32.0=r41hd029910_0 + - bioconductor-zlibbioc=1.38.0=r41hd029910_0 + - bowtie2=2.4.2=py39hc9c6fcd_2 + - brotlipy=0.7.0=py39h3811e60_1001 + - bwidget=1.9.14=ha770c72_1 + - bzip2=1.0.8=h7f98852_4 + - c-ares=1.18.1=h7f98852_0 + - ca-certificates=2021.10.8=ha878542_0 + - cairo=1.16.0=h6cf1ce9_1008 + - cffi=1.14.6=py39h4bc2ebd_1 + - chardet=4.0.0=py39hf3d152e_1 + - colorama=0.4.4=pyh9f0ad1d_0 + - commonmark=0.9.1=py_0 + - coreutils=8.31=h516909a_0 + - coverage=6.1.1=py39h3811e60_0 + - cryptography=35.0.0=py39h95dcef6_1 + - curl=7.79.1=h2574ce0_1 + - cutadapt>=2.9 + - cykhash=1.0.2=py39hf149a3a_2 + - cython=0.29.24=py39he80948d_0 + - fastqc=0.11.9=hdfd78af_1 + - fastq-pair + - fastp + - flash + - font-ttf-dejavu-sans-mono=2.37=hab24e00_0 + - font-ttf-inconsolata=3.000=h77eed37_0 + - font-ttf-source-code-pro=2.038=h77eed37_0 + - font-ttf-ubuntu=0.83=hab24e00_0 + - fontconfig=2.13.1=hba837de_1005 + - fonts-conda-ecosystem=1=0 + - fonts-conda-forge=1=0 + - freetype=2.10.4=h0708190_1 + - fribidi=1.0.10=h36c2ea0_0 + - future=0.18.2=py39hf3d152e_3 + - gawk=5.1.0=h7f98852_0 + - gcc_impl_linux-64=9.4.0=h03d3576_11 + - gcc_linux-64=9.4.0=h391b98a_1 + - genrich=0.6.1=h5bf99c6_1 + - gettext=0.19.8.1=h73d1719_1008 + - gfortran_impl_linux-64=9.4.0=h0003116_11 + - gfortran_linux-64=9.4.0=hf0ab688_1 + - giflib=5.2.1=h36c2ea0_2 + - graphite2=1.3.13=h58526e2_1001 + - grep=3.4=h9d02d08_1 + - gsl=2.6=he838d99_2 + - gxx_impl_linux-64=9.4.0=h03d3576_11 + - gxx_linux-64=9.4.0=h0316aca_1 + - harfbuzz=2.9.1=h83ec7ef_1 + - hmmratac=1.2.10=hdfd78af_1 + - homer=4.11=pl5262h7d875b9_5 + - htslib=1.14=h9093b5e_0 + - icu=68.2=h9c3ff4c_0 + - importlib-metadata=4.8.1=py39hf3d152e_0 + - iniconfig=1.1.1=pyh9f0ad1d_0 + - jbig=2.1=h7f98852_2003 + - jinja2=3.0.2=pyhd8ed1ab_0 + - jpeg=9d=h36c2ea0_0 + - jsonschema=4.1.2=pyhd8ed1ab_0 + - kernel-headers_linux-64=2.6.32=he073ed8_15 + - krb5=1.19.2=hcc1bbae_2 + - lcms2=2.12=hddcbb42_0 + - ld_impl_linux-64=2.36.1=hea4e1c9_2 + - lerc=2.2.1=h9c3ff4c_0 + - libblas=3.9.0=12_linux64_openblas + - libcblas=3.9.0=12_linux64_openblas + - libcurl=7.79.1=h2574ce0_1 + - libdeflate=1.7=h7f98852_5 + - libedit=3.1.20191231=he28a2e2_2 + - libev=4.33=h516909a_1 + - libffi=3.4.2=h9c3ff4c_4 + - libgcc-devel_linux-64=9.4.0=hd854feb_11 + - libgcc-ng=11.2.0=h1d223b6_11 + - libgfortran-ng=11.2.0=h69a702a_11 + - libgfortran5=11.2.0=h5c6108e_11 + - libgit2=1.3.0=hee63804_1 + - libglib=2.70.0=h174f98d_1 + - libgomp=11.2.0=h1d223b6_11 + - libiconv=1.16=h516909a_0 + - libidn2=2.3.2=h7f98852_0 + - liblapack=3.9.0=12_linux64_openblas + - libnghttp2=1.43.0=h812cca2_1 + - libopenblas=0.3.18=pthreads_h8fe5266_0 + - libpng=1.6.37=h21135ba_2 + - libsanitizer=9.4.0=h79bfe98_11 + - libssh2=1.10.0=ha56f1ee_2 + - libstdcxx-devel_linux-64=9.4.0=hd854feb_11 + - libstdcxx-ng=11.2.0=he4da1e4_11 + - libtiff=4.3.0=hf544144_1 + - libunistring=0.9.10=h7f98852_0 + - libuuid=2.32.1=h7f98852_1000 + - libwebp-base=1.2.1=h7f98852_0 + - libxcb=1.13=h7f98852_1003 + - libxml2=2.9.12=h72842e0_0 + - libzlib=1.2.11=h36c2ea0_1013 + - lz4-c=1.9.3=h9c3ff4c_1 + - make=4.3=hd18ef5c_1 + - markupsafe=2.0.1=py39h3811e60_0 + - more-itertools=8.10.0=pyhd8ed1ab_0 + - mysql-connector-c=6.1.11=h6eb9d5d_1007 + - ncurses=6.2=h58526e2_4 + - numpy=1.21.3=py39hdbf815f_0 + - openblas=0.3.18=pthreads_h4748800_0 + - openjdk=11.0.9.1=h5cc2fde_1 + - openssl=1.1.1l=h7f98852_0 + - oyaml=1.0=pyhd8ed1ab_0 + - packaging=21.0=pyhd8ed1ab_0 + - pango=1.48.10=hb8ff022_1 + - pathlib2=2.3.6=py39hf3d152e_0 + - pcre=8.45=h9c3ff4c_0 + - pcre2=10.37=h032f7d1_0 + - peppy=0.31.1=pyhd8ed1ab_0 + - perl=5.26.2=h36c2ea0_1008 + - picard=2.26.4=hdfd78af_0 + - pigz=2.6=h27826a3_0 + - pip=21.3.1=pyhd8ed1ab_0 + - piper=0.12.1=py_1 + - pixman=0.40.0=h36c2ea0_0 + - pluggy=1.0.0=py39hf3d152e_1 + - preseq=2.0.3=hc216eb9_5 + - psutil=5.8.0=py39h3811e60_1 + - pthread-stubs=0.4=h36c2ea0_1001 + - py=1.10.0=pyhd3deb0d_0 + - pycparser=2.20=pyh9f0ad1d_2 + - pyfaidx=0.6.3.1=pyh5e36f6f_0 + - pygments=2.10.0=pyhd8ed1ab_0 + - pyopenssl=21.0.0=pyhd8ed1ab_0 + - pyparsing=3.0.4=pyhd8ed1ab_0 + - pysocks=1.7.1=py39hf3d152e_3 + - pytest=6.2.5=py39hf3d152e_0 + - python=3.9.7=hb7a2778_3_cpython + - python-dateutil=2.8.2=pyhd8ed1ab_0 + - python-levenshtein=0.12.2=py39h3811e60_0 + - python_abi=3.9=2_cp39 + - pytz=2021.3=pyhd8ed1ab_0 + - r-askpass=1.1=r41hcfec24a_2 + - r-assertthat=0.2.1=r41hc72bb7e_2 + - r-backports=1.3.0=r41hcfec24a_0 + - r-base=4.1.1=hb67fd72_0 + - r-essentials=4.1 + - r-gert + - r-base64enc=0.1_3=r41hcfec24a_1004 + - r-bh=1.75.0_0=r41hc72bb7e_0 + - r-biocmanager=1.30.16=r41hc72bb7e_0 + - r-bit=4.0.4=r41hcfec24a_0 + - r-bit64=4.0.5=r41hcfec24a_0 + - r-bitops=1.0_7=r41hcfec24a_0 + - r-blob=1.2.2=r41hc72bb7e_0 + - r-brew=1.0_6=r41hc72bb7e_1003 + - r-brio=1.1.2=r41hcfec24a_0 + - r-bslib=0.3.1=r41hc72bb7e_0 + - r-cachem=1.0.6=r41hcfec24a_0 + - r-callr=3.7.0=r41hc72bb7e_0 + - r-catools=1.18.2=r41h03ef668_0 + - r-cli=3.1.0=r41h03ef668_0 + - r-clipr=0.7.1=r41hc72bb7e_0 + - r-colorspace=2.0_2=r41hcfec24a_0 + - r-commonmark=1.7=r41hcfec24a_1002 + - r-covr=3.5.1=r41h03ef668_0 + - r-cpp11=0.4.0=r41hc72bb7e_0 + - r-crayon=1.4.2=r41hc72bb7e_0 + - r-credentials=1.3.1=r41hc72bb7e_0 + - r-crosstalk=1.1.1=r41hc72bb7e_0 + - r-curl=4.3.2=r41hcfec24a_0 + - r-data.table=1.14.2=r41hcfec24a_0 + - r-dbi=1.1.1=r41hc72bb7e_0 + - r-dbplyr=2.1.1=r41hc72bb7e_0 + - r-desc=1.4.0=r41hc72bb7e_0 + - r-devtools=2.4.2=r41hc72bb7e_0 + - r-diffobj=0.3.5=r41hcfec24a_0 + - r-digest=0.6.28=r41h03ef668_0 + - r-dplyr=1.0.7=r41h03ef668_0 + - r-dt=0.19=r41hc72bb7e_0 + - r-ellipsis=0.3.2=r41hcfec24a_0 + - r-evaluate=0.14=r41hc72bb7e_2 + - r-fansi=0.4.2=r41hcfec24a_0 + - r-farver=2.1.0=r41h03ef668_0 + - r-fastmap=1.1.0=r41h03ef668_0 + - r-filelock=1.0.2=r41hcfec24a_1002 + - r-fontawesome=0.2.2=r41hc72bb7e_0 + - r-formatr=1.11=r41hc72bb7e_0 + - r-fs=1.5.0=r41h03ef668_0 + - r-futile.logger=1.4.3=r41hc72bb7e_1003 + - r-futile.options=1.0.1=r41hc72bb7e_1002 + - r-generics=0.1.1=r41hc72bb7e_0 + - r-gert=1.4.1=r41h29657ab_1 + - r-ggplot2=3.3.5=r41hc72bb7e_0 + - r-gh=1.3.0=r41hc72bb7e_0 + - r-git2r=0.28.0=r41hf628c3e_1 + - r-gitcreds=0.1.1=r41hc72bb7e_0 + - r-glue=1.4.2=r41hcfec24a_0 + - r-gplots=3.1.1=r41hc72bb7e_0 + - r-gtable=0.3.0=r41hc72bb7e_3 + - r-gtools=3.9.2=r41hcfec24a_0 + - r-highr=0.9=r41hc72bb7e_0 + - r-hms=1.1.1=r41hc72bb7e_0 + - r-htmltools=0.5.2=r41h03ef668_0 + - r-htmlwidgets=1.5.4=r41hc72bb7e_0 + - r-httpuv=1.6.3=r41h03ef668_0 + - r-httr=1.4.2=r41hc72bb7e_0 + - r-ini=0.3.1=r41hc72bb7e_1003 + - r-isoband=0.2.5=r41h03ef668_0 + - r-jquerylib=0.1.4=r41hc72bb7e_0 + - r-jsonlite=1.7.2=r41hcfec24a_0 + - r-kernsmooth=2.23_20=r41h742201e_0 + - r-knitr=1.35=r41hc72bb7e_0 + - r-labeling=0.4.2=r41hc72bb7e_0 + - r-lambda.r=1.2.4=r41hc72bb7e_1 + - r-later=1.2.0=r41h03ef668_0 + - r-lattice=0.20_45=r41hcfec24a_0 + - r-lazyeval=0.2.2=r41hcfec24a_2 + - r-lifecycle=1.0.1=r41hc72bb7e_0 + - r-magrittr=2.0.1=r41hcfec24a_1 + - r-markdown=1.1=r41hcfec24a_1 + - r-mass=7.3_54=r41hcfec24a_0 + - r-matrix=1.3_4=r41he454529_0 + - r-matrixstats=0.61.0=r41hcfec24a_0 + - r-memoise=2.0.0=r41hc72bb7e_0 + - r-mgcv=1.8_38=r41he454529_0 + - r-mime=0.12=r41hcfec24a_0 + - r-munsell=0.5.0=r41hc72bb7e_1003 + - r-nlme=3.1_153=r41h859d828_0 + - r-openssl=1.4.5=r41he36bf35_1 + - r-pillar=1.6.4=r41hc72bb7e_0 + - r-pkgbuild=1.2.0=r41hc72bb7e_0 + - r-pkgconfig=2.0.3=r41hc72bb7e_1 + - r-pkgload=1.2.3=r41h03ef668_0 + - r-plogr=0.2.0=r41hc72bb7e_1003 + - r-png=0.1_7=r41hcfec24a_1004 + - r-praise=1.0.0=r41hc72bb7e_1004 + - r-prettyunits=1.1.1=r41hc72bb7e_1 + - r-processx=3.5.2=r41hcfec24a_0 + - r-progress=1.2.2=r41hc72bb7e_2 + - r-promises=1.2.0.1=r41h03ef668_0 + - r-ps=1.6.0=r41hcfec24a_0 + - r-purrr=0.3.4=r41hcfec24a_1 + - r-r6=2.5.1=r41hc72bb7e_0 + - r-rappdirs=0.3.3=r41hcfec24a_0 + - r-rcmdcheck=1.4.0=r41h785f33e_0 + - r-rcolorbrewer=1.1_2=r41h785f33e_1003 + - r-rcpp=1.0.7=r41h03ef668_0 + - r-rcurl=1.98_1.5=r41hcfec24a_0 + - r-rematch2=2.1.2=r41hc72bb7e_1 + - r-remotes=2.4.1=r41hc72bb7e_0 + - r-restfulr=0.0.13=r41hdf9a8c9_1 + - r-rex=1.2.0=r41hc72bb7e_1 + - r-rjson=0.2.20=r41h03ef668_1002 + - r-rlang=0.4.12=r41hcfec24a_0 + - r-roxygen2=7.1.2=r41h03ef668_0 + - r-rprojroot=2.0.2=r41hc72bb7e_0 + - r-rsqlite=2.2.8=r41h03ef668_0 + - r-rstudioapi=0.13=r41hc72bb7e_0 + - r-rversions=2.1.1=r41hc72bb7e_0 + - r-sass=0.4.0=r41h03ef668_0 + - r-scales=1.1.1=r41hc72bb7e_0 + - r-sessioninfo=1.2.0=r41hc72bb7e_0 + - r-shiny=1.7.1=r41h785f33e_0 + - r-snow=0.4_4=r41hc72bb7e_0 + - r-sourcetools=0.1.7=r41h9c3ff4c_1002 + - r-stringi=1.7.5=r41hcabe038_0 + - r-stringr=1.4.0=r41hc72bb7e_2 + - r-sys=3.4=r41hcfec24a_0 + - r-testthat=3.1.0=r41h03ef668_0 + - r-tibble=3.1.5=r41hcfec24a_0 + - r-tidyselect=1.1.1=r41hc72bb7e_0 + - r-usethis=2.1.3=r41hc72bb7e_0 + - r-utf8=1.2.2=r41hcfec24a_0 + - r-vctrs=0.3.8=r41hcfec24a_1 + - r-viridislite=0.4.0=r41hc72bb7e_0 + - r-waldo=0.3.1=r41hc72bb7e_0 + - r-whisker=0.4=r41hc72bb7e_1 + - r-withr=2.4.2=r41hc72bb7e_0 + - r-xfun=0.27=r41h03ef668_0 + - r-xml=3.99_0.8=r41hcfec24a_0 + - r-xml2=1.3.2=r41h03ef668_1 + - r-xopen=1.0.0=r41hc72bb7e_1003 + - r-xtable=1.8_4=r41hc72bb7e_3 + - r-yaml=2.2.1=r41hcfec24a_1 + - r-zip=2.2.0=r41hcfec24a_0 + - readline=8.1=h46c0cb4_0 + - refgenie=0.12.0=pyhdfd78af_0 + - rich=10.12.0=py39hf3d152e_0 + - rust=1.56.0=h61edd41_0 + - rust-std-x86_64-unknown-linux-gnu=1.56.0=hc1431ca_0 + - samblaster=0.1.26=h7d875b9_1 + - samtools=1.14=hb421002_0 + - sed=4.8=he412f7d_0 + - seqkit + - seqtk + - six=1.16.0=pyh6c4a22f_0 + - sqlite=3.36.0=h9cd32fc_2 + - sysroot_linux-64=2.12=he073ed8_15 + - tbb=2020.2=h4bd325d_4 + - tk=8.6.11=h27826a3_1 + - tktable=2.10=hb7b940f_3 + - toml=0.10.2=pyhd8ed1ab_0 + - tomli=1.2.2=pyhd8ed1ab_0 + - typing_extensions=3.10.0.2=pyha770c72_0 + - tzdata=2021e=he74cb21_0 + - ubiquerg=0.6.1=pyh9f0ad1d_0 + - ucsc-bedgraphtobigwig=377=h0b8a92a_2 + - ucsc-bedtobigbed=377=h0b8a92a_2 + - ucsc-bigwigcat=377=h0b8a92a_2 + - ucsc-bigwigmerge=377=h0b8a92a_2 + - ucsc-wigtobigwig=377=h0b8a92a_2 + - unzip=6.0=h7f98852_2 + - urllib3=1.26.7=pyhd8ed1ab_0 + - veracitools=0.1.3=py_0 + - wget=1.20.3=ha56f1ee_1 + - wheel=0.37.0=pyhd8ed1ab_1 + - xorg-fixesproto=5.0=h7f98852_1002 + - xorg-inputproto=2.3.2=h7f98852_1002 + - xorg-kbproto=1.0.7=h7f98852_1002 + - xorg-libice=1.0.10=h7f98852_0 + - xorg-libsm=1.2.3=hd9c2040_1000 + - xorg-libx11=1.7.2=h7f98852_0 + - xorg-libxau=1.0.9=h7f98852_0 + - xorg-libxdmcp=1.1.3=h7f98852_0 + - xorg-libxext=1.3.4=h7f98852_1 + - xorg-libxfixes=5.0.3=h7f98852_1004 + - xorg-libxi=1.7.10=h7f98852_0 + - xorg-libxrender=0.9.10=h7f98852_1003 + - xorg-libxt=1.2.1=h7f98852_2 + - xorg-libxtst=1.2.3=h7f98852_1002 + - xorg-recordproto=1.14.2=h7f98852_1002 + - xorg-renderproto=0.11.1=h7f98852_1002 + - xorg-xextproto=7.3.0=h7f98852_1002 + - xorg-xproto=7.0.31=h7f98852_1007 + - xz=5.2.5=h516909a_1 + - yacman=0.8.3=pyhd8ed1ab_0 + - yaml=0.2.5=h516909a_0 + - zipp=3.6.0=pyhd8ed1ab_0 + - zlib=1.2.11=h36c2ea0_1013 + - zstd=1.5.0=ha95c52a_0 + - pip: + - bio==1.3.2 + - biopython==1.79 + - biothings-client==0.2.6 + - certifi==2021.10.8 + - charset-normalizer==2.0.7 + - codecov==2.1.12 + - divvy==0.6.0 + - eido==0.1.5 + - hypothesis==4.38.0 + - idna==3.3 + - logmuse==0.2.7 + - looper==1.3.1 + - mygene==3.2.2 + - pandas==1.3.4 + - pararead==0.7.0 + - piper>=0.12.3 + - pyrsistent==0.18.0 + - pysam==0.17.0 + - pyyaml==6.0 + - refgenconf==0.12.1 + - requests==2.26.0 + - setuptools==58.4.0 + - tqdm==4.62.3 + diff --git a/requirements.txt b/requirements.txt index afc7fd0..2db2556 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,14 @@ +attmap>=0.13.0 +divvy>=0.6.0 +eido>=0.1.3 cutadapt>=2.9 -looper>=1.2 -numpy -pandas>=0.20.2 +looper>=1.3.1 +numpy>=1.17 +pandas>=1.3.4 pararead>=0.7.0 -piper>=0.12.1 -refgenconf -refgenie>=0.9.1 +peppy>=0.31.1 +piper>=0.12.3 +refgenconf>=0.7.0 +refgenie>=0.12.1 # Optional v0.9.12+ +ubiquerg>=0.6.1 +yacman>=0.6.7 \ No newline at end of file diff --git a/sample_pipeline_interface.yaml b/sample_pipeline_interface.yaml index 164cd2f..8856436 100644 --- a/sample_pipeline_interface.yaml +++ b/sample_pipeline_interface.yaml @@ -21,15 +21,21 @@ command_template: > {% if sample.max_len is defined %} --max-len {sample.max_len} {% endif %} {% if sample.sob is defined %} --sob {% endif %} {% if sample.scale is defined %} --scale {% endif %} - {% if sample.prealignments is defined %} --prealignments {sample.prealignments} {%- endif -%} - {% if sample.TSS_name is defined %} --TSS-name {sample.TSS_name} {% endif %} - {% if sample.ensembl_tss is defined %} --pi-tss {sample.ensembl_tss} {% endif %} - {% if sample.ensembl_gene_body is defined %} --pi-body {sample.ensembl_gene_body} {% endif %} - {% if sample.pre_name is defined %} --pre-name {sample.pre_name} {% endif %} - {% if sample.anno_name is defined %} --anno-name {sample.anno_name} {% endif %} - {% if sample.exon_name is defined %} --exon-name {sample.exon_name} {% endif %} - {% if sample.intron_name is defined %} --intron-name {sample.intron_name} {% endif %} - {% if sample.search_file is defined %} --search-file {sample.search_file} {% endif %} + {% if sample.genome_index is defined %} --genome-index { sample.genome_index } {% elif refgenie[sample.genome].bowtie2_index is defined %} --genome-index { refgenie[sample.genome].bowtie2_index.dir } {% endif %} + {% if sample.chrom_sizes is defined %} --chrom-sizes { sample.chrom_sizes } {% elif refgenie[sample.genome].fasta is defined %} --chrom-sizes { refgenie[sample.genome].fasta.chrom_sizes } {% endif %} + {% if sample.prealignment_index is defined %} --prealignment-index { sample.prealignment_index } {% endif %} + {% if sample.prealignment_names is defined %} --prealignment-index {% for p in sample.prealignment_names %} { p ~ '=' ~ refgenie[p].bowtie2_index.dir } {% endfor %} {% endif %} + {% if sample.TSS_name is defined %} --TSS-name { sample.TSS_name } {% elif refgenie[sample.genome].refgene_anno is defined %} --TSS-name { refgenie[sample.genome].refgene_anno.refgene_tss } {% endif %} + {% if sample.ensembl_tss is defined %} --pi-tss { sample.ensembl_tss } {% elif refgenie[sample.genome].ensembl_gtf is defined %} --pi-tss { refgenie[sample.genome].ensembl_gtf.ensembl_tss } {% endif %} + {% if sample.ensembl_gene_body is defined %} --pi-body { sample.ensembl_gene_body } {% elif refgenie[sample.genome].ensembl_gtf is defined %} --pi-body { refgenie[sample.genome].ensembl_gtf.ensembl_gene_body } {% endif %} + {% if sample.pre_name is defined %} --pre-name { sample.pre_name } {% elif refgenie[sample.genome].refgene_anno is defined %} --pre-name { refgenie[sample.genome].refgene_anno.refgene_pre_mRNA } {% endif %} + {% if sample.exon_name is defined %} --exon-name { sample.exon_name } {% elif refgenie[sample.genome].refgene_anno is defined %} --exon-name { refgenie[sample.genome].refgene_anno.refgene_exon } {% endif %} + {% if sample.intron_name is defined %} --intron-name { sample.intron_name } {% elif refgenie[sample.genome].refgene_anno is defined %} --intron-name { refgenie[sample.genome].refgene_anno.refgene_intron } {% endif %} + {% if sample.anno_name is defined %} --anno-name { sample.anno_name } {% elif refgenie[sample.genome].feat_annotation is defined %} --anno-name { refgenie[sample.genome].feat_annotation.feat_annotation } {% endif %} + {% if sample.sob is defined %} {% if refgenie[sample.genome].tallymer_index is defined %} --search-file { refgenie[sample.genome].tallymer_index.search_file } {% endif %} {% endif %} + {% if sample.sob is defined %} {% if refgenie[sample.genome].fasta is defined %} --fasta { refgenie[sample.genome].fasta.fasta } {% endif %} {% endif %} + {% if sample.fasta is defined %} --fasta { sample.fasta } {% elif refgenie[sample.genome].fasta is defined %} --fasta { refgenie[sample.genome].fasta.fasta } {% endif %} + {% if sample.search_file is defined %} --search-file { sample.search_file } {% elif refgenie[sample.genome].tallymer_index is defined %} --search-file { refgenie[sample.genome].tallymer_index.search_file } {% endif %} {% if sample.coverage is defined %} --coverage {% endif %} {% if sample.keep is defined %} --keep {% endif %} {% if sample.no_fifo is defined %} --noFIFO {% endif %} @@ -39,9 +45,13 @@ command_template: > compute: singularity_image: ${SIMAGES}peppro docker_image: databio/peppro - bulker_crate: databio/peppro + bulker_crate: databio/peppro:1.0.1 size_dependent_variables: resources.tsv - +var_templates: + refgenie_config: "$REFGENIE" +pre_submit: + python_functions: + - refgenconf.looper_refgenie_populate bioconductor: readFunName: readPepproGeneCounts readFunPath: BiocProject/readPepproGeneCounts.R diff --git a/usage.txt b/usage.txt index 1541684..9ad5b51 100644 --- a/usage.txt +++ b/usage.txt @@ -1,23 +1,25 @@ usage: peppro.py [-h] [-R] [-N] [-D] [-F] [-T] [--silent] [--verbosity V] [--logdev] [-C CONFIG_FILE] -O PARENT_OUTPUT_FOLDER - [-M MEMORY_LIMIT] [-P NUMBER_OF_CORES] -S SAMPLE_NAME -I - INPUT_FILES [INPUT_FILES ...] - [-I2 [INPUT_FILES2 [INPUT_FILES2 ...]]] -G GENOME_ASSEMBLY - [-Q SINGLE_OR_PAIRED] + [-M MEMORY_LIMIT] [-P NUMBER_OF_CORES] + [--pipeline-name PIPELINE_NAME] -S SAMPLE_NAME -I INPUT_FILES + [INPUT_FILES ...] [-I2 [INPUT_FILES2 [INPUT_FILES2 ...]]] -G + GENOME_ASSEMBLY [-Q SINGLE_OR_PAIRED] [--protocol {PRO,pro,PRO-SEQ,PRO-seq,proseq,PROSEQ,GRO,gro,groseq,GROSEQ,GRO-SEQ,GRO-seq}] [--adapter-tool {cutadapt,fastp}] [--dedup-tool {seqkit,fqdedup}] [--trimmer-tool {seqtk,fastx}] [--umi-len UMI_LEN] [--max-len MAX_LEN] [--sob] [--scale] - [--prealignments PREALIGNMENTS [PREALIGNMENTS ...]] - [--TSS-name TSS_NAME] [--pi-tss ENSEMBL_TSS] + [--prealignment-names PREALIGNMENT_NAMES [PREALIGNMENT_NAMES ...]] + [--prealignment-index PREALIGNMENT_INDEX [PREALIGNMENT_INDEX ...]] + --genome-index GENOME_INDEX [--fasta FASTA] --chrom-sizes + CHROM_SIZES [--TSS-name TSS_NAME] [--pi-tss ENSEMBL_TSS] [--pi-body ENSEMBL_GENE_BODY] [--pre-name PRE_NAME] [--anno-name ANNO_NAME] [--exon-name EXON_NAME] [--intron-name INTRON_NAME] [--search-file SEARCH_FILE] [--coverage] [--keep] [--noFIFO] [--no-complexity] [--prioritize] [-V] -PEPPRO version 0.9.11 +PEPPRO version 0.10.0 optional arguments: -h, --help show this help message and exit @@ -38,6 +40,8 @@ optional arguments: [K|M|G|T]. -P NUMBER_OF_CORES, --cores NUMBER_OF_CORES Number of cores for parallelized processes + --pipeline-name PIPELINE_NAME + Name of the pipeline -I2 [INPUT_FILES2 [INPUT_FILES2 ...]], --input2 [INPUT_FILES2 [INPUT_FILES2 ...]] Secondary input files, such as read2 -Q SINGLE_OR_PAIRED, --single-or-paired SINGLE_OR_PAIRED @@ -59,9 +63,20 @@ optional arguments: --scale Scale signal tracks: Default is to scale by read count. If using seqOutBias, scales by the expected/observed cut frequency. - --prealignments PREALIGNMENTS [PREALIGNMENTS ...] - Space-delimited list of reference genomes to align to - before primary alignment. + --prealignment-names PREALIGNMENT_NAMES [PREALIGNMENT_NAMES ...] + Space-delimited list of prealignment genome names to + align to before primary alignment. + --prealignment-index PREALIGNMENT_INDEX [PREALIGNMENT_INDEX ...] + Space-delimited list of prealignment genome name and + index files delimited by an equals sign to align to + before primary alignment. e.g. + rCRSd=/path/to/bowtie2_index/. + --genome-index GENOME_INDEX + Path to bowtie2 primary genome index file. + --fasta FASTA Path to primary genome fasta file. Required with + --sob. + --chrom-sizes CHROM_SIZES + Path to primary genome chromosome sizes file. --TSS-name TSS_NAME file_name of TSS annotation file. --pi-tss ENSEMBL_TSS file_name of pause index TSS annotation file. --pi-body ENSEMBL_GENE_BODY @@ -74,8 +89,9 @@ optional arguments: --intron-name INTRON_NAME file_name of intron annotation file. --search-file SEARCH_FILE - file_name of read length matched gt tallymer index - search file + Required for seqOutBias (--sob). Path to tallymer + index search file built with the same read length as + the input. --coverage Report library complexity using coverage: reads / (bases in genome / read length) --keep Keep prealignment BAM files