Merge pull request #3 from lskatz/curtis-dev

Curtis dev
kapsakcj · Jun 13, 2019 · 73c518c · 73c518c
2 parents cef70a0 + ebe94c5
commit 73c518c
Show file tree

Hide file tree

Showing 6 changed files with 299 additions and 20 deletions.
diff --git a/README.md b/README.md
@@ -8,6 +8,7 @@ Started by [@lskatz](https://github.com/lskatz), contributions from [@kapsakcj](
   * [Scripts](#scripts)
   * [Workflows](#workflows)
     * [Guppy GPU basecalling and demultiplexing with qcat](#guppy-gpu-basecalling-and-demultiplexing-with-qcat)
+    * [Assembly with wtdbg2 and polishing with Nanopolish](#assembly-with-wtdbg2-and-polishing-with-nanopolish)
   * [Contributing](#contributing)
   * [Future Plans](#future-plans)
   * [Resources](#resources)
@@ -79,6 +80,76 @@ $OUTDIR
     └── logfile-gpu-basecalling.txt
 ```
 
+### Assembly with wtdbg2 and polishing with Nanopolish
+
+#### This workflow does the following:
+  * Takes in 2 arguments (in this order):
+    1. `$outdir` - an output directory
+    2. `$FAST5DIR` - a directory containing raw fast5 files
+  * Prepares a barcoded sample - concatenates all fastq files into one, compresses, and counts read lengths
+  * Assembles using wtdbg2
+  * Polishes using nanopolish
+
+#### Requirements
+  * Must have previously run the above script that basecalls reads on a GPU via node98.
+  * Not necessary to be on node98. Any server with the ability to `qsub` will work.
+  * `outdir` argument must be the same directory as the `OUTDIR` from the gpu-basecalling script
+    * Recommend `cd`'ing to that directory and use `.` as the `outdir` argument (see USAGE below)
+
+#### USAGE
+```bash
+Usage: 
+    # use your favorite queue, doesn't have to be all.q
+    qsub -q all.q ~/nanoporeWorkflow/workflows/workflow-after-gpu-basecalling.sh outdir/ fast5dir/
+
+    # example - if you are in your output directory from the gpu-basecalling script
+    cd outdir/
+    qsub -q all.q ~/nanoporeWorkflow/workflows/workflow-after-gpu-basecalling.sh . ../FAST5/
+
+# OUTPUT
+$OUTDIR
+├── demux
+│   ├── barcode12 # only showing one barcode for brevity
+│   │   ├── all-barcode12.fastq.gz
+│   │   ├── all-barcode12.fastq.gz.index
+│   │   ├── all-barcode12.fastq.gz.index.fai
+│   │   ├── all-barcode12.fastq.gz.index.gzi
+│   │   ├── all-barcode12.fastq.gz.index.readdb
+│   │   ├── consensus.ctg1:0-11000.log # A LOT OF THESE, for each chunk of each contig
+│   │   ├── consensus.ctg1:0-11000.vcf.gz # A LOT OF THESE, for each chunk of each contig
+│   │   ├── polished.fasta
+│   │   ├── rangesCounter.txt
+│   │   ├── readlengths.txt.gz
+│   │   ├── reads.bam
+│   │   ├── reads.bam.bai
+│   │   ├── unpolished.fasta
+│   │   ├── unpolished.fasta.fai
+│   │   ├── wtdbg2.1.dot.gz
+│   │   ├── wtdbg2.1.nodes
+│   │   ├── wtdbg2.1.reads
+│   │   ├── wtdbg2.2.dot.gz
+│   │   ├── wtdbg2.3.dot.gz
+│   │   ├── wtdbg2.alignments.gz
+│   │   ├── wtdbg2.binkmer
+│   │   ├── wtdbg2.closed_bins
+│   │   ├── wtdbg2.clps
+│   │   ├── wtdbg2.ctg.dot.gz
+│   │   ├── wtdbg2.ctg.lay.gz
+│   │   ├── wtdbg2.events
+│   │   ├── wtdbg2.frg.dot.gz
+│   │   ├── wtdbg2.frg.nodes
+│   │   └── wtdbg2.kmerdep
+└── log
+    ├── prepSample-35d239ad-f2c3-4472-b810-76f56ad43c1d.log # one of each of these logs for each barcode
+    ├── assemble-f73c45e5-ceb4-4aae-bc13-c9923adfe63a.log
+    └── polish-c1888109-727b-4a99-bc92-69c12e97222e.log
+```
+#### Notes on assembly and polishing workflow
+  * It will check for the following files, to determine if it should skip any of the steps. Helps if one part doesn't run correctly and you don't want to repeat a certain step, e.g. re-assembling.
+    * `03_preppSample-w-gpu.sh` looks for `./demux/barcodeXX/all-barcodeXX.fastq.gz` 
+    * `05_assemble.sh` looks for `./demux/barcodeXX/unpolished.fasta`
+    * `07_nanopolish.sh` looks for `./demux/barcodeXX/polished.fasta`
+
 ## Contributing
 If you are interested in contributing to nanoporeWorkflow, please take a look at the [contribution guidelines](CONTRIBUTING.md). We welcome issues or pull requests!
 

diff --git a/scripts/01_basecall-w-gpu.sh b/scripts/01_basecall-w-gpu.sh
@@ -102,8 +102,8 @@ else
                    -x "cuda:0 cuda:1" \
                    -m /opt/ont/guppy/data/template_r9.4.1_450bps_hac.jsn \
                    -c /opt/ont/guppy/data/dna_r9.4.1_450bps_hac.cfg \
-                   --chunk_size 1100 \
-                   --gpu_runners_per_device 7 \
+                   --chunk_size 1000 \
+                   --gpu_runners_per_device 2 \
                    --chunks_per_runner 1000 \
                    --chunks_per_caller 10000 \
                    --overlap 50 \

diff --git a/scripts/03_prepSample-w-gpu.sh b/scripts/03_prepSample-w-gpu.sh
@@ -0,0 +1,84 @@
+#!/bin/bash
+#$ -o prepSample.log
+#$ -e prepSample.err
+#$ -j y
+#$ -N wtdbg2
+#$ -pe smp 2-16
+#$ -V -cwd
+set -e
+
+#This function will check to make sure the directory doesn't already exist before trying to create it
+make_directory() {
+    if [ -e $1 ]; then
+        echo "Directory "$1" already exists"
+    else
+        mkdir $1
+        echo "Directory "$1" has been created"
+    fi
+}
+
+source /etc/profile.d/modules.sh
+module purge
+
+NSLOTS=${NSLOTS:=24}
+echo '$NSLOTS is set to:' $NSLOTS
+
+FASTQDIR=$1
+echo '$FASTQDIR is set to:' $FASTQDIR
+
+set -u
+
+if [ "$FASTQDIR" == "" ]; then
+    echo ""
+    echo "Usage: $0 barcode-fastq-dir"
+    echo ""
+    exit 1;
+fi;
+
+# Setup any debugging information
+date
+hostname
+
+# Setup tempdir
+tmpdir=$(mktemp -p . -d prepfastq.XXXXXX)
+trap ' { echo "END - $(date)"; rm -rf $tmpdir; } ' EXIT
+make_directory $tmpdir/log
+echo "$0: temp dir is $tmpdir";
+
+dir=$FASTQDIR
+echo '$dir is set to:' $dir
+
+BARCODE=$(basename ${dir})
+echo '$BARCODE is set to:' $BARCODE
+
+# check to see if reads have been compresesd and renamed to all-barcodeXX.fastq.gz
+if [[ -e ${dir}/all-${BARCODE}.fastq.gz ]]; then
+  echo "Reads have been concatenated and renamed by 03_prepSample script already. Exiting...."
+  exit 0
+fi
+
+### commented out since gzipping is done by previous script
+# Gzip them all
+#uncompressed=$(\ls $dir/*.fastq 2>/dev/null || true)
+#if [ "$uncompressed" != "" ]; then
+#  echo "$uncompressed" | xargs -P $NSLOTS gzip 
+#fi
+
+# Put all the individual gzip fastqs into a subdir,
+# Concatenate them, and then remove them.
+# Keep the aggregate fastq file.
+echo "concatenating .fastq.gz files in barcode sub-dir..."
+mkdir -p ${dir}/fastqChunks
+mv ${dir}/*.fastq.gz ${dir}/fastqChunks
+cat ${dir}/fastqChunks/*.fastq.gz > ${dir}/all-${BARCODE}.fastq.gz
+rm -rf $dir/fastqChunks
+
+# Combine reads and count lengths in one stream
+echo "combining reads and counting read lengths..."
+LENGTHS=${dir}/readlengths.txt.gz
+zcat ${dir}/all-${BARCODE}.fastq.gz | perl -lne '
+  next if($. % 4 != 2);
+  print length($_);
+' | sort -rn | gzip -cf > ${LENGTHS};
+echo "Finished combining reads and counting read lengths."
+
diff --git a/scripts/05_assemble.sh b/scripts/05_assemble.sh
@@ -1,24 +1,40 @@
 #!/bin/bash
 #$ -o wtdbg2.log
+#$ -e wtdbg2.err
 #$ -j y
 #$ -N wtdbg2
 #$ -pe smp 2-16
 #$ -V -cwd
 set -e
 
+#This function will check to make sure the directory doesn't already exist before trying to create it
+make_directory() {
+    if [ -e $1 ]; then
+        echo "Directory "$1" already exists"
+    else
+        mkdir $1
+        echo "Directory "$1" has been created"
+    fi
+}
+
 source /etc/profile.d/modules.sh
 module purge
 
 NSLOTS=${NSLOTS:=1}
+echo '$NSLOTS set to:' $NSLOTS
 
 INDIR=$1
+echo '$INDIR is set to:' $INDIR
+
 GENOMELENGTH=5000000 # TODO make this a parameter
-LONGREADCOVERAGE=50  # How much coverage to target with long reads
+LONGREADCOVERAGE=120  # How much coverage to target with long reads
 
 set -u
 
 if [ "$INDIR" == "" ]; then
-    echo "Usage: $0 projectDir"
+    echo ""
+    echo "Usage: $0 barcode-fastq-dir/"
+    echo ""
     exit 1;
 fi;
 
@@ -29,12 +45,23 @@ hostname
 # Setup tempdir
 tmpdir=$(mktemp -p . -d wtdbg2.XXXXXX)
 trap ' { echo "END - $(date)"; rm -rf $tmpdir; } ' EXIT
-mkdir $tmpdir/log
+make_directory $tmpdir/log
 echo "$0: temp dir is $tmpdir";
 
 dir=$INDIR
 
-LENGTHS=$dir/readlengths.txt
+BARCODE=$(basename ${dir})
+echo '$BARCODE is set to:' $BARCODE
+
+# check to see if sample has already been assembled, skip if so
+if [[ -e ${INDIR}/unpolished.fasta ]]; then
+  echo "Reads have been assembled by 05_assemble script already. Skipping...."
+  exit 0
+fi
+
+LENGTHS=$dir/readlengths.txt.gz
+echo '$LENGTHS set to:' $LENGTHS
+
 # Determine minimum read length for desired coverage.
 # Do this by reading lengths from biggest to smallest,
 # stopping when we get to the desired coverage and saving
@@ -43,10 +70,12 @@ MINLENGTH=$(zcat "$LENGTHS" | sort -rn | perl -lane 'chomp; $minlength=$_; $cum+
 echo "Min length for $LONGREADCOVERAGE coverage will be $MINLENGTH";
 
 # Assemble.
+echo "Assembling with wtdbg2..."
 module purge
 module load wtdbg2/2.4
-wtdbg2 -t $NSLOTS -i $dir/all.fastq.gz -fo $dir/wtdbg2 -p 19 -AS 2 -s 0.05 -L $MINLENGTH -g $GENOMELENGTH -X $LONGREADCOVERAGE
+wtdbg2 -t $NSLOTS -i ${dir}/all-${BARCODE}.fastq.gz -fo ${dir}/wtdbg2 -p 19 -AS 2 -s 0.05 -L $MINLENGTH -g $GENOMELENGTH -X $LONGREADCOVERAGE
 
 # Generate the actual assembly using wtpoa-cns
-wtpoa-cns -t $NSLOTS -i $dir/wtdbg2.ctg.lay.gz -o $dir/unpolished.fasta 
+echo "Generating consensus with wtpoa-cns...."
+wtpoa-cns -t 16 -i $dir/wtdbg2.ctg.lay.gz -o $dir/unpolished.fasta 
 
diff --git a/scripts/07_nanopolish.sh b/scripts/07_nanopolish.sh
@@ -1,5 +1,6 @@
 #!/bin/bash
 #$ -o nanopolish.log
+#$ -e nanopolish.err
 #$ -j y
 #$ -N nanopolish
 #$ -pe smp 2-16
@@ -10,9 +11,12 @@ source /etc/profile.d/modules.sh
 module purge
 
 NSLOTS=${NSLOTS:=48}
+echo '$NSLOTS set to:' $NSLOTS
 
 INDIR=$1
+echo '$INDIR set to:' $INDIR
 FAST5DIR=$2
+echo '$FAST5DIR set to:' $FAST5DIR
 
 set -u
 
@@ -30,31 +34,47 @@ tmpdir=$(mktemp -p . -d nanopolish.XXXXXX)
 trap ' { echo "END - $(date)"; rm -rf $tmpdir; } ' EXIT
 echo "$0: temp dir is $tmpdir";
 
-module purge
 module load nanopolish/0.11.1
 module load minimap2/2.16
 module load samtools/1.8
 module load tabix/0.2.6
 module load gcc/5.5
+module load Python/3.7
 
 dir=$INDIR
-FASTQ="$dir/all.fastq.gz"
+echo '$dir is set to:' ${dir}
+BARCODE=$(basename ${dir})
+echo '$BARCODE is set to:' $BARCODE
+
+FASTQ="$dir/all-${BARCODE}.fastq.gz"
+
+# check to see if assembly has been polished, skip if so
+if [[ -e ${dir}/polished.fasta ]]; then
+  echo "Assembly has already been polished by 07_nanopolish script. Exiting...."
+  exit 0
+fi
 
 #nanopolish index -d $FAST5DIR $FASTQ
 if [ ! -e "$dir/.nanopolish-index" ]; then
-  nanopolish index -s $dir/sequencing_summary.txt -d $FAST5DIR $FASTQ
+  echo "nanopolish indexing fast5 files..."
+  nanopolish index -s ${dir}/../sequencing_summary.txt -d $FAST5DIR $FASTQ
   touch $dir/.nanopolish-index
+else
+  echo "fast5 files have already been indexed by nanopolish. Skipping..."
 fi
 
 # Map the reads to get a bam
 if [ ! -e "$dir/.mapped-reads" ]; then
-  minimap2 -a -x map-ont -t $NSLOTS $dir/unpolished.fasta $dir/all.fastq.gz | \
-    samtools view -bS -T $dir/unpolished.fasta > $dir/unsorted.bam
-  samtools sort -l 1 --threads $(($NSLOTS - 1)) $dir/unsorted.bam > $dir/reads.bam
-  samtools index $dir/reads.bam
-  rm $dir/unsorted.bam
-  samtools faidx $dir/unpolished.fasta # unsure if this is needed
-  touch $dir/.mapped-reads
+  echo "mapping reads to the unpolished assembly with minimap2..."
+  minimap2 -a -x map-ont -t $NSLOTS ${dir}/unpolished.fasta ${dir}/all-${BARCODE}.fastq.gz | \
+    samtools view -bS -T ${dir}/unpolished.fasta > ${dir}/unsorted.bam
+  samtools sort -l 1 --threads $(($NSLOTS - 1)) ${dir}/unsorted.bam > ${dir}/reads.bam
+  samtools index ${dir}/reads.bam
+  rm ${dir}/unsorted.bam
+  samtools faidx ${dir}/unpolished.fasta # unsure if this is needed
+  touch ${dir}/.mapped-reads
+else
+  echo "reads have already been mapped to the unpolished assembly. Skipping..."
 fi
 
 # Start a loop based on suggested ranges using nanopolish_makerange.py
@@ -67,6 +87,7 @@ echo "0" > $dir/rangesCounter.txt
 echo "$RANGES" | xargs -P $NSLOTS -n 1 bash -c '
   window="$0";
   dir="'$dir'";
+  BARCODE="'$BARCODE'";
 
   # Progress counter
   lockfile -l 3 $dir/rangesCounter.txt.lock
@@ -82,7 +103,7 @@ echo "$RANGES" | xargs -P $NSLOTS -n 1 bash -c '
   fi
 
   echo "Nanopolish variants on $window ($counter/$numRanges)"
-  nanopolish variants --consensus -r $dir/all.fastq.gz -b $dir/reads.bam -g $dir/unpolished.fasta -t 1 --min-candidate-frequency 0.1 --min-candidate-depth 20 -w "$window" --max-haplotypes=1000 --ploidy 1 > $dir/consensus.$window.vcf 2>$dir/consensus.$window.log;
+  nanopolish variants --consensus -r $dir/all-${BARCODE}.fastq.gz -b $dir/reads.bam -g $dir/unpolished.fasta -t 1 --min-candidate-frequency 0.1 --min-candidate-depth 20 -w "$window" --max-haplotypes=1000 --ploidy 1 > $dir/consensus.$window.vcf 2>$dir/consensus.$window.log;
 
   # Record that we finished these results
   touch $dir/.$window-vcf
@@ -91,7 +112,8 @@ echo
 
 # Run merge on vcf files
 echo "nanopolish vcf2fasta"
-nanopolish vcf2fasta -g $dir/unpolished.fasta $dir/consensus.*.vcf > $dir/polished.fasta
+nanopolish vcf2fasta -g $dir/unpolished.fasta ${dir}/consensus.*.vcf > $dir/polished.fasta
+echo "nanopolish vcf2fasta finished"
 
 # help with some level of compression in the folder
 \ls $dir/consensus.*.vcf | xargs -P $NSLOTS -n 1 bgzip