From 3cf64d8ef6f7f913f40b8d8f7bcccc0d6fcf272e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jo=C3=A3o=20Rafael=20Almeida?= Date: Tue, 1 Dec 2020 23:39:29 +0000 Subject: [PATCH] Update gto_relative_complexity_profile pipeline and add gto_fasta_align_bwa pipeline --- conda/build.sh | 2 +- conda/meta.yaml | 4 +- .../gto_fasta_align_bwa.sh | 177 ++++++++++++++++ .../gto_relative_complexity_profile.sh | 197 ++++++++++++++---- 4 files changed, 337 insertions(+), 43 deletions(-) create mode 100644 pipelines/gto_fasta_align_bwa/gto_fasta_align_bwa.sh diff --git a/conda/build.sh b/conda/build.sh index 6b3788d..2cd3c43 100755 --- a/conda/build.sh +++ b/conda/build.sh @@ -25,7 +25,7 @@ cp pipelines/gto_sars_simulation_detection/gto_sars_simulation_detection.sh $PRE cp pipelines/gto_download_sars2/gto_download_sars2.sh $PREFIX/bin/ cp pipelines/gto_find_best_sars2/gto_find_best_sars2.sh $PREFIX/bin/ cp pipelines/gto_relative_complexity_profile/gto_relative_complexity_profile.sh $PREFIX/bin/ - +cp pipelines/gto_fasta_align_bwa/gto_fasta_align_bwa.sh $PREFIX/bin/ cp bin/gto $PREFIX/bin/ cp bin/gto_amino_acid_compressor $PREFIX/bin/ diff --git a/conda/meta.yaml b/conda/meta.yaml index 36c08f4..89078dd 100644 --- a/conda/meta.yaml +++ b/conda/meta.yaml @@ -5,10 +5,10 @@ package: name: gto - version: '1.5.4' + version: '1.5.6' source: - git_rev: v1.5.4 + git_rev: v1.5.6 git_url: https://github.com/cobilab/gto.git requirements: diff --git a/pipelines/gto_fasta_align_bwa/gto_fasta_align_bwa.sh b/pipelines/gto_fasta_align_bwa/gto_fasta_align_bwa.sh new file mode 100644 index 0000000..41fee5b --- /dev/null +++ b/pipelines/gto_fasta_align_bwa/gto_fasta_align_bwa.sh @@ -0,0 +1,177 @@ +#!/bin/bash +# +TAG="X"; +RUN="0"; +THREADS="4"; +REFERENCE=""; +FORWARD="o_fw_pr.fq"; +REVERSE="o_rv_pr.fq"; +UNPAIRED="o_fw_unpr.fq,o_rv_unpr.fq"; +# +################################################################################ +# +SHOW_MENU () { + echo " ------------------------------------------------------- "; + echo " "; + echo " gto_align_reads.sh >> FASTQ reads alignment pipeline "; + echo " with sort, duplications removal, and indexes. "; + echo " "; + echo " Program options --------------------------------------- "; + echo " "; + echo " -h, --help Show this, "; + echo " -c, --install Installation (w/ conda), "; + echo " "; + echo " -n , --name TAG name for uniqueness, "; + echo " -t , --threads Number of threads. "; + echo " "; + echo " Input options ----------------------------------------- "; + echo " "; + echo " -r , --reference FASTA Reference filename, "; + echo " "; + echo " -1 , --forward FASTQ forward filename, "; + echo " -2 , --reverse FASTQ reverse filename, "; + echo " -U , --unpaired FASTQ unpaired filename. "; + echo " "; + echo " Example ----------------------------------------------- "; + echo " "; + echo " gto_align_reads.sh --threads 8 --reference B19.fa \\ "; + echo " --forward FW.fq --reverse RV.fq -U FW_U.fq,RV_U.fq "; + echo " "; + echo " ------------------------------------------------------- "; + } +# +################################################################################ +# +CHECK_INPUT () { + FILE=$1 + if [ -f "$FILE" ]; + then + echo "Input filename: $FILE" + else + echo -e "\e[31mERROR: input file not found ($FILE)!\e[0m"; + SHOW_MENU; + exit; + fi + } +# +################################################################################ +# +PROGRAM_EXISTS () { + if ! [ -x "$(command -v $1)" ]; + then + echo -e "\e[41mERROR\e[49m: $1 is not installed." >&2; + echo -e "\e[42mTIP\e[49m: Try: gto_align_reads.sh --install" >&2; + exit 1; + fi + } +# +################################################################################ +# +if [[ "$#" -lt 1 ]]; + then + HELP=1; + fi +# +POSITIONAL=(); +# +while [[ $# -gt 0 ]] + do + i="$1"; + case $i in + -h|--help|?) + HELP=1; + shift + ;; + -c|--install|--compile) + INSTALL=1; + shift + ;; + -n|--name|--tag) + TAG="$2"; + shift 2; + ;; + -t|--threads) + THREADS="$2"; + shift 2; + ;; + -r|--reference) + REFERENCE="$2"; + RUN=1; + shift 2; + ;; + -1|--forward) + FORWARD="$2"; + shift 2; + ;; + -2|--reverse) + REVERSE="$2"; + shift 2; + ;; + -U|--unpaired) + UNPAIRED="$2"; + shift 2; + ;; + -*) # unknown option with small + echo "Invalid arg ($1)!"; + echo "For help, try: gto_align_reads.sh -h" + exit 1; + ;; + esac + done +# +set -- "${POSITIONAL[@]}" # restore positional parameters +# +################################################################################ +# +if [[ "$HELP" -eq "1" ]]; + then + SHOW_MENU; + exit; + fi +# +################################################################################ +# +if [[ "$INSTALL" -eq "1" ]]; + then + conda install -c bioconda samtools --yes + conda install -c bioconda bowtie2 --yes + exit; + fi +# +################################################################################ +# +if [[ "$RUN" -eq "1" ]]; + then + # + echo "Using $THREADS threads ..."; + # + PROGRAM_EXISTS "bowtie2"; + PROGRAM_EXISTS "samtools"; + # + CHECK_INPUT "$REFERENCE"; + # + CHECK_INPUT "$FORWARD"; + CHECK_INPUT "$REVERSE"; + CHECK_INPUT "$UNPAIRED"; + # + rm -f index_file_$TAG* aligned_sorted-$TAG.bam aligned_sorted-$TAG.bam.bai aligned_sorted-$TAG.bam + # + bowtie2-build $REFERENCE index_file_$TAG + bowtie2 -a --threads $THREADS --very-sensitive -x index_file_$TAG -1 $FORWARD -2 $REVERSE -U $UNPAIRED > aligned-$TAG.sam + samtools sort --threads $THREADS aligned-$TAG.sam > aligned_sorted-$TAG.bam + samtools sort --threads $THREADS -n aligned_sorted-$TAG.bam > aligned_sorted_sorted-$TAG.bam + samtools fixmate --threads $THREADS -m aligned_sorted_sorted-$TAG.bam aligned_sorted_sorted-$TAG-fixmate.bam + samtools sort --threads $THREADS -o aligned_sorted_sorted-$TAG-fixmate-sort.bam aligned_sorted_sorted-$TAG-fixmate.bam + samtools markdup --threads $THREADS -r aligned_sorted_sorted-$TAG-fixmate-sort.bam aligned_sorted-$TAG.bam + samtools index -@ $THREADS aligned_sorted-$TAG.bam aligned_sorted-$TAG.bam.bai + samtools flagstat -@ $THREADS aligned_sorted-$TAG.bam > Alignments-Stats-$TAG.txt + # + rm -f *.bt2 aligned-$TAG.sam aligned_sorted_sorted-$TAG.bam aligned_sorted_sorted-$TAG-fixmate.bam aligned_sorted_sorted-$TAG-fixmate-sort.bam + # + echo "Reference : $REFERENCE (index included)"; + echo "Alignments : aligned_sorted-$TAG.bam (index included)"; + # + fi +# +################################################################################ +# \ No newline at end of file diff --git a/pipelines/gto_relative_complexity_profile/gto_relative_complexity_profile.sh b/pipelines/gto_relative_complexity_profile/gto_relative_complexity_profile.sh index bd95810..1942dcb 100644 --- a/pipelines/gto_relative_complexity_profile/gto_relative_complexity_profile.sh +++ b/pipelines/gto_relative_complexity_profile/gto_relative_complexity_profile.sh @@ -1,79 +1,196 @@ #!/bin/bash # -# ============================================================================== -# | | -# | THIS PROGRAM COMPUTES RELATIVE COMPLEXITY PROFILES WITH GTO | -# | =========================================================== | -# | | -# | ./gto_relative_complexity_profile.sh seqA.fa SeqB.fa | -# | | -# | FILE NEEDED TO THE COMPUTATION: | -# | | -# | $1: FILE WITH THE REFERENCE SEQUENCE IN FASTA | -# | $2: FILE WITH THE TARGET SEQUENCE IN FASTA | -# | | -# ============================================================================== +TAG="X"; +RUN="0"; +LEVEL=" -rm 5:10:1:0:0.85/0:0:0 -rm 11:20:1:0:0.9/4:10:0.9 "; +THREADS="8"; +WINDOW="5"; +REFERENCE=""; +TARGET=""; # -# ============================================================================== -# ================================ DEFINITIONS ================================= +################################################################################ # -#LEVEL=" -rm 5:10:0:0:0.85/0:0:0 -rm 11:20:0:0:0.9/4:10:0.9 "; -LEVEL=" $3 "; -WINDOW_SIZE=3; +SHOW_MENU () { + echo " ------------------------------------------------------- "; + echo " "; + echo " gto_relative_complexity_profile.sh >> creates relative "; + echo " complexity profiles "; + echo " "; + echo " Program options --------------------------------------- "; + echo " "; + echo " -h, --help Show this, "; + echo " -c, --install Installation (w/ conda), "; + echo " "; + echo " -n , --name TAG name for uniqueness, "; + echo " -t , --threads Number of threads, "; + echo " -w , --window Low-pass filter size, "; + echo " -l , --level Compression parameters. "; + echo " "; + echo " Input options ----------------------------------------- "; + echo " "; + echo " -r , --reference FASTA Reference filename, "; + echo " -t , --target FASTA target filename. "; + echo " "; + echo " Example ----------------------------------------------- "; + echo " "; + echo " gto_relative_complexity_profile.sh --threads 8 \\ "; + echo " --name x --reference ref.fa --target tar.fa \\ "; + echo " --window 5 --level \" -rm 11:20:0:0:0.9/4:10:0.9 \" "; + echo " "; + echo " ------------------------------------------------------- "; + } # -GET_GTO=0; -RUN_COMPARISON=1; +################################################################################ # -# ============================================================================== -# ================================== GET GTO =================================== +CHECK_INPUT () { + FILE=$1 + if [ -f "$FILE" ]; + then + echo "Input filename: $FILE" + else + echo -e "\e[31mERROR: input file not found ($FILE)!\e[0m"; + SHOW_MENU; + exit; + fi + } +# +################################################################################ +# +PROGRAM_EXISTS () { + if ! [ -x "$(command -v $1)" ]; + then + echo -e "\e[41mERROR\e[49m: $1 is not installed." >&2; + echo -e "\e[42mTIP\e[49m: Try: gto_relative_complexity_profile.sh --install" >&2; + exit 1; + fi + } +# +################################################################################ +# +if [[ "$#" -lt 1 ]]; + then + HELP=1; + fi +# +POSITIONAL=(); +# +while [[ $# -gt 0 ]] + do + i="$1"; + case $i in + -h|--help|?) + HELP=1; + shift + ;; + -c|--install|--compile) + INSTALL=1; + shift + ;; + -n|--name|--tag) + TAG="$2"; + shift 2; + ;; + -t|--threads) + THREADS="$2"; + shift 2; + ;; + -r|--reference) + REFERENCE="$2"; + RUN=1; + shift 2; + ;; + -t|--target) + TARGET="$2"; + shift 2; + ;; + -w|--window) + WINDOW="$2"; + shift 2; + ;; + -l|--level) + LEVEL="$2"; + shift 2; + ;; + -*) # unknown option with small + echo "Invalid arg ($1)!"; + echo "For help, try: gto_relative_complexity_profile.sh -h" + exit 1; + ;; + esac + done # -if [[ "$GET_GTO" -eq "1" ]]; +set -- "${POSITIONAL[@]}" # restore positional parameters +# +################################################################################ +# +if [[ "$HELP" -eq "1" ]]; + then + SHOW_MENU; + exit; + fi +# +################################################################################ +# +if [[ "$INSTALL" -eq "1" ]]; then conda install -c cobilab gto --yes + exit; fi # -# ============================================================================== -# =============================== RUN COMPARISON =============================== +################################################################################ # -if [[ "$RUN_COMPARISON" -eq "1" ]]; +if [[ "$RUN" -eq "1" ]]; then - gto_fasta_rand_extra_chars < $2 | gto_fasta_to_seq > SEQ; # - # GET WINDOW SIZE BY SEQUENCE SIZE - LENGTH=`gto_info < SEQ | grep "Number of sym" | awk '{ print $5}'`; - echo "WINDOW SIZE: $WINDOW_SIZE"; + PROGRAM_EXISTS "gto"; + # + CHECK_INPUT "$REFERENCE"; + CHECK_INPUT "$TARGET"; # - gto_geco -v $LEVEL -e -r $1 SEQ + rm -f SEQ-$TAG SEQ-$TAG.iae SEQ-$TAG.ub $REFERENCE-$TARGET-$TAG.pos $REFERENCE-$TARGET-$TAG.x ; # - gto_upper_bound -u 2 < SEQ.iae > SEQ_UB - gto_filter -d 0 -w $WINDOW_SIZE -c < SEQ_UB > FIL_UB.x - gto_segment -t 1 < FIL_UB.x > $1.positions + echo "Using window size: $WINDOW"; + echo "Randomizing chars outside {A,C,G,T}"; + gto_fasta_rand_extra_chars < $TARGET | gto_fasta_to_seq > SEQ-$TAG; + echo "Running compression ... "; + gto_geco -v $LEVEL -e -r $REFERENCE SEQ-$TAG; + echo "Processing output ... "; + gto_upper_bound -u 2 < SEQ-$TAG.iae > SEQ-$TAG.ub + gto_filter -d 0 -w $WINDOW -c < SEQ-$TAG.ub > $REFERENCE-$TARGET-$TAG.x + gto_segment -t 1 < $REFERENCE-$TARGET-$TAG.x > $REFERENCE-$TARGET-$TAG.pos # + echo "Generating plot with Gnuplot ... "; gnuplot << EOF reset set terminal pdfcairo enhanced color font 'Verdana,12' - set output "profile-$1.pdf" + set output "$REFERENCE-$TARGET-$TAG.pdf" set style line 101 lc rgb '#000000' lt 1 lw 4 set border 3 front ls 101 set tics nomirror out scale 0.75 set format '%g' set size ratio 0.1 unset key - set yrange [0:2] + set yrange [0:2] set xrange [:] set xtics auto set ytics 0.5 - set grid + set grid set ylabel "Bps" set xlabel "Length" set border linewidth 1.5 set style line 1 lc rgb '#0060ad' lt 1 lw 2 pt 5 ps 0.4 # --- blue set style line 2 lc rgb '#0060ad' lt 1 lw 4 pt 6 ps 0.4 # --- green - plot "FIL_UB.x" using 1:2 with lines ls 1 + plot "$REFERENCE-$TARGET-$TAG.x" using 1:2 with lines ls 1 EOF + # + rm -f SEQ-$TAG SEQ-$TAG.iae SEQ-$TAG.ub ; + # + echo "PDF plot : $REFERENCE-$TARGET-$TAG.pdf"; + echo "Segmented regions : $REFERENCE-$TARGET-$TAG.pos"; + echo "Information values : $REFERENCE-$TARGET-$TAG.x"; + # fi # # # ============================================================================== -# ============================================================================== - +# ============================================================================== \ No newline at end of file