Skip to content

Commit

Permalink
updates for second run
Browse files Browse the repository at this point in the history
  • Loading branch information
avantonder committed Jul 3, 2024
1 parent aa71e72 commit 07b8a57
Show file tree
Hide file tree
Showing 90 changed files with 2,713 additions and 307 deletions.
Binary file modified .DS_Store
Binary file not shown.
Binary file added course_files/.DS_Store
Binary file not shown.
Binary file added course_files/scripts/.DS_Store
Binary file not shown.
16 changes: 16 additions & 0 deletions course_files/scripts/M_tuberculosis/01-run_fetchngs.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#!/bin/bash

# before running this script make sure to
# mamba activate nextflow

# create output directory
mkdir -p results/fetchngs

# FIX!!
# run the pipeline
nextflow run nf-core/fetchngs \
-profile singularity \
--max_memory '16.GB' --max_cpus 8 \
--input SAMPLES \
--outdir results/fetchngs \
--nf_core_pipeline viralrecon
19 changes: 19 additions & 0 deletions course_files/scripts/M_tuberculosis/02-run_bacqc.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#!/bin/bash

# before running this script make sure to
# mamba activate nextflow

# create output directory
mkdir -p results/bacqc

# FIX!!
# run the pipeline
nextflow run avantonder/bacQC \
-r main \
-resume -profile singularity \
--max_memory '16.GB' --max_cpus 8 \
--input FIX_SAMPLESHEET \
--outdir results/bacqc \
--kraken2db databases/minikraken2_v1_8GB \
--brackendb databases/minikraken2_v1_8GB \
--genome_size FIX_GENOME_SIZE
17 changes: 17 additions & 0 deletions course_files/scripts/M_tuberculosis/03-run_bactmap.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#!/bin/bash

# before running this script make sure to
# mamba activate nextflow

# create output directory
mkdir -p results/bactmap

# FIX!!
# run the pipeline
nextflow run nf-core/bactmap \
-resume -profile singularity \
--max_memory '16.GB' --max_cpus 8 \
--input FIX_SAMPLESHEET \
--outdir results/bactmap \
--reference FIX_REFERENCE_FASTA \
--genome_size 4.3M
52 changes: 52 additions & 0 deletions course_files/scripts/M_tuberculosis/04-pseudogenome_check.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
#!/bin/bash

# before running this script make sure to
# mamba activate seqtk

#### Settings #####

# directory with pseudogenome FASTA

fasta_dir="results/bactmap/pseudogenomes"

# output directory for results
outdir="results/bactmap/pseudogenomes_check"

# path to seqtk_parser.py
parser="scripts/seqtk_parser.py"

#### End of settings ####

#### Analysis ####
# WARNING: be careful changing the code below

# exit upon any error
set -e

# create output directory
mkdir -p $outdir/seqtk

# rename aligned_pseudogenomes.fas
mv $fasta_dir/aligned_pseudogenomes.fas $fasta_dir/aligned_pseudogenomes.fasta

# loop through each pseudogenome
for filepath in $fasta_dir/*.fas
do
# get the sample name
sample=$(basename $filepath)

# print a message
echo "Processing $sample"

# run seqtk command
seqtk comp $filepath > ${outdir}/seqtk/${sample}.tsv
done

# run seqtk_parser.py
python $parser --input_dir $outdir/seqtk

# move mapping_summary.tsv to results/bactmap/pseudogenomes_check
mv mapping_summary.tsv $outdir

# rename aligned_pseudogenomes.fas
mv $fasta_dir/aligned_pseudogenomes.fasta $fasta_dir/aligned_pseudogenomes.fas
33 changes: 33 additions & 0 deletions course_files/scripts/M_tuberculosis/05-mask_pseudogenome.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
#!/bin/bash

# before running this script make sure to
# mamba activate remove_blocks

#### Settings #####

# directory with pseudogenome FASTA

fasta_dir="results/bactmap/pseudogenomes"

# output directory for results
outdir="results/bactmap/masked_alignment"

# path to bed file with masking co-ordinates
bed="resources/masking/MTBC0_Goigetal_regions_toDiscard.bed"

#### End of settings ####

#### Analysis ####
# WARNING: be careful changing the code below

# exit upon any error
set -e

# create output directory
mkdir -p $outdir

# copy pseudogenome alignment to output directory
cp $fasta_dir/aligned_pseudogenomes.fas $outdir

# mask alignment with co-ordinates in bed file
remove_blocks_from_aln.py -a $outdir/aligned_pseudogenomes.fas -t $bed -o $outdir/aligned_pseudogenomes_masked.fas
28 changes: 28 additions & 0 deletions course_files/scripts/M_tuberculosis/06-run_iqtree.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#!/bin/bash

# before running this script make sure to
# mamba activate iqtree

# create output directory
mkdir -p results/snp-sites/
mkdir -p results/iqtree/

# FIX!!
# extract variable sites
snp-sites FIX_INPUT_PSEUDOGENOMES_FASTA > results/snp-sites/aligned_pseudogenomes_masked_snps.fas

# FIX!!
# count invariant sites
snp-sites -C FIX_INPUT_PSEUDOGENOMES_FASTA > results/snp-sites/constant_sites.txt

# FIX!!
# Run iqtree
iqtree \
-fconst $(cat results/snp-sites/constant_sites.txt) \
-s FIX_INPUT_SNP_ALIGNMENT \
--prefix results/iqtree/Nam_TB \
-nt AUTO \
-ntmax 8 \
-mem 8G \
-m GTR+F+I \
-bb 1000
50 changes: 50 additions & 0 deletions course_files/scripts/M_tuberculosis/07-run_tb-profiler.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
#!/bin/bash

# before running this script make sure to
# mamba activate tb-profiler

#### Settings #####

# directory with pseudogenome FASTA

fastq_dir="data/reads"

# output directory for results
outdir="results/tb-profiler"

# set prefix for collated results
prefix="Nam_TB"

#### End of settings ####

#### Analysis ####
# WARNING: be careful changing the code below

# create output directory
mkdir -p $outdir

# loop through each set of fastq files
for filepath in $fastq_dir/*_1.fastq.gz
do
# get the sample name
sample=$(basename ${filepath%_1.fastq.gz})

# print a message
echo "Processing $sample"

# run tb-profiler command
tb-profiler profile -1 $filepath -2 ${filepath%_1.fastq.gz}_2.fastq.gz -p $sample -t 8 --csv -d $outdir 2> $outdir/"$sample".log

# Check if tb-profiler exited with an error
if [ $? -ne 0 ]; then
echo "tb-profiler failed for $sample. See $sample.log for details."
else
echo "tb-profiler completed successfully for $sample."
fi
done

# run tb-profiler collate
tb-profiler collate -d $outdir/results --prefix $prefix

# move collated result to tb-profiler results directory
mv ${prefix}.* $outdir
16 changes: 16 additions & 0 deletions course_files/scripts/M_tuberculosis/08-run_pairsnp.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#!/bin/bash

# before running this script make sure to
# mamba activate pairsnp

# create output directory
mkdir -p results/transmission/

# masked variants file to extract pairwise SNP distances from
snp_file="preprocessed/snp-sites/aligned_pseudogenomes_masked_snps.fas"

# output file
outfile="results/transmission/aligned_pseudogenomes_masked_snps.csv"

# Run pairsnp
pairsnp $snp_file -c > $outfile
25 changes: 25 additions & 0 deletions course_files/scripts/M_tuberculosis/08-run_treetime.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#!/bin/bash

# before running this script make sure to
# mamba activate treetime

# create output directory
mkdir -p results/treetime/

# Remove outgroup from alignment
seqkit grep -v -p MTBC0 results/bactmap/masked_alignment/aligned_pseudogenomes_masked.fas > results/treetime/aligned_pseudogenomes_masked_no_outgroups.fas

# Remove outgroup from rooted tree
python remove_outgroup.py -i Nam_TB_rooted.treefile -g MTBC0 -o Nam_TB_rooted_no_outgroup.treefile

# Run TreeTime
treetime --tree results/treetime/Nam_TB_rooted_no_outgroup.treefile \
--dates TB_metadata.tsv \
--name-column sample \
--date-column Date.sample.collection \
--aln results/treetime/aligned_pseudogenomes_masked_no_outgroups.fas \
--outdir results/treetime \
--report-ambiguous \
--time-marginal only-final \
--clock-std-dev 0.00003 \
--relax 1.0 0
Loading

0 comments on commit 07b8a57

Please sign in to comment.