diff --git a/.DS_Store b/.DS_Store index 31d82f6..6dbcd5f 100644 Binary files a/.DS_Store and b/.DS_Store differ diff --git a/course_files/.DS_Store b/course_files/.DS_Store new file mode 100644 index 0000000..2fce51d Binary files /dev/null and b/course_files/.DS_Store differ diff --git a/course_files/scripts/.DS_Store b/course_files/scripts/.DS_Store new file mode 100644 index 0000000..c616221 Binary files /dev/null and b/course_files/scripts/.DS_Store differ diff --git a/course_files/scripts/M_tuberculosis/01-run_fetchngs.sh b/course_files/scripts/M_tuberculosis/01-run_fetchngs.sh new file mode 100644 index 0000000..6a12a43 --- /dev/null +++ b/course_files/scripts/M_tuberculosis/01-run_fetchngs.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +# before running this script make sure to +# mamba activate nextflow + +# create output directory +mkdir -p results/fetchngs + +# FIX!! +# run the pipeline +nextflow run nf-core/fetchngs \ + -profile singularity \ + --max_memory '16.GB' --max_cpus 8 \ + --input SAMPLES \ + --outdir results/fetchngs \ + --nf_core_pipeline viralrecon diff --git a/course_files/scripts/M_tuberculosis/02-run_bacqc.sh b/course_files/scripts/M_tuberculosis/02-run_bacqc.sh new file mode 100644 index 0000000..27811f1 --- /dev/null +++ b/course_files/scripts/M_tuberculosis/02-run_bacqc.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +# before running this script make sure to +# mamba activate nextflow + +# create output directory +mkdir -p results/bacqc + +# FIX!! +# run the pipeline +nextflow run avantonder/bacQC \ + -r main \ + -resume -profile singularity \ + --max_memory '16.GB' --max_cpus 8 \ + --input FIX_SAMPLESHEET \ + --outdir results/bacqc \ + --kraken2db databases/minikraken2_v1_8GB \ + --brackendb databases/minikraken2_v1_8GB \ + --genome_size FIX_GENOME_SIZE diff --git a/course_files/scripts/M_tuberculosis/03-run_bactmap.sh b/course_files/scripts/M_tuberculosis/03-run_bactmap.sh new file mode 100644 index 0000000..328d41d --- /dev/null +++ b/course_files/scripts/M_tuberculosis/03-run_bactmap.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +# before running this script make sure to +# mamba activate nextflow + +# create output directory +mkdir -p results/bactmap + +# FIX!! +# run the pipeline +nextflow run nf-core/bactmap \ + -resume -profile singularity \ + --max_memory '16.GB' --max_cpus 8 \ + --input FIX_SAMPLESHEET \ + --outdir results/bactmap \ + --reference FIX_REFERENCE_FASTA \ + --genome_size 4.3M diff --git a/course_files/scripts/M_tuberculosis/04-pseudogenome_check.sh b/course_files/scripts/M_tuberculosis/04-pseudogenome_check.sh new file mode 100644 index 0000000..3dacf66 --- /dev/null +++ b/course_files/scripts/M_tuberculosis/04-pseudogenome_check.sh @@ -0,0 +1,52 @@ +#!/bin/bash + +# before running this script make sure to +# mamba activate seqtk + +#### Settings ##### + +# directory with pseudogenome FASTA + +fasta_dir="results/bactmap/pseudogenomes" + +# output directory for results +outdir="results/bactmap/pseudogenomes_check" + +# path to seqtk_parser.py +parser="scripts/seqtk_parser.py" + +#### End of settings #### + +#### Analysis #### +# WARNING: be careful changing the code below + +# exit upon any error +set -e + +# create output directory +mkdir -p $outdir/seqtk + +# rename aligned_pseudogenomes.fas +mv $fasta_dir/aligned_pseudogenomes.fas $fasta_dir/aligned_pseudogenomes.fasta + +# loop through each pseudogenome +for filepath in $fasta_dir/*.fas +do + # get the sample name + sample=$(basename $filepath) + + # print a message + echo "Processing $sample" + + # run seqtk command + seqtk comp $filepath > ${outdir}/seqtk/${sample}.tsv +done + +# run seqtk_parser.py +python $parser --input_dir $outdir/seqtk + +# move mapping_summary.tsv to results/bactmap/pseudogenomes_check +mv mapping_summary.tsv $outdir + +# rename aligned_pseudogenomes.fas +mv $fasta_dir/aligned_pseudogenomes.fasta $fasta_dir/aligned_pseudogenomes.fas diff --git a/course_files/scripts/M_tuberculosis/05-mask_pseudogenome.sh b/course_files/scripts/M_tuberculosis/05-mask_pseudogenome.sh new file mode 100644 index 0000000..022be94 --- /dev/null +++ b/course_files/scripts/M_tuberculosis/05-mask_pseudogenome.sh @@ -0,0 +1,33 @@ +#!/bin/bash + +# before running this script make sure to +# mamba activate remove_blocks + +#### Settings ##### + +# directory with pseudogenome FASTA + +fasta_dir="results/bactmap/pseudogenomes" + +# output directory for results +outdir="results/bactmap/masked_alignment" + +# path to bed file with masking co-ordinates +bed="resources/masking/MTBC0_Goigetal_regions_toDiscard.bed" + +#### End of settings #### + +#### Analysis #### +# WARNING: be careful changing the code below + +# exit upon any error +set -e + +# create output directory +mkdir -p $outdir + +# copy pseudogenome alignment to output directory +cp $fasta_dir/aligned_pseudogenomes.fas $outdir + +# mask alignment with co-ordinates in bed file +remove_blocks_from_aln.py -a $outdir/aligned_pseudogenomes.fas -t $bed -o $outdir/aligned_pseudogenomes_masked.fas diff --git a/course_files/scripts/M_tuberculosis/06-run_iqtree.sh b/course_files/scripts/M_tuberculosis/06-run_iqtree.sh new file mode 100644 index 0000000..b757b79 --- /dev/null +++ b/course_files/scripts/M_tuberculosis/06-run_iqtree.sh @@ -0,0 +1,28 @@ +#!/bin/bash + +# before running this script make sure to +# mamba activate iqtree + +# create output directory +mkdir -p results/snp-sites/ +mkdir -p results/iqtree/ + +# FIX!! +# extract variable sites +snp-sites FIX_INPUT_PSEUDOGENOMES_FASTA > results/snp-sites/aligned_pseudogenomes_masked_snps.fas + +# FIX!! +# count invariant sites +snp-sites -C FIX_INPUT_PSEUDOGENOMES_FASTA > results/snp-sites/constant_sites.txt + +# FIX!! +# Run iqtree +iqtree \ + -fconst $(cat results/snp-sites/constant_sites.txt) \ + -s FIX_INPUT_SNP_ALIGNMENT \ + --prefix results/iqtree/Nam_TB \ + -nt AUTO \ + -ntmax 8 \ + -mem 8G \ + -m GTR+F+I \ + -bb 1000 diff --git a/course_files/scripts/M_tuberculosis/07-run_tb-profiler.sh b/course_files/scripts/M_tuberculosis/07-run_tb-profiler.sh new file mode 100644 index 0000000..c1f972f --- /dev/null +++ b/course_files/scripts/M_tuberculosis/07-run_tb-profiler.sh @@ -0,0 +1,50 @@ +#!/bin/bash + +# before running this script make sure to +# mamba activate tb-profiler + +#### Settings ##### + +# directory with pseudogenome FASTA + +fastq_dir="data/reads" + +# output directory for results +outdir="results/tb-profiler" + +# set prefix for collated results +prefix="Nam_TB" + +#### End of settings #### + +#### Analysis #### +# WARNING: be careful changing the code below + +# create output directory +mkdir -p $outdir + +# loop through each set of fastq files +for filepath in $fastq_dir/*_1.fastq.gz +do + # get the sample name + sample=$(basename ${filepath%_1.fastq.gz}) + + # print a message + echo "Processing $sample" + + # run tb-profiler command + tb-profiler profile -1 $filepath -2 ${filepath%_1.fastq.gz}_2.fastq.gz -p $sample -t 8 --csv -d $outdir 2> $outdir/"$sample".log + + # Check if tb-profiler exited with an error + if [ $? -ne 0 ]; then + echo "tb-profiler failed for $sample. See $sample.log for details." + else + echo "tb-profiler completed successfully for $sample." + fi +done + +# run tb-profiler collate +tb-profiler collate -d $outdir/results --prefix $prefix + +# move collated result to tb-profiler results directory +mv ${prefix}.* $outdir diff --git a/course_files/scripts/M_tuberculosis/08-run_pairsnp.sh b/course_files/scripts/M_tuberculosis/08-run_pairsnp.sh new file mode 100644 index 0000000..16e7496 --- /dev/null +++ b/course_files/scripts/M_tuberculosis/08-run_pairsnp.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +# before running this script make sure to +# mamba activate pairsnp + +# create output directory +mkdir -p results/transmission/ + +# masked variants file to extract pairwise SNP distances from +snp_file="preprocessed/snp-sites/aligned_pseudogenomes_masked_snps.fas" + +# output file +outfile="results/transmission/aligned_pseudogenomes_masked_snps.csv" + +# Run pairsnp +pairsnp $snp_file -c > $outfile diff --git a/course_files/scripts/M_tuberculosis/08-run_treetime.sh b/course_files/scripts/M_tuberculosis/08-run_treetime.sh new file mode 100644 index 0000000..c64aca9 --- /dev/null +++ b/course_files/scripts/M_tuberculosis/08-run_treetime.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +# before running this script make sure to +# mamba activate treetime + +# create output directory +mkdir -p results/treetime/ + +# Remove outgroup from alignment +seqkit grep -v -p MTBC0 results/bactmap/masked_alignment/aligned_pseudogenomes_masked.fas > results/treetime/aligned_pseudogenomes_masked_no_outgroups.fas + +# Remove outgroup from rooted tree +python remove_outgroup.py -i Nam_TB_rooted.treefile -g MTBC0 -o Nam_TB_rooted_no_outgroup.treefile + +# Run TreeTime +treetime --tree results/treetime/Nam_TB_rooted_no_outgroup.treefile \ + --dates TB_metadata.tsv \ + --name-column sample \ + --date-column Date.sample.collection \ + --aln results/treetime/aligned_pseudogenomes_masked_no_outgroups.fas \ + --outdir results/treetime \ + --report-ambiguous \ + --time-marginal only-final \ + --clock-std-dev 0.00003 \ + --relax 1.0 0 \ No newline at end of file diff --git a/course_files/scripts/M_tuberculosis/09-transmission.R b/course_files/scripts/M_tuberculosis/09-transmission.R new file mode 100644 index 0000000..f704fc8 --- /dev/null +++ b/course_files/scripts/M_tuberculosis/09-transmission.R @@ -0,0 +1,162 @@ +############################# +## Load required libraries ## +############################# + +library(tidyverse) +library(data.table) +library(tidygraph) +library(ggraph) +library(igraph) + +######################### +## Read in pairsnp CSV ## +######################### + +# Update the path + +snp_matrix <- "preprocessed/transmission/aligned_pseudogenomes_masked_snps.csv" + +snp_matrix_df <- read_csv(snp_matrix, col_names = FALSE) + +###################### +## Read in metadata ## +###################### + +# Update the path + +metadata <- read_tsv("TB_metadata.tsv") + +################################ +## Add column names to matrix ## +################################ + +snp_matrix_df <- transpose(snp_matrix_df, make.names = 'X1') + +########################################### +## Convert pairsnp output into dataframe ## +########################################### + +snp_matrix_df_decon <- data.frame( t(combn(names(snp_matrix_df),2)), + dist=t(snp_matrix_df)[lower.tri(snp_matrix_df)] ) + +colnames(snp_matrix_df_decon) <- c("Taxon1", "Taxon2", "dist") + +############################################## +## Plot histogram of pairwise SNP distances ## +############################################## + +snp_histogram <- ggplot(data = snp_matrix_df_decon, + aes(x = dist)) + + geom_histogram(bins = 200, + alpha = 0.8, + position = "identity") + + theme_bw() + + xlab("Pairwise SNP distance") + + ylab("Frequency") + +################################################ +## Create new df containing nodes for network ## +################################################ + +metadata_nodes <- metadata %>% + mutate(id = row_number()) %>% + select(id, + sample, + longitude, + latitude, + Region, + Sex, + HIV.status, + Site.of..TB) + +####################### +## Set SNP threshold ## +####################### + +threshold <- 15 + +####################################### +## Extract pairwise SNP comparisons ## +####################################### + +edges <- snp_matrix_df_decon %>% + filter(dist <= threshold) %>% + select(Taxon1, + Taxon2, + dist) + +edges <- edges %>% + left_join(metadata_nodes[,c(1,2)], by = c("Taxon1" = "sample")) %>% + rename(from.id = id) + +edges <- edges %>% + left_join(metadata_nodes[,c(1,2)], by = c("Taxon2" = "sample")) %>% + select(from.id, to.id = id, dist) + +########################### +## Create network object ## +########################### + +network <- tbl_graph(nodes = metadata_nodes, edges = edges, directed = TRUE) + +network_components <- components(network) + +metadata_nodes_networked <- metadata_nodes %>% + mutate(network_id = network_components$membership) + +################################### +## Filter out network singletons ## +################################### + +metadata_nodes_networked_filtered <- metadata_nodes_networked %>% + group_by(network_id) %>% + filter(n() > 1) %>% + ungroup() %>% + mutate(id = row_number()) %>% + select(id, + sample, + longitude, + latitude, + Region, + Sex, + HIV.status, + Site.of..TB) + +###################################### +## Extract pairwise SNP comparisons ## +###################################### + +edges_filtered <- snp_matrix_df_decon %>% + filter(dist <= threshold) %>% + select(Taxon1, + Taxon2, + dist) + +edges_filtered <- edges_filtered %>% + left_join(metadata_nodes_networked_filtered[,c(1,2)], by = c("Taxon1" = "sample")) %>% + rename(from.id = id) + +edges_filtered <- edges_filtered %>% + left_join(metadata_nodes_networked_filtered[,c(1,2)], by = c("Taxon2" = "sample")) %>% + select(from.id, to.id = id, dist) + +############################### +## Create new network object ## +############################### + +network_filtered <- tbl_graph(nodes = metadata_nodes_networked_filtered, edges = edges_filtered, directed = TRUE) + +################################################### +## Plot network without isolates not in networks ## +################################################### + +network_filtered_plot <- ggraph(network_filtered, layout = "nicely") + + geom_edge_link(aes(label = dist), + edge_colour = 'black', + edge_width = 1, + angle_calc = "along", + label_dodge = unit(2.5,'mm'), + label_size = 3) + + geom_node_point(aes(colour = Region, shape = Sex), size = 6) + + theme_graph() + + labs(colour = "Region", shape = "Sex") diff --git a/course_files/scripts/M_tuberculosis/fastq_dir_to_samplesheet.py b/course_files/scripts/M_tuberculosis/fastq_dir_to_samplesheet.py new file mode 100644 index 0000000..447238b --- /dev/null +++ b/course_files/scripts/M_tuberculosis/fastq_dir_to_samplesheet.py @@ -0,0 +1,158 @@ +#!/usr/bin/env python + +import os +import sys +import glob +import argparse + + +def parse_args(args=None): + Description = ( + "Generate nf-core/viralrecon samplesheet from a directory of FastQ files." + ) + Epilog = "Example usage: python fastq_dir_to_samplesheet.py " + + parser = argparse.ArgumentParser(description=Description, epilog=Epilog) + parser.add_argument("FASTQ_DIR", help="Folder containing raw FastQ files.") + parser.add_argument("SAMPLESHEET_FILE", help="Output samplesheet file.") + parser.add_argument( + "-r1", + "--read1_extension", + type=str, + dest="READ1_EXTENSION", + default="_R1_001.fastq.gz", + help="File extension for read 1.", + ) + parser.add_argument( + "-r2", + "--read2_extension", + type=str, + dest="READ2_EXTENSION", + default="_R2_001.fastq.gz", + help="File extension for read 2.", + ) + parser.add_argument( + "-se", + "--single_end", + dest="SINGLE_END", + action="store_true", + help="Single-end information will be auto-detected but this option forces paired-end FastQ files to be treated as single-end so only read 1 information is included in the samplesheet.", + ) + parser.add_argument( + "-sn", + "--sanitise_name", + dest="SANITISE_NAME", + action="store_true", + help="Whether to further sanitise FastQ file name to get sample id. Used in conjunction with --sanitise_name_delimiter and --sanitise_name_index.", + ) + parser.add_argument( + "-sd", + "--sanitise_name_delimiter", + type=str, + dest="SANITISE_NAME_DELIMITER", + default="_", + help="Delimiter to use to sanitise sample name.", + ) + parser.add_argument( + "-si", + "--sanitise_name_index", + type=int, + dest="SANITISE_NAME_INDEX", + default=1, + help="After splitting FastQ file name by --sanitise_name_delimiter all elements before this index (1-based) will be joined to create final sample name.", + ) + return parser.parse_args(args) + + +def fastq_dir_to_samplesheet( + fastq_dir, + samplesheet_file, + read1_extension="_R1_001.fastq.gz", + read2_extension="_R2_001.fastq.gz", + single_end=False, + sanitise_name=False, + sanitise_name_delimiter="_", + sanitise_name_index=1, +): + def sanitize_sample(path, extension): + """Retrieve sample id from filename""" + sample = os.path.basename(path).replace(extension, "") + if sanitise_name: + sample = sanitise_name_delimiter.join( + os.path.basename(path).split(sanitise_name_delimiter)[ + :sanitise_name_index + ] + ) + return sample + + def get_fastqs(extension): + """ + Needs to be sorted to ensure R1 and R2 are in the same order + when merging technical replicates. Glob is not guaranteed to produce + sorted results. + See also https://stackoverflow.com/questions/6773584/how-is-pythons-glob-glob-ordered + """ + return sorted( + glob.glob(os.path.join(fastq_dir, f"*{extension}"), recursive=False) + ) + + read_dict = {} + + ## Get read 1 files + for read1_file in get_fastqs(read1_extension): + sample = sanitize_sample(read1_file, read1_extension) + if sample not in read_dict: + read_dict[sample] = {"R1": [], "R2": []} + read_dict[sample]["R1"].append(read1_file) + + ## Get read 2 files + if not single_end: + for read2_file in get_fastqs(read2_extension): + sample = sanitize_sample(read2_file, read2_extension) + read_dict[sample]["R2"].append(read2_file) + + ## Write to file + if len(read_dict) > 0: + out_dir = os.path.dirname(samplesheet_file) + if out_dir and not os.path.exists(out_dir): + os.makedirs(out_dir) + + with open(samplesheet_file, "w") as fout: + header = ["sample", "fastq_1", "fastq_2"] + fout.write(",".join(header) + "\n") + for sample, reads in sorted(read_dict.items()): + for idx, read_1 in enumerate(reads["R1"]): + read_2 = "" + if idx < len(reads["R2"]): + read_2 = reads["R2"][idx] + sample_info = ",".join([sample, read_1, read_2]) + fout.write(f"{sample_info}\n") + else: + error_str = ( + "\nWARNING: No FastQ files found so samplesheet has not been created!\n\n" + ) + error_str += "Please check the values provided for the:\n" + error_str += " - Path to the directory containing the FastQ files\n" + error_str += " - '--read1_extension' parameter\n" + error_str += " - '--read2_extension' parameter\n" + print(error_str) + sys.exit(1) + + +def main(args=None): + args = parse_args(args) + + fastq_dir_to_samplesheet( + fastq_dir=args.FASTQ_DIR, + samplesheet_file=args.SAMPLESHEET_FILE, + read1_extension=args.READ1_EXTENSION, + read2_extension=args.READ2_EXTENSION, + single_end=args.SINGLE_END, + sanitise_name=args.SANITISE_NAME, + sanitise_name_delimiter=args.SANITISE_NAME_DELIMITER, + sanitise_name_index=args.SANITISE_NAME_INDEX, + ) + + +if __name__ == "__main__": + sys.exit(main()) \ No newline at end of file diff --git a/course_files/scripts/M_tuberculosis/merge_tb_data.py b/course_files/scripts/M_tuberculosis/merge_tb_data.py new file mode 100644 index 0000000..d3cf00f --- /dev/null +++ b/course_files/scripts/M_tuberculosis/merge_tb_data.py @@ -0,0 +1,76 @@ +#!/usr/bin/env python3 + +import os +import sys +import glob +import argparse +import pandas as pd + +def parser_args(args=None): + """ + Function for input arguments for merge_tb_data.py + """ + Description = 'Merge TB-profiler output and sample_info.csv to create a summary table' + Epilog = """Example usage: python merge_tb_data.py """ + parser = argparse.ArgumentParser(description=Description, epilog=Epilog) + parser.add_argument("-s", "--sample_info", type=str, default="sample_info.csv", help="Sample metadata file (default: 'sample_info.csv').") + parser.add_argument("-t", "--tbp_file", type=str, default="tbprofiler.txt", help="TB-profiler summary file (default: 'tbprofiler.txt').") + parser.add_argument("-o", "--output_file", type=str, default="TB_metadata.tsv", help="Merged TB summary file (default: 'TB_metadata.tsv').") + return parser.parse_args(args) + +def make_dir(path): + """ + Function for making a directory from a provided path + """ + if not len(path) == 0: + try: + os.makedirs(path) + except OSError as exception: + if exception.errno != errno.EEXIST: + raise + +def read_sample_info(sample_info): + """ + Function for reading sample_info.csv + """ + sample_info_read = pd.read_csv(sample_info, sep=',') + + return sample_info_read + +def read_tbprofiler(tbp_out): + """ + Function for reading tb-profiler summary file + """ + tbp_out_read = pd.read_csv(tbp_out, sep='\t') + + return tbp_out_read + +def metadata_merge(df1, df2): + """ + Function for merging two dataframes using the column 'sample' + """ + merged = pd.merge(df1, df2, on = ['sample']) + + return merged + +def main(args=None): + args = parser_args(args) + + ## Create output directory if it doesn't exist + out_dir = os.path.dirname(args.output_file) + make_dir(out_dir) + + ## Read in sample info file + sample_info = read_sample_info(args.sample_info) + + ## Read in TB-profiler output file + tb_profiler = read_tbprofiler(args.tbp_file) + + ## Merge sample info and TB-profiler outputs + merged_df = metadata_merge(sample_info, tb_profiler) + + ## Write output file + merged_df.to_csv(args.output_file, sep = '\t', header = True, index = False) + +if __name__ == '__main__': + sys.exit(main()) \ No newline at end of file diff --git a/course_files/scripts/M_tuberculosis/remove_outgroup.py b/course_files/scripts/M_tuberculosis/remove_outgroup.py new file mode 100644 index 0000000..eaa99b2 --- /dev/null +++ b/course_files/scripts/M_tuberculosis/remove_outgroup.py @@ -0,0 +1,66 @@ +#!/usr/bin/env python3 + +import os +import sys +import argparse +from Bio import Phylo + +def parser_args(args=None): + """ + Function for input arguments for remove_outgroup.py + """ + Description = 'Read a phylogenetic tree and remove a specified outgroup' + Epilog = """Example usage: python remove_outgroup.py -i rooted_tree.newick -g OUTGROUP -o rooted_tree_no_outgroup.newick""" + parser = argparse.ArgumentParser(description=Description, epilog=Epilog) + parser.add_argument("-i", "--input_tree", help="Phylogenetic tree in Newick format") + parser.add_argument("-g", "--outgroup", help="Name of outgroup to remove from tree") + parser.add_argument("-o", "--output_tree", help="Rooted phylogenetic tree with outgroup removed") + return parser.parse_args(args) + +def make_dir(path): + """ + Function for making a directory from a provided path + """ + if not len(path) == 0: + try: + os.makedirs(path) + except OSError as exception: + if exception.errno != errno.EEXIST: + raise + +def drop_outgroup(input_file, output_file, outgroup): + """ + Function for removing an outgroup from a phylogenetic tree + """ + # Read the tree from the file + tree = Phylo.read(input_file, 'newick') + + # Find the outgroup clade + outgroup_clade = tree.find_any(name=outgroup) + + if outgroup_clade: + # Remove the outgroup clade from the tree + tree.prune(outgroup_clade) + else: + print(f"Outgroup '{outgroup}' not found in the tree.") + + # Write the rooted tree to the output file + Phylo.write(tree, output_file, 'newick') + print(f"Tree with outgroup removed written to {output_file}") + +def main(args=None): + args = parser_args(args) + + ## Create output directory if it doesn't exist + out_dir = os.path.dirname(args.output_tree) + make_dir(out_dir) + + ## Read in and root phylogenetic tree + input_file = args.input_tree + output_file = args.output_tree + outgroup = args.outgroup + + drop_outgroup(input_file, output_file, outgroup) + +if __name__ == '__main__': + sys.exit(main()) \ No newline at end of file diff --git a/course_files/scripts/M_tuberculosis/root_tree.py b/course_files/scripts/M_tuberculosis/root_tree.py new file mode 100644 index 0000000..69e1843 --- /dev/null +++ b/course_files/scripts/M_tuberculosis/root_tree.py @@ -0,0 +1,64 @@ +#!/usr/bin/env python3 + +import os +import sys +import argparse +from Bio import Phylo + +def parser_args(args=None): + """ + Function for input arguments for root_tree.py + """ + Description = 'Read and root a phylogenetic tree' + Epilog = """Example usage: python root_tree.py -i tree.newick -g OUTGROUP -o rooted_tree.newick""" + parser = argparse.ArgumentParser(description=Description, epilog=Epilog) + parser.add_argument("-i", "--input_tree", help="Phylogenetic tree in Newick format") + parser.add_argument("-g", "--outgroup", help="Name of outgroup to root tree with") + parser.add_argument("-o", "--output_tree", help="Rooted phylogenetic tree") + return parser.parse_args(args) + +def make_dir(path): + """ + Function for making a directory from a provided path + """ + if not len(path) == 0: + try: + os.makedirs(path) + except OSError as exception: + if exception.errno != errno.EEXIST: + raise + +def read_and_root_tree(input_file, output_file, outgroup=None): + """ + Function for reading and rooting a phylogenetic tree + """ + # Read the tree from the file + tree = Phylo.read(input_file, 'newick') + + # If an outgroup is specified, root the tree using the outgroup + if outgroup: + tree.root_with_outgroup(outgroup) + else: + # Otherwise, root the tree at the midpoint + tree.root_at_midpoint() + + # Write the rooted tree to the output file + Phylo.write(tree, output_file, 'newick') + print(f"Rooted tree written to {output_file}") + +def main(args=None): + args = parser_args(args) + + ## Create output directory if it doesn't exist + out_dir = os.path.dirname(args.output_tree) + make_dir(out_dir) + + ## Read in and root phylogenetic tree + input_file = args.input_tree + output_file = args.output_tree + outgroup = args.outgroup # Optional: specify the outgroup for rooting + + read_and_root_tree(input_file, output_file, outgroup) + +if __name__ == '__main__': + sys.exit(main()) \ No newline at end of file diff --git a/course_files/scripts/M_tuberculosis/seqtk_parser.py b/course_files/scripts/M_tuberculosis/seqtk_parser.py new file mode 100644 index 0000000..91fe471 --- /dev/null +++ b/course_files/scripts/M_tuberculosis/seqtk_parser.py @@ -0,0 +1,63 @@ +#!/usr/bin/env python3 + +import os +import sys +import glob +import argparse +import pandas as pd + +def parser_args(args=None): + """ + Function for input arguments for seqtk_parser.py + """ + Description = 'Collect seqtk comp outputs and create a summary table' + Epilog = """Example usage: python seqtk_parser.py """ + parser = argparse.ArgumentParser(description=Description, epilog=Epilog) + parser.add_argument("-i", "--input_dir", help="directory containing pseudogenome files") + parser.add_argument("-of", "--output_file", type=str, default="mapping_summary.tsv", help="seqtk comp summary file (default: 'mapping_summary.tsv').") + return parser.parse_args(args) + +def make_dir(path): + """ + Function for making a directory from a provided path + """ + if not len(path) == 0: + try: + os.makedirs(path) + except OSError as exception: + if exception.errno != errno.EEXIST: + raise + +def seqtk_to_dataframe(file_list): + """ + Function for creating a dataframe from a list of seqtk comp files + """ + seqtk_file_list = [pd.read_csv(f, sep='\t', header=None) for f in file_list] + seqtk_df = pd.concat(seqtk_file_list, ignore_index=True) + seqtk_df = seqtk_df.iloc[:, 0:6] + seqtk_df.columns = ['sample', 'ref_length', '#A', '#C', '#G', '#T'] + column_list = ['#A', '#C', '#G', '#T'] + seqtk_df["mapped"] = seqtk_df[column_list].sum(axis=1) + seqtk_df["%ref mapped"] = seqtk_df['mapped'] / seqtk_df['ref_length'] * 100 + + return seqtk_df + +def main(args=None): + args = parser_args(args) + + ## Create output directory if it doesn't exist + out_dir = os.path.dirname(args.output_file) + make_dir(out_dir) + + ## Create list of seqtk comp tsv outputs + fastq_dir=args.input_dir + seqtk_files = sorted(glob.glob(os.path.join(fastq_dir, '*.tsv'))) + + ## Create dataframe + seqtk_df = seqtk_to_dataframe(seqtk_files) + + ## Write output file + seqtk_df.to_csv(args.output_file, sep = '\t', header = True, index = False) + +if __name__ == '__main__': + sys.exit(main()) \ No newline at end of file diff --git a/course_files/scripts/S_aureus/01-run_assemblebac.sh b/course_files/scripts/S_aureus/01-run_assemblebac.sh new file mode 100644 index 0000000..bd43fa4 --- /dev/null +++ b/course_files/scripts/S_aureus/01-run_assemblebac.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +# before running this script make sure to +# mamba activate nextflow + +nextflow run avantonder/assembleBAC \ + -r main \ + -resume -profile singularity \ + --max_memory '16.GB' --max_cpus 8 \ + --input SAMPLESHEET \ + --outdir results/assemblebac \ + --baktadb databases/db-light \ + --genome_size GENOME_SIZE \ + --checkm2db databases/CheckM2_database/uniref100.KO.1.dmnd diff --git a/course_files/scripts/S_aureus/02-run_panaroo.sh b/course_files/scripts/S_aureus/02-run_panaroo.sh new file mode 100644 index 0000000..8825fdc --- /dev/null +++ b/course_files/scripts/S_aureus/02-run_panaroo.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +# before running this script make sure to +# mamba activate panaroo + +# create output directory +mkdir -p results/panaroo/ +mkdir -p results/snp-sites/ + +# FIX!! +# Run panaroo +panaroo \ + --input FIX_PATH_TO_INPUT_FILES \ + --out_dir FIX_OUTPUT_DIRECTORY \ + --clean-mode strict \ + --alignment core \ + --core_threshold 0.98 \ + --remove-invalid-genes \ + --threads 8 diff --git a/course_files/scripts/S_aureus/03-run_iqtree.sh b/course_files/scripts/S_aureus/03-run_iqtree.sh new file mode 100644 index 0000000..f87c763 --- /dev/null +++ b/course_files/scripts/S_aureus/03-run_iqtree.sh @@ -0,0 +1,28 @@ +#!/bin/bash + +# before running this script make sure to +# mamba activate iqtree + +# create output directory +mkdir -p results/snp-sites/ +mkdir -p results/iqtree/ + +# FIX!! +# extract variable sites +snp-sites FIX_INPUT_CORE_ALIGNMENT > results/snp-sites/core_gene_alignment_snps.aln + +# FIX!! +# count invariant sites +snp-sites -C FIX_INPUT_CORE_ALIGNMENT > results/snp-sites/constant_sites.txt + +# FIX!! +# Run iqtree +iqtree \ + -fconst $(cat results/snp-sites/constant_sites.txt) \ + -s FIX_INPUT_SNP_ALIGNMENT \ + --prefix results/iqtree/School_Staph \ + -nt AUTO \ + -ntmax 8 \ + -mem 8G \ + -m MFP \ + -bb 1000 diff --git a/course_files/scripts/S_aureus/fastq_dir_to_samplesheet.py b/course_files/scripts/S_aureus/fastq_dir_to_samplesheet.py new file mode 100644 index 0000000..447238b --- /dev/null +++ b/course_files/scripts/S_aureus/fastq_dir_to_samplesheet.py @@ -0,0 +1,158 @@ +#!/usr/bin/env python + +import os +import sys +import glob +import argparse + + +def parse_args(args=None): + Description = ( + "Generate nf-core/viralrecon samplesheet from a directory of FastQ files." + ) + Epilog = "Example usage: python fastq_dir_to_samplesheet.py " + + parser = argparse.ArgumentParser(description=Description, epilog=Epilog) + parser.add_argument("FASTQ_DIR", help="Folder containing raw FastQ files.") + parser.add_argument("SAMPLESHEET_FILE", help="Output samplesheet file.") + parser.add_argument( + "-r1", + "--read1_extension", + type=str, + dest="READ1_EXTENSION", + default="_R1_001.fastq.gz", + help="File extension for read 1.", + ) + parser.add_argument( + "-r2", + "--read2_extension", + type=str, + dest="READ2_EXTENSION", + default="_R2_001.fastq.gz", + help="File extension for read 2.", + ) + parser.add_argument( + "-se", + "--single_end", + dest="SINGLE_END", + action="store_true", + help="Single-end information will be auto-detected but this option forces paired-end FastQ files to be treated as single-end so only read 1 information is included in the samplesheet.", + ) + parser.add_argument( + "-sn", + "--sanitise_name", + dest="SANITISE_NAME", + action="store_true", + help="Whether to further sanitise FastQ file name to get sample id. Used in conjunction with --sanitise_name_delimiter and --sanitise_name_index.", + ) + parser.add_argument( + "-sd", + "--sanitise_name_delimiter", + type=str, + dest="SANITISE_NAME_DELIMITER", + default="_", + help="Delimiter to use to sanitise sample name.", + ) + parser.add_argument( + "-si", + "--sanitise_name_index", + type=int, + dest="SANITISE_NAME_INDEX", + default=1, + help="After splitting FastQ file name by --sanitise_name_delimiter all elements before this index (1-based) will be joined to create final sample name.", + ) + return parser.parse_args(args) + + +def fastq_dir_to_samplesheet( + fastq_dir, + samplesheet_file, + read1_extension="_R1_001.fastq.gz", + read2_extension="_R2_001.fastq.gz", + single_end=False, + sanitise_name=False, + sanitise_name_delimiter="_", + sanitise_name_index=1, +): + def sanitize_sample(path, extension): + """Retrieve sample id from filename""" + sample = os.path.basename(path).replace(extension, "") + if sanitise_name: + sample = sanitise_name_delimiter.join( + os.path.basename(path).split(sanitise_name_delimiter)[ + :sanitise_name_index + ] + ) + return sample + + def get_fastqs(extension): + """ + Needs to be sorted to ensure R1 and R2 are in the same order + when merging technical replicates. Glob is not guaranteed to produce + sorted results. + See also https://stackoverflow.com/questions/6773584/how-is-pythons-glob-glob-ordered + """ + return sorted( + glob.glob(os.path.join(fastq_dir, f"*{extension}"), recursive=False) + ) + + read_dict = {} + + ## Get read 1 files + for read1_file in get_fastqs(read1_extension): + sample = sanitize_sample(read1_file, read1_extension) + if sample not in read_dict: + read_dict[sample] = {"R1": [], "R2": []} + read_dict[sample]["R1"].append(read1_file) + + ## Get read 2 files + if not single_end: + for read2_file in get_fastqs(read2_extension): + sample = sanitize_sample(read2_file, read2_extension) + read_dict[sample]["R2"].append(read2_file) + + ## Write to file + if len(read_dict) > 0: + out_dir = os.path.dirname(samplesheet_file) + if out_dir and not os.path.exists(out_dir): + os.makedirs(out_dir) + + with open(samplesheet_file, "w") as fout: + header = ["sample", "fastq_1", "fastq_2"] + fout.write(",".join(header) + "\n") + for sample, reads in sorted(read_dict.items()): + for idx, read_1 in enumerate(reads["R1"]): + read_2 = "" + if idx < len(reads["R2"]): + read_2 = reads["R2"][idx] + sample_info = ",".join([sample, read_1, read_2]) + fout.write(f"{sample_info}\n") + else: + error_str = ( + "\nWARNING: No FastQ files found so samplesheet has not been created!\n\n" + ) + error_str += "Please check the values provided for the:\n" + error_str += " - Path to the directory containing the FastQ files\n" + error_str += " - '--read1_extension' parameter\n" + error_str += " - '--read2_extension' parameter\n" + print(error_str) + sys.exit(1) + + +def main(args=None): + args = parse_args(args) + + fastq_dir_to_samplesheet( + fastq_dir=args.FASTQ_DIR, + samplesheet_file=args.SAMPLESHEET_FILE, + read1_extension=args.READ1_EXTENSION, + read2_extension=args.READ2_EXTENSION, + single_end=args.SINGLE_END, + sanitise_name=args.SANITISE_NAME, + sanitise_name_delimiter=args.SANITISE_NAME_DELIMITER, + sanitise_name_index=args.SANITISE_NAME_INDEX, + ) + + +if __name__ == "__main__": + sys.exit(main()) \ No newline at end of file diff --git a/course_files/scripts/S_aureus/merge_staph_data.py b/course_files/scripts/S_aureus/merge_staph_data.py new file mode 100644 index 0000000..10ee8ee --- /dev/null +++ b/course_files/scripts/S_aureus/merge_staph_data.py @@ -0,0 +1,96 @@ +#!/usr/bin/env python3 + +import os +import sys +import glob +import argparse +import pandas as pd + +def parser_args(args=None): + """ + Function for input arguments for merge_staph_data.py + """ + Description = 'Merge Pathogenwatch output and sample_info.csv to create a summary table' + Epilog = """Example usage: python merge_staph_data.py """ + parser = argparse.ArgumentParser(description=Description, epilog=Epilog) + parser.add_argument("-s", "--sample_info", type=str, default="sample_info.csv", help="Sample metadata file (default: 'sample_info.csv').") + parser.add_argument("-t", "--typing_file", type=str, default="pathogenwatch-typing.csv", help="Pathogenwatch typing file (default: 'pathogenwatch-typing.csv').") + parser.add_argument("-a", "--amr_file", type=str, default="pathogenwatch-amr-profile.csv", help="Pathogenwatch AMR file (default: 'pathogenwatch-amr-profile.csv').") + parser.add_argument("-o", "--output_file", type=str, default="staph_metadata.tsv", help="Merged Staph summary file (default: 'staph_metadata.tsv').") + return parser.parse_args(args) + +def make_dir(path): + """ + Function for making a directory from a provided path + """ + if not len(path) == 0: + try: + os.makedirs(path) + except OSError as exception: + if exception.errno != errno.EEXIST: + raise + +def read_sample_info(sample_info): + """ + Function for reading sample_info.csv + """ + sample_info_read = pd.read_csv(sample_info, sep=',') + + return sample_info_read + +def read_pathogenwatch(pw_out): + """ + Function for reading Pathogenwatch CSV files + """ + pw_out_read = pd.read_csv(pw_out, sep=',') + + return pw_out_read + +def metadata_merge(df1, df2, on_column): + """ + Function for merging two dataframes using a column name + """ + merged_df = pd.merge(df1, df2, on = on_column) + + return merged_df + +def pathogenwatch_rename(df): + """ + Function for renaming sample names to remove '_contigs' and + renaming column 'NAME' to 'sample' + """ + df['NAME'] = df['NAME'].str.replace('_contigs','') + df = df.rename(columns = {'NAME' : 'sample'}) + + return df + +def main(args=None): + args = parser_args(args) + + ## Create output directory if it doesn't exist + out_dir = os.path.dirname(args.output_file) + make_dir(out_dir) + + ## Read in sample info file + sample_info = read_sample_info(args.sample_info) + + ## Read in Pathogenwatch typing file + typing = read_pathogenwatch(args.typing_file) + + ## Read in Pathogenwatch AMR file + amr_profile = read_pathogenwatch(args.amr_file) + + ## Merge Pathogenwatch outputs + merged_pw_df = metadata_merge(typing, amr_profile, 'NAME') + + ## Edit Pathogenwatch dataframe + merged_pw_df = pathogenwatch_rename(merged_pw_df) + + ## Merge sample_info.csv and Pathogenwatch dataframe + final_merged_df = metadata_merge(sample_info, merged_pw_df, 'sample') + + ## Write output file + final_merged_df.to_csv(args.output_file, sep = '\t', header = True, index = False) + +if __name__ == '__main__': + sys.exit(main()) \ No newline at end of file diff --git a/course_files/scripts/S_pneumoniae/01-run_bactmap.sh b/course_files/scripts/S_pneumoniae/01-run_bactmap.sh new file mode 100644 index 0000000..49b1ac6 --- /dev/null +++ b/course_files/scripts/S_pneumoniae/01-run_bactmap.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +# before running this script make sure to +# mamba activate nextflow + +# create output directory +mkdir results/bactmap + +# FIX!! +# run the pipeline +nextflow run nf-core/bactmap \ + -resume -profile singularity \ + --max_memory '16.GB' --max_cpus 8 \ + --input SAMPLESHEET \ + --outdir results/bactmap \ + --reference REFERENCE \ + --genome_size 2.0M diff --git a/course_files/scripts/S_pneumoniae/02-pseudogenome_check.sh b/course_files/scripts/S_pneumoniae/02-pseudogenome_check.sh new file mode 100644 index 0000000..6a7000e --- /dev/null +++ b/course_files/scripts/S_pneumoniae/02-pseudogenome_check.sh @@ -0,0 +1,47 @@ +#!/bin/bash + +# before running this script make sure to +# mamba activate seqtk + +#### Settings ##### + +# directory with pseudogenome FASTA + +fasta_dir="results/bactmap/pseudogenomes" + +# output directory for results +outdir="results/bactmap/pseudogenomes_check" + +# path to seqtk_parser.py +parser="scripts/seqtk_parser.py" + +#### End of settings #### + + +#### Analysis #### +# WARNING: be careful changing the code below + +# exit upon any error +set -e + +# create output directory +mkdir -p $outdir/seqtk + +# loop through each pseudogenome +for filepath in $(ls $fasta_dir/*.fas | grep -v "aligned_") +do + # get the sample name + sample=$(basename $filepath) + + # print a message + echo "Processing $sample" + + # run seqtk command + seqtk comp $filepath > ${outdir}/seqtk/${sample}.tsv +done + +# run seqtk_parser.py +python $parser --input_dir $outdir/seqtk + +# move mapping_summary.tsv to results/bactmap/pseudogenomes_check +mv mapping_summary.tsv $outdir diff --git a/course_files/scripts/S_pneumoniae/03-run_gubbins.sh b/course_files/scripts/S_pneumoniae/03-run_gubbins.sh new file mode 100644 index 0000000..788637c --- /dev/null +++ b/course_files/scripts/S_pneumoniae/03-run_gubbins.sh @@ -0,0 +1,34 @@ +#!/bin/bash + +# before running this script make sure to +# mamba activate gubbins + +# create output directory +mkdir -p results/gubbins/ + +# reference genome gff +gff=resources/reference/GCF_000299015.1_ASM29901v1_genomic.gff + +# masked variants file to extract pairwise SNP distances from +alignment="results/bactmap/pseudogenomes/aligned_pseudogenomes.fas" + +# prefix for gubbins outputs +prefix="sero1" + +# output directory for results +outdir="results/gubbins" + +# name of masked alignment +outfile="results/gubbins/aligned_pseudogenomes_masked.fas" + +# run gubbins +run_gubbins.py --prefix $prefix --tree-builder iqtree $alignment + +# move gubbins outputs to results directory +mv ${prefix}.* $outdir + +# mask original alignment with recombinant regions +mask_gubbins_aln.py --aln $alignment --gff $outdir/${prefix}.recombination_predictions.gff --out $outfile + +# run plot_gubbins.R to create plot of recombinant regions +plot_gubbins.R -t $outdir/${prefix}.final_tree.tre -r $outdir/${prefix}.recombination_predictions.gff -a $gff -o $outdir/${prefix}.recombination.png diff --git a/course_files/scripts/S_pneumoniae/04-run_iqtree.sh b/course_files/scripts/S_pneumoniae/04-run_iqtree.sh new file mode 100644 index 0000000..5982540 --- /dev/null +++ b/course_files/scripts/S_pneumoniae/04-run_iqtree.sh @@ -0,0 +1,28 @@ +#!/bin/bash + +# before running this script make sure to +# mamba activate iqtree + +# create output directory +mkdir -p results/snp-sites/ +mkdir -p results/iqtree/ + +# FIX!! +# extract variable sites +snp-sites FIX_INPUT_PSEUDOGENOMES_FASTA > results/snp-sites/aligned_pseudogenomes_masked_snps.fas + +# FIX!! +# count invariant sites +snp-sites -C FIX_INPUT_PSEUDOGENOMES_FASTA > results/snp-sites/constant_sites.txt + +# FIX!! +# Run iqtree +iqtree \ + -fconst FIX_CONSTANT_SITES \ + -s FIX_INPUT_SNP_ALIGNMENT \ + --prefix results/iqtree/sero1 \ + -nt AUTO \ + -ntmax 8 \ + -mem 8G \ + -m MFP \ + -bb 1000 diff --git a/course_files/scripts/S_pneumoniae/05-run_poppunk.sh b/course_files/scripts/S_pneumoniae/05-run_poppunk.sh new file mode 100644 index 0000000..a20a3e8 --- /dev/null +++ b/course_files/scripts/S_pneumoniae/05-run_poppunk.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +# before running this script make sure to +# mamba activate poppunk + +# create output directory +mkdir -p results/poppunk/ + +# PopPUNK database +db="databases/poppunk/GPS_v8_ref" + +# GPSC designations +clusters="databases/poppunk/GPS_v8_external_clusters.csv" + +# list of assemblies +assemblies="assemblies.txt" + +# output directory for results +outdir="results/poppunk" + +# run PopPUNK +poppunk_assign --db $db --external-clustering $clusters --query $assemblies --output $outdir --threads 8 diff --git a/course_files/scripts/S_pneumoniae/06-plot_phylogeny.R b/course_files/scripts/S_pneumoniae/06-plot_phylogeny.R new file mode 100644 index 0000000..aa1559a --- /dev/null +++ b/course_files/scripts/S_pneumoniae/06-plot_phylogeny.R @@ -0,0 +1,74 @@ +############################# +## Load required libraries ## +############################# + +library(tidyverse) +library(ggtree) +library(ggnewscale) + +################################# +## Assign colours for metadata ## +################################# + +colourList <- c("#0000cc","#ff0000","#660000","#9900ff","#ff66cc","#006600","#9999ff","#996600","#663366","#99cc66", + "#c4dc00","#ff6600","#666600","#cc9933","#99cc00","#cc99ff","#ccccff","#ffcc66","#336666","#000066","#666666","#663300","#1dc39f", + "#ddff33","#ffd700","#ffac2a","#c4dc00","#8f007f","#444c04","#e8ff2a","#8f0038","#388f00","#dc00c4","#1dc34f","#c34c1d","#ff6666", + "#ffcccc", "#FFFFD9", "#EDF8B1", "#C7E9B4", "#7FCDBB") + +########################## +## Read in metadata TSV ## +########################## + +# Update the path +metadata <- read_tsv("pneumo_metadata.tsv") + +############################################ +## Create metadata df for ggtree plotting ## +############################################ + +metadataDF <- as.data.frame(metadata[2:10]) + +metadataDF$`MLST ST (PubMLST)` <- as.factor(metadataDF$`MLST ST (PubMLST)`) + +rownames(metadataDF) <- metadata$sample + +############################## +## Read in IQTREE phylogeny ## +############################## + +# Update the path +pneumo_phylogeny <- read.tree("sero1.treefile") + +##################################### +## Root with reference as outgroup ## +##################################### + +pneumo_phylogeny <- root(pneumo_phylogeny, outgroup = "NC_018630.1", resolve.root = TRUE) + +############################ +## Plot phylogenetic tree ## +############################ + +pneumo_phylogeny_plot <- ggtree(pneumo_phylogeny) + + geom_tiplab(align = F, + size = 2) + + xlim_tree(0.002) + + geom_treescale(x = 0, y = 25) + + theme_tree() + +############################################## +## Plot phylogeny with ST as metadata strip ## +############################################## + +pneumo_phylogeny_plot_meta <- gheatmap(pneumo_phylogeny_plot, + metadataDF[9], + offset = 0.0005, + width = 0.05, + color = NA, + colnames = T, + colnames_angle=90, + colnames_position = "top", + hjust = 0, + font.size = 4) + + scale_fill_manual(values = colourList, name = "MLST ST (PubMLST)") + + coord_cartesian(clip = "off") diff --git a/course_files/scripts/S_pneumoniae/07-run_funcscan.sh b/course_files/scripts/S_pneumoniae/07-run_funcscan.sh new file mode 100644 index 0000000..3c3c150 --- /dev/null +++ b/course_files/scripts/S_pneumoniae/07-run_funcscan.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +# before running this script make sure to +# mamba activate nextflow + +# create output directory +mkdir -p results/funcscan + +# FIX!! +# run the pipeline +nextflow run nf-core/funcscan \ + -resume -profile singularity \ + --max_memory 16.GB --max_cpus 8 \ + --input FIX_PATH_TO_SAMPLESHEET \ + --outdir FIX_PATH_TO_OUTPUT_DIRECTORY \ + --run_arg_screening \ + --arg_skip_deeparg diff --git a/course_files/scripts/S_pneumoniae/fastq_dir_to_samplesheet.py b/course_files/scripts/S_pneumoniae/fastq_dir_to_samplesheet.py new file mode 100644 index 0000000..447238b --- /dev/null +++ b/course_files/scripts/S_pneumoniae/fastq_dir_to_samplesheet.py @@ -0,0 +1,158 @@ +#!/usr/bin/env python + +import os +import sys +import glob +import argparse + + +def parse_args(args=None): + Description = ( + "Generate nf-core/viralrecon samplesheet from a directory of FastQ files." + ) + Epilog = "Example usage: python fastq_dir_to_samplesheet.py " + + parser = argparse.ArgumentParser(description=Description, epilog=Epilog) + parser.add_argument("FASTQ_DIR", help="Folder containing raw FastQ files.") + parser.add_argument("SAMPLESHEET_FILE", help="Output samplesheet file.") + parser.add_argument( + "-r1", + "--read1_extension", + type=str, + dest="READ1_EXTENSION", + default="_R1_001.fastq.gz", + help="File extension for read 1.", + ) + parser.add_argument( + "-r2", + "--read2_extension", + type=str, + dest="READ2_EXTENSION", + default="_R2_001.fastq.gz", + help="File extension for read 2.", + ) + parser.add_argument( + "-se", + "--single_end", + dest="SINGLE_END", + action="store_true", + help="Single-end information will be auto-detected but this option forces paired-end FastQ files to be treated as single-end so only read 1 information is included in the samplesheet.", + ) + parser.add_argument( + "-sn", + "--sanitise_name", + dest="SANITISE_NAME", + action="store_true", + help="Whether to further sanitise FastQ file name to get sample id. Used in conjunction with --sanitise_name_delimiter and --sanitise_name_index.", + ) + parser.add_argument( + "-sd", + "--sanitise_name_delimiter", + type=str, + dest="SANITISE_NAME_DELIMITER", + default="_", + help="Delimiter to use to sanitise sample name.", + ) + parser.add_argument( + "-si", + "--sanitise_name_index", + type=int, + dest="SANITISE_NAME_INDEX", + default=1, + help="After splitting FastQ file name by --sanitise_name_delimiter all elements before this index (1-based) will be joined to create final sample name.", + ) + return parser.parse_args(args) + + +def fastq_dir_to_samplesheet( + fastq_dir, + samplesheet_file, + read1_extension="_R1_001.fastq.gz", + read2_extension="_R2_001.fastq.gz", + single_end=False, + sanitise_name=False, + sanitise_name_delimiter="_", + sanitise_name_index=1, +): + def sanitize_sample(path, extension): + """Retrieve sample id from filename""" + sample = os.path.basename(path).replace(extension, "") + if sanitise_name: + sample = sanitise_name_delimiter.join( + os.path.basename(path).split(sanitise_name_delimiter)[ + :sanitise_name_index + ] + ) + return sample + + def get_fastqs(extension): + """ + Needs to be sorted to ensure R1 and R2 are in the same order + when merging technical replicates. Glob is not guaranteed to produce + sorted results. + See also https://stackoverflow.com/questions/6773584/how-is-pythons-glob-glob-ordered + """ + return sorted( + glob.glob(os.path.join(fastq_dir, f"*{extension}"), recursive=False) + ) + + read_dict = {} + + ## Get read 1 files + for read1_file in get_fastqs(read1_extension): + sample = sanitize_sample(read1_file, read1_extension) + if sample not in read_dict: + read_dict[sample] = {"R1": [], "R2": []} + read_dict[sample]["R1"].append(read1_file) + + ## Get read 2 files + if not single_end: + for read2_file in get_fastqs(read2_extension): + sample = sanitize_sample(read2_file, read2_extension) + read_dict[sample]["R2"].append(read2_file) + + ## Write to file + if len(read_dict) > 0: + out_dir = os.path.dirname(samplesheet_file) + if out_dir and not os.path.exists(out_dir): + os.makedirs(out_dir) + + with open(samplesheet_file, "w") as fout: + header = ["sample", "fastq_1", "fastq_2"] + fout.write(",".join(header) + "\n") + for sample, reads in sorted(read_dict.items()): + for idx, read_1 in enumerate(reads["R1"]): + read_2 = "" + if idx < len(reads["R2"]): + read_2 = reads["R2"][idx] + sample_info = ",".join([sample, read_1, read_2]) + fout.write(f"{sample_info}\n") + else: + error_str = ( + "\nWARNING: No FastQ files found so samplesheet has not been created!\n\n" + ) + error_str += "Please check the values provided for the:\n" + error_str += " - Path to the directory containing the FastQ files\n" + error_str += " - '--read1_extension' parameter\n" + error_str += " - '--read2_extension' parameter\n" + print(error_str) + sys.exit(1) + + +def main(args=None): + args = parse_args(args) + + fastq_dir_to_samplesheet( + fastq_dir=args.FASTQ_DIR, + samplesheet_file=args.SAMPLESHEET_FILE, + read1_extension=args.READ1_EXTENSION, + read2_extension=args.READ2_EXTENSION, + single_end=args.SINGLE_END, + sanitise_name=args.SANITISE_NAME, + sanitise_name_delimiter=args.SANITISE_NAME_DELIMITER, + sanitise_name_index=args.SANITISE_NAME_INDEX, + ) + + +if __name__ == "__main__": + sys.exit(main()) \ No newline at end of file diff --git a/course_files/scripts/S_pneumoniae/merge_pneumo_data.py b/course_files/scripts/S_pneumoniae/merge_pneumo_data.py new file mode 100644 index 0000000..8cf5ded --- /dev/null +++ b/course_files/scripts/S_pneumoniae/merge_pneumo_data.py @@ -0,0 +1,99 @@ +#!/usr/bin/env python3 + +import os +import sys +import glob +import argparse +import pandas as pd + +def parser_args(args=None): + """ + Function for input arguments for merge_pneumo_data.py + """ + Description = 'Merge Pathogenwatch output and sample_info.csv to create a summary table' + Epilog = """Example usage: python merge_pneumo_data.py """ + parser = argparse.ArgumentParser(description=Description, epilog=Epilog) + parser.add_argument("-s", "--sample_info", type=str, default="sample_info.csv", help="Sample metadata file (default: 'sample_info.csv').") + parser.add_argument("-t", "--typing_file", type=str, default="pathogenwatch-typing.csv", help="Pathogenwatch typing file (default: 'pathogenwatch-typing.csv').") + parser.add_argument("-a", "--amr_file", type=str, default="pathogenwatch-amr-profile.csv", help="Pathogenwatch AMR file (default: 'pathogenwatch-amr-profile.csv').") + parser.add_argument("-o", "--output_file", type=str, default="pneumo_metadata.tsv", help="Merged Pneumo summary file (default: 'pneumo_metadata.tsv').") + return parser.parse_args(args) + +def make_dir(path): + """ + Function for making a directory from a provided path + """ + if not len(path) == 0: + try: + os.makedirs(path) + except OSError as exception: + if exception.errno != errno.EEXIST: + raise + +def read_sample_info(sample_info): + """ + Function for reading sample_info.csv + """ + sample_info_read = pd.read_csv(sample_info, sep=',') + + return sample_info_read + +def read_pathogenwatch(pw_out): + """ + Function for reading Pathogenwatch CSV files + """ + pw_out_read = pd.read_csv(pw_out, sep=',') + + return pw_out_read + +def metadata_merge(df1, df2, on_column): + """ + Function for merging two dataframes using a column name + """ + merged_df = pd.merge(df1, df2, on = on_column) + + return merged_df + +def pathogenwatch_rename(df): + """ + Function for renaming sample names to remove '_contigs' and + renaming column 'NAME' to 'sample' + """ + df['NAME'] = df['NAME'].str.replace('_contigs','') + df = df.rename(columns = {'NAME' : 'sample'}) + + return df + +def main(args=None): + args = parser_args(args) + + ## Create output directory if it doesn't exist + out_dir = os.path.dirname(args.output_file) + make_dir(out_dir) + + ## Read in sample info file + sample_info = read_sample_info(args.sample_info) + + ## Read in Pathogenwatch typing file + typing = read_pathogenwatch(args.typing_file) + + ## Read in Pathogenwatch AMR file + amr_profile = read_pathogenwatch(args.amr_file) + + ## Merge Pathogenwatch outputs + merged_pw_df = metadata_merge(typing, amr_profile, 'NAME') + + ## Edit Pathogenwatch dataframe + merged_pw_df = pathogenwatch_rename(merged_pw_df) + + ## Merge sample_info.csv and Pathogenwatch dataframe + final_merged_df = metadata_merge(sample_info, merged_pw_df, 'sample') + + ## Remove _T1 from sample_ids to match tip labels + final_merged_df['sample'] = final_merged_df['sample'].str.replace('_T1','') + + ## Write output file + final_merged_df.to_csv(args.output_file, sep = '\t', header = True, index = False) + +if __name__ == '__main__': + sys.exit(main()) \ No newline at end of file diff --git a/course_files/scripts/S_pneumoniae/seqtk_parser.py b/course_files/scripts/S_pneumoniae/seqtk_parser.py new file mode 100644 index 0000000..91fe471 --- /dev/null +++ b/course_files/scripts/S_pneumoniae/seqtk_parser.py @@ -0,0 +1,63 @@ +#!/usr/bin/env python3 + +import os +import sys +import glob +import argparse +import pandas as pd + +def parser_args(args=None): + """ + Function for input arguments for seqtk_parser.py + """ + Description = 'Collect seqtk comp outputs and create a summary table' + Epilog = """Example usage: python seqtk_parser.py """ + parser = argparse.ArgumentParser(description=Description, epilog=Epilog) + parser.add_argument("-i", "--input_dir", help="directory containing pseudogenome files") + parser.add_argument("-of", "--output_file", type=str, default="mapping_summary.tsv", help="seqtk comp summary file (default: 'mapping_summary.tsv').") + return parser.parse_args(args) + +def make_dir(path): + """ + Function for making a directory from a provided path + """ + if not len(path) == 0: + try: + os.makedirs(path) + except OSError as exception: + if exception.errno != errno.EEXIST: + raise + +def seqtk_to_dataframe(file_list): + """ + Function for creating a dataframe from a list of seqtk comp files + """ + seqtk_file_list = [pd.read_csv(f, sep='\t', header=None) for f in file_list] + seqtk_df = pd.concat(seqtk_file_list, ignore_index=True) + seqtk_df = seqtk_df.iloc[:, 0:6] + seqtk_df.columns = ['sample', 'ref_length', '#A', '#C', '#G', '#T'] + column_list = ['#A', '#C', '#G', '#T'] + seqtk_df["mapped"] = seqtk_df[column_list].sum(axis=1) + seqtk_df["%ref mapped"] = seqtk_df['mapped'] / seqtk_df['ref_length'] * 100 + + return seqtk_df + +def main(args=None): + args = parser_args(args) + + ## Create output directory if it doesn't exist + out_dir = os.path.dirname(args.output_file) + make_dir(out_dir) + + ## Create list of seqtk comp tsv outputs + fastq_dir=args.input_dir + seqtk_files = sorted(glob.glob(os.path.join(fastq_dir, '*.tsv'))) + + ## Create dataframe + seqtk_df = seqtk_to_dataframe(seqtk_files) + + ## Write output file + seqtk_df.to_csv(args.output_file, sep = '\t', header = True, index = False) + +if __name__ == '__main__': + sys.exit(main()) \ No newline at end of file diff --git a/index.md b/index.md index 969a13a..c3c1186 100644 --- a/index.md +++ b/index.md @@ -7,7 +7,7 @@ number-sections: false ## Overview -This comprehensive course equips you with essential skills and knowledge in bacterial genomics analysis, primarily using Illumina-sequenced samples. You'll gain an understanding of how to select the most appropriate analysis workflow, tailored to the genome diversity of a given bacterial species. Through hands-on training, you'll apply both _de novo_ assembly and reference-based mapping approaches to obtain bacterial genomes for your isolates. You will apply standardised workflows for genome assembly and annotation, including quality assessment criteria to ensure the reliability of your results. Furthermore, you'll learn how to construct phylogenetic trees using whole genome and core genome alignments, enabling you to explore the evolutionary relationships among bacterial isolates. Lastly, you'll apply methods to detect antimicrobial resistance genes. As examples we will use _Mycobacterium tuberculosis_, _Staphylococcus aureus_ and _Streptococcus pneumoniae_, allowing you to become well-equipped to conduct bacterial genomics analyses on a range of species. +This comprehensive course equips you with essential skills and knowledge in bacterial genomics analysis, primarily using Illumina-sequenced samples. You'll gain an understanding of how to select the most appropriate analysis workflow, tailored to the genome diversity of a given bacterial species. Through hands-on training, you'll apply both de novo assembly and reference-based mapping approaches to obtain bacterial genomes for your isolates. You will apply standardised workflows for genome assembly and annotation, including quality assessment criteria to ensure the reliability of your results. Along with typing bacteria using methods such as MLST, you'll learn how to construct phylogenetic trees using whole genome and core genome alignments, enabling you to explore the evolutionary relationships among bacterial isolates. You’ll extend this to estimate a time-scaled phylogeny using a starting phylogenetic tree. Lastly, you'll apply methods to detect antimicrobial resistance genes. As examples we will use _Mycobacterium tuberculosis_, _Staphylococcus aureus_ and _Streptococcus pneumoniae_, allowing you to become well-equipped to conduct bacterial genomics analyses on a range of species. ::: {.callout-tip} ### Learning Objectives @@ -20,6 +20,8 @@ By the end of this course, you will be able to: - Evaluate the quality of assembled genomes and determine their suitability for downstream analysis. - Detect and remove recombinant regions. - Construct phylogenetic trees using both whole genome and core genome alignments. +- Estimate a time-scaled phylogeny using and initial maximum likelihood phylogenetic tree and sample dates. +- Conduct genomic epidemiology and strain typing. - Detect the presence of antimicrobial resistance genes in your isolates. ::: diff --git a/materials/.DS_Store b/materials/.DS_Store index aad9492..102ec03 100644 Binary files a/materials/.DS_Store and b/materials/.DS_Store differ diff --git a/materials/02-know_your_bug.md b/materials/01-know_your_bug.md similarity index 100% rename from materials/02-know_your_bug.md rename to materials/01-know_your_bug.md diff --git a/materials/02-preparing_data.md b/materials/02-preparing_data.md new file mode 100644 index 0000000..b5c5b67 --- /dev/null +++ b/materials/02-preparing_data.md @@ -0,0 +1,76 @@ +--- +title: "Preparing data" +--- + +::: {.callout-tip} +## Learning Objectives + +- Recognise the importance of organising your files and folders when starting a bioinformatic analysis. +- Investigate the content of sequencing FASTQ files. +- Recognise the importance of metadata for data interpretation and discuss its relevance for pathogen surveillance. + +::: + +## Preparing Files + +On our computers, for each of the three species we will be working with on the course, we have a directory in `~/Course_Materials` for each species where we will do all our analysis. We already included the following in each species directory: + +- `data/` - contains the sequencing files we will be working with. +- `preprocessed/` - contains the results for all the analysis we'll be running during the course (see box below). +- `resources/` - where we include other files we will be using such as reference genomes. +- `databases/` - where we include public databases needed by some tools. +- `scripts/` - where we include some scripts that we will use during the course. You will have to edit some of these scripts as part of the exercises. +- `sample_info.csv` - a table with some metadata for our samples. + +:::{.callout-warning} + +We have a lot to fit in this week, and during testing, we found that running all the pipelines and tools on even the genomes we selected from larger datasets was going to take too long. So, what we decided, was to only analyse five samples during the exercises you'll be doing this week. This will allow you to set up and run the various scripts, see the pipelines and tools running and see the outputs in the time we've set aside for each section. The outputs will always be sent to a `results` directory. However, we've provided the results you would have obtained if you'd run the scripts on all of the data in a `preprocessed` directory. For some of the exercises, you'll be making use of the outputs in this directory instead of what you create yourselves as this will hopefully give you a more realistic set of results like the ones you may eventually generate when working with larger datasets. We will make it clear in the course materials and exercises when you should be working with the results in the `preprocessed` directory instead of the `results` directory. +::: + +### Data + +Your analysis starts with FASTQ files (if you need a reminder of what a FASTQ file is, look at the [Intro to NGS > FASTQ](02-intro_ngs.html#FASTQ_Files) section of the materials). The Illumina files come as compressed FASTQ files (`.fastq.gz` format) and there's two files per sample, corresponding to read 1 and read 2. +This is indicated by the file name suffix: + +- `*_1.fastq.gz` for read 1 +- `*_2.fastq.gz` for read 2 + +You can look at the files you have available in any of the species directories from the command line using: + +```bash +ls data/reads +``` + +### Metadata + +A critical step in any analysis is to make sure that our samples have all the relevant metadata associated with them. This is important to make sense of our results and produce informative reports at the end. There are many types of information that can be collected from each sample and for effective genomic surveillance, we need at the very minimum three pieces of information: + +- **When**: date when the sample was collected (not when it was sequenced!). +- **Where**: the location where the sample was collected (not where it was sequenced!). +- **Source**: the source of the the sample e.g. host, environment. + +Of course, this is the **minimum** metadata we need for a useful analysis. The more information you collect about each sample, the more questions you can ask from your data -- so always remember to record as much information as possible for each sample. + +:::{.callout-warning} +#### Dates in Spreadsheet Programs + +Note that programs such as _Excel_ often convert date columns to their own format, and this can cause problems when analysing data later on. +The ISO standard for dates is YYYY-MM-DD and this is how we recommend you store your dates. However, by default _Excel_ displays dates as DD/MM/YYYY. +You can change how _Excel_ displays dates by highlighting the date column, right-clicking and selecting Format cells, then select "Date" and pick the format that matches YYYY-MM-DD. +However, every time you open the CSV file, _Excel_ annoyingly converts it back to its default format! + +To make sure no date information is lost due to _Excel_'s behaviour, it's a good idea to **store information about year, month and day in separate columns** (stored just as regular numbers). + +::: + +## Summary + +::: {.callout-tip} +## Key Points + +- Proper file and folder organization ensures clarity, reproducibility, and efficiency throughout your bioinformatic analysis. +- Organising files by project, and creating directories for data, scripts and results helps prevent data mix-ups and confusion. +- FASTQ files containing the raw sequencing data, can be quickly investigated using standard command line tools, for example to count how many reads are available. +- Metadata provides context to biological data, including sample information, experimental conditions, and data sources. +- In pathogen surveillance, metadata helps trace the origin and characteristics of samples, aiding outbreak investigation. +::: \ No newline at end of file diff --git a/materials/01-intro_ngs.md b/materials/03-intro_ngs.md similarity index 99% rename from materials/01-intro_ngs.md rename to materials/03-intro_ngs.md index 1df022a..31e2898 100644 --- a/materials/01-intro_ngs.md +++ b/materials/03-intro_ngs.md @@ -30,7 +30,7 @@ The video below from the iBiology team gives a great overview of these technolog ### Illumina Sequencing -Illumina's technology has become a widely popular method, with many applications to study transcriptomes (RNA-seq), epigenomes (ATAC-seq, BS-seq), DNA-protein interactions (ChIP-seq), chromatin conformation (Hi-C/3C-Seq), population and quantitative genetics (variant detection, GWAS), de-novo genome assembly, amongst [many others](https://emea.illumina.com/content/dam/illumina-marketing/documents/products/research_reviews/sequencing-methods-review.pdf). +Illumina's technology has become a widely popular method, with many applications to study transcriptomes (RNA-seq), epigenomes (ATAC-seq, BS-seq), DNA-protein interactions (ChIP-seq), chromatin conformation (Hi-C/3C-Seq), population and quantitative genetics (variant detection, GWAS), de-novo genome assembly, amongst [many others](https://moodle2.units.it/pluginfile.php/156101/mod_resource/content/0/sequencing-methods-review.pdf). An overview of the sequencing procedure is shown in the animation video below. Generally, samples are processed to generate so-called _sequencing libraries_, where the genetic material (DNA or RNA) is processed to generate fragments of DNA with attached oligo adapters necessary for the sequencing procedure (if the starting material is RNA, it can be converted to DNA by a step of reverse transcription). diff --git a/materials/03-nextflow.md b/materials/04-nextflow.md similarity index 100% rename from materials/03-nextflow.md rename to materials/04-nextflow.md diff --git a/materials/04-nf_core.md b/materials/05-nf_core.md similarity index 100% rename from materials/04-nf_core.md rename to materials/05-nf_core.md diff --git a/materials/06-downloading.md b/materials/06-downloading.md new file mode 100644 index 0000000..5ef62cb --- /dev/null +++ b/materials/06-downloading.md @@ -0,0 +1,166 @@ +--- +title: "Downloading sequence data" +--- + +::: {.callout-tip} +## Learning Objectives + +- Download sequence data from a public database with the nf-core pipeline fetchngs. + +::: + +## Pipeline Overview + +![The fetchngs pipeline](images/nf-core-fetchngs_metro_map_grey.png) + +**fetchngs** is a bioinformatics analysis pipeline written in Nextflow to automatically download and process raw FASTQ files from public databases. Identifiers can be provided in a file and any type of accession ID found in the SRA, ENA, DDBJ and GEO databases are supported. If run accessions (SRR/ERR/DRR) are provided, these will be resolved back to the sample accessions (SRX/ERX/DRX) to allow multiple runs for the same sample to be merged. As well as the FASTQ files, `fetchngs` will also produce a `samplesheet.csv` file containing the sample metadata obtained from the ENA. This file can be used as input for other nf-core and Nextflow pipelines like the ones we'll be using this week. + +## Prepare a samples file + +`fetchngs` requires a samples file with the accessions you would like to download. The file requires the suffix `.csv` but does not need to be in CSV format. Each line needs to represent a database id: + +``` +ERR9907668 +ERR9907669 +ERR9907670 +ERR9907671 +ERR9907672 +``` + +:::{.callout-exercise} +#### Exercise: Preparing a samples file + +Your first task is to create a `samples.csv` file to be used as input for `fetchngs`. Use the following accessions: + +``` +ERR9907668 +ERR9907669 +ERR9907670 +ERR9907671 +ERR9907672 +``` + +Make sure you save the file in the `~/Course_Materials/M_tuberculosis` directory. + +:::{.callout-answer} + +We opened a text editor (e.g. nano. vim), copied and pasted the accessions and saved the file as `samples.csv` in the `~/Course_Materials/M_tuberculosis` directory. + +::: +::: + +## Running fetchngs + +Now that we have the `samples.csv` file, we can run the `fetchngs` pipeline. First, let's activate the `nextflow` software environment: + +```bash +mamba activate nextflow +``` + +There are [many options](https://nf-co.re/fetchngs/1.12.0/parameters) that can be used to customise the pipeline but a typical command is shown below: + +```bash +nextflow run nf-core/fetchngs \ + -profile singularity \ + --max_memory '16.GB' --max_cpus 8 \ + --input SAMPLES \ + --outdir results/fetchngs \ + --nf_core_pipeline viralrecon +``` + +The options we used are: + +- `-profile singularity` - indicates we want to use the _Singularity_ program to manage all the software required by the pipeline (another option is to use `docker`). See [Data & Setup](../setup.md) for details about their installation. +- `--max_memory` and `--max_cpus` - sets the available RAM memory and CPUs. You can check this with the commands `free -h` and `nproc --all`, respectively. +- `--input` - the samples file with the accessions to be downloaded, as explained above. +- `--nf_core_pipeline` - Name of supported nf-core pipeline e.g. 'viralrecon'. A samplesheet for direct use with the pipeline will be created with the appropriate columns. + +:::{.callout-note} +#### Maximum Memory and CPUs + +In our `Nextflow` command above we have set `--max_memory '16.GB' --max_cpus 8` to limit the resources used in the analysis. +This is suitable for the computers we are using in this workshop. +However, make sure to set these options to the maximum resources available on the computer where you process your data. +::: + +:::{.callout-exercise} +#### Exercise: Running fetchngs + +Your next task is to download sequence data with the `fetchngs`. In the folder `scripts` (in the `M_tuberculosis` analysis directory) you will find a script named `01-run_fetchngs.sh`. This script contains the code to run `fetchngs`. + +- Edit this script, adjusting it to fit your input file. + +- Run the script using `bash scripts/01-run_fetchngs.sh`. + +- If the script is running successfully it should start printing the progress of each job in the `fetchngs` pipeline. + +:::{.callout-answer} + +- The fixed script is: + +```bash +#!/bin/bash + +nextflow run nf-core/fetchngs \ + -profile singularity \ + --max_memory '16.GB' --max_cpus 8 \ + --input samples.csv \ + --outdir results/fetchngs \ + --nf_core_pipeline viralrecon +``` + +- We ran the script as instructed using: + +```bash +bash scripts/01-run_fetchngs.sh +``` + +- While it was running it printed a message on the screen: + +``` +executor > local (20) +[5d/682f49] process > NFCORE_FETCHNGS:SRA:SRA_IDS_TO_RUNINFO (ERR9907668) [100%] 5 of 5 ✔ +[9d/ab8a7b] process > NFCORE_FETCHNGS:SRA:SRA_RUNINFO_TO_FTP (5) [100%] 5 of 5 ✔ +[f9/39e27c] process > NFCORE_FETCHNGS:SRA:SRA_FASTQ_FTP (ERX9450498_ERR9907670) [ 80%] 4 of 5 +[82/c5f847] process > NFCORE_FETCHNGS:SRA:FASTQ_DOWNLOAD_PREFETCH_FASTERQDUMP_SRATOOLS:CUSTOM_SRATOOLSNCBISETTINGS (ncbi-settings) [100%] 1 of 1 ✔ +[- ] process > NFCORE_FETCHNGS:SRA:FASTQ_DOWNLOAD_PREFETCH_FASTERQDUMP_SRATOOLS:SRATOOLS_PREFETCH - +[- ] process > NFCORE_FETCHNGS:SRA:FASTQ_DOWNLOAD_PREFETCH_FASTERQDUMP_SRATOOLS:SRATOOLS_FASTERQDUMP - +[2e/4fd490] process > NFCORE_FETCHNGS:SRA:SRA_TO_SAMPLESHEET (ERX9450498_ERR9907670) [100%] 4 of 4 +[- ] process > NFCORE_FETCHNGS:SRA:SRA_MERGE_SAMPLESHEET - +[- ] process > NFCORE_FETCHNGS:SRA:MULTIQC_MAPPINGS_CONFIG - +``` + +::: +::: + +## `fetchngs` results + +Once `fetchngs` has run, we can look at the various directories it created in `results/fetchngs`: + +| Directory | Description | +|:-- | :---------- | +|`custom` | Contains settings to help the pipeline run | +|`fastq` | Paired-end/single-end reads downloaded from the SRA/ENA/DDBJ/GEO for each accession in the `samples.csv` file | +|`metadata` | Contains the re-formatted ENA metadata for each sample | +|`samplesheet` | Contains the samplesheet with collated metadata and paths to downloaded FASTQ files | +|`pipeline_info` | Contains information about the pipeline run | + +:::{.callout-warning} +#### The Nextflow work directory + +Each step of the pipeline produces one or more files that are not saved to the results directory but are kept in the work directory. This means that if, for whatever reason, the pipeline doesn't finish successfully you can resume it. However, once the pipeline has completed successfully, you no longer need this directory (it can take up a lot of space) so you can delete it: + +```bash +rm -rf work +``` + +::: + +## Summary + +::: {.callout-tip} +## Key Points + +- The nf-core fetchngs pipeline can be used to quickly download sequence data from public databases such as the ENA and GEO. +- The pipeline also produces a `samplesheet.csv` file that can be used as input for other nf-core and Nextflow pipelines. +::: \ No newline at end of file diff --git a/materials/05-intro_tb.md b/materials/07-intro_tb.md similarity index 95% rename from materials/05-intro_tb.md rename to materials/07-intro_tb.md index 3413d1e..6b9e0fe 100644 --- a/materials/05-intro_tb.md +++ b/materials/07-intro_tb.md @@ -22,7 +22,7 @@ _Mycobacterium tuberculosis_, the bacterium that causes tuberculosis (TB) in hum In 1998, the first complete genome sequence of a _M. tuberculosis_ strain, the virulent laboratory reference strain H37Rv, was published (Cole 1998). The genome of _M. tuberculosis_ is a single circular chromosome that is approximately 4.4 million base pairs in size and contains around 4000 genes. _M. tuberculosis_ is a member of the Mycobacterium tuberculosis complex (MTBC), which includes different lineages, some referred to as _M. tuberculosis sensu stricto_ (lineage 1 to lineage 4 and lineage 7), others as _M. africanum_ (lineage 5 and lineage 6), two recently discovered lineages (lineage 8 and lineage 9), and several animal-associated ecotypes such as _M. bovis_ and _M. caprae_. Some lineages are geographically widespread, while others like L5 and L6 (mainly found in West Africa), are more restricted. A simplified phylogeny showing the relationship of the various MTBC lineages is shown below. -![Phylogeny of M. tuberculosis lineage strains. Simplified maximum likelihood phylogeny of the 9 lineages of M. tuberculosis, as well as the related M. bovis strain and the M. canettii outgroup strain used as a root. (Coscolla 2021; Koleske 2023)](images/mtbc.jpg) +![Phylogeny of _M. tuberculosis_ lineage strains. Simplified maximum likelihood phylogeny of the 10 human-associaed lineages of _M. tuberculosis_ and the related animal strains. _M. canettii_ was used as a root.](images/MTBC.png) Increasingly, _M. tuberculosis_ is resistant to many of the frontline antimicrobials used to treat TB, such as isoniazid and rifampicin. This poses an enormous clinical, financial, and public health challenge across the world. Traditionally, susceptibility of TB isolates to different antimicrobials was conducted in the laboratory but in recent years, antimicrobial profiling using genomic sequencing has been shown to be nearly as accurate as lab methods, especially for the most commonly used drugs. Catalogues of genetic variants that are known to confer resistance to particular antimicrobials are used to type TB genomes, potentially saving time and money. diff --git a/materials/06-intro_qc.md b/materials/08-intro_qc.md similarity index 99% rename from materials/06-intro_qc.md rename to materials/08-intro_qc.md index 6ea2ba1..5cdf4b5 100644 --- a/materials/06-intro_qc.md +++ b/materials/08-intro_qc.md @@ -15,7 +15,7 @@ title: "Introduction to QC" Before we look at our genomic data, lets take time to explore what to look out for when performing **Q**uality **C**ontrol **(QC)** checks on our sequence data. For this course, we will largely focus on next generation sequences obtained from Illumina sequencers. -As you may already know from [Introduction to NGS](01-intro_ngs.md), the main output files expected from our Illumina sequencer are FASTQ files. +As you may already know from [Introduction to NGS](03-intro_ngs.md), the main output files expected from our Illumina sequencer are FASTQ files. ## QC assessment of NGS data diff --git a/materials/07-bacqc.md b/materials/09-bacqc.md similarity index 80% rename from materials/07-bacqc.md rename to materials/09-bacqc.md index 411ea91..e2f1262 100644 --- a/materials/07-bacqc.md +++ b/materials/09-bacqc.md @@ -35,63 +35,12 @@ Along with the outputs produced by the above tools, the pipeline produces the fo - `species_composition.tsv` - final summary of taxonomic assignment in TSV format in TSV format - `multiqc_report.html` - final summary of sequence quality, trimming and species composition for input files in HTML format -## Preparing Files - -On our computers, for each of the three species we will be working with on the course, we have a directory in `~/Course_Materials` for each species where we will do all our analysis. We already included the following in each species directory: - -- `data/` - contains the sequencing files we will be working with. -- `preprocessed/` - contains the results for all the analysis we'll be running during the course (see box below). -- `resources/` - where we include other files we will be using such as reference genomes. -- `databases/` - where we include public databases needed by some tools. -- `scripts/` - where we include some scripts that we will use during the course. You will have to edit some of these scripts as part of the exercises. -- `sample_info.csv` - a table with some metadata for our samples. - -:::{.callout-warning} - -We have a lot to fit in this week, and during testing, we found that running all the pipelines and tools on even the genomes we selected from larger datasets was going to take too long. So, what we decided, was to only analyse five samples during the exercises you'll be doing this week. This will allow you to set up and run the various scripts, see the pipelines and tools running and see the outputs in the time we've set aside for each section. The outputs will always be sent to a `results` directory. However, we've provided the results you would have obtained if you'd run the scripts on all of the data in a `preprocessed` directory. For some of the exercises, you'll be making use of the outputs in this directory instead of what you create yourselves as this will hopefully give you a more realistic set of results like the ones you may eventually generate when working with larger datasets. We will make it clear in the course materials and exercises when you should be working with the results in the `preprocessed` directory instead of the `results` directory. -::: - -### Data - -Your analysis starts with FASTQ files (if you need a reminder of what a FASTQ file is, look at the [Intro to NGS > FASTQ](01-intro_ngs.html#FASTQ_Files) section of the materials). The Illumina files come as compressed FASTQ files (`.fastq.gz` format) and there's two files per sample, corresponding to read 1 and read 2. -This is indicated by the file name suffix: - -- `*_1.fastq.gz` for read 1 -- `*_2.fastq.gz` for read 2 - -You can look at the files you have available in any of the species directories from the command line using: - -```bash -ls data/reads -``` - -### Metadata - -A critical step in any analysis is to make sure that our samples have all the relevant metadata associated with them. This is important to make sense of our results and produce informative reports at the end. There are many types of information that can be collected from each sample and for effective genomic surveillance, we need at the very minimum three pieces of information: - -- **When**: date when the sample was collected (not when it was sequenced!). -- **Where**: the location where the sample was collected (not where it was sequenced!). -- **Source**: the source of the the sample e.g. host, environment. - -Of course, this is the **minimum** metadata we need for a useful analysis. The more information you collect about each sample, the more questions you can ask from your data -- so always remember to record as much information as possible for each sample. - -:::{.callout-warning} -#### Dates in Spreadsheet Programs - -Note that programs such as _Excel_ often convert date columns to their own format, and this can cause problems when analysing data later on. -The ISO standard for dates is YYYY-MM-DD and this is how we recommend you store your dates. However, by default _Excel_ displays dates as DD/MM/YYYY. -You can change how _Excel_ displays dates by highlighting the date column, right-clicking and selecting Format cells, then select "Date" and pick the format that matches YYYY-MM-DD. -However, every time you open the CSV file, _Excel_ annoyingly converts it back to its default format! - -To make sure no date information is lost due to _Excel_'s behaviour, it's a good idea to **store information about year, month and day in separate columns** (stored just as regular numbers). - -::: - ## Prepare a samplesheet -Before we run `bacQC`, we need to prepare a CSV file with information about our sequencing files which will be used as an input to the `bacQC` pipeline (for this exercise we're going to QC the TB dataset described in [Introduction to _Mycobacterium tuberculosis_](05-intro_tb.md)). The pipeline's [documentation](https://github.com/avantonder/bacQC/blob/main/docs/usage.md) gives details about the format of this samplesheet. +Before we run `bacQC`, we need to prepare a CSV file with information about our sequencing files which will be used as an input to the `bacQC` pipeline (for this exercise we're going to QC the TB dataset described in [Introduction to _Mycobacterium tuberculosis_](07-intro_tb.md)). The pipeline's [documentation](https://github.com/avantonder/bacQC/blob/main/docs/usage.md) gives details about the format of this samplesheet. :::{.callout-exercise} +#### Exercise: Prepare a samplesheet Prepare the input samplesheet for `bacQC`. You can do this using _Excel_, making sure you save it as a CSV file (FileSave As... and choose "CSV" as the file format). Alternatively, you can use the `fastq_dir_to_samplesheet.py` script that can be found in the `scripts` directory: @@ -141,13 +90,13 @@ The options we used are: - `--genome_size` - the estimated genome size of your samples - `fastq-scan` uses this to calculate the depth of coverage across the genome. :::{.callout-exercise} -#### Running bacQC +#### Exercise: Running bacQC -Your first task is to run the `bacQC` pipeline on your data. In the folder `scripts` (within your analysis directory) you will find a script named `01-run_bacqc.sh`. This script contains the code to run `bacQC`. +Your first task is to run the `bacQC` pipeline on your data. In the folder `scripts` (within your analysis directory) you will find a script named `02-run_bacqc.sh`. This script contains the code to run `bacQC`. - Edit this script, adjusting it to fit your input files and the estimated genome size of _M. tuberculosis_. -- Run the script using `bash scripts/01-run_bacqc.sh`. +- Run the script using `bash scripts/02-run_bacqc.sh`. - If the script is running successfully it should start printing the progress of each job in the bacQC pipeline. This will take a little while to finish. @@ -172,7 +121,7 @@ nextflow run avantonder/bacQC \ - We ran the script as instructed using: ```bash -bash scripts/01-run_bacQC.sh +bash scripts/02-run_bacQC.sh ``` - While it was running it printed a message on the screen: @@ -203,14 +152,6 @@ executor > local (1), slurm (100) ::: ::: -:::{.callout-note} -#### Maximum Memory and CPUs - -In our `Nextflow` command above we have set `--max_memory '16.GB' --max_cpus 8` to limit the resources used in the analysis. -This is suitable for the computers we are using in this workshop. -However, make sure to set these options to the maximum resources available on the computer where you process your data. -::: - ## `bacQC` results In the previous exercise, we left `bacQC` running. While it runs, we can look at the preprocessed output (`preprocessed/bacqc`) to see the various directories containing output files created by `bacQC`: @@ -248,7 +189,7 @@ This is a compilation of statistics collected from the outputs of `fastp`, `fast #### fastQC -These plots will resemble some of the plots we showed you in [Introduction to QC](06-intro_qc.md) with the main difference being that they contain the results for all samples in summary plots generated by `MultiQC`. The first plot shows the number of sequences in each sample as a barplot (there should be the same number of forward and reverse reads if it's paired-end sequencing). An estimate of duplicated reads - i.e. reads that are exactly the same - is also shown. +These plots will resemble some of the plots we showed you in [Introduction to QC](08-intro_qc.md) with the main difference being that they contain the results for all samples in summary plots generated by `MultiQC`. The first plot shows the number of sequences in each sample as a barplot (there should be the same number of forward and reverse reads if it's paired-end sequencing). An estimate of duplicated reads - i.e. reads that are exactly the same - is also shown. ![Barplot of sequence counts.](images/bacqc_fastqc_seq_counts.png) @@ -391,7 +332,7 @@ The columns are: This quite a simple output and shows that there was very little contamination or non-target sequencing in this run. This may not always be the case - for some runs you may see a variety of different contaminants. :::{.callout-exercise} -#### Check the bacQC results +#### Exercise: Check the bacQC results To assess the quality of our sequence data, we can use the outputs generated by **bacQC**, found in `preprocessed/bacqc`. @@ -415,17 +356,6 @@ Make a note of any samples you think should be removed from any downstream analy ::: ::: -:::{.callout-warning} -#### The Nextflow work directory - -Each step of the pipeline produces one or more files that are not saved to the results directory but are kept in the work directory. This means that if, for whatever reason, the pipeline doesn't finish successfully you can resume it. However, once the pipeline has completed successfully, you no longer need this directory (it can take up a lot of space) so you can delete it: - -```bash -rm -rf work -``` - -::: - ## Summary ::: {.callout-tip} diff --git a/materials/08-mapping.md b/materials/10-mapping.md similarity index 100% rename from materials/08-mapping.md rename to materials/10-mapping.md diff --git a/materials/09-bactmap.md b/materials/11-bactmap.md similarity index 97% rename from materials/09-bactmap.md rename to materials/11-bactmap.md index 49b429e..48f86e6 100644 --- a/materials/09-bactmap.md +++ b/materials/11-bactmap.md @@ -65,9 +65,9 @@ The options we used are: :::{.callout-exercise} -#### Running nf-core/bactmap +#### Exercise: Running nf-core/bactmap -Your next task is to run the **bactmap** pipeline on your data. In the folder `scripts` (within your analysis directory) you will find a script named `02-run_bactmap.sh`. This script contains the code to run bactmap. +Your next task is to run the **bactmap** pipeline on your data. In the folder `scripts` (within your analysis directory) you will find a script named `03-run_bactmap.sh`. This script contains the code to run bactmap. - Edit this script, adjusting it to fit your input files and the name and location of the reference you're going to map to: - use the same samplesheet as before with the `avantonder/bacQC` pipeline. @@ -75,7 +75,7 @@ Your next task is to run the **bactmap** pipeline on your data. In the folder ` - Activate the Nextflow software environment: `mamba activate nextflow`. -- Run the script using `bash scripts/02-run_bactmap.sh`. +- Run the script using `bash scripts/03-run_bactmap.sh`. If the script is running successfully it should start printing the progress of each job in the bactmap pipeline. The pipeline will take a while to run. @@ -98,7 +98,7 @@ nextflow run nf-core/bactmap \ We ran the script as instructed using: ```bash -bash scripts/02-run_bactmap.sh +bash scripts/03-run_bactmap.sh ``` While it was running it printed a message on the screen: @@ -155,7 +155,7 @@ This is a compilation of statistics collected from the outputs of tools such as #### fastp -There are a number of plots showing the results of the `fastp` step in the pipeline. These plots are explained in [The bacQC pipeline](07-bacqc.md). +There are a number of plots showing the results of the `fastp` step in the pipeline. These plots are explained in [The bacQC pipeline](09-bacqc.md). #### Samtools @@ -241,28 +241,28 @@ To calculate the percentage of the reference mapped we divide the sum of 'A's, ' This is more than 90% so we can proceed with the analysis of this sample. :::{.callout-exercise} -#### How much of the reference was mapped? +#### Exercise: How much of the reference was mapped? We have calculated the percentage of the reference mapped for a single sample. However, we have five samples that we need to repeat the analysis on. To do this, we've provided a script that runs `seqtk comp` on all the samples in the `pseudogenomes` directory using a _for loop_. -- In the folder `scripts` (inside your analysis directory), you'll find a script named `03-pseudogenome_check.sh`. +- In the folder `scripts` (inside your analysis directory), you'll find a script named `04-pseudogenome_check.sh`. - Open the script, which you will notice is composed of two sections: - `#### Settings ####` where we define some variables for input and output files names. If you were running this script on your own data, you may want to edit the directories in this section. - `#### Analysis ####` this is where `seqtk comp` is run on each sample as detailed in @sec-seqtk. You should not change the code in this section, although examining it is a good way to learn about running a _for loop_. - Activate the software environment: `mamba activate seqtk` -- Run the script with `bash scripts/03-pseudogenome_check.sh`. If the script is running successfully it should print a message on the screen as the samples are processed. +- Run the script with `bash scripts/04-pseudogenome_check.sh`. If the script is running successfully it should print a message on the screen as the samples are processed. - Once the analysis finishes open the `mapping_summary.tsv` file in _Excel_ from your file browser . - Sort the results by the `%ref mapped` column and identify the sample which has the lowest percentage of the reference mapped. :::{.callout-answer} -We opened the script `03-pseudogenome_check.sh` and these are the settings we used: +We opened the script `04-pseudogenome_check.sh` and these are the settings we used: - `fasta_dir="results/bactmap/pseudogenomes"` - the name of the directory with the pseudogenomes produced by `bactmap` in it. - `outdir="results/bactmap/pseudogenomes_check"` - the name of the directory where we want to save our results. - `parser="scripts/seqtk_parser.py"` - the path to a python script that takes the `seqtk` TSV files as input and does the calculation we performed above for all the samples. -We then ran the script using `bash scripts/03-pseudogenome_check.sh`. The script prints a message while it's running: +We then ran the script using `bash scripts/04-pseudogenome_check.sh`. The script prints a message while it's running: ```bash Processing ERX9450498_ERR9907670.fas @@ -352,10 +352,10 @@ Done. The masked final alignment will be saved to the `results/bactmap/masked_alignment/` directory. -Alternatively we've provided a script, `04-mask_pseudogenome.sh` in the `scripts` directory which could be used instead with `bash`: +Alternatively we've provided a script, `05-mask_pseudogenome.sh` in the `scripts` directory which could be used instead with `bash`: ```bash -bash scripts/04-mask_pseudogenome.sh +bash scripts/05-mask_pseudogenome.sh ``` ## Summary diff --git a/materials/10-phylogenetics.md b/materials/12-phylogenetics.md similarity index 92% rename from materials/10-phylogenetics.md rename to materials/12-phylogenetics.md index 70be229..43b4bfb 100644 --- a/materials/10-phylogenetics.md +++ b/materials/12-phylogenetics.md @@ -113,7 +113,7 @@ The two most commonly used muliple sequence alignments in bacterial genomics are ## Building a phylogenetic tree -Following that very brief introduction to phylogenetics, we can start building our first phylogenetic tree using the masked alignment file we created in the [previous section](09-bactmap.md). +Following that very brief introduction to phylogenetics, we can start building our first phylogenetic tree using the masked alignment file we created in the [previous section](11-bactmap.md). We start our analysis by activating our software environment, to make all the necessary tools available: @@ -234,13 +234,13 @@ There are several files with the following extensions: The main files of interest are the report file (`.iqtree`) and the tree file (`.treefile`) in standard [Newick format](https://en.wikipedia.org/wiki/Newick_format). :::{.callout-exercise} -#### Tree inference +#### Exercise: Tree inference -Produce a tree from the masked pseudogenome alignment from we created in the [previous section](09-bactmap.md). +Produce a tree from the masked pseudogenome alignment from we created in the [previous section](11-bactmap.md). - Activate the software environment: `mamba activate iqtree`. -- Fix the script provided in `scripts/05-iqtree.sh`. See @sec-iqtree if you need a hint of how to fix the code in the script. -- Run the script using `bash scripts/05-iqtree.sh`. Several messages will be printed on the screen while `iqtree` runs. +- Fix the script provided in `scripts/06-iqtree.sh`. See @sec-iqtree if you need a hint of how to fix the code in the script. +- Run the script using `bash scripts/06-iqtree.sh`. Several messages will be printed on the screen while `iqtree` runs. :::{.callout-answer} @@ -290,11 +290,26 @@ Nam_TB.bionj Nam_TB.log Nam_TB.mldist Nam_TB.ckp.gz Nam_TB.iqtree Nam_TB.treefile ``` -The main file of interest is `Nam_TB.treefile`, which contains our tree in the standard [Newick format](https://en.wikipedia.org/wiki/Newick_format). We will visualize this tree alongside relevant metadata in [Visualising phylogenies](12-tree_visualization.md). +The main file of interest is `Nam_TB.treefile`, which contains our tree in the standard [Newick format](https://en.wikipedia.org/wiki/Newick_format). We will root and then visualize this tree alongside relevant metadata in [Visualising phylogenies](14-tree_visualization.md). ::: ::: +### Rooting a phylogenetic tree + +Rooting a phylogenetic tree is essential for making sense of evolutionary relationships and for providing a temporal context to the diversification of species. It transforms an unrooted tree, which simply shows relationships without direction, into a meaningful representation of evolutionary history. The most common way to accurately root a phylogenetic tree is to include an outgroup that is known to be more distantly related to the taxa included as part of the analysis. In our example we mapped our TB sequences to the MTBC0 reference, which is an outgroup to all members of the MTBC, so we'll use this to root our tree before visualizing it. There are a few different tools that could be used to root a phylogenetic tree but we've provided a python script, `root_tree.py` to do this. You can run the script using the following command: + +```bash +python scripts/root_tree.py -i preprocessed/iqtree/Nam_TB.treefile -g MTBC0 -o Nam_TB_rooted.treefile +``` +The options we used are: + +- `-i` - the TB phylogenetic tree we inferred with `IQ-TREE`. +- `-g` - the name of the outgroup to root the tree with (in this case MTBC0). +- `-o` - the rooted phylogenetic tree. + +This will create a NEWICK file called `Nam_TB_rooted.treefile` in your analysis directory. + ## Summary ::: {.callout-tip} diff --git a/materials/11-tb_profiler.md b/materials/13-tb_profiler.md similarity index 97% rename from materials/11-tb_profiler.md rename to materials/13-tb_profiler.md index 28b3814..5ee662c 100644 --- a/materials/11-tb_profiler.md +++ b/materials/13-tb_profiler.md @@ -112,29 +112,29 @@ Hopefully, we'll be able to successfully run `TB-Profiler` on all our samples bu :::{.callout-exercise} -#### Run TB-Profiler on all samples +#### Exercise: Run TB-Profiler on all samples We have run `TB-Profiler` on a single sample. However, we have five samples that we need to repeat the analysis on. To do this, we've provided a script that runs `TB-Profiler` on all the FASTQ files for all the samples in the `data/reads` directory using a _for loop_. -- In the folder `scripts` (inside your analysis directory), you'll find a script named `06-run_tb-profiler.sh`. +- In the folder `scripts` (inside your analysis directory), you'll find a script named `07-run_tb-profiler.sh`. - Open the script, which you will notice is composed of two sections: - `#### Settings ####` where we define some variables for input and output files names. If you were running this script on your own data, you may want to edit the directories in this section. - `#### Analysis ####` this is where `TB-Profiler` is run on each sample as detailed in @sec-tbprofiler. You should not change the code in this section. - Activate the software environment: `mamba activate tb-profiler` -- Run the script with `bash scripts/06-run_tb-profiler.sh`. If the script is running successfully it should print a message on the screen as the samples are processed. +- Run the script with `bash scripts/07-run_tb-profiler.sh`. If the script is running successfully it should print a message on the screen as the samples are processed. - Once the analysis finishes open the `Nam_TB.txt` file and copy/paste the data into _Excel_ (we've provided the results for all 50 samples in the `preprocessed` directory). - How many Lineage 2 isolates are there in the dataset? - Are any of the isolates antimicrobial-susceptible? :::{.callout-answer} -We opened the script `06-run_tb-profiler.sh` and these are the settings we used: +We opened the script `07-run_tb-profiler.sh` and these are the settings we used: - `fastq_dir="data/reads"` - the name of the directory with the read 1 and read 2 FASTQ files in it. - `outdir="results/tb-profiler"` - the name of the directory where we want to save our results. - `prefix="Nam_TB"` - the prefix for the output files created using `tb-profiler collate`. -We then ran the script using `bash scripts/06-run_tb-profiler.sh`. The script prints a message while it's running: +We then ran the script using `bash scripts/07-run_tb-profiler.sh`. The script prints a message while it's running: ```bash Processing ERX9450498_ERR9907670.fas diff --git a/materials/12-tree_visualization.md b/materials/14-tree_visualization.md similarity index 69% rename from materials/12-tree_visualization.md rename to materials/14-tree_visualization.md index 5ddd649..80d4a87 100644 --- a/materials/12-tree_visualization.md +++ b/materials/14-tree_visualization.md @@ -17,67 +17,57 @@ In order to use this platform you will first need to [**create an account**](htt ## Uploading tree files and metadata -Once you've logged into Microreact, you can upload the Namibian TB tree file (`Nam_TB.treefile`) and combined metadata TSV file (`TB_metadata.tsv`) we created earlier. +Once you've logged into Microreact, you can upload the rooted Namibian TB tree file (`Nam_TB_rooted.treefile`) and combined metadata TSV file (`TB_metadata.tsv`) we created earlier. -1. First, copy `Nam_TB.treefile` from the `preprocessed` directory to the analysis directory so that both files are in the same location: - -```bash -cp preprocessed/iqtree/Nam_TB.treefile . -``` - -2. Click on the **UPLOAD** link in the top-right corner of the page: +1. Click on the **UPLOAD** link in the top-right corner of the page: ![](images/microreact_upload1.png) -3. Click the **+** button on the bottom-right corner then **Browse Files** to upload the files: +2. Click the **+** button on the bottom-right corner then **Browse Files** to upload the files: ![](images/microreact_upload2.png) -1. This will open a file browser, where you can select the tree file and metadata from your local machine. Go to the `M_tuberculosis` directory where you have the results we've generated so far this week. Click and select the `Nam_TB.treefile` and `TB_metadata.tsv` files while holding the Ctrl key. Click Open on the dialogue window after you have selected both files. +3. This will open a file browser, where you can select the tree file and metadata from your local machine. Go to the `M_tuberculosis` directory where you have the results we've generated so far this week. Click and select the `Nam_TB_rooted.treefile` and `TB_metadata.tsv` files while holding the Ctrl key. Click Open on the dialogue window after you have selected both files. ![](images/microreact_upload3.png) -5. A new dialogue box will open with files you've uploaded and the File kind which is done automatically by Microreact. As the File kind for both files is correct, go ahead and click **CONTINUE**: +4. A new dialogue box will open with files you've uploaded and the File kind which is done automatically by Microreact. As the File kind for both files is correct, go ahead and click **CONTINUE**: ![](images/microreact_upload4.png) -6. Microreact will load the data and process it. The final step before we can have a look at the tree and annotate it is to confirm to Microreact which column in the metadata corresponds to the tip labels in the tree so it can match them. By default Microreact will use the first column, in this case `sample` which is correct so click **CONTINUE**: +5. Microreact will load the data and process it. The final step before we can have a look at the tree and annotate it is to confirm to Microreact which column in the metadata corresponds to the tip labels in the tree so it can match them. By default Microreact will use the first column, in this case `sample` which is correct so click **CONTINUE**: ![](images/microreact_upload5.png) -7. You should now see three windows in front of you. The top-left has a map with the locations of your isolates based on the longitude and latitude values included in `TB_metadata.tsv`. The top-right has the phylogenetic tree with a separate colour for each tip (by default Microreact will colour the tips by `BorstelID`). Across the bottom you have the metadata from `TB_metadata.tsv`. +6. You should now see three windows in front of you. The top-left has a map with the locations of your isolates based on the longitude and latitude values included in `TB_metadata.tsv`. The top-right has the phylogenetic tree with a separate colour for each tip (by default Microreact will colour the tips by `BorstelID`). Across the bottom you have the metadata from `TB_metadata.tsv`. ![](images/microreact_loaded.png) -8. The first thing we're going to do is change the colour of the tip nodes to **Region**. Click on the **Eye** icon in the top-right hand corner and change **Colour Column** to **Region**: +7. The first thing we're going to do is change the colour of the tip nodes to **Region**. Click on the **Eye** icon in the top-right hand corner and change **Colour Column** to **Region**: ![](images/microreact_region1.png) -9. This will change the colour of the tip nodes as well as the pie charts on the map - each region has its own colour as you'd expect: +8. This will change the colour of the tip nodes as well as the pie charts on the map - each region has its own colour as you'd expect: ![](images/microreact_region2.png) -10. At this point, before we proceed any further, let's save the project to your accounts. Click on the **Save** button in the top-right corner, change the project name to **Namibia TB** and add some kind of description so you know what the dataset is. Then click **Save as a New Project** (another dialogue box will appear asking if you want to share your project; for now, close this box): +9. At this point, before we proceed any further, let's save the project to your accounts. Click on the **Save** button in the top-right corner, change the project name to **Namibia TB** and add some kind of description so you know what the dataset is. Then click **Save as a New Project** (another dialogue box will appear asking if you want to share your project; for now, close this box): ![](images/microreact_save.png) -11. Now, let's make our phylogenetic tree a bit more informative. First, let's add the tip labels to the display by clicking on the left-hand of the two buttons in the phylogeny window then the drop down arrow next to **Nodes & Labels**. Now click the slider next to **Leaf Labels** and the slider next to **Align Leaf Labels**. We'll also make the text a little smaller by moving the slider to _12px_: +10. Now, let's make our phylogenetic tree a bit more informative. First, let's add the tip labels to the display by clicking on the left-hand of the two buttons in the phylogeny window then the drop down arrow next to **Nodes & Labels**. Now click the slider next to **Leaf Labels** and the slider next to **Align Leaf Labels**. We'll also make the text a little smaller by moving the slider to _12px_: ![](images/microreact_tips1.png) -12. The tip labels are still the European Nuclotide Accessions we used to download the FASTQ files. Let's change the tip labels to the `BorstelID` which is what's used in the paper. Click on the **Eye** icon again and change **Labels Column** to `BorstelID`: +11. The tip labels are still the European Nuclotide Accessions we used to download the FASTQ files. Let's change the tip labels to the `BorstelID` which is what's used in the paper. Click on the **Eye** icon again and change **Labels Column** to `BorstelID`: ![](images/microreact_tips2.png) -13. It's often useful to root a phylogenetic tree as it will more accurately reflect the relationships between our samples. As we have the ancestral reference sequence `MTBC0` which we used to build the tree included in our tree, we can use this as our root. To root the tree, hover over the node that joins `MTBC0` to three other samples and right click when a circle appears. Then click **Set as Root (Re-root)**: - -![](images/microreact_root.png) - -14. From the rooted tree, we can see we have two distinct clades within the tree. These are the two major lineages we identified in our dataset (Lineage 1 and Lineage 2). To make this clearer, change the colour of the tip nodes to `main_lineage` and click on **Legend** on the far right-hand side of the plot. Now we have a tree and map annotated with the two lineages in our dataset: +12. From the rooted tree, we can see we have two distinct clades within the tree. These are the two major lineages we identified in our dataset (Lineage 2 and Lineage 4). To make this clearer, change the colour of the tip nodes to `main_lineage` and click on **Legend** on the far right-hand side of the plot. Now we have a tree and map annotated with the two lineages in our dataset: ![](images/microreact_lineage.png) -15. The last thing we're going to do is add a histogram to show the frequency of lineages across the different regions to our Microreact window. Click on the **Pencil** icon on the top-right corner and click **Create New Chart** then move your mouse into the right hand side of the metadata box at the bottom of the window and click when you see the blue box appear. A blank chart should appear. Click **Chart Type** and select **Bar chart** and change the **X Axis Column** to `Region`. The plot should auto populate with the region on the X-axis and the Number of entries on the Y-axis. The bars are coloured according to `main_lineage` which is what we're currently using to colour our plots: +13. The last thing we're going to do is add a histogram to show the frequency of lineages across the different regions to our Microreact window. Click on the **Pencil** icon on the top-right corner and click **Create New Chart** then move your mouse into the right hand side of the metadata box at the bottom of the window and click when you see the blue box appear. A blank chart should appear. Click **Chart Type** and select **Bar chart** and change the **X Axis Column** to `Region`. The plot should auto populate with the region on the X-axis and the Number of entries on the Y-axis. The bars are coloured according to `main_lineage` which is what we're currently using to colour our plots: ![](images/microreact_hist.png) diff --git a/materials/13-group_exercise_1.md b/materials/15-group_exercise_1.md similarity index 100% rename from materials/13-group_exercise_1.md rename to materials/15-group_exercise_1.md diff --git a/materials/16-dating.md b/materials/16-dating.md new file mode 100644 index 0000000..f6ec159 --- /dev/null +++ b/materials/16-dating.md @@ -0,0 +1,185 @@ +--- +title: "Estimating time-scaled phylogenies" +--- + +::: {.callout-tip} +## Learning Objectives + +- Describe how incorporating temporal data into phylogenetic tree inference can be used to estimate the timing of evolutionary events. +- Understand the different types of tools that can be used to estimate time-scaled phylogenies. +- Learn how to assess whether there is a molecular clock signal in your data. +- Describe how TreeTime can be used to infer a time-scaled phylogenetic tree. +- Generate a time-scaled phylogeny with TreeTime. +- Visualize your time-scaled phylogeny with Figtree. +::: + +## Time-scaled phylogenies + +Time-scaled phylogenetics is an approach in evolutionary biology that integrates temporal data into the construction of phylogenetic trees, providing a framework to estimate the timing of evolutionary events. This method combines molecular sequence data with collection dates and known mutation rates to produce trees where branch lengths represent time, rather than merely genetic change. By doing so, it allows researchers to infer not only the relationships between different species but also the chronological sequence of divergence events. This approach can uncover insights into the rates of evolution, the timing of speciation events, and the impact of historical environmental changes on evolutionary processes. + +The incorporation of time into phylogenetic analysis enhances our understanding of the evolutionary timeline and facilitates more accurate reconstructions of ancestral states. For example, it enables the estimation of the age of the most recent common ancestor of a group of species, offering a temporal perspective on the diversification of lineages. Time-scaled phylogenetics is crucial for fields such as paleontology, where it can help correlate fossil records with molecular data, and for biogeography, where it aids in understanding the temporal patterns of species distributions. By integrating genetic, paleontological, and geochronological data, this method provides a comprehensive view of evolutionary history that is essential for unraveling the complexities of life's past. + +![Bayesian maximum clade credibility tree of 261 MTBC genomes with estimated divergence dates shown in years before present (Bos _et al._ 2014)](images/Bos_MTBC_2014.jpg) + +## Tools for estimating time-scaled phylogenies + +The most commonly used tools for estimating time-scaled phylogenies include BEAST (Bayesian Evolutionary Analysis Sampling Trees), MrBayes, and RAxML. BEAST is a powerful software package that uses Bayesian inference to estimate phylogenies and divergence times simultaneously, incorporating molecular clock models and various priors on the rates of evolution. It is particularly well-suited for complex datasets and allows for the incorporation of different types of data, such as molecular sequences and fossil calibrations. MrBayes, another Bayesian inference tool, is also widely used for phylogenetic analysis and can estimate time-scaled trees by applying relaxed or strict molecular clocks. RAxML (Randomized Axelerated Maximum Likelihood), although primarily a maximum likelihood-based tool, has features for dating phylogenies when combined with other tools that can handle molecular clock models. + +Despite its widespread use and powerful capabilities, BEAST has several drawbacks when estimating time-scaled phylogenies. One significant challenge is its computational intensity; BEAST requires substantial processing power and time, especially for large datasets or complex models, making it less accessible for researchers without high-performance computing resources. Additionally, BEAST's reliance on Bayesian inference means that results can be highly sensitive to the choice of priors, which requires careful consideration and can introduce subjective bias. The complexity of the software also presents a steep learning curve for new users, necessitating substantial expertise to correctly implement and interpret analyses. These limitations highlight the need for cautious application and interpretation of BEAST's results in phylogenetic studies. + +## Assessing molecular clock signal + +Estimating a molecular clock signal in genome data involves determining the rate at which genetic mutations accumulate over time, providing a "clock" to date evolutionary events. We do this by extracting the root-to-tip distances for each genome in a phylogenetic tree ("genetic divergence") and plot this against the collection date for each genome. A positive correlation between genetic divergence and time indicates a molecular clock signal with the slope being the mutation rate over time and the x-intercept the inferred date of the MRCA. It's worth remembering that this analysis is designed to provide an indication of a molecular clock signal in your data and a shallow slope doesn't necessarily mean that you may not be able to infer a robust time-scaled phylogenetic tree. + +![Example of a root-to-tip distance plot to assess molecular clock signal (van Tonder _et al._ 2021)](images/r2t_plot.png) + +## TreeTime + +`TreeTime` is a computational tool designed to estimate time-scaled phylogenies with a focus on efficiency and simplicity. It employs a maximum likelihood framework to integrate molecular sequence data with temporal information, such as sampling dates, to infer the timing of evolutionary events. `TreeTime` optimizes the placement of mutations along the phylogenetic tree while simultaneously adjusting branch lengths to reflect chronological time. This method leverages a relaxed molecular clock model, allowing for variations in the rate of evolution across different branches. By combining sequence data with temporal constraints, `TreeTime` rapidly produces time-calibrated phylogenies that are both accurate and computationally efficient, making it particularly useful for analyzing large datasets, such as those encountered in viral and bacterial evolution studies. Its user-friendly interface and robust performance make `TreeTime` an accessible and valuable tool for researchers aiming to elucidate the temporal dynamics of evolutionary processes. + +### Running `TreeTime` + +We're going to run `TreeTime` on the rooted phylogenetic tree of our Namibian TB genomes. As well as the tree, we also need the masked alignment we created in [The nf-core/bactmap pipeline](11-bactmap.md) and, as we're inferring a time-scaled phylogenetic tree, the sample collection dates that can be found in the `TB_metadata.tsv` file. Before we run `TreeTime`, we need to remove the outgroup MTBC0 from both the alignment and the phylogenetic tree: + +```bash +mamba activate treetime + +# create output directory +mkdir -p results/treetime/ + +# Remove outgroup from alignment +seqkit grep -v -p MTBC0 preprocessed/bactmap/masked_alignment/aligned_pseudogenomes_masked.fas > results/treetime/aligned_pseudogenomes_masked_no_outgroups.fas + +# Remove outgroup from rooted tree +python scripts/remove_outgroup.py -i Nam_TB_rooted.treefile -g MTBC0 -o Nam_TB_rooted_no_outgroup.treefile + +# Move no outgroup tree to results/treetime +mv Nam_TB_rooted_no_outgroup.treefile results/treetime +``` + +Now we can run `TreeTime`: + +```bash +# Run TreeTime +treetime --tree results/treetime/Nam_TB_rooted_no_outgroup.treefile \ + --dates TB_metadata.tsv \ + --name-column sample \ + --date-column Date.sample.collection \ + --aln results/treetime/aligned_pseudogenomes_masked_no_outgroups.fas \ + --outdir results/treetime \ + --report-ambiguous \ + --time-marginal only-final \ + --clock-std-dev 0.00003 \ + --relax 1.0 0 +``` + +The options used are: + +- `--tree results/treetime/Nam_TB_rooted_no_outgroup.treefile` - the rooted phylogenetic tree with the outgroup removed. +- `--dates TB_metadata.tsv` - a TSV file containing the sample collection dates. +- `--name-column sample` - the column within the TSV file that contains the sample names (this needs to match the names in the tree). +- `--date-column Date.sample.collection` - the column within the TSV file that contains the sample collection dates. +- `--aln results/treetime/aligned_pseudogenomes_masked_no_outgroups.fas` - the masked whole genome alignment with the outgroup removed. +- `--outdir results/treetime` - directory to save the output files to. +- `--report-ambiguous` - include transitions involving ambiguous states. +- `--time-marginal only-final` - assigns nodes to their most likely divergence time after integrating over all possible configurations of the other nodes. +- `--clock-std-dev 0.00003` - standard deviation of the provided clock rate estimate. +- `--relax 1.0 0` - use an autocorrelated molecular clock. Coupling 0 (–relax 1.0 0) corresponds to an un-correlated clock. + +We can look at the output folder: + +```bash +ls results/treetime +``` + +``` +aligned_pseudogenomes_masked_no_outgroups.fas dates.tsv root_to_tip_regression.pdf timetree.pdf +ancestral_sequences.fasta divergence_tree.nexus sequence_evolution_model.txt trace_run.log +auspice_tree.json molecular_clock.txt substitution_rates.tsv +branch_mutations.txt Nam_TB_rooted_no_outgroup.treefile timetree.nexus +``` + +There are several files produced by TreeTime but the most useful ones for our purposes are: + +- `root_to_tip_regression.pdf` - contains a plot showing the correlation between root to tip distance and collection date. +- `timetree.pdf` - contains a plot showing the time-scaled phylogenetic tree produced by TreeTime. +- `timetree.nexus` - contains the time-scaled phylogeny in NEXUS format. + +:::{.callout-exercise} +#### Exercise: Build a time-scaled phylogenetic tree + +We've provided a script, `08-run_treetime.sh`, to remove the outgroup from the alignment and tree and then run `TreeTime`. + +- Activate the software environment: `mamba activate treetime`. +- Run the script with `bash scripts/08-run_treetime.sh`. If the script is running successfully it should print a message on the screen as `TreeTime` constructs a time-scaled phylogenetic tree. +- Assess the strength of the molecular clock signal in the data. +- Examine the dates estimated by TreeTime. Do these seem reasonable based on what we know about the MTBC? + +:::{.callout-answer} + +We ran the script using `bash scripts/08-run_treetime.sh`. The script prints a message while it's running: + +```bash +0.00 -TreeAnc: set-up + +153.00 TreeTime.reroot: with method or node: least-squares + +153.00 TreeTime.reroot: rerooting will ignore covariance and shared ancestry. + +153.04 TreeTime.reroot: with method or node: least-squares + +153.04 TreeTime.reroot: rerooting will ignore covariance and shared ancestry. + +164.17 ###TreeTime.run: INITIAL ROUND +... +``` + +We opened the `root_to_tip_regression.pdf` file in the `results/treetime` directory: + +![Root-to-tip correlation](images/root_to_tip_regression.pdf) + +We can see that the correlation between root-to-tip distance and collection date isn't particularly strong. However, the inferred mutation rate (the slope) is 1.14e-06 mutations per site per year (~5 mutations per genome per year) which is around 10 times as fast as published estimates for the mutation rate of TB. This is the mutation rate that TreeTime uses as its prior so we should bear that in mind when we consider the dates in the time-scaled phylogenetic tree. + +Then we looked at the `timetree.pdf` file: + +![Time-scaled phylogenetic tree](images/timetree.pdf) + +We can see that the date of the MRCA for the genomes we included in our analysis is around 1906. Given this is the MRCA of the L2 and L4 genomes in our dataset, this likely to be a gross under-estimate for this date (Look at the Bos tree in the introduction which inferred the split between L2 and L4 to have occurred at least 1779 years before present). The likely reasons for this result include the small number of genomes we've included with a small range for the collection dates, the high mutation rate taken from the root-to-tip correlation plot and the fact that we've included three genomes from L2 in the dataset. The analysis was run this way to make use of the analysis we've already done thus far so you'd want to give more careful consideration when deciding which genomes to include when building a time-scaled phylogenetic tree. For instance, it may have been better to have removed the L2 genomes and just calculate the time-scaled evolutionary history of the the Namibian L4 genomes. + +::: + +::: + +### Visualizing time-scaled phylogenetic trees + +Now that we've generated a time-scaled phylogenetic tree with `TreeTime`, we can use `FigTree` to visualize the tree (the PDF produced by `TreeTime` doesn't include the sample ids and we can't edit it). You can open `FigTree` from the terminal by running the command `figtree`. + +To open the tree: + +1. Go to File > Open... and browse to the `results/treetime` folder with the `TreeTime` output files. + +2. Select the `timetree.nexus` file and click Open. You will be presented with a visual representation of the tree: + +![](images/timetree_figtree_1.png) + +3. First, let's add the time scale on the bottom. Click the arrow and Tick box next to **Scale Axis**. Then click the tick box next to **Reverse axis**: + +![](images/timetree_figtree_2.png) + +4. Now we need to correct the scale to account for our sample collection dates. Click the arrow next to **Time Scale** and input **2017.8** (the date of the most recently sampled genome)in the box next to **Offset by:** + +![](images/timetree_figtree_3.png) + +## Summary + +::: {.callout-tip} +## Key Points + +- Temporal data can be incorporated into the inference of phylogenetic trees to provide estimates for dates of evolutionary events. +- The most commonly used tools to infer time-scaled phylogenies are Bayesian and often require significant computational resources as well as time to run. +- Molecular clock signal in your data can be assessed by looking for a positive correlation between sampling date and root-to-tip distance. +- TreeTime is a computational tool designed to estimate time-scaled phylogenies with a focus on efficiency and simplicity. +- FigTree can be used to visualize time-scaled phylogenetic trees. +::: + +## References \ No newline at end of file diff --git a/materials/14-transmission.md b/materials/17-transmission.md similarity index 97% rename from materials/14-transmission.md rename to materials/17-transmission.md index dbd7d41..2da7c5c 100644 --- a/materials/14-transmission.md +++ b/materials/17-transmission.md @@ -50,20 +50,20 @@ The option we used is: The pairwise SNP matrix will be saved to the `results/transmission/` directory. -Alternatively we've provided a script, `07-run_pairsnp.sh` in the `scripts` directory which could be used instead with `bash`: +Alternatively we've provided a script, `08-run_pairsnp.sh` in the `scripts` directory which could be used instead with `bash`: ```bash -bash scripts/07-run_pairsnp.sh +bash scripts/08-run_pairsnp.sh ``` ## Calculating and plotting transmission networks in R -Now that we've generated a pairwise SNP distance matrix, we can use **R** to calculate and plot our transmission network using a pre-determined threshold of **5** SNPs to identify putative transmission events. Open RStudio then open the script `08-transmission.R` in the `scripts` directory. Run the code in the script, going line by line (remember in RStudio you can run code from the script panel using Ctrl + Enter). As you run the code check the tables that are created (in your "Environment" panel on the top-right) and see if the SNP matrix was correctly imported. Once you reach the end of the script, you should have created a plot showing the putative transmission networks identified in the data with the nodes coloured by Sex and the pairwise SNP distances shown along the edges: +Now that we've generated a pairwise SNP distance matrix, we can use **R** to calculate and plot our transmission network using a pre-determined threshold of **5** SNPs to identify putative transmission events. Open RStudio then open the script `09-transmission.R` in the `scripts` directory. Run the code in the script, going line by line (remember in RStudio you can run code from the script panel using Ctrl + Enter). As you run the code check the tables that are created (in your "Environment" panel on the top-right) and see if the SNP matrix was correctly imported. Once you reach the end of the script, you should have created a plot showing the putative transmission networks identified in the data with the nodes coloured by Sex and the pairwise SNP distances shown along the edges: ![Putative transmission networks generated using a 5 SNP threshold](images/5_snp_network.png) :::{.callout-exercise} -#### Adjust the pairwise SNP threshold +#### Exercise: Adjust the pairwise SNP threshold As discussed in the introduction above, various SNP thresholds are used when inferring putative transmission networks in TB. We used the most conservative threshold of 5 SNPs. For this exercise: diff --git a/materials/17-intro_staph.md b/materials/18-intro_staph.md similarity index 100% rename from materials/17-intro_staph.md rename to materials/18-intro_staph.md diff --git a/materials/18-assembly_annotation.md b/materials/19-assembly_annotation.md similarity index 100% rename from materials/18-assembly_annotation.md rename to materials/19-assembly_annotation.md diff --git a/materials/20-assemblebac.md b/materials/20-assemblebac.md index 732b4cc..e4ceddf 100644 --- a/materials/20-assemblebac.md +++ b/materials/20-assemblebac.md @@ -33,7 +33,7 @@ Along with the outputs produced by the above tools, the pipeline produces the fo ## Prepare a samplesheet -As with `bacQC` and `bactmap`, we need to prepare a CSV file containing the information about our sequencing files, which will be used as an input to the `assembleBAC` pipeline. Refer back to the [bacQC pipeline](07-bacqc.md#prepare-a-samplesheet) page for how to do this. +As with `bacQC` and `bactmap`, we need to prepare a CSV file containing the information about our sequencing files, which will be used as an input to the `assembleBAC` pipeline. Refer back to the [bacQC pipeline](09-bacqc.md#prepare-a-samplesheet) page for how to do this. ## Running assembleBAC @@ -67,12 +67,12 @@ Remember, the first step of any analysis of a new sequence dataset is to perform ::: :::{.callout-exercise} -#### Running assembleBAC +#### Exercise: Running assembleBAC Your next task is to run the **assembleBAC** pipeline on your data. Make sure you start this exercise from the `S_aureus` directory. - If you haven't done so already, make sure to create a samplesheet for your samples. - This follows the same format as detailed for the [bacQC pipeline](07-bacqc.md#prepare-a-samplesheet), so you can use the same python script as shown in that section. + This follows the same format as detailed for the [bacQC pipeline](09-bacqc.md#prepare-a-samplesheet), so you can use the same python script as shown in that section. - In the folder `scripts` (within your analysis directory) you will find a script named `01-run_assemblebac.sh`. This script contains the code to run this pipeline. Edit this script, adjusting it to fit your input files and the estimated genome size of _Staphylococcus aureus_. diff --git a/materials/21-assembly_qc.md b/materials/21-assembly_qc.md index 9c3b32b..83b399a 100644 --- a/materials/21-assembly_qc.md +++ b/materials/21-assembly_qc.md @@ -100,10 +100,8 @@ Assessing the accuracy of our genome includes addressing issues such as: Assessing these aspects of a genome assembly can be challenging, primarily because the true state of the organism's genome is often unknown, especially in the case of new genome assemblies. -## Exercises - :::{.callout-exercise} -#### Assembly contiguity +#### Exercise: Assembly contiguity To assess the contiguity of your assemblies, you ran `QUAST` as part of the assembleBAC pipeline. Open the file `transposed_report.tsv` in the `preprocessed/assemblebac/metadata` directory. This should open the file in _Excel_. - Answer the following questions: @@ -132,7 +130,7 @@ Isolate ERX3876939_ERR3864886_T1_contigs has more than 200 contigs. A this is hi ::: :::{.callout-exercise} -#### Assembly completeness +#### Exercise: Assembly completeness To assess the completeness of your assembly, we ran the _CheckM2_ software on your assembly files as part of the assembleBAC pipeline. Go to the ``preprocessed/assemblebac/metadata/`` directory and open the `checkm2_summary.tsv` file in _Excel_. diff --git a/materials/22-strain_typing.md b/materials/22-strain_typing.md new file mode 100644 index 0000000..f838bf0 --- /dev/null +++ b/materials/22-strain_typing.md @@ -0,0 +1,111 @@ +--- +title: "Typing bacteria using MLST" +--- + +::: {.callout-tip} +## Learning Objectives + +- Describe what multilocus sequence typing (MLST) is and its use for genomic surveillance. +- Understand that closely related Sequence Types can be grouped into Clonal Complexes. +- Perform MLST using command line software. + +::: + +Strain typing of bacteria is a critical process in microbiology that allows for the identification and differentiation of bacterial strains within a species. Traditional lab-based methods for bacterial strain typing include techniques such as serotyping, phage typing, and antibiograms. Serotyping involves the identification of distinct bacterial strains based on their unique surface antigens, while phage typing differentiates strains by their susceptibility to specific bacteriophages. Antibiograms, on the other hand, classify bacteria based on their patterns of antibiotic resistance. Although these methods are valuable and widely used, they can be labor-intensive, time-consuming, and sometimes lack the resolution needed to distinguish closely related strains. + +## Typing bacteria using MLST + +Over the past 25 years, molecular techniques like Multilocus Sequence Typing (MLST) have revolutionized bacterial strain typing. MLST involves sequencing internal fragments of multiple housekeeping genes and assigning a unique allelic profile to each strain based on the sequence variation at these loci. Each unique combination of alleles is called a sequence type (ST) and the [PubMLST project](https://pubmlst.org/) curates and maintains these sequence types in the form of large, comprehensive databases that facilitates global surveillance of bacterial populations and enhances our understanding of bacterial evolution and diversity. Although this method might seem a bit old-fashioned in the age of genomic analysis (as it focuses on only 7 genes), it offers a uniform and comparable way to categorize strains across different labs and locations. + +![Multilocus Sequence Typing (MLST)](images/mlst.png) + +### Clonal complexes + +Clonal complexes (CCs) are groups of related STs that share a high degree of genetic similarity, typically differing by only a single allele. By comparing ST profiles using algorithms such as eBURST or using clustering methods, closely related STs are grouped into CCs, reflecting their evolutionary relationships and likely descent from a common ancestor. This assignment of STs to CCs allows researchers to study the population structure, epidemiology, and evolutionary history of bacterial species, facilitating the tracking of disease outbreaks and the identification of major lineages responsible for infections. + +![Common _S. aureus_ Clonal Complexes (CCs) composed of closely related STs (Vivoni 2006)](images/clonal-complexes.jpeg) + +## MLST with command line {#sec-mlst-cli} + +We start our analysis by activating our software environment, to make all the necessary tools available: + +```bash +mamba activate mlst +``` + +We're going to run a tool called `mlst` on the assemblies we generated with `assembleBAC`: + +```bash +# create output directory +mkdir results/mlst + +# run mlst +mlst --scheme saureus results/assemblebac/assemblies/*.fa > results/mlst/mlst_typing.tsv +``` + +This command outputs a tab-delimited file (TSV), which we can open in a spreadsheet software such as _Excel_. +Here is the result for our samples: + +``` +results/assemblebac/assemblies/ERX3876905_ERR3864852_T1_contigs.fa saureus 398 arcC(3) aroE(35) glpF(19) gmk(2) pta(20) tpi(26) yqiL(39) +results/assemblebac/assemblies/ERX3876907_ERR3864854_T1_contigs.fa saureus 34 arcC(8) aroE(2) glpF(2) gmk(2) pta(6) tpi(3) yqiL(2) +results/assemblebac/assemblies/ERX3876908_ERR3864855_T1_contigs.fa saureus 97 arcC(3) aroE(1) glpF(1) gmk(1) pta(1) tpi(5) yqiL(3) +results/assemblebac/assemblies/ERX3876909_ERR3864856_T1_contigs.fa saureus 30 arcC(2) aroE(2) glpF(2) gmk(2) pta(6) tpi(3) yqiL(2) +results/assemblebac/assemblies/ERX3876929_ERR3864876_T1_contigs.fa saureus 45 arcC(10) aroE(14) glpF(8) gmk(6) pta(10) tpi(3) yqiL(2) +``` + +We get a column for each of the 7 genes used for _S. aureus_ sequence typing, with the gene name followed by the allele number in parenthesis. +The allele number is an identifier used by PubMLST, and it means that allele has a specific sequence with a certain set of variants ([search for alleles here](https://pubmlst.org/bigsdb?db=pubmlst_saureus_seqdef&page=alleleQuery)). +For example, `arcC(3)` corresponds to [allele 3 of the _arcC_ gene](https://pubmlst.org/bigsdb?db=pubmlst_saureus_seqdef&page=alleleInfo&locus=arcC&allele_id=3). + +The command line version of `mlst` also reports when an allele has an inexact match to the allele in the database, with the following notation (copied from [the README documentation](https://github.com/tseemann/mlst)): + +| Symbol | Meaning | Length | Identity | +| ------ | ----------------------------------------- | ----------------- | -------------- | +| `n` | Exact intact allele | 100% | 100% | +| `~n` | Novel full length allele _similar_ to `n` | 100% | ≥ `--minid` | +| `n?` | Partial match to known allele | ≥ `--mincov` | ≥ `--minid` | +| `-` | Allele missing | < `--mincov` | < `--minid` | +| `n,m` | Multiple alleles | | | + +The third column of the output indicates the Sequence Type (ST) of our samples based on the combination of the 7 alleles identified by `mlst`. + +:::{.callout-exercise} +#### Exercise: MLST with assemblebac + +The sharp-eyed amongst you may have noticed that `assemblebac` runs `mlst` alongside assembly and annotation. + +- Open the `mlst_summary.tsv` file in the `results/assemblebac/metadata` directory +- Are the assigned alleles and Sequence Types the same as those you obtained by running `mlst` yourself? + +:::{.callout-answer} + +- We opened the `mlst_summary.tsv` file which contained the following: + +``` +ERX3876905_ERR3864852_T1_contigs.fa saureus 398 arcC(3) aroE(35) glpF(19) gmk(2) pta(20) tpi(26) yqiL(39) +ERX3876907_ERR3864854_T1_contigs.fa saureus 34 arcC(8) aroE(2) glpF(2) gmk(2) pta(6) tpi(3) yqiL(2) +ERX3876908_ERR3864855_T1_contigs.fa saureus 97 arcC(3) aroE(1) glpF(1) gmk(1) pta(1) tpi(5) yqiL(3) +ERX3876909_ERR3864856_T1_contigs.fa saureus 30 arcC(2) aroE(2) glpF(2) gmk(2) pta(6) tpi(3) yqiL(2) +ERX3876929_ERR3864876_T1_contigs.fa saureus 45 arcC(10) aroE(14) glpF(8) gmk(6) pta(10) tpi(3) yqiL(2) +``` +- The alleles and STs were the same as those assigned when we ran `mlst` ourselves + +::: +::: + +## Summary + +::: {.callout-tip} +#### Key Points + +- MLST (Multilocus Sequence Typing) is a genotyping method used to identify and categorize bacterial strains based on the sequences of specific housekeeping genes. +- It aids in tracking and monitoring the spread of bacterial pathogens, understanding their genetic diversity, and detecting outbreaks. +- MLST results reveal the Sequence types (STs) of bacterial strains, which can help in identifying clonal complexes and their relatedness. +- Dedicated command-line software such as `mlst` allows for automation and give a more detailed output. + +::: + +#### References + +Vivoni AM, Diep BA, de Gouveia Magalhães AC, Santos KR, Riley LW, Sensabaugh GF, Moreira BM. Clonal composition of _Staphylococcus aureus_ isolates at a Brazilian university hospital: identification of international circulating lineages. _J Clin Microbiol._ 2006. [DOI](https://doi.org/10.1128/JCM.44.5.1686-1691.2006) \ No newline at end of file diff --git a/materials/22-pan_genomes.md b/materials/23-pan_genomes.md similarity index 100% rename from materials/22-pan_genomes.md rename to materials/23-pan_genomes.md diff --git a/materials/23-panaroo.md b/materials/24-panaroo.md similarity index 99% rename from materials/23-panaroo.md rename to materials/24-panaroo.md index a489370..08a7a9a 100644 --- a/materials/23-panaroo.md +++ b/materials/24-panaroo.md @@ -133,7 +133,7 @@ This is a bit more advanced, and is included here for interested users. We alrea ::: :::{.callout-exercise} -#### Core genome alignment +#### Exercise: Core genome alignment Using Panaroo, perform a core genome alignment for your assembled sequences. @@ -186,7 +186,7 @@ core_alignment_filtered_header.embl gene_presence_absence.csv ::: :::{.callout-exercise} -#### Tree inference +#### Exercise: Tree inference Because `Panaroo` takes a long time to run, we provide pre-processed results in the folder `preprocessed/panaroo`, which you can use as input to `IQ-TREE` in this exercise. diff --git a/materials/24-pathogenwatch.md b/materials/25-pathogenwatch.md similarity index 99% rename from materials/24-pathogenwatch.md rename to materials/25-pathogenwatch.md index 314678d..02ee283 100644 --- a/materials/24-pathogenwatch.md +++ b/materials/25-pathogenwatch.md @@ -127,7 +127,7 @@ You will see a table containing the drugs that **Pathogenwatch** is able to iden We will add some of this information to our phylogenetic tree in the next section. :::{.callout-exercise} -#### Downloading data from _Pathogenwatch_ +#### Exercise: Downloading data from _Pathogenwatch_ For the next step, [visualising our phylogeny](25-tree_visualization), you will need to download the results of the lineage typing and antibiotic susceptibility from `Pathogenwatch`: @@ -175,7 +175,7 @@ mv ~/Desktop/wbg-2023* . ::: :::{.callout-exercise} -#### Preparing data for _Microreact_ +#### Exercise: Preparing data for _Microreact_ Now that we have analysed our genomes with `Pathogenwatch` and downloaded the typing and AMR profiles, we need to merge this metadata with the existing information we have for the 30 _S. aureus_ genomes. We could do this with _Excel_. Alternatively, we provide a _Python_ script called `merge_staph_data.py` in the `scripts` directory. diff --git a/materials/25-tree_visualization.md b/materials/26-tree_visualization.md similarity index 95% rename from materials/25-tree_visualization.md rename to materials/26-tree_visualization.md index fd62510..8c640a6 100644 --- a/materials/25-tree_visualization.md +++ b/materials/26-tree_visualization.md @@ -10,7 +10,7 @@ title: "Visualising phylogenies 2" ::: -In the [Visualising phylogenies](12-tree_visualization.md) chapter we used `Microreact` to visualize the phylogenetic tree of Namibian TB isolates. We're going to do the same with the _S.aureus_ phylogeny and metadata but use some of the other features in `Microreact` we didn't use before. +In the [Visualising phylogenies](14-tree_visualization.md) chapter we used `Microreact` to visualize the phylogenetic tree of Namibian TB isolates. We're going to do the same with the _S.aureus_ phylogeny and metadata but use some of the other features in `Microreact` we didn't use before. 1. First, copy `School_Staph.treefile` to the analysis directory so that it's in the same location as `Staph_metadata.tsv`: @@ -40,7 +40,7 @@ In the [Visualising phylogenies](12-tree_visualization.md) chapter we used `Micr 7. Save your project calling it something helpful like "School Staph". :::{.callout-exercise} -#### Add Legend and bar plot to the window +#### Exercise: Add Legend and bar plot to the window - Add a legend to the window so you can identify what each colour corresponds to. - Add a bar chart to the window showing the distribution of STs in each school. diff --git a/materials/26-intro_pneumo.md b/materials/27-intro_pneumo.md similarity index 100% rename from materials/26-intro_pneumo.md rename to materials/27-intro_pneumo.md diff --git a/materials/27-run_bactmap.md b/materials/28-run_bactmap.md similarity index 96% rename from materials/27-run_bactmap.md rename to materials/28-run_bactmap.md index 85cd010..ca9dac2 100644 --- a/materials/27-run_bactmap.md +++ b/materials/28-run_bactmap.md @@ -18,11 +18,11 @@ Remember, the first step of any analysis of a new sequence dataset is to perform :::{.callout-exercise} -#### Running nf-core/bactmap +#### Exercise: Running nf-core/bactmap Your next task is to run the **bactmap** pipeline on the _S. pneumoniae_ data. In the folder `scripts` (within the `S_pneumoniae` analysis directory) you will find a script named `01-run_bactmap.sh`. This script contains the code to run bactmap. -- First, create a `samplesheet.csv` file for `bactmap`. Refer back to [The bacQC pipeline](07-bacqc.md#prepare-a-samplesheet) page for how to do this, if you've forgotten. +- First, create a `samplesheet.csv` file for `bactmap`. Refer back to [The bacQC pipeline](09-bacqc.md#prepare-a-samplesheet) page for how to do this, if you've forgotten. - Edit the `01-run_bactmap.sh` script, adjusting it to fit your input files and the name and location of the reference you're going to map to (Hint: the reference sequence is located in `resources/reference`). @@ -92,7 +92,7 @@ The results for all the samples looked really good so we can keep all of them fo ::: :::{.callout-exercise} -#### How much of the reference was mapped? +#### Exercise: How much of the reference was mapped? - Activate the `seqtk` software environment. - Run the `02-pseudogenome_check.sh` script we've provided in the `scripts` folder, which calculates how much missing data there is for each sample using `seqtk comp`. diff --git a/materials/28-recombination.md b/materials/29-recombination.md similarity index 99% rename from materials/28-recombination.md rename to materials/29-recombination.md index af83752..4e11db9 100644 --- a/materials/28-recombination.md +++ b/materials/29-recombination.md @@ -102,7 +102,7 @@ The options we used are: The panel on the left shows the maximum-likelihood phylogeny built from the clonal frame of serotype isolates. The scale below shows the length of branches in base substitutions. The tree is coloured according to the classification of isolates, each of which corresponds to a row in the panel on the right. Each column in this panel is a base in the reference annotation, the annotation of which is shown at the top of the figure. The panel shows the distribution of inferred recombination events, which are coloured blue if they are unique to a single isolate, or red, if they are shared by multiple isolates through common ancestry. :::{.callout-exercise} -#### Run `Gubbins` +#### Exercise: Run `Gubbins` Using Gubbins, create a recombination-masked alignment. @@ -142,7 +142,7 @@ Along with the `Gubbins` outputs the script also created the masked alignment fi ::: :::{.callout-exercise} -#### Build post-Gubbins phylogeny +#### Exercise: Build post-Gubbins phylogeny Now that we have created a recombination-masked alignment, we can extract the variant sites and count of constant sites and use these to build a recombination-free phylogenetic tree with `IQ-TREE`. diff --git a/materials/30-poppunk.md b/materials/30-poppunk.md new file mode 100644 index 0000000..fd11a41 --- /dev/null +++ b/materials/30-poppunk.md @@ -0,0 +1,130 @@ +--- +title: "Typing bacteria using PopPUNK" +--- + +::: {.callout-tip} +## Learning Objectives + +- Understand that there are newer whole genome based methods that are increasingly being used instead of MLST. +- Use PopPUNK to assign Global Pneumococcal Sequence Clusters (GPSCs) to _Streptococcus pneumoniae_ genomes. + +::: + +## Newer methods for strain typing + +Although Multilocus Sequence Typing (MLST) has been highly effective, newer methods leveraging whole-genome sequencing (WGS) are beginning to replace it due to their enhanced resolution and comprehensiveness. WGS provides a complete picture of the bacterial genome, allowing for the identification of single nucleotide polymorphisms (SNPs) across the entire genome rather than just a limited set of housekeeping genes. This approach offers a finer scale of differentiation between strains, which is crucial for detailed epidemiological investigations and understanding transmission dynamics. Additionally, WGS facilitates the detection of horizontal gene transfer, virulence factors, and antibiotic resistance genes, providing a more holistic view of bacterial pathogenicity and evolution. Tools and platforms such as core genome MLST (cgMLST) and SNP-based phylogenetic analysis have been developed to utilize WGS data, offering higher resolution and more accurate strain typing. As sequencing technologies become faster and more cost-effective, WGS is increasingly becoming the gold standard for bacterial strain typing, enabling precise and comprehensive bacterial surveillance and research. + +## PopPUNK + +PopPUNK (Population Partitioning Using Nucleotide K-mers) is a bioinformatics tool designed for the rapid and scalable analysis of bacterial population structure using whole-genome sequencing data. It works by comparing genomes based on k-mer frequencies, where k-mers are short, fixed-length sequences of nucleotides. PopPUNK creates a distance matrix by calculating the genetic distances between all pairs of genomes, capturing both core and accessory genome variations. This matrix is then used to cluster genomes into distinct populations or clades using a combination of hierarchical and density-based clustering algorithms. PopPUNK can handle large datasets efficiently, providing insights into the evolutionary relationships, population dynamics, and epidemiological patterns of bacterial species. Its ability to integrate core and accessory genome data makes it a powerful tool for understanding microbial diversity and for tracking the spread of bacterial strains in public health contexts. + +### Global Pneumococcal Sequence Clusters (GPSCs) + +PopPUNK has been instrumental in assigning Global Pneumococcal Sequence Clusters (GPSCs), thereby enhancing our understanding of the worldwide population structure of _Streptococcus pneumoniae_. By analyzing extensive collections of whole-genome sequences from pneumococcal isolates collected globally, PopPUNK clusters these genomes into distinct lineages based on k-mer frequency comparisons. This high-resolution clustering allows for the identification of major lineages and sub-lineages, providing insights into the genetic diversity and evolutionary relationships among pneumococcal strains across different geographic regions. The tool has been used to trace the spread of specific lineages, monitor the impact of pneumococcal conjugate vaccines (PCVs) on the population structure, and identify emerging strains that may pose new public health threats. By integrating global genomic data, PopPUNK facilitates a comprehensive and detailed understanding of pneumococcal epidemiology, which is crucial for devising effective vaccination strategies and managing antibiotic resistance on a global scale. + +![621 Global Pneumococcal Sequence Clusters (GPSCs) assigned with PopPUNK (Gladstone 2006)](images/gpscs.png) + +## Running PopPUNK + +We'll start by activating the `poppunk` software environment: + +```bash +mamba activate poppunk +``` + +Now we need to create a tab-delimited file (called `assemblies.txt`) containing the samples we want to assign GPSCs to and where the assemblies are located: + +``` +ERX1265396_ERR1192012_T1 preprocessed/assemblebac/assemblies/ERX1265396_ERR1192012_T1_contigs.fa +ERX1265488_ERR1192104_T1 preprocessed/assemblebac/assemblies/ERX1265488_ERR1192104_T1_contigs.fa +ERX1501202_ERR1430824_T1 preprocessed/assemblebac/assemblies/ERX1501202_ERR1430824_T1_contigs.fa +ERX1501203_ERR1430825_T1 preprocessed/assemblebac/assemblies/ERX1501203_ERR1430825_T1_contigs.fa +ERX1501204_ERR1430826_T1 preprocessed/assemblebac/assemblies/ERX1501204_ERR1430826_T1_contigs.fa +``` + +To run PopPUNK on our assemblies, the following commands can be used: + +```bash +# create output directory +mkdir -p results/poppunk + +# run PopPUNK +poppunk_assign --db GPS_v8_ref --external-clustering GPS_v8_external_clusters.csv --query qfile.txt --output results/poppunk --threads 8 +``` +The options we used are: + +- `--db` - GPSC reference database. +- `--external-clustering` - GPSC designations. +- `--query` - a 2-column tab-delimited file containing sample names and their assembly paths. +- `--output` - the output directory for the results. +- `--threads 8` - specifies how many CPUs to use. + +:::{.callout-exercise} +#### Exercise: Run `PopPUNK` + +Using `PopPUNK`, assign your Pneumococcal genomes to GPSCs. + +- Activate the software environment: `mamba activate poppunk`. +- Run the script we provide in `scripts` using `bash scripts/05-run_poppunk.sh`. +- When the analysis starts you will get several messages print on the screen. +- Were any of the assemblies not assigned to an existing GPSC? + +:::{.callout-answer} +We ran the script using `bash scripts/05-run_poppunk.sh`. The script prints a message while it's running: + +```bash +PopPUNK: assign + (with backend: sketchlib v2.1.4 + sketchlib: /rds/user/ajv37/hpc-work/micromamba/envs/poppunk/lib/python3.10/site-packages/pp_sketchlib.cpython-310-x86_64-linux-gnu.so) +Mode: Assigning clusters of query sequences + + +Graph-tools OpenMP parallelisation enabled: with 8 threads +Sketching 5 genomes using 5 thread(s) +Progress (CPU): 5 / 5 +Writing sketches to file +Loading previously refined model +Completed model loading +WARNING: versions of input databases sketches are different, results may not be compatible +Calculating distances using 8 thread(s) +Progress (CPU): 100.0% +Loading network from /home/ajv37/rds/rds-pathogen-CvaldwrLQm4/databases/poppunk/GPS_v8_ref/GPS_v8_ref_graph.gt +Network loaded: 2946 samples + +Done +``` + +In the `results/poppunk` directory we can see the following files: + +``` +poppunk_clusters.csv poppunk.dists.npy poppunk.dists.pkl poppunk_external_clusters.csv poppunk.h5 poppunk_unword_clusters.csv +``` + +We can check the results of the clustering by opening the `poppunk_clusters.csv` file in the `results/poppunk` directory: + +``` +Taxon,Cluster +ERX1501203_ERR1430825_T1,2 +ERX1501202_ERR1430824_T1,2 +ERX1265396_ERR1192012_T1,2 +ERX1265488_ERR1192104_T1,2 +ERX1501204_ERR1430826_T1,2 +``` +- We can see that all the genomes were assigned to GPSC2. + +::: + +::: + +## Summary + +::: {.callout-tip} +#### Key Points + +- There are whole genome based alternatives to MLST such as cgMLST and PopPUNK +- PopPUNK can be used to assign existing GPSCs to new genomes. +::: + +#### References + +Gladstone RA, Lo SW, Lees JA, Croucher NJ, van Tonder AJ, Corander J, Page AJ, Marttinen P, Bentley LJ, Ochoa TJ, Ho PL, du Plessis M, Cornick JE, Kwambana-Adams B, Benisty R, Nzenze SA, Madhi SA, Hawkins PA, Everett DB, Antonio M, Dagan R, Klugman KP, von Gottberg A, McGee L, Breiman RF, Bentley SD; Global Pneumococcal Sequencing Consortium. International genomic definition of pneumococcal lineages, to contextualise disease, antibiotic resistance and vaccine impact. _EBioMedicine_ 2019. [DOI](https://doi.org/10.1016/j.ebiom.2019.04.021) \ No newline at end of file diff --git a/materials/30-tree_visualization.md b/materials/30-tree_visualization.md deleted file mode 100644 index 7164dfb..0000000 --- a/materials/30-tree_visualization.md +++ /dev/null @@ -1,45 +0,0 @@ ---- -title: "Visualising phylogenies 3" ---- - -::: {.callout-tip} -## Learning Objectives - -- Visualise and annotate a _S. pneumoniae_ phylogenetic tree. - -::: - -:::{.callout-exercise} -#### Visualize the Pneumococcal tree and edit the timeline - -- Copy the preprocessed `sero1.treefile` to the analysis directory so that it's in the same location as `pneumo_metadata.tsv` -- Upload the two files to `Microreact`. -- Change the **Colour Column** to `MLST ST (PubMLST)`. -- Edit the tree so the tip labels are shown and reduce the size of the leaf nodes. -- Root the tree with the reference sequence. -- Add the AMR profile as metadata blocks. -- Add a legend to the window so you can identify what each colour corresponds to. -- Edit the Timeline plot so that the data in the bars are normalized and summarised by month. -- Save your project as **Chaguza Serotype 1**. - -:::{.callout-answer} - -- We copied `sero1.treefile` to the analysis directory: - - ```bash - cp preprocessed/iqtree/sero1.treefile . - ``` - -- We uploaded the two files to `Microreact` by clicking **Upload** then the **+** button on the bottom-right corner then **Browse Files** to upload the files. -- We clicked on the **Eye** icon and changed the **Colour Column** to `MLST ST (PubMLST)`. -- We clicked on the left-hand of the two buttons in the phylogeny window then the drop down arrow next to **Nodes & Labels**, clicked the slider next to **Leaf Labels** and moved the slider next to **Leaf Nodes** to reduce the size. -- We right-clicked on the node leading to `NC_018630_1` and clicked **Set as Root (Re-root)** to root the tree. -- We click on the **Metadata blocks** button in the top-right corner and select all of the drugs in the list. -- We clicked on the **Legend** button on the far right-hand side of the window to show the legend. -- We clicked the left-hand of the two buttons in the Timeline window then changed **Type** to **Normalised Bar Chart** and **Unit** to **Month**. -- We click on the **Save** button in the top-right corner and changed the project name to **Chaguza Serotype 1**. - -![](images/microreact_pneumo.png) - -::: -::: diff --git a/materials/29-pathogenwatch.md b/materials/31-pathogenwatch.md similarity index 95% rename from materials/29-pathogenwatch.md rename to materials/31-pathogenwatch.md index 1d7659a..5d3f4b0 100644 --- a/materials/29-pathogenwatch.md +++ b/materials/31-pathogenwatch.md @@ -10,7 +10,7 @@ title: "Pathogenwatch 2" ::: :::{.callout-exercise} -#### Analysing Pneumococcal genomes with Pathogenwatch +#### Exercise: Analysing Pneumococcal genomes with Pathogenwatch Whilst you can upload FASTQ files to Pathogenwatch, it's quicker if we work with already assembled genomes. We've provided pre-processed results for the _S. pneumoniae_ data generated with `assembleBAC` in the `preprocessed/` directory which you can use to upload to Pathogenwatch in this exercise. @@ -22,7 +22,7 @@ Whilst you can upload FASTQ files to Pathogenwatch, it's quicker if we work with :::{.callout-hint} -Refer back to [Pathogenwatch](29-pathogenwatch.md) if you need a reminder on how to perform these tasks. +Refer back to [Pathogenwatch](25-pathogenwatch.md) if you need a reminder on how to perform these tasks. ::: diff --git a/materials/32-tree_visualization.md b/materials/32-tree_visualization.md new file mode 100644 index 0000000..2a3c621 --- /dev/null +++ b/materials/32-tree_visualization.md @@ -0,0 +1,53 @@ +--- +title: "Visualising phylogenies with ggtree" +--- + +::: {.callout-tip} +## Learning Objectives + +- Visualise and annotate a _S. pneumoniae_ phylogenetic tree with the R library ggtree. + +::: + +## Plotting phylogenetic trees with R + +Plotting phylogenetic trees with `ggtree` in R offers a powerful and flexible approach for visualizing evolutionary relationships. The `ggtree` package, built on the ggplot2 framework, enables the creation of complex and highly customizable phylogenetic tree plots. With `ggtree`, users can easily manipulate tree layouts, annotate nodes and branches, and incorporate additional data layers such as heatmaps or bar charts. The package supports various tree formats, including Newick, Nexus, and PhyloXML, making it versatile for different datasets. Researchers can enhance their plots with a wide array of aesthetic options, such as color coding by clade, adding tip labels, and highlighting specific evolutionary events or traits. This flexibility makes `ggtree` an essential tool for evolutionary biologists and bioinformaticians, facilitating the clear and informative presentation of phylogenetic data. + +:::{.callout-exercise} +#### Exercise: Plot the Pneumococcal tree with `ggtree` + +- Copy the preprocessed `sero1.treefile` to the analysis directory so that it's in the same location as `pneumo_metadata.tsv`. +- Open the script `06-plot_phylogeny.R` in the `scripts` directory in RStudio. +- Edit the script so it points to the `sero1.treefile` and `pneumo_metadata.tsv` files in the analysis directory. +- Run the script to generate the annotated phylogenetic tree. + +:::{.callout-answer} + +- We copied `sero1.treefile` to the analysis directory: + +```bash +cp preprocessed/iqtree/sero1.treefile . +``` + +- We edited the script to point to the location of the metadata and tree files: + +```R +metadata <- read_tsv("../pneumo_metadata.tsv") + +pneumo_phylogeny <- read.tree("../sero1.treefile") +``` + +- We ran the script to generate a rooted phylogenetic tree: + +![Serotype 1 phylogenetic tree generated with ggtree](images/sero1_tree.png) + +::: +::: + +## Summary + +::: {.callout-tip} +#### Key Points + +- ggtree is a highly customisable R package for generating publication quality images of phylogenetic trees. +::: \ No newline at end of file diff --git a/materials/31-intro_amr.md b/materials/33-intro_amr.md similarity index 100% rename from materials/31-intro_amr.md rename to materials/33-intro_amr.md diff --git a/materials/32-command_line_amr.md b/materials/34-command_line_amr.md similarity index 98% rename from materials/32-command_line_amr.md rename to materials/34-command_line_amr.md index d102179..2b0205f 100644 --- a/materials/32-command_line_amr.md +++ b/materials/34-command_line_amr.md @@ -93,15 +93,15 @@ The options we used are: - `--arg_skip_deeparg` - this skips a step in the analysis which uses the software _DeepARG_. We did this simply because this software takes a very long time to run. But in a real analysis you may want to leave this option on. :::{.callout-exercise} -#### Running nf-core/funcscan +#### Exercise: Running funcscan -Your next task is to run the **funcscan** pipeline on your data. In the folder `scripts` (within your analysis directory) you will find a script named `05-run_funcscan.sh`. This script contains the code to run `funcscan`. +Your next task is to run the **funcscan** pipeline on your data. In the folder `scripts` (within your analysis directory) you will find a script named `07-run_funcscan.sh`. This script contains the code to run `funcscan`. - Edit this script, adjusting it to fit your input files and the name of your output directory. - Activate the `nextflow` software environment. -- Run the script using `bash scripts/05-run_funcscan.sh`. +- Run the script using `bash scripts/07-run_funcscan.sh`. While the pipeline runs, you will get a progress printed on the screen, and then a message once it finishes. @@ -128,7 +128,7 @@ nextflow run nf-core/funcscan \ We ran the script as instructed using: ```bash -bash scripts/05-run_funcscan.sh +bash scripts/07-run_funcscan.sh ``` While it was running it printed a message on the screen: diff --git a/materials/33-funcscan_pathogenwatch.md b/materials/35-funcscan_pathogenwatch.md similarity index 89% rename from materials/33-funcscan_pathogenwatch.md rename to materials/35-funcscan_pathogenwatch.md index b919703..adcd021 100644 --- a/materials/33-funcscan_pathogenwatch.md +++ b/materials/35-funcscan_pathogenwatch.md @@ -36,9 +36,9 @@ Ultimately, the safest way to assess AMR is with **experimental validation**, by However, computational analysis such as what we did can help inform these experiments and treatment decisions. :::{.callout-exercise} -#### AMR with _Pathogenwatch_ +#### Exercise: AMR with _Pathogenwatch_ -Following from the _Pathogenwatch_ exercise in [Analysing Pneumococcal genomes with Pathogenwatch](29-pathogenwatch.md), open the "Chaguza Serotype 1" collection that you created and answer the following questions: +Following from the _Pathogenwatch_ exercise in [Analysing Pneumococcal genomes with Pathogenwatch](31-pathogenwatch.md), open the "Chaguza Serotype 1" collection that you created and answer the following questions: - Open the antibiotics summary table. - Do all your samples have evidence for antibiotic resistance? @@ -53,10 +53,6 @@ We can open the "Antibiotics" table from the top-left dropdown menu, as shown in We can see that _Pathogenwatch_ identified resistance to several antibiotics. We can see that there are mainly two distinct AMR-profiles in our samples.The first group is resistant to Tetracycline, Trimethoprim, Sulfamethoxazole and Co-Trimoxazole and susceptible to the other drugs. The second group is resistant to Fluroquinolones, Sulfamethoxazole and only intermediate resistant to Co-Trimoxazole. Additionally sample ERX1501242_ERR1430864 is only resistant to Tetracycline and sample ERX1501229_ERR1430851 is resistant to Tetracycline, Sulfamethoxazole intermediate resistant to Co-Trimoxazole. -If you go back to the [_Microreact_ visualization](30-tree_visualization.md) of the "Chaguza Serotype 1" dataset and look at the distribution of the AMR profiles across the tree, you will see that there is a strong phylogenetic association. The first group that includes Tetracycline resistance forms a distinct clade at the bottom of the tree whilst the second group, susceptible to Tetracycline forms a second, larger clade in the middle of the tree. - -![](images/microreact_pneumo.png) - If we look at the _funcscan_ results we can see that it identified Tetracycline resistance in the same 12 samples as _Pathogenwatch_. ::: diff --git a/materials/36-outbreak.md b/materials/36-outbreak.md new file mode 100644 index 0000000..36a3d41 --- /dev/null +++ b/materials/36-outbreak.md @@ -0,0 +1,31 @@ +--- +title: "OUTBREAK ALERT!" +--- + +We've been contacted by the World Health Organization who are dealing with an outbreak of Acute Watery Diarrhoea in an undisclosed country. They need some urgent bioinformatics analysis doing on isolates collected as part of the outbreak and they’ve provided the raw data for us to analyse. + +![](images/who-logo1.jpeg) + +## Analyse the data + +- Perform QC on the sequence data to identify the causative agent and check that the data is good enough for further analysis + +- Have a look at the “Know Your Bug” Flowchart and decide what the best analysis approach is + +- Use what you have learned this week to analyse the isolates so as to be able to provide some useful information to the WHO so they can deal with the outbreak effectively + +- Have a think about what key information the WHO will want to know + +- Feel free to consult your colleagues if you need some help + +## Location of the data + +The data can be found in the following directory: + +```bash +outbreak/data +``` + + + + diff --git a/materials/34-group_exercise_2.md b/materials/37-group_exercise_2.md similarity index 95% rename from materials/34-group_exercise_2.md rename to materials/37-group_exercise_2.md index 82564eb..6074531 100644 --- a/materials/34-group_exercise_2.md +++ b/materials/37-group_exercise_2.md @@ -11,7 +11,7 @@ title: "Group Exercise 2" ## Final presentation -The final exercise we're going to get you to do this week is to prepare a 5 minute presentation to be presented to one of the audiences we used in [Group Exercise 1](13-group_exercise_1.md): +The final exercise we're going to get you to do this week is to prepare a 5 minute presentation to be presented to one of the audiences we used in [Group Exercise 1](15-group_exercise_1.md): - Field epidemiologist - Head of a public health lab diff --git a/materials/_chapters.yml b/materials/_chapters.yml index bbaa871..56e7370 100644 --- a/materials/_chapters.yml +++ b/materials/_chapters.yml @@ -3,55 +3,63 @@ book: # Edit this to include links to markdowns, which may be organised in different parts - part: "Introduction" chapters: - - materials/02-know_your_bug.md - - materials/01-intro_ngs.md - - materials/03-nextflow.md - - materials/04-nf_core.md + - materials/01-know_your_bug.md + - materials/02-preparing_data.md + - materials/03-intro_ngs.md + - materials/04-nextflow.md + - materials/05-nf_core.md + - materials/06-downloading.md - part: "Sequence quality control" chapters: - - materials/05-intro_tb.md - - materials/06-intro_qc.md - - materials/07-bacqc.md + - materials/07-intro_tb.md + - materials/08-intro_qc.md + - materials/09-bacqc.md - part: "Reference-based assembly" chapters: - - materials/08-mapping.md - - materials/09-bactmap.md + - materials/10-mapping.md + - materials/11-bactmap.md - part: "Phylogenetics" chapters: - - materials/10-phylogenetics.md - - materials/11-tb_profiler.md - - materials/12-tree_visualization.md - - materials/13-group_exercise_1.md + - materials/12-phylogenetics.md + - materials/13-tb_profiler.md + - materials/14-tree_visualization.md + - materials/15-group_exercise_1.md + - materials/16-dating.md - part: "Transmission" chapters: - - materials/14-transmission.md + - materials/17-transmission.md - part: "De-novo assembly and annotation" chapters: - - materials/17-intro_staph.md - - materials/18-assembly_annotation.md + - materials/18-intro_staph.md + - materials/19-assembly_annotation.md - materials/20-assemblebac.md - materials/21-assembly_qc.md + - materials/22-strain_typing.md - part: "Pan-genome analysis" chapters: - - materials/22-pan_genomes.md - - materials/23-panaroo.md - - materials/24-pathogenwatch.md - - materials/25-tree_visualization.md + - materials/23-pan_genomes.md + - materials/24-panaroo.md + - materials/25-pathogenwatch.md + - materials/26-tree_visualization.md - part: "Recombination" chapters: - - materials/26-intro_pneumo.md - - materials/27-run_bactmap.md - - materials/28-recombination.md - - materials/29-pathogenwatch.md - - materials/30-tree_visualization.md + - materials/27-intro_pneumo.md + - materials/28-run_bactmap.md + - materials/29-recombination.md + - materials/30-poppunk.md + - materials/31-pathogenwatch.md + - materials/32-tree_visualization.md - part: "Antimicrobial Resistance" chapters: - - materials/31-intro_amr.md - - materials/32-command_line_amr.md - - materials/33-funcscan_pathogenwatch.md + - materials/33-intro_amr.md + - materials/34-command_line_amr.md + - materials/35-funcscan_pathogenwatch.md + - part: "Outbreak!" + chapters: + - materials/36-outbreak.md - part: "Reporting" chapters: - - materials/34-group_exercise_2.md + - materials/37-group_exercise_2.md appendices: - materials/appendices/01-file_formats.md - materials/appendices/02-course_software.md \ No newline at end of file diff --git a/materials/appendices/02-course_software.md b/materials/appendices/02-course_software.md index 09e2ed6..18f9e01 100644 --- a/materials/appendices/02-course_software.md +++ b/materials/appendices/02-course_software.md @@ -3,91 +3,108 @@ title: Course Software number-sections: false --- -This page lists the tools used during this course (listed alphabetically). -The heading of each file links to a page with more details about each tool. +This page lists the tools used during this course (listed alphabetically). The heading of each file links to a page with more details about each tool. -### [Bakta](https://github.com/oschwengers/bakta) +### [`Bakta`](https://github.com/oschwengers/bakta) -annotation +`Bakta` is a software tool designed for the rapid and accurate annotation of bacterial genomes. It automates the process of identifying and categorizing genes and other functional elements within bacterial DNA sequences. otation -### [BCFtools](http://samtools.github.io/bcftools/bcftools.html) +### [`BCFtools`](http://samtools.github.io/bcftools/bcftools.html) -calls and filters variants +`BCFtools` is a set of utilities that manipulate variant calls in the Variant Call Format (VCF) and its binary counterpart, BCF. It is widely used for filtering, querying, merging, and annotating genomic variants, making it an essential tool in bioinformatics pipelines for genomic data analysis. -### [Bracken](https://ccb.jhu.edu/software/bracken/) +### [`Bracken`](https://ccb.jhu.edu/software/bracken/) -refines `Kraken2` assignments +`Bracken` (Bayesian Reestimation of Abundance with KrakEN) is a bioinformatics tool designed to improve the accuracy of species abundance estimation in metagenomic samples. It refines the results from Kraken by using Bayesian statistics to re-estimate the abundance of species, providing more precise and reliable microbial community profiles. -### [BWA](https://github.com/lh3/bwa) +### [`BWA`](https://github.com/lh3/bwa) -indexes reference fasta file +`BWA` (Burrows-Wheeler Aligner) is a fast and efficient software tool for aligning short DNA sequences to a reference genome. It supports various alignment algorithms, including BWA-MEM for high-quality alignments of sequences ranging from a few base pairs to hundreds of kilobases, making it a widely used tool in next-generation sequencing analysis. -### [CheckM2](https://github.com/chklovski/CheckM2) +### [`CheckM2`](https://github.com/chklovski/CheckM2) -assembly completeness +`CheckM2` is an advanced bioinformatics tool used for assessing the quality and completeness of microbial genomes and metagenome-assembled genomes (MAGs). It provides accurate estimates of genome completeness and contamination by leveraging a curated set of marker genes, which helps ensure the reliability of genomic data for downstream analyses. -### [fastp](https://github.com/OpenGene/fastp) +### [`fastp`](https://github.com/OpenGene/fastp) -performs adapter/quality trimming on sequencing reads +`fastp` is a versatile and high-performance tool designed for preprocessing high-throughput sequencing data. It performs quality control, filtering, trimming, and adapter removal with a focus on speed and accuracy, making it an essential component of modern genomic data analysis pipelines. -### [fastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/) +### [`fastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/) -A quality control tool for high throughput sequence data. +`FastQC` is a widely used bioinformatics tool that provides a comprehensive overview of the quality of high-throughput sequencing data. It generates detailed reports on various quality metrics, such as per-base sequence quality, GC content, and sequence duplication levels, helping researchers identify and address potential issues in their sequencing datasets. -### [fastq-scan](https://github.com/rpetit3/fastq-scan) +### [`fastq-scan`](https://github.com/rpetit3/fastq-scan) -calculates FASTQ summary statistics +`fastq-scan` is a lightweight and efficient tool designed to quickly summarize basic quality metrics from FASTQ files. It provides essential statistics, such as read length distribution and GC content, without the computational overhead of more comprehensive tools, making it ideal for initial quality assessments of sequencing data. -### [Gubbins](https://sanger-pathogens.github.io/gubbins/) +### [`Gubbins`](https://sanger-pathogens.github.io/gubbins/) -identifies recombinant regions (Optional) +`Gubbins` (Genealogies Unbiased By recomBinations In Nucleotide Sequences) is a bioinformatics tool used to detect and account for recombination in bacterial whole-genome sequence data. By identifying recombinant regions and reconstructing the underlying phylogeny, Gubbins helps ensure accurate evolutionary analyses and inferences. -### [IQ-TREE](http://www.iqtree.org/) +### [`IQ-TREE`](http://www.iqtree.org/) -### [Kraken2](https://ccb.jhu.edu/software/kraken2/) +`IQ-TREE` is a fast and efficient software tool for constructing phylogenetic trees based on maximum likelihood estimation. It supports a wide range of evolutionary models and provides advanced features such as ultrafast bootstrap approximation and model selection, making it a popular choice for high-quality phylogenetic analysis. -assigns taxonomic labels to reads +### [`Kraken 2`](https://ccb.jhu.edu/software/kraken2/) -### [mash](https://mash.readthedocs.io/en/latest/index.html) +`Kraken 2` is a powerful bioinformatics tool used for taxonomic classification of sequencing reads from metagenomic or genomic datasets. It rapidly assigns taxonomic labels to reads based on k-mer matches to a user-defined reference database, facilitating the characterization of microbial communities and the detection of pathogens in complex samples. -estimates genome size if not provided +### [`mash`](https://mash.readthedocs.io/en/latest/index.html) -### [mlst](https://github.com/tseemann/mlst) +`mash` is a computational tool used for fast genome and metagenome distance estimation based on k-mer similarity. It enables rapid comparison of large genomic datasets by compressing sequences into sketch files, allowing efficient retrieval of pairwise distances between genomes or metagenomes. -Sequence Type assignment +### [`mlst`](https://github.com/tseemann/mlst) -### [MultiQC](https://multiqc.info/) +`mlst` facilitates the analysis of Multi-Locus Sequence Typing data by automating allele calling and sequence comparison across bacterial isolates. It generates profiles that define the sequence types (STs) of each isolate based on allele variations in designated loci, crucial for studying bacterial population dynamics and genetic diversity. -summarises and creates visualizations for outputs from `fastQC`, `fastp` and `MultiQC` +### [`MultiQC`](https://multiqc.info/) -### [Nextflow](https://www.nextflow.io/) +`MultiQC` is a versatile tool used for aggregating and visualizing quality control metrics from multiple bioinformatics analyses in a single comprehensive report. It simplifies the assessment of data quality across diverse sequencing experiments and facilitates rapid identification of potential issues or trends in large datasets. + +### [`Nextflow`](https://www.nextflow.io/) + +`Nextflow` is a data-driven workflow management system designed for scalable and reproducible scientific workflows. It simplifies the deployment and execution of complex computational pipelines across different computing environments, enhancing collaboration and facilitating the integration of diverse bioinformatics tools and data sources. ### [pairsnp](https://github.com/gtonkinhill/pairsnp) -### [Panaroo](https://gtonkinhill.github.io/panaroo/) +`pairsnp` is a bioinformatics tool used for comparing whole-genome sequences and identifying single nucleotide polymorphisms (SNPs) between pairs of genomes. It provides a rapid and efficient method for genome-wide SNP detection, aiding in the study of genetic variation and evolutionary relationships among bacterial or viral isolates. + +### [`Panaroo`](https://gtonkinhill.github.io/panaroo/) + +`Panaroo` is a bioinformatics tool used for pan-genome analysis, which identifies core and accessory genes across multiple genomes of related organisms. It efficiently clusters and annotates genes to reveal the genomic diversity within a species or strain collection, aiding in understanding evolutionary dynamics and functional differences among microbial populations. + +### [`QUAST`](https://quast.sourceforge.net/) + +`QUAST` (Quality Assessment Tool for Genome Assemblies) is a software tool used for evaluating the quality of genome assemblies. It provides comprehensive metrics and graphical reports to assess key aspects such as contiguity, accuracy, and completeness, aiding researchers in optimizing genome assembly strategies and comparing different assembly methods. + +### [`Rasusa`](https://github.com/mbhall88/rasusa) + +`Rasusa` is a bioinformatics tool used for simulating sequencing reads from reference genomes with specified sequencing error profiles and sequencing depths. It enables researchers to evaluate and benchmark bioinformatics pipelines by generating synthetic datasets that mimic real-world sequencing data characteristics. + +### [`remove_blocks_from_aln.py`](https://github.com/sanger-pathogens/remove_blocks_from_aln) -### [Quast](https://quast.sourceforge.net/) +`remove_blocks_from_aln.py` is a Python script used for removing blocks of sequences from a multiple sequence alignment (MSA) based on specified criteria such as sequence identity or gap content. It facilitates the preprocessing of alignments to focus on conserved regions or to remove poorly aligned or ambiguous sequences, improving downstream phylogenetic or evolutionary analyses. -assembly metrics +### [`SAMtools`](https://sourceforge.net/projects/samtools/files/samtools/) -### [Rasusa](https://github.com/mbhall88/rasusa) +`SAMtools` is a widely used software suite for manipulating sequencing alignment files in the SAM/BAM format. It provides essential functions such as file format conversion, sorting, indexing, and variant calling, making it indispensable in genomics research and variant analysis workflows. -downsamples fastq files to 100X by default (Optional) +### [`seqtk`](https://github.com/lh3/seqtk) -### [remove_blocks_from_aln.py](https://github.com/sanger-pathogens/remove_blocks_from_aln) +`seqtk` is a versatile toolkit for processing sequences in FASTA and FASTQ formats. It includes tools for subsetting sequences, extracting specific regions, filtering by quality, and converting between different sequence formats, facilitating various bioinformatics analyses and preprocessing tasks. -### [SAMtools](https://sourceforge.net/projects/samtools/files/samtools/) +### [`Shovill`](https://github.com/tseemann/shovill) -sorts and indexes alignments +`Shovill` is a bioinformatics tool designed for fast and accurate _de novo_ assembly of microbial genomes from Illumina sequencing data. It automates the assembly process, including read trimming, error correction, and scaffolding, providing comprehensive genome assemblies suitable for downstream genomic analysis and comparative genomics studies. -### [seqtk](https://github.com/lh3/seqtk) +### [`SNP-sites`](https://github.com/sanger-pathogens/snp-sites) -### [Shovill](https://github.com/tseemann/shovill) +`SNP-sites` is a software tool used for identifying and extracting variable sites (SNPs) from a multiple sequence alignment (MSA). It efficiently detects polymorphic positions across aligned sequences, essential for population genetics, phylogenetics, and evolutionary studies based on genomic data. -*de novo* genome assembly +### [`TB-Profiler`](https://github.com/jodyphelan/TBProfiler) -### [SNP-sites](https://github.com/sanger-pathogens/snp-sites) +`TB-Profiler` is a bioinformatics tool used for analyzing _Mycobacterium tuberculosis_ genomes to detect mutations associated with drug resistance and lineage classification. It provides a comprehensive analysis of resistance profiles and strain lineages based on genomic data, aiding in clinical diagnostics and epidemiological surveillance of tuberculosis. -extracts variant sites from whole genome alignment +### [`TreeTime`](https://treetime.readthedocs.io/en/latest/index.html) -### [TB-Profiler](https://github.com/jodyphelan/TBProfiler) \ No newline at end of file +`TreeTime` is a tool used for molecular clock phylogenetic analysis, which estimates the evolutionary timescale of genetic sequences by integrating sequence data with a phylogenetic tree. It performs ancestral reconstruction and divergence dating, providing insights into the timing of evolutionary events within microbial populations or viral lineages. \ No newline at end of file diff --git a/materials/images/Bos_MTBC_2014.jpg b/materials/images/Bos_MTBC_2014.jpg new file mode 100644 index 0000000..488319e Binary files /dev/null and b/materials/images/Bos_MTBC_2014.jpg differ diff --git a/materials/images/MTBC.png b/materials/images/MTBC.png new file mode 100644 index 0000000..18d517a Binary files /dev/null and b/materials/images/MTBC.png differ diff --git a/materials/images/clonal-complexes.jpeg b/materials/images/clonal-complexes.jpeg new file mode 100644 index 0000000..62e5ac8 Binary files /dev/null and b/materials/images/clonal-complexes.jpeg differ diff --git a/materials/images/gpscs.png b/materials/images/gpscs.png new file mode 100644 index 0000000..7004fbd Binary files /dev/null and b/materials/images/gpscs.png differ diff --git a/materials/images/mlst.png b/materials/images/mlst.png new file mode 100644 index 0000000..140d891 Binary files /dev/null and b/materials/images/mlst.png differ diff --git a/materials/images/nf-core-fetchngs_metro_map_grey.png b/materials/images/nf-core-fetchngs_metro_map_grey.png new file mode 100644 index 0000000..eb4b49b Binary files /dev/null and b/materials/images/nf-core-fetchngs_metro_map_grey.png differ diff --git a/materials/images/r2t_plot.png b/materials/images/r2t_plot.png new file mode 100644 index 0000000..ef5143c Binary files /dev/null and b/materials/images/r2t_plot.png differ diff --git a/materials/images/root_to_tip_regression.pdf b/materials/images/root_to_tip_regression.pdf new file mode 100644 index 0000000..c7a6b37 Binary files /dev/null and b/materials/images/root_to_tip_regression.pdf differ diff --git a/materials/images/sero1_tree.png b/materials/images/sero1_tree.png new file mode 100644 index 0000000..2d42116 Binary files /dev/null and b/materials/images/sero1_tree.png differ diff --git a/materials/images/timetree.pdf b/materials/images/timetree.pdf new file mode 100644 index 0000000..42e46dd Binary files /dev/null and b/materials/images/timetree.pdf differ diff --git a/materials/images/timetree_figtree_1.png b/materials/images/timetree_figtree_1.png new file mode 100644 index 0000000..5ac57c7 Binary files /dev/null and b/materials/images/timetree_figtree_1.png differ diff --git a/materials/images/timetree_figtree_2.png b/materials/images/timetree_figtree_2.png new file mode 100644 index 0000000..4e553fe Binary files /dev/null and b/materials/images/timetree_figtree_2.png differ diff --git a/materials/images/timetree_figtree_3.png b/materials/images/timetree_figtree_3.png new file mode 100644 index 0000000..6952a27 Binary files /dev/null and b/materials/images/timetree_figtree_3.png differ diff --git a/materials/images/who-logo1.jpeg b/materials/images/who-logo1.jpeg new file mode 100644 index 0000000..472586a Binary files /dev/null and b/materials/images/who-logo1.jpeg differ diff --git a/setup.md b/setup.md index 5a870e5..1d86fcf 100644 --- a/setup.md +++ b/setup.md @@ -119,6 +119,30 @@ For convenience, we recommend installing the popular Pandas package in the base mamba install -n base pandas ``` +#### Bakta + +```bash +mamba create -n bakta bakta +``` + +#### Gubbins + +```bash +mamba create -n gubbins gubbins +``` + +#### IQ-Tree + +```bash +mamba create -n iqtree iqtree snp-sites biopython +``` + +#### mlst + +```bash +mamba create -n mlst mlst +``` + #### Nextflow ```bash @@ -153,58 +177,49 @@ docker { " >> $HOME/.nextflow/config ``` - -#### Seqtk - -```bash -mamba create -n seqtk seqtk pandas -``` - -#### remove_blocks_from_aln +#### pairsnp ```bash -mamba create -n remove_blocks python=2.7 -$HOME/miniforge3/envs/remove_blocks/bin/pip git+https://github.com/sanger-pathogens/remove_blocks_from_aln.git +mamba create -n pairsnp pairsnp ``` -#### IQ-Tree +#### Panaroo ```bash -mamba create -n iqtree iqtree snp-sites +mamba create -n panaroo python=3.9 panaroo>=1.3 snp-sites ``` -#### TB-Profiler +#### PopPUNK ```bash -mamba create -n tb-profiler tb-profiler pandas +mamba create -n poppunk python=3.10 poppunk ``` - -#### Panaroo +#### remove_blocks_from_aln ```bash -mamba create -n panaroo python=3.9 panaroo>=1.3 snp-sites +mamba create -n remove_blocks python=2.7 +$HOME/miniforge3/envs/remove_blocks/bin/pip git+https://github.com/sanger-pathogens/remove_blocks_from_aln.git ``` -#### Gubbins +#### Seqtk ```bash -mamba create -n gubbins gubbins +mamba create -n seqtk seqtk pandas ``` -#### pairsnp +#### TB-Profiler ```bash -mamba create -n pairsnp pairsnp +mamba create -n tb-profiler tb-profiler pandas ``` -#### Bakta +#### TreeTime ```bash -mamba create -n bakta bakta +mamba create -n treetime treetime seqkit biopython ``` - ### R and RStudio _R_ and _RStudio_ are available for all major operating systems. @@ -226,7 +241,7 @@ After installing R, you will need to install a few packages. Open _RStudio_ and on the console type the following command: ```r -install.packages(c("tidyverse", "tidygraph", "ggraph", "igraph")) +install.packages(c("tidyverse", "tidygraph", "ggraph", "igraph", "ggtree", "ggnewscale")) ``` @@ -353,4 +368,13 @@ amrfinder_update --force_update --database db-light/amrfinderplus-db/ ```bash wget https://zenodo.org/records/5571251/files/checkm2_database.tar.gz?download=1 tar -xzf checkm2_database.tar.gz +``` + +#### GPSCs + +```bash +wget https://gps-project.cog.sanger.ac.uk/GPS_v8_ref.tar.gz +tar -xzf GPS_v8_ref.tar.gz + +wget https://gps-project.cog.sanger.ac.uk/GPS_v8_external_clusters.csv ``` \ No newline at end of file