This document is a walkthrough of the methods and code for comparative analysis of the bee gut restricted bacteria, Gilliamella and Snodgrassella genomes. In the paper, we (1) annotated the genomes; (2) performed ortholog assignment and constructed phylogenetic trees; (3) measured gene flow based on PopCOGenT, and (4) performed functional enrichment analysis using anvi'o. Genomes and results of major steps are available on Zenodo: https://doi.org/10.5281/zenodo.5209528
# Genomes can be download from Zenodo https://doi.org/10.5281/zenodo.5209528
# Install anvio version 7
# Make a list of genome names
ls *.fna | perl -ne '@a=split/\./; print "$a[0]\n";' > fna.name.list
# make anvio dbs and gene annotation
cat fna.name.list | while read i
do
echo $i
echo "step a: rename fasta files"
anvi-script-reformat-fasta $i\.fna -o $i\-fixed.fa -l 0 --simplify-names
mv $i\-fixed.fa $i\.fa
echo "step b: contigs -> database; predict ORF"
anvi-gen-contigs-database -f $i\.fa -o $i\.db -n "Snodgrassella" # or Gilliamella for Gilliamella genomes
echo "step c: run HMMs"
anvi-run-hmms -c $i\.db --num-threads 20
echo "step d: run NCBI COGs"
anvi-run-ncbi-cogs -c $i\.db --num-threads 20 --sensitive
echo "step e: export gff files"
anvi-get-sequences-for-gene-calls -c $i\.db --export-gff3 -o $i\.gff
echo "step f: annotate 16S rRNA genes"
anvi-get-sequences-for-hmm-hits -c $i\.db --hmm-source Ribosomal_RNAs --gene Bacterial_16S_rRNA -o $i\.16SrRNA
echo "step g: export the protein sequences"
anvi-get-sequences-for-gene-calls -c $i\.db --get-aa-sequences -o $i\.protein-sequences.fa
echo "step h: annotate gene functions using EGGNOG"
emapper.py -i $i\.protein-sequences.fa --output $i\.protein-sequences.fa_maNOG --data_dir /PATH_TO_EGGNOG/eggnog-mapper_v2.1.2/ --cpu 32 --override
echo "step i: annotate gene functions using KEGG"
anvi-run-kegg-kofams -c $i\.db --kegg-data-dir /PATH_TO_KEGG_DB_FROM_ANVIO/KEGG_db -T 12
# KEGG annotations following the tutorial on anvio
# https://merenlab.org/2018/01/17/importing-ghostkoala-annotations/
done
cat fna.name.list | while read i | while read i
do
echo $i
#singularity exec -B $PWD CrisprCasFinder.simg perl /usr/local/CRISPRCasFinder/CRISPRCasFinder.pl -so /usr/local/CRISPRCasFinder/sel392v2.so -cf /usr/local/CRISPRCasFinder/CasFinder-2.0.3 -drpt /usr/local/CRISPRCasFinder/supplementary_files/repeatDirection.tsv -rpts /usr/local/CRISPRCasFinder/supplementary_files/Repeat_List.csv -cas -def G -out ./crispr_out/$i -in $i\.fa -cpuMacSyFinder 8 1> ./crispr_out/$i.1.log 2> ./crispr_out/$i.2.log
done
# note that the ortholog.all.protein.fasta is from step 2 ortholog assignment
python run_dbcan.py ortholog.all.protein.fasta protein --out_dir output --dia_cpu 30 --hmm_cpu 30 --tf_cpu 30 --dia_cpu 30 --hotpep_cpu 30
# the create external genome file with path to each genome db (details here: https://merenlab.org/software/anvio/help/main/artifacts/external-genomes/)
anvi-gen-genomes-storage -e external-genomes.txt -o GGENOMES.db
# ortholog assignment
anvi-pan-genome -g GGENOMES.db --project-name "G_genomes" --output-dir GGENOMES --num-threads 50 --mcl-inflation 10
# export all annotated coding sequences and protein sequences:
anvi-get-sequences-for-gene-clusters -p GGENOMES/G_genomes-PAN.db -g GGENOMES.db -o ortholog.all.dna.fasta --report-DNA-sequences --max-num-genes-from-each-genome 1000
anvi-get-sequences-for-gene-clusters -p GGENOMES/G_genomes-PAN.db -g GGENOMES.db -o ortholog.all.protein.fasta --max-num-genes-from-each-genome 1000
# export ortholog assignment and functions in a table (these files are available on Zenodo: https://doi.org/10.5281/zenodo.5209528)
anvi-script-add-default-collection -p GGENOMES/G_genomes-PAN.db
anvi-summarize -p GGENOMES/G_genomes-PAN.db -g GGENOMES.db -C DEFAULT -o PAN_SUMMARY
For Snodgrassella:
/PATH_TO_IQ_TREE/iqtree-2.1.2-Linux/bin/iqtree2 -s concate.list.fasta -o Kingella_denitrificans_NA,Neisseria_meningitidis_NA -T AUTO -B 1000
For Gilliamella:
/PATH_TO_IQ_TREE/iqtree-2.1.2-Linux/bin/iqtree2 -s outgroups_concate.list.fasta -o Frischella_Ac13_Acerana,Frischella_DSM104328_Amellifera,Frischella_ESL0167_Amellifera,Frischella_PEB0191_Amellifera,Orbus_IPMB12_Zatratus,Orbus_hercynius_Sscrofa,Schmidhempelia_bombi_Bimpatiens -T 50 -B 1000
PopCOGenT can be installed from here: https://github.com/philarevalo/PopCOGenT
Results can be downloaded from Zenodo (https://doi.org/10.5281/zenodo.5209528)
# extract the distance information from PopCOGenT
cut -f 1,2,3 Gilliamella.length_bias.txt | perl -ne 'chomp; s/\t/ /; print "$_\n"; ' > RAxML_distances.dist
# create a list of genomes
less -S Gilliamella.length_bias.txt | perl -ne '@a=split; print "$a[0]\n$a[1]\n";' | sort | uniq > sample.txt
# convert distance in column format to a Phylip distance matrix (using this script https://github.com/lyy005/conspecifix_arbovirus/blob/master/step3_conSpeciFix/simulation_master_script_v1.0_beta/RAxML2phylip.py)
python RAxML2phylip.py
# make a tree based on the distance matrix using BIONJ_linux (http://www.atgc-montpellier.fr/bionj/binaries.php)
# use seq-gen to simulate sequences based on the tree without recombination
# GC content and sequence length are based on Snodgrassella wkB2
seq-gen -mGTR -l2527978 -n1 -f0.294,0.207,0.206,0.293 -of < matrix_sample.tree > simulation.fasta
# GC content and sequence length are based on Gilliamella wkB1
seq-gen -mGTR -l3139412 -n1 -f0.333,0.167,0.169,0.331 -of < matrix_sample.tree > simulation.fasta
anvi-get-enriched-functions-per-pan-group -p GGENOMES/G_genomes-PAN.db -g GGENOMES.db --category-variable AmAc_apis_apicola --annotation-source KeggGhostKoala -o funcEnrich.AmAcApisApicola_KeggGhostKoala.txt --functional-occurrence-table-output funcEnrich.AmAcApisApicola_KeggGhostKoala.frequency.txt --exclude-ungrouped
anvi-get-enriched-functions-per-pan-group -p GGENOMES/G_genomes-PAN.db -g GGENOMES.db --category-variable AmAc_apis_apicola --annotation-source 'IDENTITY' --include-gc-identity-as-function -o funcEnrich.AmAcApisApicola_orthologs.txt --functional-occurrence-table-output funcEnrich.AmAcApisApicola_orthologs.frequency.txt --exclude-ungrouped
AmB_shared_genes.list is the list of genes only shared by A. mellifera- and Bombus-derived Snodgrassella genomes. Apis_shared_genes.list is the list of genes only shared by A. mellifera- and other Apis-derived Snodgrassella genomes.
# extract cds sequences based on gene IDs from Powell et al. 2016 PNAS https://doi.org/10.1073/pnas.1610856113
perl pick_sequences_on_list_v2.pl GCF_000600005.1_ASM60000v1_cds_from_genomic.fna gene_list_essential.txt gene_list_essential.fasta
# BLAST annotated wkB2 genes in this study to the genes from NCBI
blastn -db gene_list_essential.fasta -query S_wkB2_Amellifera.fasta -out blast_essential.out -evalue 1e-10 -outfmt "6 qlen slen qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore" -num_alignments 1 -perc_identity 99
# removed short fragments in the alignment
awk '($6 > 88){print}' blast_essential.out > blast_essential.out.filter_88bp
# combine the gene IDs and the essential gene list
perl combine_GC_geneID_powellID.pl S_wkB2_Amellifera.GC_geneID.list blast_essential.out.filter_88bp S_wkB2_Amellifera.GC_geneID_powellID.essential_list
# combine the gene functions
perl prep_for_heatmap_v2_essential.pl S_wkB2_Amellifera.GC_geneID_powellID.essential_list ortholog.table.col.sort powellID_essential.heatmap.input
# the useful output with gene family and functional information is: S_wkB2_Amellifera.GC_geneID_powellID.essential_list
# extract cds sequences based on gene IDs from Powell et al. 2016 PNAS https://doi.org/10.1073/pnas.1610856113
perl pick_sequences_on_list_v2.pl GCF_000600005.1_ASM60000v1_cds_from_genomic.fna gene_list_beneficial.txt gene_list_beneficial.fasta
# BLAST annotated wkB2 genes in this study to the genes from NCBI
blastn -db gene_list_beneficial.fasta -query S_wkB2_Amellifera.fasta -out blast_beneficial.out -evalue 1e-10 -outfmt "6 qlen slen qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore" -num_alignments 1 -perc_identity 99
# removed short fragments in the alignment
awk '($6 > 88){print}' blast_beneficial.out > blast_beneficial.out.filter_88bp
# combine the gene IDs and the beneficial gene list
perl combine_GC_geneID_powellID.pl S_wkB2_Amellifera.GC_geneID.list blast_beneficial.out.filter_88bp S_wkB2_Amellifera.GC_geneID_powellID.beneficial_list
# combine the gene functions
perl prep_for_heatmap_v2_beneficial.pl S_wkB2_Amellifera.GC_geneID_powellID.beneficial_list ortholog.table.col.sort powellID_beneficial.heatmap.input
# the useful output with gene family and functional information is: S_wkB2_Amellifera.GC_geneID_powellID.beneficial_list
Li Y., Leonard S.P., Powell J.E. Moran N.A., 2021.