diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..496ee2ca --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +.DS_Store \ No newline at end of file diff --git a/hic/FI1.hic b/hic/FI1.hic new file mode 100644 index 00000000..5b5315a9 Binary files /dev/null and b/hic/FI1.hic differ diff --git a/hic/FI1.html b/hic/FI1.html new file mode 100644 index 00000000..94f07f6e --- /dev/null +++ b/hic/FI1.html @@ -0,0 +1,93 @@ + +
+ + ++ AssemblyQC is a Nextflow pipeline which evaluates assembly quality with well established tools and presents + the results in a unified html report. +
+Reference:
++ Rashid, U., Wu, C., Shiller, J., Smith, K., Crowhurst, R., Davy, M., Chen, T.-H., Thomson, S., & Deng, C. + (2024). AssemblyQC: A NextFlow pipeline for evaluating assembly quality (1.4). Zenodo. + 10.5281/zenodo.10647870. GitHub. + https://github.com/Plant-Food-Research-Open/assemblyqc +
+Only displaying parameters that differ from the pipeline defaults.
{
+ "runName": "condescending_koch",
+ "containerEngine": "apptainer",
+ "launchDir": "/powerplant/workspace/hrauxr/assemblyqc",
+ "workDir": "/powerplant/workspace/hrauxr/assemblyqc/work",
+ "projectDir": "/powerplant/workspace/hrauxr/assemblyqc",
+ "userName": "hrauxr",
+ "profile": "pfr,apptainer,test_full",
+ "input": "https://raw.githubusercontent.com/plant-food-research-open/assemblyqc/dev/assets/assemblysheet.csv",
+ "ncbi_fcs_adaptor_skip": "false",
+ "ncbi_fcs_adaptor_empire": "euk",
+ "ncbi_fcs_gx_skip": "false",
+ "ncbi_fcs_gx_tax_id": "35717",
+ "ncbi_fcs_gx_db_path": "/workspace/ComparativeDataSources/NCBI/FCS/GX/r2023-01-24",
+ "busco_skip": "false",
+ "busco_mode": "geno",
+ "busco_lineage_datasets": "fungi_odb10 hypocreales_odb10",
+ "tidk_skip": "false",
+ "tidk_repeat_seq": "TTTGGG",
+ "lai_skip": "false",
+ "kraken2_skip": "false",
+ "kraken2_db_path": "/workspace/ComparativeDataSources/kraken2db/k2_pluspfp_20230314",
+ "hic": "test_data/SRR8238190_R{1,2}.fastq.gz",
+ "synteny_skip": "false",
+ "synteny_xref_assemblies": "https://raw.githubusercontent.com/plant-food-research-open/assemblyqc/dev/assets/xrefsheet.csv",
+ "config_profile_name": "Plant&Food profile",
+ "config_profile_description": "Plant&Food profile using SLURM in combination with Apptainer"
+}
+
Following is a non-exhaustive list of tools used to generate this report.
+{
+ "KronaTools": "2.7.1",
+ "LTR_FINDER_parallel": "v1.1",
+ "LTR_HARVEST_parallel": "v1.1",
+ "LTR_retriever": "v2.9.9",
+ "Nextflow": "23.04.4",
+ "assemblathon_stats": "github/PlantandFoodResearch/assemblathon2-analysis/a93cba2",
+ "av_screen_x": "0.4",
+ "awk": "1.3.4 20200120",
+ "biopython": "1.75",
+ "bundlelinks": "24Sep2013",
+ "busco": "5.6.1",
+ "bwa": "0.7.17-r1188",
+ "circos": "v0.69-8",
+ "dnadiff": "1.3",
+ "fastp": "0.23.4",
+ "fastqc": "0.12.1",
+ "fcs_gx": "0.5",
+ "genometools": "1.6.5",
+ "grep": "(GNU grep) 3.4",
+ "gunzip": "1.10",
+ "hic_qc.py": "0+unknown",
+ "juicebox_scripts": "0.1.0",
+ "kraken2": "2.1.2",
+ "lai": "beta3.2",
+ "ltr_finder": "v1.07",
+ "matlock": "20181227",
+ "nucmer": "4.0.0rc1",
+ "pandas": "2.1.1",
+ "perl": "5.32.1",
+ "pigz": "2.3.4",
+ "plant-food-research-open/assemblyqc": "1.4",
+ "py_fasta_validator": "0.6",
+ "python": "3.8.13",
+ "run-assembly-visualizer.sh": "18 July 2016",
+ "samblaster": "0.1.26",
+ "samtools": "1.16.1",
+ "sed": "(GNU sed) 4.7",
+ "seqkit": "v2.6.1",
+ "sort": "8.30",
+ "tidk": "0.2.41",
+ "ubuntu": "20.04.6l",
+ "yaml": "5.4.1"
+}
+
FCS-adaptor detects adaptor and vector contamination in genome sequences.
+Reference:
++ https://github.com/ncbi/fcs +
+Version: 0.4
+No contamination detected.
FCS-GX detects contamination from foreign organisms in genome sequences.
+Reference:
++ Alexander Astashyn, Eric S Tvedte, Deacon Sweeney, Victor Sapojnikov, Nathan Bouk, Victor Joukov, Eyal + Mozes, Pooja K Strope, Pape M Sylla, Lukas Wagner, Shelby L Bidwell, Karen Clark, Emily W Davis, Brian + Smith-White, Wratko Hlavina, Kim D Pruitt, Valerie A Schneider, Terence D Murphy bioRxiv 2023.06.02.543519; + doi: + 10.1101/2023.06.02.543519, GitHub: + https://github.com/ncbi/fcs +
++ Version: 0.5 +
++ DB Version: 2023-01-24 +
+ + +No contamination detected.
+Bogus, Repeat, Low-coverage and Inconclusive results are labelled as No hits.
+A script to calculate a basic set of metrics from a genome assembly.
+Reference:
++ https://github.com/KorfLab/Assemblathon +
++ Version: github/PlantandFoodResearch/assemblathon2-analysis/a93cba2 +
+Warning:
++ Contig-related stats are based on the assumption that the assemblathon_stats_n_limit (100) parameter is specified correctly. If you + are not certain of the value of the n_limit parameter, please ignore the contig-related stats. +
+Stat | Value |
---|---|
Assembly | GCA_003814445.1_ASM381444v1_genomic.fna |
Number of scaffolds | 8 |
Total size of scaffolds | 35023690 |
Longest scaffold | 7872678 |
Shortest scaffold | 52960 |
Number of scaffolds > 1K nt | 8 |
Percentage of scaffolds > 1K nt | 100.0 |
Number of scaffolds > 10K nt | 8 |
Percentage of scaffolds > 10K nt | 100.0 |
Number of scaffolds > 100K nt | 7 |
Percentage of scaffolds > 100K nt | 87.5 |
Number of scaffolds > 1M nt | 7 |
Percentage of scaffolds > 1M nt | 87.5 |
Number of scaffolds > 10M nt | 0 |
Percentage of scaffolds > 10M nt | 0.0 |
Mean scaffold size | 4377961 |
Median scaffold size | 3434925 |
N50 scaffold length | 6201951 |
L50 scaffold count | 3 |
scaffold %A | 28.15 |
scaffold %C | 21.88 |
scaffold %G | 21.83 |
scaffold %T | 28.15 |
scaffold %N | 0.0 |
scaffold %non-ACGTN | 0.0 |
Number of scaffold non-ACGTN nt | 0 |
Percentage of assembly in scaffolded contigs | 0.0 |
Percentage of assembly in unscaffolded contigs | 100.0 |
Average number of contigs per scaffold | 1.0 |
Mean length of breaks (>=100Ns) between contigs in scaffold | 0 |
Number of contigs | 8 |
Number of contigs in scaffolds | 0 |
Number of contigs not in scaffolds | 8 |
Total size of contigs | 35023690 |
Longest contig | 7872678 |
Shortest contig | 52960 |
Number of contigs > 1K nt | 8 |
Percentage of contigs > 1K nt | 100.0 |
Number of contigs > 10K nt | 8 |
Percentage of contigs > 10K nt | 100.0 |
Number of contigs > 100K nt | 7 |
Percentage of contigs > 100K nt | 87.5 |
Number of contigs > 1M nt | 7 |
Percentage of contigs > 1M nt | 87.5 |
Number of contigs > 10M nt | 0 |
Percentage of contigs > 10M nt | 0.0 |
Mean contig size | 4377961 |
Median contig size | 3434925 |
N50 contig length | 6201951 |
L50 contig count | 3 |
contig %A | 28.15 |
contig %C | 21.88 |
contig %G | 21.83 |
contig %T | 28.15 |
contig %N | 0.0 |
contig %non-ACGTN | 0.0 |
Number of contig non-ACGTN nt | 0 |
+ A tool to calculate a basic set of statistics about features contained in GFF3 files. +
+Reference:
++ Gremme G, Steinbiss S, Kurtz S. GenomeTools: a comprehensive software library for efficient processing of + structured genome annotations. IEEE/ACM Trans Comput Biol Bioinform. 2013 May-Jun;10(3):645-56. doi: + 10.1109/TCBB.2013.68. PMID: 24091398. +
+Version: 1.6.5
+Stat | Value |
---|---|
parsed genome node DAGs | 7165 |
sequence regions | 8 (total length: 35023690) |
multi-features | 5951 |
genes | 7137 |
protein-coding genes | 7034 |
mRNAs | 7034 |
protein-coding mRNAs | 7034 |
exons | 20368 |
CDSs | 20265 |
introns | 13231 |
rRNAs | 3 |
regions | 8 |
tRNAs | 98 |
transcripts | 2 |
+ BUSCO estimates the completeness and redundancy of processed genomic data based on universal single-copy + orthologs. +
+Reference:
++ Manni M., Berkeley M.R., Seppey M., Simao F.A., Zdobnov E.M. 2021. BUSCO update: novel and streamlined + workflows along with broader and deeper phylogenetic coverage for scoring of eukaryotic, prokaryotic, and + viral genomes. arXiv:2106.11799 [q-bio] [Internet]. Available from: + arxiv.org/abs/2106.11799 +
+Version: 5.6.1
+Assembly | +Lineage | +Percentages | +
---|---|---|
+ FI1 + | ++ fungi_odb10 + | +C:98.4%[S:97.9%,D:0.5%],F:0.1%,M:1.5%,n:758 | +
+ FI1 + | ++ hypocreales_odb10 + | +C:96.3%[S:96.2%,D:0.1%],F:0.5%,M:3.2%,n:4494 | +
Event | +Value | +
---|---|
Search Percentages | +C:98.4%[S:97.9%,D:0.5%],F:0.1%,M:1.5%,n:758 | +
Event | Frequency |
---|---|
Complete BUSCOs (C) | 746 |
Complete and single-copy BUSCOs (S) | 742 |
Complete and duplicated BUSCOs (D) | 4 |
Fragmented BUSCOs (F) | 1 |
Missing BUSCOs (M) | 11 |
Total BUSCO groups searched | 758 |
Parameter | +Value | +
---|---|
Version | +5.6.1 | +
Lineage create on | +2024-01-08 | +
mode | +euk_genome_met | +
predictor | +metaeuk | +
Dependency | Version |
---|---|
hmmsearch | 3.1 |
bbtools | 39.01 |
metaeuk | 6.a5d39d9 |
Event | +Value | +
---|---|
Search Percentages | +C:96.3%[S:96.2%,D:0.1%],F:0.5%,M:3.2%,n:4494 | +
Event | Frequency |
---|---|
Complete BUSCOs (C) | 4325 |
Complete and single-copy BUSCOs (S) | 4321 |
Complete and duplicated BUSCOs (D) | 4 |
Fragmented BUSCOs (F) | 24 |
Missing BUSCOs (M) | 145 |
Total BUSCO groups searched | 4494 |
Parameter | +Value | +
---|---|
Version | +5.6.1 | +
Lineage create on | +2024-01-08 | +
mode | +euk_genome_met | +
predictor | +metaeuk | +
Dependency | Version |
---|---|
hmmsearch | 3.1 |
bbtools | 39.01 |
metaeuk | 6.a5d39d9 |
+ A toolkit to identify and visualise telomeric repeats for the Darwin Tree of Life genomes. +
+Reference:
++ https://github.com/tolkit/telomeric-identifier +
+Version: 0.2.41
++ Searched sequence: + AACCCTAACCCTAACCCTAACCCT +
++ Searched sequence: + TTTGGG +
++ LTR Assembly Index (LAI) is a reference-free genome metric that evaluates assembly continuity using LTR-RTs. + LTR retrotransposons (LTR-RTs) are the predominant interspersed repeat that is poorly assembled in draft + genomes. Correcting for LTR-RT amplification dynamics, LAI is independent of genome size, genomic LTR-RT + content, and gene space evaluation metrics such as BUSCO. LAI = Raw LAI + 2.8138 × (94 – whole genome LTR + identity). The LAI is set to 0 when raw LAI = 0 or the adjustment produces a negative value. Raw LAI = + (Intact LTR element length / Total LTR sequence length) * 100 +
+Reference:
++ Shujun Ou, Jinfeng Chen, Ning Jiang, Assessing genome assembly quality using the LTR Assembly Index (LAI), + Nucleic Acids Research, Volume 46, Issue 21, 30 November 2018, Page e126, + 10.1093/nar/gky730 +
+Version: beta3.2
+Assembly | +Results | +
---|---|
FI1 | ++ Intact: 0.0113, Total: 0.2065, Raw LAI: 5.50, LAI: 4.84 + | +
+ Kraken2 assigns taxonomic labels to sequencing reads for metagenomics projects. +
+Reference:
++ Wood, D.E., Lu, J. & Langmead, B. Improved metagenomic analysis with Kraken 2. Genome Biol 20, 257 (2019). + 10.1186/s13059-019-1891-0 +
+Version: 2.1.2
++ Hi-C contact mapping experiments measure the frequency of physical contact between loci in the genome. The + resulting dataset, called a “contact map,” is represented using a two-dimensional heatmap where the + intensity of each pixel indicates the frequency of contact between a pair of loci. +
+Reference:
++ Robinson JT, Turner D, Durand NC, Thorvaldsdóttir H, Mesirov JP, Aiden EL. Juicebox.js Provides a + Cloud-Based Visualization System for Hi-C Data. Cell Syst. 2018 Feb 28;6(2):256-258.e1. + 10.1016/j.cels.2018.01.001. Epub + 2018 Feb 7. PMID: 29428417; PMCID: PMC6047755. +
+Version: 2.4.3
++ Circos facilitates the identification and analysis of similarities and differences arising from comparisons + of genomes. The genome-wide alignments are performed with MUMMER. +
+References:
++ Krzywinski, M., Schein, J., Birol, I., Connors, J., Gascoyne, R., Horsman, D., ... & Marra, M. A. (2009). + Circos: an information aesthetic for comparative genomics. Genome research, 19(9), 1639-1645. + 10.1101/gr.092759.109 +
++ Marçais G, Delcher AL, Phillippy AM, Coston R, Salzberg SL, Zimin A. MUMmer4: A fast and versatile genome + alignment system. PLoS Comput Biol. 2018 Jan 26;14(1):e1005944. + 10.1371/journal.pcbi.1005944 +
++ Versions: v0.69-8 (CIRCOS), 4.0.0rc1 (MUMMER) +
+Notes:
+