diff --git a/_freeze/chapters/chapter_05/execute-results/html.json b/_freeze/chapters/chapter_05/execute-results/html.json index 759bcc3..31f1dee 100644 --- a/_freeze/chapters/chapter_05/execute-results/html.json +++ b/_freeze/chapters/chapter_05/execute-results/html.json @@ -1,8 +1,8 @@ { - "hash": "078e4ba8906e32b3de602a2ec43e1a3c", + "hash": "f0a90b71b235da71dc6d6f48390c91dc", "result": { "engine": "knitr", - "markdown": "---\nexecute: \n echo: true\n eval: false\n warning: false\n---\n\n\n# Runtime benchmark\n\nHere, we will perform a runtime benchmark for functions related to duplicate \nclassification and substitution rates calculation using model organisms.\n\nTo start, let's load the required data and packages.\n\n\n::: {.cell}\n\n```{.r .cell-code}\nset.seed(123) # for reproducibility\n\n# Load required packages\nlibrary(doubletrouble)\nlibrary(here)\nlibrary(tidyverse)\nlibrary(patchwork)\n\nsource(here(\"code\", \"utils.R\"))\n\n# Load sample metadata for Ensembl instances\nload(here(\"products\", \"result_files\", \"metadata_all.rda\"))\n```\n:::\n\n::: {.cell}\n\n:::\n\n\n## Benchmark 1: `classify_gene_pairs()`\n\nHere, we will benchmark the performance of `classify_gene_pairs()`\nwith model organisms.\n\nFirst, let's get the genome and annotation data.\n\n\n::: {.cell}\n\n```{.r .cell-code}\n# Create a data frame with names of model species and their Ensembl instances\nmodel_species <- data.frame(\n species = c(\n \"arabidopsis_thaliana\", \"caenorhabditis_elegans\", \n \"homo_sapiens\", \"saccharomyces_cerevisiae\",\n \"drosophila_melanogaster\", \"danio_rerio\"\n ),\n instance = c(\n \"plants\", \"metazoa\", \"ensembl\", \"fungi\", \"metazoa\", \"ensembl\"\n )\n)\n\n# For each organism, download data, and identify and classify duplicates\nmodel_duplicates <- lapply(seq_len(nrow(model_species)), function(x) {\n \n species <- model_species$species[x]\n instance <- model_species$instance[x]\n \n # Get annotation\n annot <- get_annotation(model_species[x, ], instance)\n \n # Get proteome and keep only primary transcripts\n seq <- get_proteomes(model_species[x, ], instance)\n seq <- filter_sequences(seq, annot)\n \n # Process data\n pdata <- syntenet::process_input(seq, annot, filter_annotation = TRUE)\n \n # Perform DIAMOND search\n outdir <- file.path(tempdir(), paste0(species, \"_intra\"))\n diamond <- syntenet::run_diamond(\n seq = pdata$seq,\n compare = \"intraspecies\", \n outdir = outdir,\n threads = 4,\n ... = \"--sensitive\"\n )\n \n fs::dir_delete(outdir)\n \n # Classify duplicates - standard mode\n start <- Sys.time()\n duplicate_pairs <- classify_gene_pairs(\n blast_list = diamond,\n annotation = pdata$annotation,\n scheme = \"standard\"\n )[[1]]\n end <- Sys.time()\n runtime <- end - start\n \n return(runtime)\n})\nnames(model_duplicates) <- gsub(\"_\", \" \", str_to_title(model_species$species))\n\n# Summarize results in a table\nbenchmark_classification <- data.frame(\n species = names(model_duplicates),\n time_seconds = as.numeric(unlist(model_duplicates))\n)\n\n# Save results\nsave(\n benchmark_classification, compress = \"xz\",\n file = here(\"products\", \"result_files\", \"benchmark_classification.rda\")\n)\n```\n:::\n\n\n## Benchmark 2: `pairs2kaks()`\n\nNext, we will benchmark the performance of `pairs2kaks()` for\nduplicate pairs in the *Saccharomyces cerevisiae* genome.\nWe will do it using a single thread, and using parallelization (with\n4 and 8 threads).\n\nFirst of all, let's get the required data for `pairs2kaks()`.\n\n\n::: {.cell}\n\n```{.r .cell-code}\n# Load duplicate pairs for S. cerevisiae\nload(here(\"products\", \"result_files\", \"fungi_duplicates.rda\"))\nscerevisiae_pairs <- fungi_duplicates[\"saccharomyces_cerevisiae\"]\n\n# Get CDS for S. cerevisiae\nscerevisiae_cds <- get_cds_ensembl(\"saccharomyces_cerevisiae\", \"fungi\")\n```\n:::\n\n\nNow, we can do the benchmark. \n\n\n::: {.cell}\n\n```{.r .cell-code}\n# Parallel back-end: SerialParam (1 thread)\nstart <- Sys.time()\nkaks <- pairs2kaks(\n scerevisiae_pairs, \n scerevisiae_cds,\n bp_param = BiocParallel::SerialParam()\n)\nend <- Sys.time()\nruntime_serial <- end - start\n\n# Parallel back-end: SnowParam, 4 threads\nstart <- Sys.time()\nkaks <- pairs2kaks(\n scerevisiae_pairs, \n scerevisiae_cds,\n bp_param = BiocParallel::SnowParam(workers = 4)\n)\nend <- Sys.time()\nruntime_snow4 <- end - start\n\n# Parallel back-end: SnowParam, 8 threads\nstart <- Sys.time()\nkaks <- pairs2kaks(\n scerevisiae_pairs, \n scerevisiae_cds,\n bp_param = BiocParallel::SnowParam(workers = 8)\n)\nend <- Sys.time()\nruntime_snow8 <- end - start\n\n# Summarize results in a table\nbenchmark_kaks <- data.frame(\n `Back-end` = c(\"Serial\", \"Snow, 4 threads\", \"Snow, 8 threads\"),\n Time_minutes = as.numeric(c(runtime_serial, runtime_snow4, runtime_snow8))\n) |>\n dplyr::mutate(\n Pairs_per_minute = nrow(scerevisiae_pairs[[1]]) / Time_minutes,\n Pairs_per_second = nrow(scerevisiae_pairs[[1]]) / (Time_minutes * 60)\n )\n\nsave(\n benchmark_kaks, compress = \"xz\",\n file = here(\"products\", \"result_files\", \"benchmark_kaks.rda\")\n)\n```\n:::\n\n\n## Benchmark 3: **doubletrouble** vs __DupGen_finder__\n\nHere, we will classify duplicate pairs in the *A. thaliana* genome \nusing __doubletrouble__ and DupGen_finder to assess if they produce the same \nresults, and compare their runtimes. \n\nFirst, let's get all data we need (proteomes, annotation, and DIAMOND tables).\n\n\n::: {.cell}\n\n```{.r .cell-code}\n# Get annotation\nsmeta <- data.frame(\n species = c(\"arabidopsis_thaliana\", \"amborella_trichopoda\"),\n instance = \"plants\"\n)\nannot <- get_annotation(smeta, \"plants\")\n\n# Get proteome and keep only primary transcripts\nseq <- get_proteomes(smeta, \"plants\")\nseq <- filter_sequences(seq, annot)\n \n# Process data\npdata <- syntenet::process_input(seq, annot, filter_annotation = TRUE)\n \n# Run intraspecies DIAMOND search\noutdir <- file.path(tempdir(), \"benchmark3_intra\")\ndiamond_intra <- syntenet::run_diamond(\n seq = pdata$seq,\n compare = \"intraspecies\", \n outdir = outdir,\n threads = 4,\n ... = \"--sensitive\"\n)\nfs::dir_delete(outdir)\n\n# Run interspecies DIAMOND search\noutdir2 <- file.path(tempdir(), \"benchmark3_inter\")\ncompare_df <- data.frame(\n query = \"arabidopsis.thaliana\", target = \"amborella.trichopoda\"\n)\n\ndiamond_inter <- syntenet::run_diamond(\n seq = pdata$seq,\n compare = compare_df, \n outdir = outdir2,\n threads = 4,\n ... = \"--sensitive\"\n)\nfs::dir_delete(outdir2)\n```\n:::\n\n\nNow, let's classify duplicates with __doubletrouble__.\n\n\n::: {.cell}\n\n```{.r .cell-code}\n# Classify duplicates with doubletrouble\nstart1 <- Sys.time()\ndups1 <- classify_gene_pairs(\n blast_list = diamond_intra[2],\n annotation = pdata$annotation,\n blast_inter = diamond_inter,\n scheme = \"extended\",\n collinearity_dir = here(\"products\")\n)\nend1 <- Sys.time()\nruntime1 <- end1 - start1\n```\n:::\n\n\nNext, we will classify duplicates with DupGen_finder. For that, we will first\nexport input data in the required format.\n\n\n::: {.cell}\n\n:::\n\n::: {.cell}\n\n```{.r .cell-code}\n# Export data\n## .blast files\nb1 <- diamond_intra$arabidopsis.thaliana_arabidopsis.thaliana |>\n mutate(\n query = str_replace_all(query, \"^ara_\", \"\"),\n db = str_replace_all(db, \"^ara_\", \"\")\n )\n\nb2 <- diamond_inter$arabidopsis.thaliana_amborella.trichopoda |>\n mutate(\n query = str_replace_all(query, \"^ara_\", \"\"),\n db = str_replace_all(db, \"^amb_\", \"\")\n )\n\nwrite_tsv(b1, file = file.path(tempdir(), \"Ath.blast\"), col_names = FALSE)\nwrite_tsv(b2, file = file.path(tempdir(), \"Ath_Atr.blast\"), col_names = FALSE)\n\n## .gff files\ngff1 <- pdata$annotation$arabidopsis.thaliana |>\n as.data.frame() |>\n mutate(gene = str_replace_all(gene, \"^ara_\", \"\")) |>\n mutate(seqnames = str_replace_all(seqnames, \"ara_\", \"ara-\")) |>\n dplyr::select(seqnames, gene, start, end)\n\ngff2 <- pdata$annotation$amborella.trichopoda |>\n as.data.frame() |>\n mutate(gene = str_replace_all(gene, \"^amb_\", \"\")) |>\n mutate(seqnames = str_replace_all(seqnames, \"^amb_\", \"amb-\")) |>\n dplyr::select(seqnames, gene, start, end)\n\ngff2 <- bind_rows(gff1, gff2)\n\nwrite_tsv(gff1, file = file.path(tempdir(), \"Ath.gff\"), col_names = FALSE)\nwrite_tsv(gff2, file = file.path(tempdir(), \"Ath_Atr.gff\"), col_names = FALSE)\n\n# Classify duplicates with DupGen_finder\nargs = c(\n \"-i\", tempdir(), \"-t Ath -c Atr\",\n \"-o\", file.path(tempdir(), \"results\"),\n \"-e 1e-10\"\n)\n\nstart2 <- Sys.time()\nsystem2(\"DupGen_finder.pl\", args = args)\nend2 <- Sys.time()\nruntime2 <- end2 - start2\n```\n:::\n\n\nNow, let's compare both algorithms in terms of runtime and results.\n\n\n::: {.cell}\n\n```{.r .cell-code}\n# Load `DupGen_finder.pl` results\nfiles <- c(\n \"Ath.wgd.pairs\", \"Ath.tandem.pairs\", \"Ath.proximal.pairs\",\n \"Ath.transposed.pairs\", \"Ath.dispersed.pairs\"\n)\nfiles <- file.path(tempdir(), \"results\", files)\nnames(files) <- c(\"SD\", \"TD\", \"PD\", \"TRD\", \"DD\")\ndups2 <- Reduce(rbind, lapply(seq_along(files), function(x) {\n d <- read_tsv(files[x], show_col_types = FALSE) |>\n mutate(type = names(files)[x]) |>\n select(1, 3, type) |>\n as.data.frame()\n \n names(d)[c(1,2)] <- c(\"dup1\", \"dup2\")\n \n return(d)\n}))\n\n# Compare runtime\ncomp_runtime <- data.frame(doubletrouble = runtime1, DupGen_finder = runtime2)\n\n# Compare number of gene pairs per category\ncomp_results <- inner_join(\n count(dups1$arabidopsis.thaliana, type) |> \n dplyr::rename(n_doubletrouble = n),\n count(dups2, type) |>\n dplyr::rename(n_DupGen_finder = n),\n by = \"type\"\n)\ncomp_results <- bind_rows(\n comp_results, \n data.frame(\n type = \"Total\", \n n_doubletrouble = sum(comp_results$n_doubletrouble),\n n_DupGen_finder = sum(comp_results$n_DupGen_finder)\n )\n)\n```\n:::\n\n::: {.cell}\n\n:::\n\n::: {.cell}\n\n```{.r .cell-code}\nlist(runtime = comp_runtime, results = comp_results)\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n$runtime\n doubletrouble DupGen_finder\n1 3.71474 secs 3.373682 secs\n\n$results\n type n_doubletrouble n_DupGen_finder\n1 SD 4329 4355\n2 TD 2075 2131\n3 PD 2789 896\n4 TRD 6703 5941\n5 DD 31589 17834\n6 Total 47485 31157\n```\n\n\n:::\n:::\n\n\nOverall, results are similar, but there are important differences. In terms\nof runtime, there is no significant difference (this is the runtime of a single\nrun, so there's some stochasticity). In terms of results, the numbers\nof SD, TD, and TRD pairs are similar, but there are more pronounced differences\nfor PD and DD pairs. In particular, the total number of paralogous pairs differs\nbetween algorithms. When we remove rows from the DIAMOND output based on \nthe E-value threshold and remove self (e.g., \"gene1-gene1\") and redundant \nhits (e.g., \"gene1-gene2\" and \"gene2-gene1\"), we get the following total \nnumber of pairs:\n\n\n::: {.cell}\n\n```{.r .cell-code}\n# Get total number of paralog pairs from DIAMOND output\nevalue <- 1e-10\ndmd <- diamond_intra$arabidopsis.thaliana_arabidopsis.thaliana\n\nall_paralogs <- lapply(list(dmd), function(x) {\n fpair <- x[x$evalue <= evalue, c(1, 2)]\n fpair <- fpair[fpair[, 1] != fpair[, 2], ]\n fpair <- fpair[!duplicated(t(apply(fpair, 1, sort))), ]\n names(fpair) <- c(\"dup1\", \"dup2\")\n return(fpair)\n})[[1]]\n```\n:::\n\n::: {.cell}\n\n:::\n\n::: {.cell}\n\n```{.r .cell-code}\nnrow(all_paralogs)\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n[1] 47485\n```\n\n\n:::\n:::\n\n\nThe total number of paralogous pairs in the DIAMOND output is the same as\nsum of classes for __doubletrouble__, but greater than the sum of classes\nfor __DupGen_finder__, indicating that the latter probably does some additional\n(undocumented) filtering before classifying gene pairs, or it could be a bug.\n\nFinally, since the number of PD pairs identified by __doubletrouble__ is much\ngreater than the number of PD pairs identified by __DupGen_finder__, we will \nexplore a few of these PD pairs so check whether __doubletrouble__ \nmisclassified them.\n\n\n::: {.cell}\n\n```{.r .cell-code}\npd1 <- dups1$arabidopsis.thaliana |>\n filter(type == \"PD\")\n\npd2 <- dups2 |>\n filter(type == \"PD\")\n\n# Get all pairs in `pd1` and `pd2`\np1 <- t(apply(pd1[, 1:2], 1, sort)) |>\n as.data.frame() |>\n mutate(V1 = str_replace_all(V1, \"^ara_\", \"\")) |>\n mutate(V2 = str_replace_all(V2, \"^ara_\", \"\")) |>\n mutate(pair_string = str_c(V1, V2, sep = \"-\")) |>\n pull(pair_string)\n\np2 <- t(apply(pd2[, 1:2], 1, sort)) |>\n as.data.frame() |>\n mutate(pair_string = str_c(V1, V2, sep = \"-\")) |>\n pull(pair_string)\n\n# Sample PD pairs from doubletrouble that are not PD pairs in DupGen_finder\nexamples <- p1[!p1 %in% p2] |> head(n = 5)\n\n# Check whether they are PD pairs or not\nath_annot <- pdata$annotation$arabidopsis.thaliana\nrange_examples <- lapply(examples, function(x) {\n g1 <- paste0(\"ara_\", strsplit(x, \"-\")[[1]][1])\n g2 <- paste0(\"ara_\", strsplit(x, \"-\")[[1]][2])\n ranges <- ath_annot[ath_annot$gene %in% c(g1, g2)]\n return(ranges)\n})\n```\n:::\n\n::: {.cell}\n\n:::\n\n::: {.cell}\n\n```{.r .cell-code}\nrange_examples\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n[[1]]\nGRanges object with 2 ranges and 1 metadata column:\n seqnames ranges strand | gene\n | \n 128 ara_1 428650-430720 - | ara_AT1G02220\n 130 ara_1 437860-439559 - | ara_AT1G02250\n -------\n seqinfo: 7 sequences from an unspecified genome; no seqlengths\n\n[[2]]\nGRanges object with 2 ranges and 1 metadata column:\n seqnames ranges strand | gene\n | \n 127 ara_1 427548-427811 - | ara_AT1G02210\n 130 ara_1 437860-439559 - | ara_AT1G02250\n -------\n seqinfo: 7 sequences from an unspecified genome; no seqlengths\n\n[[3]]\nGRanges object with 2 ranges and 1 metadata column:\n seqnames ranges strand | gene\n | \n 17117 ara_4 656407-659178 - | ara_AT4G01520\n 17120 ara_4 673862-676445 - | ara_AT4G01550\n -------\n seqinfo: 7 sequences from an unspecified genome; no seqlengths\n\n[[4]]\nGRanges object with 2 ranges and 1 metadata column:\n seqnames ranges strand | gene\n | \n 15329 ara_3 17882465-17884515 + | ara_AT3G48290\n 15332 ara_3 17887996-17889942 + | ara_AT3G48310\n -------\n seqinfo: 7 sequences from an unspecified genome; no seqlengths\n\n[[5]]\nGRanges object with 2 ranges and 1 metadata column:\n seqnames ranges strand | gene\n | \n 25845 ara_5 21378702-21379744 + | ara_AT5G52720\n 25847 ara_5 21382482-21383432 + | ara_AT5G52740\n -------\n seqinfo: 7 sequences from an unspecified genome; no seqlengths\n```\n\n\n:::\n:::\n\n\nIn these first five examples of pairs that are classified as PD pairs by\n__doubletrouble__, but not by __DupGen_finder__, we can see based on the\nnumbers in row names that they are indeed very close, separated by only a few \ngenes (<10). Thus, they are true PD pairs that __DupGen_finder__ failed \nto classify as PD pairs, or removed in their undocumented filtering (described\nabove).\n\n## Session info {.unnumbered}\n\nThis document was created under the following conditions:\n\n\n::: {.cell}\n::: {.cell-output .cell-output-stdout}\n\n```\n─ Session info ───────────────────────────────────────────────────────────────\n setting value\n version R version 4.3.2 (2023-10-31)\n os Ubuntu 22.04.3 LTS\n system x86_64, linux-gnu\n ui X11\n language (EN)\n collate en_US.UTF-8\n ctype en_US.UTF-8\n tz Europe/Brussels\n date 2024-04-18\n pandoc 3.1.1 @ /usr/lib/rstudio/resources/app/bin/quarto/bin/tools/ (via rmarkdown)\n\n─ Packages ───────────────────────────────────────────────────────────────────\n package * version date (UTC) lib source\n abind 1.4-5 2016-07-21 [1] CRAN (R 4.3.2)\n ade4 1.7-22 2023-02-06 [1] CRAN (R 4.3.2)\n AnnotationDbi 1.64.1 2023-11-03 [1] Bioconductor\n ape 5.7-1 2023-03-13 [1] CRAN (R 4.3.2)\n Biobase 2.62.0 2023-10-24 [1] Bioconductor\n BiocFileCache 2.10.1 2023-10-26 [1] Bioconductor\n BiocGenerics 0.48.1 2023-11-01 [1] Bioconductor\n BiocIO 1.12.0 2023-10-24 [1] Bioconductor\n BiocParallel 1.37.0 2024-01-19 [1] Github (Bioconductor/BiocParallel@79a1b2d)\n biomaRt 2.58.2 2024-01-30 [1] Bioconductor 3.18 (R 4.3.2)\n Biostrings 2.70.2 2024-01-28 [1] Bioconductor 3.18 (R 4.3.2)\n bit 4.0.5 2022-11-15 [1] CRAN (R 4.3.2)\n bit64 4.0.5 2020-08-30 [1] CRAN (R 4.3.2)\n bitops 1.0-7 2021-04-24 [1] CRAN (R 4.3.2)\n blob 1.2.4 2023-03-17 [1] CRAN (R 4.3.2)\n cachem 1.0.8 2023-05-01 [1] CRAN (R 4.3.2)\n cli 3.6.2 2023-12-11 [1] CRAN (R 4.3.2)\n coda 0.19-4.1 2024-01-31 [1] CRAN (R 4.3.2)\n codetools 0.2-19 2023-02-01 [4] CRAN (R 4.2.2)\n colorspace 2.1-0 2023-01-23 [1] CRAN (R 4.3.2)\n crayon 1.5.2 2022-09-29 [1] CRAN (R 4.3.2)\n curl 5.2.0 2023-12-08 [1] CRAN (R 4.3.2)\n DBI 1.2.1 2024-01-12 [1] CRAN (R 4.3.2)\n dbplyr 2.4.0 2023-10-26 [1] CRAN (R 4.3.2)\n DelayedArray 0.28.0 2023-10-24 [1] Bioconductor\n digest 0.6.34 2024-01-11 [1] CRAN (R 4.3.2)\n doParallel 1.0.17 2022-02-07 [1] CRAN (R 4.3.2)\n doubletrouble * 1.3.4 2024-02-05 [1] Bioconductor\n dplyr * 1.1.4 2023-11-17 [1] CRAN (R 4.3.2)\n evaluate 0.23 2023-11-01 [1] CRAN (R 4.3.2)\n fansi 1.0.6 2023-12-08 [1] CRAN (R 4.3.2)\n fastmap 1.1.1 2023-02-24 [1] CRAN (R 4.3.2)\n filelock 1.0.3 2023-12-11 [1] CRAN (R 4.3.2)\n forcats * 1.0.0 2023-01-29 [1] CRAN (R 4.3.2)\n foreach 1.5.2 2022-02-02 [1] CRAN (R 4.3.2)\n generics 0.1.3 2022-07-05 [1] CRAN (R 4.3.2)\n GenomeInfoDb 1.38.6 2024-02-08 [1] Bioconductor 3.18 (R 4.3.2)\n GenomeInfoDbData 1.2.11 2023-12-21 [1] Bioconductor\n GenomicAlignments 1.38.2 2024-01-16 [1] Bioconductor 3.18 (R 4.3.2)\n GenomicFeatures 1.54.3 2024-01-31 [1] Bioconductor 3.18 (R 4.3.2)\n GenomicRanges 1.54.1 2023-10-29 [1] Bioconductor\n ggnetwork 0.5.13 2024-02-14 [1] CRAN (R 4.3.2)\n ggplot2 * 3.5.0 2024-02-23 [1] CRAN (R 4.3.2)\n glue 1.7.0 2024-01-09 [1] CRAN (R 4.3.2)\n gtable 0.3.4 2023-08-21 [1] CRAN (R 4.3.2)\n here * 1.0.1 2020-12-13 [1] CRAN (R 4.3.2)\n hms 1.1.3 2023-03-21 [1] CRAN (R 4.3.2)\n htmltools 0.5.7 2023-11-03 [1] CRAN (R 4.3.2)\n htmlwidgets 1.6.4 2023-12-06 [1] CRAN (R 4.3.2)\n httr 1.4.7 2023-08-15 [1] CRAN (R 4.3.2)\n igraph 2.0.1.1 2024-01-30 [1] CRAN (R 4.3.2)\n intergraph 2.0-4 2024-02-01 [1] CRAN (R 4.3.2)\n IRanges 2.36.0 2023-10-24 [1] Bioconductor\n iterators 1.0.14 2022-02-05 [1] CRAN (R 4.3.2)\n jsonlite 1.8.8 2023-12-04 [1] CRAN (R 4.3.2)\n KEGGREST 1.42.0 2023-10-24 [1] Bioconductor\n knitr 1.45 2023-10-30 [1] CRAN (R 4.3.2)\n lattice 0.22-5 2023-10-24 [4] CRAN (R 4.3.1)\n lifecycle 1.0.4 2023-11-07 [1] CRAN (R 4.3.2)\n lubridate * 1.9.3 2023-09-27 [1] CRAN (R 4.3.2)\n magrittr 2.0.3 2022-03-30 [1] CRAN (R 4.3.2)\n MASS 7.3-60 2023-05-04 [4] CRAN (R 4.3.1)\n Matrix 1.6-3 2023-11-14 [4] CRAN (R 4.3.2)\n MatrixGenerics 1.14.0 2023-10-24 [1] Bioconductor\n matrixStats 1.2.0 2023-12-11 [1] CRAN (R 4.3.2)\n mclust 6.0.1 2023-11-15 [1] CRAN (R 4.3.2)\n memoise 2.0.1 2021-11-26 [1] CRAN (R 4.3.2)\n MSA2dist 1.7.3 2024-03-14 [1] gitlab (mpievolbio-it/msa2dist@7558ab2)\n munsell 0.5.0 2018-06-12 [1] CRAN (R 4.3.2)\n network 1.18.2 2023-12-05 [1] CRAN (R 4.3.2)\n networkD3 0.4 2017-03-18 [1] CRAN (R 4.3.2)\n nlme 3.1-163 2023-08-09 [4] CRAN (R 4.3.1)\n patchwork * 1.2.0 2024-01-08 [1] CRAN (R 4.3.2)\n pheatmap 1.0.12 2019-01-04 [1] CRAN (R 4.3.2)\n pillar 1.9.0 2023-03-22 [1] CRAN (R 4.3.2)\n pkgconfig 2.0.3 2019-09-22 [1] CRAN (R 4.3.2)\n png 0.1-8 2022-11-29 [1] CRAN (R 4.3.2)\n prettyunits 1.2.0 2023-09-24 [1] CRAN (R 4.3.2)\n progress 1.2.3 2023-12-06 [1] CRAN (R 4.3.2)\n purrr * 1.0.2 2023-08-10 [1] CRAN (R 4.3.2)\n R6 2.5.1 2021-08-19 [1] CRAN (R 4.3.2)\n rappdirs 0.3.3 2021-01-31 [1] CRAN (R 4.3.2)\n RColorBrewer 1.1-3 2022-04-03 [1] CRAN (R 4.3.2)\n Rcpp 1.0.12 2024-01-09 [1] CRAN (R 4.3.2)\n RCurl 1.98-1.14 2024-01-09 [1] CRAN (R 4.3.2)\n readr * 2.1.5 2024-01-10 [1] CRAN (R 4.3.2)\n restfulr 0.0.15 2022-06-16 [1] CRAN (R 4.3.2)\n rjson 0.2.21 2022-01-09 [1] CRAN (R 4.3.2)\n rlang 1.1.3 2024-01-10 [1] CRAN (R 4.3.2)\n rmarkdown 2.25 2023-09-18 [1] CRAN (R 4.3.2)\n rprojroot 2.0.4 2023-11-05 [1] CRAN (R 4.3.2)\n Rsamtools 2.18.0 2023-10-24 [1] Bioconductor\n RSQLite 2.3.5 2024-01-21 [1] CRAN (R 4.3.2)\n rstudioapi 0.15.0 2023-07-07 [1] CRAN (R 4.3.2)\n rtracklayer 1.62.0 2023-10-24 [1] Bioconductor\n S4Arrays 1.2.0 2023-10-24 [1] Bioconductor\n S4Vectors 0.40.2 2023-11-23 [1] Bioconductor 3.18 (R 4.3.2)\n scales 1.3.0 2023-11-28 [1] CRAN (R 4.3.2)\n seqinr 4.2-36 2023-12-08 [1] CRAN (R 4.3.2)\n sessioninfo 1.2.2 2021-12-06 [1] CRAN (R 4.3.2)\n SparseArray 1.2.4 2024-02-11 [1] Bioconductor 3.18 (R 4.3.2)\n statnet.common 4.9.0 2023-05-24 [1] CRAN (R 4.3.2)\n stringi 1.8.3 2023-12-11 [1] CRAN (R 4.3.2)\n stringr * 1.5.1 2023-11-14 [1] CRAN (R 4.3.2)\n SummarizedExperiment 1.32.0 2023-10-24 [1] Bioconductor\n syntenet 1.4.0 2023-10-24 [1] Bioconductor\n tibble * 3.2.1 2023-03-20 [1] CRAN (R 4.3.2)\n tidyr * 1.3.1 2024-01-24 [1] CRAN (R 4.3.2)\n tidyselect 1.2.0 2022-10-10 [1] CRAN (R 4.3.2)\n tidyverse * 2.0.0 2023-02-22 [1] CRAN (R 4.3.2)\n timechange 0.3.0 2024-01-18 [1] CRAN (R 4.3.2)\n tzdb 0.4.0 2023-05-12 [1] CRAN (R 4.3.2)\n utf8 1.2.4 2023-10-22 [1] CRAN (R 4.3.2)\n vctrs 0.6.5 2023-12-01 [1] CRAN (R 4.3.2)\n withr 3.0.0 2024-01-16 [1] CRAN (R 4.3.2)\n xfun 0.42 2024-02-08 [1] CRAN (R 4.3.2)\n XML 3.99-0.16.1 2024-01-22 [1] CRAN (R 4.3.2)\n xml2 1.3.6 2023-12-04 [1] CRAN (R 4.3.2)\n XVector 0.42.0 2023-10-24 [1] Bioconductor\n yaml 2.3.8 2023-12-11 [1] CRAN (R 4.3.2)\n zlibbioc 1.48.0 2023-10-24 [1] Bioconductor\n\n [1] /home/faalm/R/x86_64-pc-linux-gnu-library/4.3\n [2] /usr/local/lib/R/site-library\n [3] /usr/lib/R/site-library\n [4] /usr/lib/R/library\n\n──────────────────────────────────────────────────────────────────────────────\n```\n\n\n:::\n:::\n", + "markdown": "---\nexecute: \n echo: true\n eval: false\n warning: false\n---\n\n\n# Runtime benchmark\n\nHere, we will perform a runtime benchmark for functions related to duplicate \nclassification and substitution rates calculation using model organisms.\n\nTo start, let's load the required data and packages.\n\n\n::: {.cell}\n\n```{.r .cell-code}\nset.seed(123) # for reproducibility\n\n# Load required packages\nlibrary(doubletrouble)\nlibrary(here)\nlibrary(tidyverse)\nlibrary(patchwork)\n\nsource(here(\"code\", \"utils.R\"))\n\n# Load sample metadata for Ensembl instances\nload(here(\"products\", \"result_files\", \"metadata_all.rda\"))\n```\n:::\n\n::: {.cell}\n\n:::\n\n\n## Benchmark 1: `classify_gene_pairs()`\n\nHere, we will benchmark the performance of `classify_gene_pairs()`\nwith model organisms.\n\nFirst, let's get the genome and annotation data.\n\n\n::: {.cell}\n\n```{.r .cell-code}\n# Create a data frame with names of model species and their Ensembl instances\nmodel_species <- data.frame(\n species = c(\n \"arabidopsis_thaliana\", \"caenorhabditis_elegans\", \n \"homo_sapiens\", \"saccharomyces_cerevisiae\",\n \"drosophila_melanogaster\", \"danio_rerio\"\n ),\n instance = c(\n \"plants\", \"metazoa\", \"ensembl\", \"fungi\", \"metazoa\", \"ensembl\"\n )\n)\n\n# For each organism, download data, and identify and classify duplicates\nmodel_duplicates <- lapply(seq_len(nrow(model_species)), function(x) {\n \n species <- model_species$species[x]\n instance <- model_species$instance[x]\n \n # Get annotation\n annot <- get_annotation(model_species[x, ], instance)\n \n # Get proteome and keep only primary transcripts\n seq <- get_proteomes(model_species[x, ], instance)\n seq <- filter_sequences(seq, annot)\n \n # Process data\n pdata <- syntenet::process_input(seq, annot, filter_annotation = TRUE)\n \n # Perform DIAMOND search\n outdir <- file.path(tempdir(), paste0(species, \"_intra\"))\n diamond <- syntenet::run_diamond(\n seq = pdata$seq,\n compare = \"intraspecies\", \n outdir = outdir,\n threads = 4,\n ... = \"--sensitive\"\n )\n \n fs::dir_delete(outdir)\n \n # Classify duplicates - standard mode\n start <- Sys.time()\n duplicate_pairs <- classify_gene_pairs(\n blast_list = diamond,\n annotation = pdata$annotation,\n scheme = \"standard\"\n )[[1]]\n end <- Sys.time()\n runtime <- end - start\n \n return(runtime)\n})\nnames(model_duplicates) <- gsub(\"_\", \" \", str_to_title(model_species$species))\n\n# Summarize results in a table\nbenchmark_classification <- data.frame(\n species = names(model_duplicates),\n time_seconds = as.numeric(unlist(model_duplicates))\n)\n\n# Save results\nsave(\n benchmark_classification, compress = \"xz\",\n file = here(\"products\", \"result_files\", \"benchmark_classification.rda\")\n)\n```\n:::\n\n\n## Benchmark 2: `pairs2kaks()`\n\nNext, we will benchmark the performance of `pairs2kaks()` for\nduplicate pairs in the *Saccharomyces cerevisiae* genome.\nWe will do it using a single thread, and using parallelization (with\n4 and 8 threads).\n\nFirst of all, let's get the required data for `pairs2kaks()`.\n\n\n::: {.cell}\n\n```{.r .cell-code}\n# Load duplicate pairs for S. cerevisiae\nload(here(\"products\", \"result_files\", \"fungi_duplicates.rda\"))\nscerevisiae_pairs <- fungi_duplicates[\"saccharomyces_cerevisiae\"]\n\n# Get CDS for S. cerevisiae\nscerevisiae_cds <- get_cds_ensembl(\"saccharomyces_cerevisiae\", \"fungi\")\n```\n:::\n\n\nNow, we can do the benchmark. \n\n\n::: {.cell}\n\n```{.r .cell-code}\n# 1 thread\nstart <- Sys.time()\nkaks <- pairs2kaks(\n scerevisiae_pairs, \n scerevisiae_cds,\n threads = 1\n)\nend <- Sys.time()\nruntime1 <- end - start\n\n# 4 threads\nstart <- Sys.time()\nkaks <- pairs2kaks(\n scerevisiae_pairs, \n scerevisiae_cds,\n threads = 4\n)\nend <- Sys.time()\nruntime4 <- end - start\n\n# 8 threads\nstart <- Sys.time()\nkaks <- pairs2kaks(\n scerevisiae_pairs, \n scerevisiae_cds,\n threads = 8\n)\nend <- Sys.time()\nruntime8 <- end - start\n\n# Summarize results in a table\nbenchmark_kaks <- data.frame(\n Threads = factor(c(1, 4, 8)),\n Time_minutes = as.numeric(c(runtime1, runtime4, runtime8))\n) |>\n dplyr::mutate(\n Pairs_per_minute = nrow(scerevisiae_pairs[[1]]) / Time_minutes,\n Pairs_per_second = nrow(scerevisiae_pairs[[1]]) / (Time_minutes * 60)\n )\n\nsave(\n benchmark_kaks, compress = \"xz\",\n file = here(\"products\", \"result_files\", \"benchmark_kaks.rda\")\n)\n```\n:::\n\n\n## Benchmark 3: **doubletrouble** vs __DupGen_finder__\n\nHere, we will classify duplicate pairs in the *A. thaliana* genome \nusing __doubletrouble__ and DupGen_finder to assess if they produce the same \nresults, and compare their runtimes. \n\nFirst, let's get all data we need (proteomes, annotation, and DIAMOND tables).\n\n\n::: {.cell}\n\n```{.r .cell-code}\n# Get annotation\nsmeta <- data.frame(\n species = c(\"arabidopsis_thaliana\", \"amborella_trichopoda\"),\n instance = \"plants\"\n)\nannot <- get_annotation(smeta, \"plants\")\n\n# Get proteome and keep only primary transcripts\nseq <- get_proteomes(smeta, \"plants\")\nseq <- filter_sequences(seq, annot)\n \n# Process data\npdata <- syntenet::process_input(seq, annot, filter_annotation = TRUE)\n \n# Run intraspecies DIAMOND search\noutdir <- file.path(tempdir(), \"benchmark3_intra\")\ndiamond_intra <- syntenet::run_diamond(\n seq = pdata$seq,\n compare = \"intraspecies\", \n outdir = outdir,\n threads = 4,\n ... = \"--sensitive\"\n)\nfs::dir_delete(outdir)\n\n# Run interspecies DIAMOND search\noutdir2 <- file.path(tempdir(), \"benchmark3_inter\")\ncompare_df <- data.frame(\n query = \"arabidopsis.thaliana\", target = \"amborella.trichopoda\"\n)\n\ndiamond_inter <- syntenet::run_diamond(\n seq = pdata$seq,\n compare = compare_df, \n outdir = outdir2,\n threads = 4,\n ... = \"--sensitive\"\n)\nfs::dir_delete(outdir2)\n```\n:::\n\n\nNow, let's classify duplicates with __doubletrouble__.\n\n\n::: {.cell}\n\n```{.r .cell-code}\n# Classify duplicates with doubletrouble\nstart1 <- Sys.time()\ndups1 <- classify_gene_pairs(\n blast_list = diamond_intra[2],\n annotation = pdata$annotation,\n blast_inter = diamond_inter,\n scheme = \"extended\",\n collinearity_dir = here(\"products\")\n)\nend1 <- Sys.time()\nruntime1 <- end1 - start1\n```\n:::\n\n\nNext, we will classify duplicates with DupGen_finder. For that, we will first\nexport input data in the required format.\n\n\n::: {.cell}\n\n:::\n\n::: {.cell}\n\n```{.r .cell-code}\n# Export data\n## .blast files\nb1 <- diamond_intra$arabidopsis.thaliana_arabidopsis.thaliana |>\n mutate(\n query = str_replace_all(query, \"^ara_\", \"\"),\n db = str_replace_all(db, \"^ara_\", \"\")\n )\n\nb2 <- diamond_inter$arabidopsis.thaliana_amborella.trichopoda |>\n mutate(\n query = str_replace_all(query, \"^ara_\", \"\"),\n db = str_replace_all(db, \"^amb_\", \"\")\n )\n\nwrite_tsv(b1, file = file.path(tempdir(), \"Ath.blast\"), col_names = FALSE)\nwrite_tsv(b2, file = file.path(tempdir(), \"Ath_Atr.blast\"), col_names = FALSE)\n\n## .gff files\ngff1 <- pdata$annotation$arabidopsis.thaliana |>\n as.data.frame() |>\n mutate(gene = str_replace_all(gene, \"^ara_\", \"\")) |>\n mutate(seqnames = str_replace_all(seqnames, \"ara_\", \"ara-\")) |>\n dplyr::select(seqnames, gene, start, end)\n\ngff2 <- pdata$annotation$amborella.trichopoda |>\n as.data.frame() |>\n mutate(gene = str_replace_all(gene, \"^amb_\", \"\")) |>\n mutate(seqnames = str_replace_all(seqnames, \"^amb_\", \"amb-\")) |>\n dplyr::select(seqnames, gene, start, end)\n\ngff2 <- bind_rows(gff1, gff2)\n\nwrite_tsv(gff1, file = file.path(tempdir(), \"Ath.gff\"), col_names = FALSE)\nwrite_tsv(gff2, file = file.path(tempdir(), \"Ath_Atr.gff\"), col_names = FALSE)\n\n# Classify duplicates with DupGen_finder\nargs = c(\n \"-i\", tempdir(), \"-t Ath -c Atr\",\n \"-o\", file.path(tempdir(), \"results\"),\n \"-e 1e-10\"\n)\n\nstart2 <- Sys.time()\nsystem2(\"DupGen_finder.pl\", args = args)\nend2 <- Sys.time()\nruntime2 <- end2 - start2\n```\n:::\n\n\nNow, let's compare both algorithms in terms of runtime and results.\n\n\n::: {.cell}\n\n```{.r .cell-code}\n# Load `DupGen_finder.pl` results\nfiles <- c(\n \"Ath.wgd.pairs\", \"Ath.tandem.pairs\", \"Ath.proximal.pairs\",\n \"Ath.transposed.pairs\", \"Ath.dispersed.pairs\"\n)\nfiles <- file.path(tempdir(), \"results\", files)\nnames(files) <- c(\"SD\", \"TD\", \"PD\", \"TRD\", \"DD\")\ndups2 <- Reduce(rbind, lapply(seq_along(files), function(x) {\n d <- read_tsv(files[x], show_col_types = FALSE) |>\n mutate(type = names(files)[x]) |>\n select(1, 3, type) |>\n as.data.frame()\n \n names(d)[c(1,2)] <- c(\"dup1\", \"dup2\")\n \n return(d)\n}))\n\n# Compare runtime\ncomp_runtime <- data.frame(doubletrouble = runtime1, DupGen_finder = runtime2)\n\n# Compare number of gene pairs per category\ncomp_results <- inner_join(\n count(dups1$arabidopsis.thaliana, type) |> \n dplyr::rename(n_doubletrouble = n),\n count(dups2, type) |>\n dplyr::rename(n_DupGen_finder = n),\n by = \"type\"\n)\ncomp_results <- bind_rows(\n comp_results, \n data.frame(\n type = \"Total\", \n n_doubletrouble = sum(comp_results$n_doubletrouble),\n n_DupGen_finder = sum(comp_results$n_DupGen_finder)\n )\n)\n```\n:::\n\n::: {.cell}\n\n:::\n\n::: {.cell}\n\n```{.r .cell-code}\nlist(runtime = comp_runtime, results = comp_results)\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n$runtime\n doubletrouble DupGen_finder\n1 3.71474 secs 3.373682 secs\n\n$results\n type n_doubletrouble n_DupGen_finder\n1 SD 4329 4355\n2 TD 2075 2131\n3 PD 2789 896\n4 TRD 6703 5941\n5 DD 31589 17834\n6 Total 47485 31157\n```\n\n\n:::\n:::\n\n\nOverall, results are similar, but there are important differences. In terms\nof runtime, there is no significant difference (this is the runtime of a single\nrun, so there's some stochasticity). In terms of results, the numbers\nof SD, TD, and TRD pairs are similar, but there are more pronounced differences\nfor PD and DD pairs. In particular, the total number of paralogous pairs differs\nbetween algorithms. When we remove rows from the DIAMOND output based on \nthe E-value threshold and remove self (e.g., \"gene1-gene1\") and redundant \nhits (e.g., \"gene1-gene2\" and \"gene2-gene1\"), we get the following total \nnumber of pairs:\n\n\n::: {.cell}\n\n```{.r .cell-code}\n# Get total number of paralog pairs from DIAMOND output\nevalue <- 1e-10\ndmd <- diamond_intra$arabidopsis.thaliana_arabidopsis.thaliana\n\nall_paralogs <- lapply(list(dmd), function(x) {\n fpair <- x[x$evalue <= evalue, c(1, 2)]\n fpair <- fpair[fpair[, 1] != fpair[, 2], ]\n fpair <- fpair[!duplicated(t(apply(fpair, 1, sort))), ]\n names(fpair) <- c(\"dup1\", \"dup2\")\n return(fpair)\n})[[1]]\n```\n:::\n\n::: {.cell}\n\n:::\n\n::: {.cell}\n\n```{.r .cell-code}\nnrow(all_paralogs)\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n[1] 47485\n```\n\n\n:::\n:::\n\n\nThe total number of paralogous pairs in the DIAMOND output is the same as\nsum of classes for __doubletrouble__, but greater than the sum of classes\nfor __DupGen_finder__, indicating that the latter probably does some additional\n(undocumented) filtering before classifying gene pairs, or it could be a bug.\n\nFinally, since the number of PD pairs identified by __doubletrouble__ is much\ngreater than the number of PD pairs identified by __DupGen_finder__, we will \nexplore a few of these PD pairs so check whether __doubletrouble__ \nmisclassified them.\n\n\n::: {.cell}\n\n```{.r .cell-code}\npd1 <- dups1$arabidopsis.thaliana |>\n filter(type == \"PD\")\n\npd2 <- dups2 |>\n filter(type == \"PD\")\n\n# Get all pairs in `pd1` and `pd2`\np1 <- t(apply(pd1[, 1:2], 1, sort)) |>\n as.data.frame() |>\n mutate(V1 = str_replace_all(V1, \"^ara_\", \"\")) |>\n mutate(V2 = str_replace_all(V2, \"^ara_\", \"\")) |>\n mutate(pair_string = str_c(V1, V2, sep = \"-\")) |>\n pull(pair_string)\n\np2 <- t(apply(pd2[, 1:2], 1, sort)) |>\n as.data.frame() |>\n mutate(pair_string = str_c(V1, V2, sep = \"-\")) |>\n pull(pair_string)\n\n# Sample PD pairs from doubletrouble that are not PD pairs in DupGen_finder\nexamples <- p1[!p1 %in% p2] |> head(n = 5)\n\n# Check whether they are PD pairs or not\nath_annot <- pdata$annotation$arabidopsis.thaliana\nrange_examples <- lapply(examples, function(x) {\n g1 <- paste0(\"ara_\", strsplit(x, \"-\")[[1]][1])\n g2 <- paste0(\"ara_\", strsplit(x, \"-\")[[1]][2])\n ranges <- ath_annot[ath_annot$gene %in% c(g1, g2)]\n return(ranges)\n})\n```\n:::\n\n::: {.cell}\n\n:::\n\n::: {.cell}\n\n```{.r .cell-code}\nrange_examples\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n[[1]]\nGRanges object with 2 ranges and 1 metadata column:\n seqnames ranges strand | gene\n | \n 128 ara_1 428650-430720 - | ara_AT1G02220\n 130 ara_1 437860-439559 - | ara_AT1G02250\n -------\n seqinfo: 7 sequences from an unspecified genome; no seqlengths\n\n[[2]]\nGRanges object with 2 ranges and 1 metadata column:\n seqnames ranges strand | gene\n | \n 127 ara_1 427548-427811 - | ara_AT1G02210\n 130 ara_1 437860-439559 - | ara_AT1G02250\n -------\n seqinfo: 7 sequences from an unspecified genome; no seqlengths\n\n[[3]]\nGRanges object with 2 ranges and 1 metadata column:\n seqnames ranges strand | gene\n | \n 17117 ara_4 656407-659178 - | ara_AT4G01520\n 17120 ara_4 673862-676445 - | ara_AT4G01550\n -------\n seqinfo: 7 sequences from an unspecified genome; no seqlengths\n\n[[4]]\nGRanges object with 2 ranges and 1 metadata column:\n seqnames ranges strand | gene\n | \n 15329 ara_3 17882465-17884515 + | ara_AT3G48290\n 15332 ara_3 17887996-17889942 + | ara_AT3G48310\n -------\n seqinfo: 7 sequences from an unspecified genome; no seqlengths\n\n[[5]]\nGRanges object with 2 ranges and 1 metadata column:\n seqnames ranges strand | gene\n | \n 25845 ara_5 21378702-21379744 + | ara_AT5G52720\n 25847 ara_5 21382482-21383432 + | ara_AT5G52740\n -------\n seqinfo: 7 sequences from an unspecified genome; no seqlengths\n```\n\n\n:::\n:::\n\n\nIn these first five examples of pairs that are classified as PD pairs by\n__doubletrouble__, but not by __DupGen_finder__, we can see based on the\nnumbers in row names that they are indeed very close, separated by only a few \ngenes (<10). Thus, they are true PD pairs that __DupGen_finder__ failed \nto classify as PD pairs, or removed in their undocumented filtering (described\nabove).\n\n## Session info {.unnumbered}\n\nThis document was created under the following conditions:\n\n\n::: {.cell}\n::: {.cell-output .cell-output-stdout}\n\n```\n─ Session info ───────────────────────────────────────────────────────────────\n setting value\n version R version 4.4.1 (2024-06-14)\n os Ubuntu 22.04.4 LTS\n system x86_64, linux-gnu\n ui X11\n language (EN)\n collate en_US.UTF-8\n ctype en_US.UTF-8\n tz Europe/Brussels\n date 2024-07-29\n pandoc 3.1.1 @ /usr/lib/rstudio/resources/app/bin/quarto/bin/tools/ (via rmarkdown)\n\n─ Packages ───────────────────────────────────────────────────────────────────\n package * version date (UTC) lib source\n abind 1.4-5 2016-07-21 [1] CRAN (R 4.4.1)\n ade4 1.7-22 2023-02-06 [1] CRAN (R 4.4.1)\n AnnotationDbi 1.66.0 2024-05-01 [1] Bioconductor 3.19 (R 4.4.1)\n ape 5.8 2024-04-11 [1] CRAN (R 4.4.1)\n Biobase 2.64.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.1)\n BiocGenerics 0.50.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.1)\n BiocIO 1.14.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.1)\n BiocParallel 1.38.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.1)\n Biostrings 2.72.1 2024-06-02 [1] Bioconductor 3.19 (R 4.4.1)\n bit 4.0.5 2022-11-15 [1] CRAN (R 4.4.1)\n bit64 4.0.5 2020-08-30 [1] CRAN (R 4.4.1)\n bitops 1.0-7 2021-04-24 [1] CRAN (R 4.4.1)\n blob 1.2.4 2023-03-17 [1] CRAN (R 4.4.1)\n cachem 1.1.0 2024-05-16 [1] CRAN (R 4.4.1)\n cli 3.6.3 2024-06-21 [1] CRAN (R 4.4.1)\n coda 0.19-4.1 2024-01-31 [1] CRAN (R 4.4.1)\n codetools 0.2-20 2024-03-31 [1] CRAN (R 4.4.1)\n colorspace 2.1-0 2023-01-23 [1] CRAN (R 4.4.1)\n crayon 1.5.3 2024-06-20 [1] CRAN (R 4.4.1)\n curl 5.2.1 2024-03-01 [1] CRAN (R 4.4.1)\n DBI 1.2.3 2024-06-02 [1] CRAN (R 4.4.1)\n DelayedArray 0.30.1 2024-05-07 [1] Bioconductor 3.19 (R 4.4.1)\n digest 0.6.36 2024-06-23 [1] CRAN (R 4.4.1)\n doParallel 1.0.17 2022-02-07 [1] CRAN (R 4.4.1)\n doubletrouble * 1.4.1 2024-07-25 [1] Bioconductor\n dplyr * 1.1.4 2023-11-17 [1] CRAN (R 4.4.1)\n evaluate 0.24.0 2024-06-10 [1] CRAN (R 4.4.1)\n fansi 1.0.6 2023-12-08 [1] CRAN (R 4.4.1)\n fastmap 1.2.0 2024-05-15 [1] CRAN (R 4.4.1)\n forcats * 1.0.0 2023-01-29 [1] CRAN (R 4.4.1)\n foreach 1.5.2 2022-02-02 [1] CRAN (R 4.4.1)\n generics 0.1.3 2022-07-05 [1] CRAN (R 4.4.1)\n GenomeInfoDb 1.40.1 2024-05-24 [1] Bioconductor 3.19 (R 4.4.1)\n GenomeInfoDbData 1.2.12 2024-07-24 [1] Bioconductor\n GenomicAlignments 1.40.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.1)\n GenomicFeatures 1.56.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.1)\n GenomicRanges 1.56.1 2024-06-12 [1] Bioconductor 3.19 (R 4.4.1)\n ggnetwork 0.5.13 2024-02-14 [1] CRAN (R 4.4.1)\n ggplot2 * 3.5.1 2024-04-23 [1] CRAN (R 4.4.1)\n glue 1.7.0 2024-01-09 [1] CRAN (R 4.4.1)\n gtable 0.3.5 2024-04-22 [1] CRAN (R 4.4.1)\n here * 1.0.1 2020-12-13 [1] CRAN (R 4.4.1)\n hms 1.1.3 2023-03-21 [1] CRAN (R 4.4.1)\n htmltools 0.5.8.1 2024-04-04 [1] CRAN (R 4.4.1)\n htmlwidgets 1.6.4 2023-12-06 [1] CRAN (R 4.4.1)\n httr 1.4.7 2023-08-15 [1] CRAN (R 4.4.1)\n igraph 2.0.3 2024-03-13 [1] CRAN (R 4.4.1)\n intergraph 2.0-4 2024-02-01 [1] CRAN (R 4.4.1)\n IRanges 2.38.1 2024-07-03 [1] Bioconductor 3.19 (R 4.4.1)\n iterators 1.0.14 2022-02-05 [1] CRAN (R 4.4.1)\n jsonlite 1.8.8 2023-12-04 [1] CRAN (R 4.4.1)\n KEGGREST 1.44.1 2024-06-19 [1] Bioconductor 3.19 (R 4.4.1)\n knitr 1.48 2024-07-07 [1] CRAN (R 4.4.1)\n lattice 0.22-6 2024-03-20 [1] CRAN (R 4.4.1)\n lifecycle 1.0.4 2023-11-07 [1] CRAN (R 4.4.1)\n lubridate * 1.9.3 2023-09-27 [1] CRAN (R 4.4.1)\n magrittr 2.0.3 2022-03-30 [1] CRAN (R 4.4.1)\n MASS 7.3-61 2024-06-13 [1] CRAN (R 4.4.1)\n Matrix 1.7-0 2024-04-26 [1] CRAN (R 4.4.1)\n MatrixGenerics 1.16.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.1)\n matrixStats 1.3.0 2024-04-11 [1] CRAN (R 4.4.1)\n mclust 6.1.1 2024-04-29 [1] CRAN (R 4.4.1)\n memoise 2.0.1 2021-11-26 [1] CRAN (R 4.4.1)\n MSA2dist 1.8.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.1)\n munsell 0.5.1 2024-04-01 [1] CRAN (R 4.4.1)\n network 1.18.2 2023-12-05 [1] CRAN (R 4.4.1)\n networkD3 0.4 2017-03-18 [1] CRAN (R 4.4.1)\n nlme 3.1-165 2024-06-06 [1] CRAN (R 4.4.1)\n patchwork * 1.2.0 2024-01-08 [1] CRAN (R 4.4.1)\n pheatmap 1.0.12 2019-01-04 [1] CRAN (R 4.4.1)\n pillar 1.9.0 2023-03-22 [1] CRAN (R 4.4.1)\n pkgconfig 2.0.3 2019-09-22 [1] CRAN (R 4.4.1)\n png 0.1-8 2022-11-29 [1] CRAN (R 4.4.1)\n purrr * 1.0.2 2023-08-10 [1] CRAN (R 4.4.1)\n pwalign 1.0.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.1)\n R6 2.5.1 2021-08-19 [1] CRAN (R 4.4.1)\n RColorBrewer 1.1-3 2022-04-03 [1] CRAN (R 4.4.1)\n Rcpp 1.0.13 2024-07-17 [1] CRAN (R 4.4.1)\n RCurl 1.98-1.16 2024-07-11 [1] CRAN (R 4.4.1)\n readr * 2.1.5 2024-01-10 [1] CRAN (R 4.4.1)\n restfulr 0.0.15 2022-06-16 [1] CRAN (R 4.4.1)\n rjson 0.2.21 2022-01-09 [1] CRAN (R 4.4.1)\n rlang 1.1.4 2024-06-04 [1] CRAN (R 4.4.1)\n rmarkdown 2.27 2024-05-17 [1] CRAN (R 4.4.1)\n rprojroot 2.0.4 2023-11-05 [1] CRAN (R 4.4.1)\n Rsamtools 2.20.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.1)\n RSQLite 2.3.7 2024-05-27 [1] CRAN (R 4.4.1)\n rstudioapi 0.16.0 2024-03-24 [1] CRAN (R 4.4.1)\n rtracklayer 1.64.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.1)\n S4Arrays 1.4.1 2024-05-20 [1] Bioconductor 3.19 (R 4.4.1)\n S4Vectors 0.42.1 2024-07-03 [1] Bioconductor 3.19 (R 4.4.1)\n scales 1.3.0 2023-11-28 [1] CRAN (R 4.4.1)\n seqinr 4.2-36 2023-12-08 [1] CRAN (R 4.4.1)\n sessioninfo 1.2.2 2021-12-06 [1] CRAN (R 4.4.1)\n SparseArray 1.4.8 2024-05-24 [1] Bioconductor 3.19 (R 4.4.1)\n statnet.common 4.9.0 2023-05-24 [1] CRAN (R 4.4.1)\n stringi 1.8.4 2024-05-06 [1] CRAN (R 4.4.1)\n stringr * 1.5.1 2023-11-14 [1] CRAN (R 4.4.1)\n SummarizedExperiment 1.34.0 2024-05-01 [1] Bioconductor 3.19 (R 4.4.1)\n syntenet 1.6.0 2024-05-01 [1] Bioconductor 3.19 (R 4.4.1)\n tibble * 3.2.1 2023-03-20 [1] CRAN (R 4.4.1)\n tidyr * 1.3.1 2024-01-24 [1] CRAN (R 4.4.1)\n tidyselect 1.2.1 2024-03-11 [1] CRAN (R 4.4.1)\n tidyverse * 2.0.0 2023-02-22 [1] CRAN (R 4.4.1)\n timechange 0.3.0 2024-01-18 [1] CRAN (R 4.4.1)\n tzdb 0.4.0 2023-05-12 [1] CRAN (R 4.4.1)\n UCSC.utils 1.0.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.1)\n utf8 1.2.4 2023-10-22 [1] CRAN (R 4.4.1)\n vctrs 0.6.5 2023-12-01 [1] CRAN (R 4.4.1)\n withr 3.0.0 2024-01-16 [1] CRAN (R 4.4.1)\n xfun 0.46 2024-07-18 [1] CRAN (R 4.4.1)\n XML 3.99-0.17 2024-06-25 [1] CRAN (R 4.4.1)\n XVector 0.44.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.1)\n yaml 2.3.9 2024-07-05 [1] CRAN (R 4.4.1)\n zlibbioc 1.50.0 2024-04-30 [1] Bioconductor 3.19 (R 4.4.1)\n\n [1] /home/faalm/R/x86_64-pc-linux-gnu-library/4.4\n [2] /usr/local/lib/R/site-library\n [3] /usr/lib/R/site-library\n [4] /usr/lib/R/library\n\n──────────────────────────────────────────────────────────────────────────────\n```\n\n\n:::\n:::\n", "supporting": [], "filters": [ "rmarkdown/pagebreak.lua"