diff --git a/.gitignore b/.gitignore index ff2d0761..4ed146d4 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,6 @@ # External binaries scripts/assembly_go -scripts/assembly_cpp +scripts/assemblycpp-v5 # Generated by Cargo # will have compiled files and executables diff --git a/CITATION.cff b/CITATION.cff new file mode 100644 index 00000000..8fce5a72 --- /dev/null +++ b/CITATION.cff @@ -0,0 +1,68 @@ +cff-version: "1.2.0" +authors: +- family-names: Vimal + given-names: Devansh + orcid: "https://orcid.org/0009-0006-2794-8995" +- family-names: Parzych + given-names: Garrett + orcid: "https://orcid.org/0009-0008-4789-9603" +- family-names: Smith + given-names: Olivia M. + orcid: "https://orcid.org/0009-0004-2299-3522" +- family-names: Parkar + given-names: Devendra + orcid: "https://orcid.org/0009-0009-0133-8875" +- family-names: Bergen + given-names: Holly + orcid: "https://orcid.org/0009-0004-3570-5120" +- family-names: Daymude + given-names: Joshua J. + orcid: "https://orcid.org/0000-0001-7294-5626" +- family-names: Mathis + given-names: Cole + orcid: "https://orcid.org/0000-0001-8424-9169" +contact: +- family-names: Mathis + given-names: Cole + orcid: "https://orcid.org/0000-0001-8424-9169" +doi: 10.5281/zenodo.16764412 +message: If you use this software, please cite our article in the + Journal of Open Source Software. +preferred-citation: + authors: + - family-names: Vimal + given-names: Devansh + orcid: "https://orcid.org/0009-0006-2794-8995" + - family-names: Parzych + given-names: Garrett + orcid: "https://orcid.org/0009-0008-4789-9603" + - family-names: Smith + given-names: Olivia M. + orcid: "https://orcid.org/0009-0004-2299-3522" + - family-names: Parkar + given-names: Devendra + orcid: "https://orcid.org/0009-0009-0133-8875" + - family-names: Bergen + given-names: Holly + orcid: "https://orcid.org/0009-0004-3570-5120" + - family-names: Daymude + given-names: Joshua J. + orcid: "https://orcid.org/0000-0001-7294-5626" + - family-names: Mathis + given-names: Cole + orcid: "https://orcid.org/0000-0001-8424-9169" + date-published: 2026-01-06 + doi: 10.21105/joss.09318 + issn: 2475-9066 + issue: 117 + journal: Journal of Open Source Software + publisher: + name: Open Journals + start: 9318 + title: "assembly-theory: Open, Reproducible Calculation of Assembly + Indices" + type: article + url: "https://joss.theoj.org/papers/10.21105/joss.09318" + volume: 11 +title: "assembly-theory: Open, Reproducible Calculation of Assembly + Indices" diff --git a/README-crate.md b/README-crate.md index 93b4c7e1..679a2b00 100644 --- a/README-crate.md +++ b/README-crate.md @@ -36,10 +36,23 @@ See [the documentation](https://docs.rs/assembly-theory) for a complete list of ## Citation -If you use this crate in your own scientific work, please consider citing us: +If you use `assembly-theory` in your own scientific work, please consider citing us! +On [GitHub](https://github.com/DaymudeLab/assembly-theory), you can use the "Cite this repository" dropdown in the About section to get APA and BibTeX citations; this is also directly compatible with the Zotero browser plugin. +Otherwise, you can use the following BibTeX entry: ```bibtex -Coming soon! +@article{Vimal2026-assemblytheory, + title = {{assembly-theory: Open, Reproducible Calculation of Assembly Indices}}, + author = {Vimal, Devansh and Parzych, Garrett and Smith, Olivia M. and Parkar, Devendra and Bergen, Holly and Daymude, Joshua J. and Mathis, Cole}, + journal = {Journal of Open Source Software}, + volume = {11}, + number = {117}, + pages = {9318}, + month = jan, + year = 2026, + doi = {10.21105/joss.09318}, + url = {https://joss.theoj.org/papers/10.21105/joss.09318}, +} ``` diff --git a/README.md b/README.md index acbc06cb..065b857d 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,8 @@ [![crates.io](https://img.shields.io/crates/v/assembly-theory)](https://crates.io/crates/assembly-theory) [![PyPI](https://img.shields.io/pypi/v/assembly-theory)](https://pypi.org/project/assembly-theory/) [![docs.rs](https://docs.rs/assembly-theory/badge.svg)](https://docs.rs/assembly-theory) -[![Zenodo](https://zenodo.org/badge/DOI/10.5281/zenodo.16764413.svg)](https://doi.org/10.5281/zenodo.16764413) +[![JOSS](https://joss.theoj.org/papers/704f4c6eba7224d413819cf889c95091/status.svg)](https://joss.theoj.org/papers/704f4c6eba7224d413819cf889c95091) +[![Zenodo](https://zenodo.org/badge/DOI/10.5281/zenodo.16764412.svg)](https://doi.org/10.5281/zenodo.16764412) `assembly-theory` is an open-source, high-performance library for computing *assembly indices* of molecular structures (see, e.g., [Sharma et al., 2023](https://doi.org/10.1038/s41586-023-06600-9); [Walker et al., 2024](https://doi.org/10.1098/rsif.2024.0367)). It is implemented in Rust and is available as a [Rust crate](https://crates.io/crates/assembly-theory), [Python package](https://pypi.org/project/assembly-theory/), and standalone executable. @@ -157,17 +158,30 @@ Your PR will not be reviewed unless it passes all GitHub Actions (compilation, f ## Governance -`assembly-theory` is maintained by Devansh Vimal ([@AgentElement](https://github.com/AgentElement)), Garrett Parzych ([@Garrett-Pz](https://github.com/Garrett-Pz)), Joshua J. Daymude ([@jdaymude](https://github.com/jdaymude)), and Cole Mathis ([@colemathis](https://github.com/colemathis)) with support from other members of the [Biodesign Center for Biocomputing, Security and Society](https://biodesign.asu.edu/biocomputing-security-and-society/) at Arizona State University including Olivia M. Smith ([@omsmith161](https://github.com/omsmith161)), Devendra Parkar ([@devrz45](https://github.com/devrz45)), and Sean Bergen ([@ARandomCl0wn](https://github.com/ARandomCl0wn)). +`assembly-theory` is maintained by Devansh Vimal ([@AgentElement](https://github.com/AgentElement)), Garrett Parzych ([@Garrett-Pz](https://github.com/Garrett-Pz)), Joshua J. Daymude ([@jdaymude](https://github.com/jdaymude)), and Cole Mathis ([@colemathis](https://github.com/colemathis)) with support from other members of the [Biodesign Center for Biocomputing, Security and Society](https://biodesign.asu.edu/biocomputing-security-and-society/) at Arizona State University including Olivia M. Smith ([@omsmith161](https://github.com/omsmith161)), Devendra Parkar ([@devrz45](https://github.com/devrz45)), and Holly Bergen ([@ARandomCl0wn](https://github.com/ARandomCl0wn)). The maintainers govern the project using the committee model: high-level decisions about the project's direction require maintainer consensus, major code changes require majority approval, hotfixes and patches require just one maintainer approval, new maintainers can be added by unanimous decision of the existing maintainers, and existing maintainers can step down with advance notice. ## Citation -If you use this crate in your own scientific work, please consider citing us: +If you use `assembly-theory` in your own scientific work, please consider citing us! +On [GitHub](https://github.com/DaymudeLab/assembly-theory), you can use the "Cite this repository" dropdown in the About section to get APA and BibTeX citations; this is also directly compatible with the Zotero browser plugin. +Otherwise, you can use the following BibTeX entry: ```bibtex -Coming soon! +@article{Vimal2026-assemblytheory, + title = {{assembly-theory: Open, Reproducible Calculation of Assembly Indices}}, + author = {Vimal, Devansh and Parzych, Garrett and Smith, Olivia M. and Parkar, Devendra and Bergen, Holly and Daymude, Joshua J. and Mathis, Cole}, + journal = {Journal of Open Source Software}, + volume = {11}, + number = {117}, + pages = {9318}, + month = jan, + year = 2026, + doi = {10.21105/joss.09318}, + url = {https://joss.theoj.org/papers/10.21105/joss.09318}, +} ``` diff --git a/python/README.md b/python/README.md index c766d29d..484bc443 100644 --- a/python/README.md +++ b/python/README.md @@ -92,10 +92,23 @@ See the [`assembly_theory::python` documentation](https://docs.rs/assembly-theor ## Citation -If you use this package in your own scientific work, please consider citing us: +If you use `assembly-theory` in your own scientific work, please consider citing us! +On [GitHub](https://github.com/DaymudeLab/assembly-theory), you can use the "Cite this repository" dropdown in the About section to get APA and BibTeX citations; this is also directly compatible with the Zotero browser plugin. +Otherwise, you can use the following BibTeX entry: ```bibtex -Coming soon! +@article{Vimal2026-assemblytheory, + title = {{assembly-theory: Open, Reproducible Calculation of Assembly Indices}}, + author = {Vimal, Devansh and Parzych, Garrett and Smith, Olivia M. and Parkar, Devendra and Bergen, Holly and Daymude, Joshua J. and Mathis, Cole}, + journal = {Journal of Open Source Software}, + volume = {11}, + number = {117}, + pages = {9318}, + month = jan, + year = 2026, + doi = {10.21105/joss.09318}, + url = {https://joss.theoj.org/papers/10.21105/joss.09318}, +} ``` diff --git a/scripts/README.md b/scripts/README.md index 144ac2dc..f35e66c5 100644 --- a/scripts/README.md +++ b/scripts/README.md @@ -60,12 +60,11 @@ Again, enter a number to choose: ```shell 1) assembly_go (Jirasek et al., 2024) -2) assembly_cpp (Seet et al., 2024) +2) assemblycpp-v5 (Seet et al., 2025) 3) assembly-theory Calculate assembly indices using: ``` -[`assembly_go`](https://github.com/croningp/assembly_go) is existing, open-source software for calculating assembly indices. -We do not package its source code or its executable with our library, but it can be obtained [on GitHub](https://github.com/croningp/assembly_go) if non-self-referential ground truth is desired. -[`assembly_cpp`] is the current state-of-the-art algorithm by Seet et al. (2024) and was provided to us by its authors on the condition that it remains private and is used only for this ground-truth generation. +Both [`assembly_go`](https://github.com/croningp/assembly_go) and [`assemblycpp-v5`](https://github.com/croningp/assemblycpp-v5) are open-source software for calculating assembly indices. +We do not package their source code or executables with our library, but they can be obtained from GitHub if non-self-referential ground truth is desired. Otherwise, a release build of `assembly-theory` is created and used. diff --git a/scripts/dataset_curation.ipynb b/scripts/dataset_curation.ipynb index 5b02e3c3..92480fb6 100644 --- a/scripts/dataset_curation.ipynb +++ b/scripts/dataset_curation.ipynb @@ -267,7 +267,7 @@ "\n", "This dataset contains 55 natural products sampled from the COCONUT database (Sorokina et al., 2021), [accessed in late 2024](https://zenodo.org/records/13897048) prior to COCONUT 2.0 (Chandrasekhar et al., 2025).\n", "Natural products (or secondary metabolites) are a rich source of evolved chemical complexity, often exhibiting drug-like properties.\n", - "Subsets of this database were used to benchmark recent algorithmic progress in (Seet et al., 2024).\n", + "Subsets of this database were used to benchmark recent algorithmic progress in (Seet et al., 2025).\n", "Our sample includes 5 randomly sampled molecules for each number of heavy atoms from 15–25.\n", "\n", "COCONUT downloads as a single `.sdf` file containing all molecules in the database.\n", @@ -388,7 +388,7 @@ "\n", "Ruddigkeit, L., Van Deursen, R., Blum, L. C., & Reymond, J.-L. (2012). Enumeration of 166 Billion Organic Small Molecules in the Chemical Universe Database GDB-17. *Journal of Chemical Information and Modeling*, *52*(11), 2864–2875. https://doi.org/10.1021/ci300415d\n", "\n", - "Seet, I., Patarroyo, K. Y., Siebert, G., Walker, S. I., & Cronin, L. (2024). *Rapid Computation of the Assembly Index of Molecular Graphs* (No. 2410.09100). arXiv. https://doi.org/10.48550/arXiv.2410.09100\n", + "Seet, I., Patarroyo, K. Y., Siebert, G., Walker, S. I., & Cronin, L. (2025). Rapid Exploration of the Assembly Chemical Space of Molecular Graphs. *Journal of Chemical Information and Modeling*, *65*(24), 13203-13214. https://doi.org/10.1021/acs.jcim.5c01964\n", "\n", "Sorokina, M., Merseburger, P., Rajan, K., Yirik, M. A., & Steinbeck, C. (2021). COCONUT online: Collection of Open Natural Products database. *Journal of Cheminformatics*, *13*(1), 2. https://doi.org/10.1186/s13321-020-00478-9" ] diff --git a/scripts/generate-ma-index.sh b/scripts/generate-ma-index.sh index 7f2ae5c9..c245e22a 100755 --- a/scripts/generate-ma-index.sh +++ b/scripts/generate-ma-index.sh @@ -16,7 +16,7 @@ done # Let the user choose which executable should generate ground truth. PS3="Calculate assembly indices using: " -select exec_choice in "assembly_go (Jirasek et al., 2024)" "assembly_cpp (Seet et al., 2024)" "assembly-theory" +select exec_choice in "assembly_go (Jirasek et al., 2024)" "assemblycpp-v5 (Seet et al., 2025)" "assembly-theory" do case $REPLY in 1) @@ -29,12 +29,12 @@ do break ;; 2) - if [ ! -f "assembly_cpp" ]; then - echo -n "ERROR: Missing ./assembly_cpp executable " - echo "(provided privately by Seet et al.)." + if [ ! -f "assemblycpp-v5" ]; then + echo -n "ERROR: Missing ./assemblycpp-v5 executable " + echo "(https://github.com/croningp/assemblycpp-v5)." exit 1 fi - executable="./assembly_cpp" + executable="./assemblycpp-v5" break ;; 3) @@ -60,12 +60,12 @@ do molfile=$(basename "$direntry") echo -ne "\r\e[K$exec_choice: Calculating assembly index of $molfile..." - # assembly_go and assembly-theory expect ".mol" but assembly_cpp - # expects only "" with the ".mol" part stripped off. Also, - # assembly_cpp prints a ton of unnecessary information requiring some - # parsing to get just the assembly index. Lastly, assembly_cpp generates - # auxiliary output files that need to be removed. - if [ $executable = "./assembly_cpp" ]; then + # assembly_go and assembly-theory take ".mol" as input but + # assemblycpp-v5 takes only "" with the ".mol" part stripped off. + # Also, assemblycpp-v5 prints a ton of unnecessary information requiring + # some parsing to get just the assembly index. Lastly, assemblycpp-v5 + # generates auxiliary output files that need to be removed. + if [ $executable = "./assemblycpp-v5" ]; then molpath_stripped=$(echo "$direntry" | sed -e "s/.mol//g") maindex=$("$executable" "$molpath_stripped" -pathway=0 | tail -n 1 | awk '{print $NF}') rm "${molpath_stripped}Out" diff --git a/src/bounds.rs b/src/bounds.rs index d98eb5d8..658ec269 100644 --- a/src/bounds.rs +++ b/src/bounds.rs @@ -45,7 +45,7 @@ pub enum Bound { Log, /// An improvement over `Log` that also uses the size of the "largest /// duplicatable subgraph" for this state in an integer addition chain; see - /// [Seet et al. (2024)](https://arxiv.org/abs/2410.09100). + /// [Seet et al. (2025)](https://doi.org/10.1021/acs.jcim.5c01964). Int, /// Uses the types of bonds in the molecule to bound the number of assembly /// steps remaining. The first time a unique bond type is added to the diff --git a/src/matches.rs b/src/matches.rs index 0c5f1a90..a41c74e1 100644 --- a/src/matches.rs +++ b/src/matches.rs @@ -28,13 +28,14 @@ struct DagNode { /// Structural information on "matches" in a molecular graph, i.e., pairs of /// edge-disjoint, isomorphic subgraphs. pub struct Matches { - /// Seet et al. (2024) perform match enumeration by constructing a directed - /// acyclic graph (DAG). Each node in this DAG is a fragment (i.e., a - /// connected molecular subgraph) that is duplicatable (i.e., there exists - /// some other edge-disjoint fragment it is isomorphic to). If there is an - /// edge from u to v, then fragment v is fragment u with one added edge - /// (note: this new edge may be between two existing nodes in u or "extend" - /// from one existing node in u to a new node). + /// [Seet et al. (2025)](https://doi.org/10.1021/acs.jcim.5c01964) perform + /// match enumeration by constructing a directed acyclic graph (DAG). Each + /// node in this DAG is a fragment (i.e., a connected molecular subgraph) + /// that is duplicatable (i.e., there exists some other edge-disjoint + /// fragment it is isomorphic to). If there is an edge from u to v, then + /// fragment v is fragment u with one added edge (note: this new edge may + /// be between two existing nodes in u or "extend" from one existing node + /// in u to a new node). dag: Vec, /// All possible matches (i.e., pairs of edge-disjoint, isomorphic /// fragments) stored as pairs of fragment (i.e., DAG node) indices.