JuBiotech · Y0dler · Sep 30, 2024 · Sep 26, 2024 · Sep 26, 2024 · Sep 26, 2024
diff --git a/docs/requirements.txt b/docs/requirements.txt
@@ -3,4 +3,5 @@ myst-nb
 numpydoc
 nbsphinx
 sphinx-book-theme
+sphinxcontrib.bibtex
 sphinxcontrib.mermaid
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -37,9 +37,22 @@
     "numpydoc",
     "myst_nb",
     "sphinx_book_theme",
+    "sphinxcontrib.bibtex",
     "sphinxcontrib.mermaid",
 ]
+myst_enable_extensions = [
+    "amsmath",  # needed for LaTeX math environments
+    "colon_fence",
+    "dollarmath",  # needed for $ and $$ math
+    "html_image",
+    "replacements",
+    "strikethrough",
+    "tasklist",
+]
 nb_execution_mode = "off"
+bibtex_bibfiles = ["literature.bib"]
+bibtex_default_style = "plain"
+bibtex_reference_style = "label"
 
 # Add any paths that contain templates here, relative to this directory.
 templates_path = ["_templates"]

diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -34,6 +34,12 @@ The documentation features various notebooks that demonstrate the usage and inve
 .. toctree::
    :maxdepth: 1
 
+   markdown/Installation
+   markdown/Peak_model_composition
+   markdown/PeakPerformance_validation
+   markdown/PeakPerformance_workflow
+   markdown/Diagnostic_plots
+   markdown/How_to_adapt_PeakPerformance_to_your_data
    notebooks/Ex1_Simple_Pipeline.ipynb
    notebooks/Ex2_Custom_Use_of_PeakPerformance.ipynb
    notebooks/Ex3_Pipeline_with_larger_example_dataset.ipynb

diff --git a/docs/source/literature.bib b/docs/source/literature.bib
@@ -0,0 +1,214 @@
+@misc{nutpie,
+  author   = {Seyboldt, Adrian and {PyMC Developers}},
+  keywords = {Software},
+  license  = {MIT},
+  title    = {{nutpie}},
+  url      = {https://github.com/pymc-devs/nutpie}
+}
+
+@article{scipy,
+  author  = {Virtanen, Pauli and Gommers, Ralf and Oliphant, Travis E. and
+             Haberland, Matt and Reddy, Tyler and Cournapeau, David and
+             Burovski, Evgeni and Peterson, Pearu and Weckesser, Warren and
+             Bright, Jonathan and {van der Walt}, St{\'e}fan J. and
+             Brett, Matthew and Wilson, Joshua and Millman, K. Jarrod and
+             Mayorov, Nikolay and Nelson, Andrew R. J. and Jones, Eric and
+             Kern, Robert and Larson, Eric and Carey, C J and
+             Polat, {\.I}lhan and Feng, Yu and Moore, Eric W. and
+             {VanderPlas}, Jake and Laxalde, Denis and Perktold, Josef and
+             Cimrman, Robert and Henriksen, Ian and Quintero, E. A. and
+             Harris, Charles R. and Archibald, Anne M. and
+             Ribeiro, Ant{\^o}nio H. and Pedregosa, Fabian and
+             {van Mulbregt}, Paul and {SciPy 1.0 Contributors}},
+  title   = {{{SciPy} 1.0: {F}undamental Algorithms for Scientific Computing in {P}ython}},
+  journal = {Nature Methods},
+  year    = {2020},
+  volume  = {17},
+  pages   = {261--272},
+  adsurl  = {https://rdcu.be/b08Wh},
+  doi     = {10.1038/s41592-019-0686-2}
+}
+
+@article{matplotlib,
+  author    = {Hunter, J. D.},
+  title     = {Matplotlib: A 2D graphics environment},
+  journal   = {Computing in Science \& Engineering},
+  volume    = {9},
+  number    = {3},
+  pages     = {90--95},
+  abstract  = {Matplotlib is a 2D graphics package used for Python for
+               application development, interactive scripting, and publication-quality
+               image generation across user interfaces and operating systems.},
+  publisher = {IEEE COMPUTER SOC},
+  doi       = {10.1109/MCSE.2007.55},
+  year      = 2007
+}
+
+@misc{matplotlibzenodo,
+  author    = {{The Matplotlib Development Team}},
+  title     = {Matplotlib: Visualization with Python},
+  keywords  = {software},
+  month     = may,
+  year      = 2024,
+  publisher = {Zenodo},
+  version   = {v3.9.0},
+  doi       = {10.5281/zenodo.11201097},
+  url       = {https://doi.org/10.5281/zenodo.11201097}
+}
+
+@article{RN173,
+  author  = {Hoffmann, Matthew D. and Gelman, Andrew},
+  title   = {The No-U-Turn Sampler: Adaptively Setting Path Lengths in Hamiltonian Monte Carlo},
+  journal = {Journal of Machine Learning Research},
+  volume  = {15},
+  year    = {2014},
+  type    = {Journal Article}
+}
+
+@article{RN150,
+  author  = {Abril-Pla, O. and Andreani, V. and Carroll, C. and Dong, L. and Fonnesbeck, C. J. and Kochurov, M. and Kumar, R. and Lao, J. and Luhmann, C. C. and Martin, O. A. and Osthege, M. and Vieira, R. and Wiecki, T. and Zinkov, R.},
+  title   = {{PyMC}: a modern, and comprehensive probabilistic programming framework in Python},
+  journal = {PeerJ Comput Sci},
+  volume  = {9},
+  pages   = {e1516},
+  issn    = {2376-5992 (Electronic)
+             2376-5992 (Linking)},
+  doi     = {10.7717/peerj-cs.1516},
+  url     = {https://www.ncbi.nlm.nih.gov/pubmed/37705656},
+  year    = {2023},
+  type    = {Journal Article}
+}
+
+@book{RN162,
+  author  = {Kruschke, John K.},
+  title   = {Doing Bayesian Data Analysis},
+  edition = {1st Edition},
+  publisher={Academic Press},
+  isbn    = {9780123814852},
+  year    = {2010},
+  type    = {Book}
+}
+
+@article{RN144,
+  author  = {Azzalini, A.},
+  title   = {A class of distributions which includes the normal ones},
+  journal = {Scand. J. Statist.},
+  volume  = {12},
+  pages   = {171-178},
+  year    = {1985},
+  type    = {Journal Article}
+}
+
+
+@article{RN152,
+  author  = {Gelman, Andrew and Rubin, Donald B.},
+  title   = {Inference from Iterative Simulation Using Multiple Sequences},
+  journal = {Statistical Science},
+  volume  = {7},
+  number  = {4},
+  year    = {1992},
+  type    = {Journal Article}
+}
+
+@article{RN153,
+  author  = {Grushka, E.},
+  title   = {Characterization of exponentially modified Gaussian peaks in chromatography},
+  journal = {Anal Chem},
+  volume  = {44},
+  number  = {11},
+  pages   = {1733-8},
+  issn    = {0003-2700 (Print)
+             0003-2700 (Linking)},
+  doi     = {10.1021/ac60319a011},
+  url     = {https://www.ncbi.nlm.nih.gov/pubmed/22324584},
+  year    = {1972},
+  type    = {Journal Article}
+}
+
+@article{RN149,
+  author  = {Hemmerich, J. and Noack, S. and Wiechert, W. and Oldiges, M.},
+  title   = {Microbioreactor Systems for Accelerated Bioprocess Development},
+  journal = {Biotechnol J},
+  volume  = {13},
+  number  = {4},
+  pages   = {e1700141},
+  issn    = {1860-7314 (Electronic)
+             1860-6768 (Linking)},
+  doi     = {10.1002/biot.201700141},
+  url     = {https://www.ncbi.nlm.nih.gov/pubmed/29283217},
+  year    = {2018},
+  type    = {Journal Article}
+}
+
+@article{RN148,
+  author  = {Kostov, Y. and Harms, P. and Randers-Eichhorn, L. and Rao, G.},
+  title   = {Low-cost microbioreactor for high-throughput bioprocessing},
+  journal = {Biotechnol Bioeng},
+  volume  = {72},
+  number  = {3},
+  pages   = {346-52},
+  issn    = {0006-3592 (Print)
+             0006-3592 (Linking)},
+  doi     = {10.1002/1097-0290(20010205)72:3<346::aid-bit12>3.0.co;2-x},
+  url     = {https://www.ncbi.nlm.nih.gov/pubmed/11135205},
+  year    = {2001},
+  type    = {Journal Article}
+}
+
+@article{RN145,
+  author  = {Vehtari, Aki and Gelman, Andrew and Gabry, Jonah},
+  title   = {Practical Bayesian model evaluation using leave-one-out cross-validation and WAIC},
+  journal = {Statistics and Computing},
+  volume  = {27},
+  number  = {5},
+  pages   = {1413-1432},
+  issn    = {0960-3174
+             1573-1375},
+  doi     = {10.1007/s11222-016-9696-4},
+  year    = {2016},
+  type    = {Journal Article}
+}
+
+@article{RN146,
+  author  = {Watanabe, Sumio},
+  title   = {Asymptotic Equivalence of Bayes Cross Validation and Widely Applicable Information Criterion in Singular Learning Theory},
+  journal = {Journal of machine learning research},
+  volume  = {11},
+  pages   = {3571-3594},
+  year    = {2010},
+  type    = {Journal Article}
+}
+
+@article{RN147,
+  author  = {Kumar, Ravin and Carroll, Colin and Hartikainen, Ari and Martin, Osvaldo},
+  title   = {ArviZ a unified library for exploratory analysis of Bayesian models in Python},
+  journal = {Journal of Open Source Software},
+  volume  = {4},
+  number  = {33},
+  issn    = {2475-9066},
+  doi     = {10.21105/joss.01143},
+  year    = {2019},
+  type    = {Journal Article}
+}
+
+@article{harris2020array,
+	title         = {Array programming with {NumPy}},
+	author        = {Harris, C. R. and Millman, K. J. and
+	{van der Walt}, S. J. and Gommers, R. and Virtanen, P. and
+	Cournapeau, D. and Wieser, E. and Taylor, J. and
+	Berg, S. and Smith, N. J. and Kern, R. and Picus, M.
+	and Hoyer, S. and {van Kerkwijk}, M. H. and
+	Brett, M. and Haldane, M. and del R{\'{i}}o, J. F. and Wiebe, M. and Peterson, P. and
+	G{\'{e}}rard-Marchant, P. and Sheppard, K. and Reddy, T. and
+	Weckesser, W. and Abbasi, H. and Gohlke, C. and
+	Oliphant, T. E.},
+	year          = {2020},
+	month         = sep,
+	journal       = {Nature},
+	volume        = {585},
+	number        = {7825},
+	pages         = {357--362},
+	doi           = {10.1038/s41586-020-2649-2},
+	publisher     = {Springer Science and Business Media {LLC}},
+	url           = {https://doi.org/10.1038/s41586-020-2649-2}
+}
diff --git a/docs/source/markdown/Diagnostic_plots.md b/docs/source/markdown/Diagnostic_plots.md
@@ -0,0 +1,14 @@
+# Diagnostic plots
+
+An important feature of `PeakPerformance` is constituted by the easy access to diagnostic metrics for extensive quality control.
+Using the data stored in an inference data object of a fit, the user can utilize the ArviZ package to generate various diagnostic plots.
+One particularly useful one is the cumulative posterior predictive plot portrayed in Figure 1.
+This plot enables users to judge the quality of a fit and identify instances of lack-of-fit.
+As can be seen in the left plot, some predicted intensity values in the lowest quantile of the single peak example show a minimal lack-of-fit.
+Importantly, such a deviation can be observed, judged and is quantifiable which intrinsically represents a large improvement over the status quo.
+
+```{figure-md} fig_d1
+![](./Fig5_ppc.png)
+
+__Figure 1:__ Cumulative posterior predictive plots created with the ArviZ package and pertaining to the example data of the single His peak (left) and the double Leu and Ile peak (right). The empirical cumulative density function (black) is in good agreement with the median posterior predictive (orange) and lies within the predicted variance (blue band), visualizing that the model provides an adequate prediction irrespective of the intensity value.
+```
diff --git a/docs/source/markdown/Fig1_model_single_peak.png b/docs/source/markdown/Fig1_model_single_peak.png
diff --git a/docs/source/markdown/Fig2_model_double_peak.png b/docs/source/markdown/Fig2_model_double_peak.png
diff --git a/docs/source/markdown/Fig3_PP-standalone.png b/docs/source/markdown/Fig3_PP-standalone.png
diff --git a/docs/source/markdown/Fig4_peak_results.png b/docs/source/markdown/Fig4_peak_results.png
diff --git a/docs/source/markdown/Fig5_ppc.png b/docs/source/markdown/Fig5_ppc.png
diff --git a/docs/source/markdown/Fig6_PP-validation.png b/docs/source/markdown/Fig6_PP-validation.png
diff --git a/How to adapt PeakPerformance to your data.md → ..._to_adapt_PeakPerformance_to_your_data.md b/How to adapt PeakPerformance to your data.md → ..._to_adapt_PeakPerformance_to_your_data.md
diff --git a/Installation.md → docs/source/markdown/Installation.md b/Installation.md → docs/source/markdown/Installation.md
@@ -2,7 +2,15 @@
 It is highly recommended to follow these steps:
 1. Install the package manager [Mamba](https://github.com/conda-forge/miniforge/releases).
 Choose the latest installer at the top of the page, click on "show all assets", and download an installer denominated by "Mambaforge-version number-name of your OS.exe", so e.g. "Mambaforge-23.3.1-1-Windows-x86_64.exe" for a Windows 64 bit operating system. Then, execute the installer to install mamba and activate the option "Add Mambaforge to my PATH environment variable".
-(⚠ __WARNING__ ⚠: If you have already installed Miniconda, you can install Mamba on top of it but there are compatibility issues with Anaconda. The newest conda version should also work, just replace `mamba` with `conda` in step 2.)
+
+```{caution}
+If you have already installed Miniconda, you can install Mamba on top of it but there are compatibility issues with Anaconda.
+```
+
+```{note}
+The newest conda version should also work, just replace `mamba` with `conda` in step 2.)
+```
+
 2. Create a new Python environment (replace "name_of_environment" with your desired name) in the command line via
 ```
 mamba create -c conda-forge -n name_of_environment pymc nutpie arviz jupyter matplotlib openpyxl "python=3.10"

diff --git a/docs/source/markdown/PeakPerformance_validation.md b/docs/source/markdown/PeakPerformance_validation.md
@@ -0,0 +1,82 @@
+# Validation of `PeakPerformance`
+
+## Materials and Methods
+Several stages of validation were employed to prove the suitability of `PeakPerformance` for chromatographic peak data analysis.
+The goals were to showcase the efficacy of `PeakPerformance` utilizing noisy synthetic data, to investigate cases where a peak could reasonably be fit with either of the single peak models, and finally to use experimental data to compare results obtained with `PeakPerformance` to those from the commercial vendor software Sciex MultiQuant.
+
+For the first test, 500 random data sets were generated with the NumPy random module {cite}`harris2020array` by drawing from the normal distributions detailed in [Table 1](#tab_v1) except for the mean parameter which was held constant at a value of 6.
+Subsequently, normally distributed random noise ($\mathcal{N}(0, 0.6)$ or $\mathcal{N}(0, 1.2)$ for data sets with the tag "higher noise") was added to each data point.
+The amount of data points per time was chosen based on an LC-MS/MS method routinely utilized by the authors and accordingly set to one data point per 1.8 s.
+
+(tab_v1)=
+:::{table} __Table 1:__ Normal distributions from which parameters were drawn randomly to create synthetic data sets for the validation of `PeakPerformance`.
+
+| **parameter**      | **model (1st test)**    | **model (2nd test)**    |
+| ------------------ | ----------------------- | ----------------------- |
+| area               | $\mathcal{N}(8, 0.5)$   | -                       |
+| standard deviation | $\mathcal{N}(0.5, 0.1)$ | $\mathcal{N}(0.5, 0.1)$ |
+| skewness           | $\mathcal{N}(0, 2)$     | -                       |
+| baseline intercept | $\mathcal{N}(25, 1)$    | $\mathcal{N}(25, 1)$    |
+| baseline slope     | $\mathcal{N}(0, 1)$     | $\mathcal{N}(0, 1)$     |
+:::
+
+In marginal cases when the shape of a single peak had a slight skew, the automated model selection would at times settle on a normal or a skew normal model.
+Therefore, it was relevant to investigate whether this choice would lead to a significant discrepancy in estimated peak parameters.
+Accordingly, for the second test synthetic data sets were generated with the NumPy random module according to [Table 1](#tab_v1) and noise was added as described before.
+The residual parameters were held constant, i.e. the mean was fixed to 6, the area to 8, and the skewness parameter $\alpha$ to 1.
+
+For the third and final test, experimental peak data was analyzed with both `PeakPerformance` (version 0.7.0) and Sciex MultiQuant (version 3.0.3) with human supervision, i.e. the results were visually inspected and corrected if necessary.
+The data set consisted of 192 signals comprised of 123 single peaks, 50 peaks as part of double peaks, and 19 noise signals.
+
+
+## Results and Discussion
+In the first stage of validation, peak fitting with normal and skew normal peak models was tested regarding the ability to reproduce the ground truth of randomly generated noisy synthetic data sets.
+The arithmetic means portrayed in [Figure 1a](#fig_v1) were calculated based on a measure of similarity
+
+$$
+F_{y / \hat{y}} = \frac{y}{\hat{y}}
+$$ (eqn:F_yy)
+
+where $y$ represents the estimated parameter value and $\hat{y}$ its pertaining ground truth.
+As they exhibit values close to 1, this demonstrates a near identity between estimation and ground truth.
+Additionally, the normal-shaped peak model was paired with skew normally distributed noisy data and vice versa.
+In both cases, $\sigma$ was not reproduced well, especially by the normal-shaped model.
+Nevertheless, the peak area and height were still identified correctly with the skew normal model and merely slightly underestimated by the normal model.
+
+```{figure-md} fig_v1
+![Validation of results from PeakPerformance.](./Fig6_PP-validation.png)
+
+__Figure 1:__ Validation of results from `PeakPerformance`.
+**a)** Noisy synthetic data was randomly generated from one of the implemented distributions and the program's ability to infer the ground truth was observed. Portrayed are the fractions of estimated parameter to ground truth. **b)** The influence of model choice between normal and skew normal model in marginal cases with little to no skew was tested and the ratios between results from both models are plotted. **c)** Lastly, experimental data was analyzed with `PeakPerformance` version 0.7.0 and compared to results achieved with the commercial software Sciex MultiQuant version 3.0.3.
+```
+
+In the second stage, marginal cases in the form of slightly skewed peaks were investigated to observe whether their estimation with a normal- or skew normal-shaped intensity function would result in significant differences in terms of peak area and height.
+Here, a slight skew was defined as an $\alpha$ parameter of 1 resulting in peak shapes not visibly discernible as clearly normal or skew normal.
+With a sample size of 100 noisy, randomly generated data sets, we show that nearly identical estimates for peak area and height, as well as their respective uncertainties are obtained regardless of the utilized model ([Fig. 1b](#fig_v1)).
+The exhibited mean values are based on fractions of the key peak parameters area and height between results obtained with a normal and skew normal model which were defined as
+
+$$
+F_{n / \mathrm{sn}} = \frac{A_{\mathcal{N}}}{A_{\mathrm{skew \ normal}}}
+$$ (eqn:F_nsn)
+
+where $A_{\mathrm{normal}}$ and $A_{\mathrm{skew \ normal}}$ are the estimated areas with normal and skew normal models, respectively.
+
+In the third stage, experimental peak data was analyzed with both `PeakPerformance` (version 0.7.0) and Sciex MultiQuant (version 3.0.3) and the fraction of the obtained areas was determined as
+
+$$
+F_{\mathrm{MQ} / \mathrm{PP}} = \frac{A_{\mathrm{MQ}}}{A_{\mathrm{PP}}}
+$$ (eqn:F_mqpp)
+
+where $A_{\mathrm{MQ}}$ denominates the area yielded by MultiQuant and $A_{\mathrm{PP}}$ the area from `PeakPerformance`.
+Beyond the comparability of the resulting peak area ratio means portrayed in [Figure 1c](#fig_v1), it is relevant to state that 103 signals from MultiQuant (54 % of total signals) were manually modified.
+Of these, 31 % were false positives and 69 % were manually re-integrated.
+These figures are the result of a relatively high share of double peaks in the test sample which generally give a lot more cause for manual interference than single peaks.
+In contrast, however, the `PeakPerformance` pipeline was only started once and merely two single peaks and one double peak were fit again with a different model and/or increased sample size after the original pipeline batch run had finished.
+Among the 192 signals of the test data set, there were 7 noisy, low intensity signals without a clear peak which were recognized as a peak only by either one or the other software and were hence omitted from this comparison.
+By showing not only the mean area ratio of all peaks but also the ones for the single and double peak subgroups, it is evident that the variance is significantly higher for double peaks.
+In case of this data set, two low quality double peaks in particular inflated the variance significantly which may not be representative for other data sets.
+It has to be stated, too, that the prevalence of manual re-integration of double peaks in MQ might have introduced a user-specific bias, thereby increasing the final variance.
+Nevertheless, it could be shown that `PeakPerformance` yields comparable peak area results to a commercially available vendor software.
+
+```{bibliography}
+```