diff --git a/docs/requirements.txt b/docs/requirements.txt index ffaa18c..728f4e1 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -3,4 +3,5 @@ myst-nb numpydoc nbsphinx sphinx-book-theme +sphinxcontrib.bibtex sphinxcontrib.mermaid diff --git a/docs/source/conf.py b/docs/source/conf.py index aacae2d..47f6c77 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -37,9 +37,22 @@ "numpydoc", "myst_nb", "sphinx_book_theme", + "sphinxcontrib.bibtex", "sphinxcontrib.mermaid", ] +myst_enable_extensions = [ + "amsmath", # needed for LaTeX math environments + "colon_fence", + "dollarmath", # needed for $ and $$ math + "html_image", + "replacements", + "strikethrough", + "tasklist", +] nb_execution_mode = "off" +bibtex_bibfiles = ["literature.bib"] +bibtex_default_style = "plain" +bibtex_reference_style = "label" # Add any paths that contain templates here, relative to this directory. templates_path = ["_templates"] diff --git a/docs/source/index.rst b/docs/source/index.rst index cf0494d..5091b78 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -34,6 +34,12 @@ The documentation features various notebooks that demonstrate the usage and inve .. toctree:: :maxdepth: 1 + markdown/Installation + markdown/Peak_model_composition + markdown/PeakPerformance_validation + markdown/PeakPerformance_workflow + markdown/Diagnostic_plots + markdown/How_to_adapt_PeakPerformance_to_your_data notebooks/Ex1_Simple_Pipeline.ipynb notebooks/Ex2_Custom_Use_of_PeakPerformance.ipynb notebooks/Ex3_Pipeline_with_larger_example_dataset.ipynb diff --git a/docs/source/literature.bib b/docs/source/literature.bib new file mode 100644 index 0000000..e21e6b8 --- /dev/null +++ b/docs/source/literature.bib @@ -0,0 +1,214 @@ +@misc{nutpie, + author = {Seyboldt, Adrian and {PyMC Developers}}, + keywords = {Software}, + license = {MIT}, + title = {{nutpie}}, + url = {https://github.com/pymc-devs/nutpie} +} + +@article{scipy, + author = {Virtanen, Pauli and Gommers, Ralf and Oliphant, Travis E. and + Haberland, Matt and Reddy, Tyler and Cournapeau, David and + Burovski, Evgeni and Peterson, Pearu and Weckesser, Warren and + Bright, Jonathan and {van der Walt}, St{\'e}fan J. and + Brett, Matthew and Wilson, Joshua and Millman, K. Jarrod and + Mayorov, Nikolay and Nelson, Andrew R. J. and Jones, Eric and + Kern, Robert and Larson, Eric and Carey, C J and + Polat, {\.I}lhan and Feng, Yu and Moore, Eric W. and + {VanderPlas}, Jake and Laxalde, Denis and Perktold, Josef and + Cimrman, Robert and Henriksen, Ian and Quintero, E. A. and + Harris, Charles R. and Archibald, Anne M. and + Ribeiro, Ant{\^o}nio H. and Pedregosa, Fabian and + {van Mulbregt}, Paul and {SciPy 1.0 Contributors}}, + title = {{{SciPy} 1.0: {F}undamental Algorithms for Scientific Computing in {P}ython}}, + journal = {Nature Methods}, + year = {2020}, + volume = {17}, + pages = {261--272}, + adsurl = {https://rdcu.be/b08Wh}, + doi = {10.1038/s41592-019-0686-2} +} + +@article{matplotlib, + author = {Hunter, J. D.}, + title = {Matplotlib: A 2D graphics environment}, + journal = {Computing in Science \& Engineering}, + volume = {9}, + number = {3}, + pages = {90--95}, + abstract = {Matplotlib is a 2D graphics package used for Python for + application development, interactive scripting, and publication-quality + image generation across user interfaces and operating systems.}, + publisher = {IEEE COMPUTER SOC}, + doi = {10.1109/MCSE.2007.55}, + year = 2007 +} + +@misc{matplotlibzenodo, + author = {{The Matplotlib Development Team}}, + title = {Matplotlib: Visualization with Python}, + keywords = {software}, + month = may, + year = 2024, + publisher = {Zenodo}, + version = {v3.9.0}, + doi = {10.5281/zenodo.11201097}, + url = {https://doi.org/10.5281/zenodo.11201097} +} + +@article{RN173, + author = {Hoffmann, Matthew D. and Gelman, Andrew}, + title = {The No-U-Turn Sampler: Adaptively Setting Path Lengths in Hamiltonian Monte Carlo}, + journal = {Journal of Machine Learning Research}, + volume = {15}, + year = {2014}, + type = {Journal Article} +} + +@article{RN150, + author = {Abril-Pla, O. and Andreani, V. and Carroll, C. and Dong, L. and Fonnesbeck, C. J. and Kochurov, M. and Kumar, R. and Lao, J. and Luhmann, C. C. and Martin, O. A. and Osthege, M. and Vieira, R. and Wiecki, T. and Zinkov, R.}, + title = {{PyMC}: a modern, and comprehensive probabilistic programming framework in Python}, + journal = {PeerJ Comput Sci}, + volume = {9}, + pages = {e1516}, + issn = {2376-5992 (Electronic) + 2376-5992 (Linking)}, + doi = {10.7717/peerj-cs.1516}, + url = {https://www.ncbi.nlm.nih.gov/pubmed/37705656}, + year = {2023}, + type = {Journal Article} +} + +@book{RN162, + author = {Kruschke, John K.}, + title = {Doing Bayesian Data Analysis}, + edition = {1st Edition}, + publisher={Academic Press}, + isbn = {9780123814852}, + year = {2010}, + type = {Book} +} + +@article{RN144, + author = {Azzalini, A.}, + title = {A class of distributions which includes the normal ones}, + journal = {Scand. J. Statist.}, + volume = {12}, + pages = {171-178}, + year = {1985}, + type = {Journal Article} +} + + +@article{RN152, + author = {Gelman, Andrew and Rubin, Donald B.}, + title = {Inference from Iterative Simulation Using Multiple Sequences}, + journal = {Statistical Science}, + volume = {7}, + number = {4}, + year = {1992}, + type = {Journal Article} +} + +@article{RN153, + author = {Grushka, E.}, + title = {Characterization of exponentially modified Gaussian peaks in chromatography}, + journal = {Anal Chem}, + volume = {44}, + number = {11}, + pages = {1733-8}, + issn = {0003-2700 (Print) + 0003-2700 (Linking)}, + doi = {10.1021/ac60319a011}, + url = {https://www.ncbi.nlm.nih.gov/pubmed/22324584}, + year = {1972}, + type = {Journal Article} +} + +@article{RN149, + author = {Hemmerich, J. and Noack, S. and Wiechert, W. and Oldiges, M.}, + title = {Microbioreactor Systems for Accelerated Bioprocess Development}, + journal = {Biotechnol J}, + volume = {13}, + number = {4}, + pages = {e1700141}, + issn = {1860-7314 (Electronic) + 1860-6768 (Linking)}, + doi = {10.1002/biot.201700141}, + url = {https://www.ncbi.nlm.nih.gov/pubmed/29283217}, + year = {2018}, + type = {Journal Article} +} + +@article{RN148, + author = {Kostov, Y. and Harms, P. and Randers-Eichhorn, L. and Rao, G.}, + title = {Low-cost microbioreactor for high-throughput bioprocessing}, + journal = {Biotechnol Bioeng}, + volume = {72}, + number = {3}, + pages = {346-52}, + issn = {0006-3592 (Print) + 0006-3592 (Linking)}, + doi = {10.1002/1097-0290(20010205)72:3<346::aid-bit12>3.0.co;2-x}, + url = {https://www.ncbi.nlm.nih.gov/pubmed/11135205}, + year = {2001}, + type = {Journal Article} +} + +@article{RN145, + author = {Vehtari, Aki and Gelman, Andrew and Gabry, Jonah}, + title = {Practical Bayesian model evaluation using leave-one-out cross-validation and WAIC}, + journal = {Statistics and Computing}, + volume = {27}, + number = {5}, + pages = {1413-1432}, + issn = {0960-3174 + 1573-1375}, + doi = {10.1007/s11222-016-9696-4}, + year = {2016}, + type = {Journal Article} +} + +@article{RN146, + author = {Watanabe, Sumio}, + title = {Asymptotic Equivalence of Bayes Cross Validation and Widely Applicable Information Criterion in Singular Learning Theory}, + journal = {Journal of machine learning research}, + volume = {11}, + pages = {3571-3594}, + year = {2010}, + type = {Journal Article} +} + +@article{RN147, + author = {Kumar, Ravin and Carroll, Colin and Hartikainen, Ari and Martin, Osvaldo}, + title = {ArviZ a unified library for exploratory analysis of Bayesian models in Python}, + journal = {Journal of Open Source Software}, + volume = {4}, + number = {33}, + issn = {2475-9066}, + doi = {10.21105/joss.01143}, + year = {2019}, + type = {Journal Article} +} + +@article{harris2020array, + title = {Array programming with {NumPy}}, + author = {Harris, C. R. and Millman, K. J. and + {van der Walt}, S. J. and Gommers, R. and Virtanen, P. and + Cournapeau, D. and Wieser, E. and Taylor, J. and + Berg, S. and Smith, N. J. and Kern, R. and Picus, M. + and Hoyer, S. and {van Kerkwijk}, M. H. and + Brett, M. and Haldane, M. and del R{\'{i}}o, J. F. and Wiebe, M. and Peterson, P. and + G{\'{e}}rard-Marchant, P. and Sheppard, K. and Reddy, T. and + Weckesser, W. and Abbasi, H. and Gohlke, C. and + Oliphant, T. E.}, + year = {2020}, + month = sep, + journal = {Nature}, + volume = {585}, + number = {7825}, + pages = {357--362}, + doi = {10.1038/s41586-020-2649-2}, + publisher = {Springer Science and Business Media {LLC}}, + url = {https://doi.org/10.1038/s41586-020-2649-2} +} diff --git a/docs/source/markdown/Diagnostic_plots.md b/docs/source/markdown/Diagnostic_plots.md new file mode 100644 index 0000000..13aadce --- /dev/null +++ b/docs/source/markdown/Diagnostic_plots.md @@ -0,0 +1,14 @@ +# Diagnostic plots + +An important feature of `PeakPerformance` is constituted by the easy access to diagnostic metrics for extensive quality control. +Using the data stored in an inference data object of a fit, the user can utilize the ArviZ package to generate various diagnostic plots. +One particularly useful one is the cumulative posterior predictive plot portrayed in Figure 1. +This plot enables users to judge the quality of a fit and identify instances of lack-of-fit. +As can be seen in the left plot, some predicted intensity values in the lowest quantile of the single peak example show a minimal lack-of-fit. +Importantly, such a deviation can be observed, judged and is quantifiable which intrinsically represents a large improvement over the status quo. + +```{figure-md} fig_d1 +![](./Fig5_ppc.png) + +__Figure 1:__ Cumulative posterior predictive plots created with the ArviZ package and pertaining to the example data of the single His peak (left) and the double Leu and Ile peak (right). The empirical cumulative density function (black) is in good agreement with the median posterior predictive (orange) and lies within the predicted variance (blue band), visualizing that the model provides an adequate prediction irrespective of the intensity value. +``` diff --git a/docs/source/markdown/Fig1_model_single_peak.png b/docs/source/markdown/Fig1_model_single_peak.png new file mode 100644 index 0000000..82b9a1b Binary files /dev/null and b/docs/source/markdown/Fig1_model_single_peak.png differ diff --git a/docs/source/markdown/Fig2_model_double_peak.png b/docs/source/markdown/Fig2_model_double_peak.png new file mode 100644 index 0000000..de6507e Binary files /dev/null and b/docs/source/markdown/Fig2_model_double_peak.png differ diff --git a/docs/source/markdown/Fig3_PP-standalone.png b/docs/source/markdown/Fig3_PP-standalone.png new file mode 100644 index 0000000..349ce13 Binary files /dev/null and b/docs/source/markdown/Fig3_PP-standalone.png differ diff --git a/docs/source/markdown/Fig4_peak_results.png b/docs/source/markdown/Fig4_peak_results.png new file mode 100644 index 0000000..2f82bff Binary files /dev/null and b/docs/source/markdown/Fig4_peak_results.png differ diff --git a/docs/source/markdown/Fig5_ppc.png b/docs/source/markdown/Fig5_ppc.png new file mode 100644 index 0000000..3669790 Binary files /dev/null and b/docs/source/markdown/Fig5_ppc.png differ diff --git a/docs/source/markdown/Fig6_PP-validation.png b/docs/source/markdown/Fig6_PP-validation.png new file mode 100644 index 0000000..a367f08 Binary files /dev/null and b/docs/source/markdown/Fig6_PP-validation.png differ diff --git a/How to adapt PeakPerformance to your data.md b/docs/source/markdown/How_to_adapt_PeakPerformance_to_your_data.md similarity index 100% rename from How to adapt PeakPerformance to your data.md rename to docs/source/markdown/How_to_adapt_PeakPerformance_to_your_data.md diff --git a/Installation.md b/docs/source/markdown/Installation.md similarity index 85% rename from Installation.md rename to docs/source/markdown/Installation.md index 3be08a3..0477d10 100644 --- a/Installation.md +++ b/docs/source/markdown/Installation.md @@ -2,7 +2,15 @@ It is highly recommended to follow these steps: 1. Install the package manager [Mamba](https://github.com/conda-forge/miniforge/releases). Choose the latest installer at the top of the page, click on "show all assets", and download an installer denominated by "Mambaforge-version number-name of your OS.exe", so e.g. "Mambaforge-23.3.1-1-Windows-x86_64.exe" for a Windows 64 bit operating system. Then, execute the installer to install mamba and activate the option "Add Mambaforge to my PATH environment variable". -(⚠ __WARNING__ ⚠: If you have already installed Miniconda, you can install Mamba on top of it but there are compatibility issues with Anaconda. The newest conda version should also work, just replace `mamba` with `conda` in step 2.) + +```{caution} +If you have already installed Miniconda, you can install Mamba on top of it but there are compatibility issues with Anaconda. +``` + +```{note} +The newest conda version should also work, just replace `mamba` with `conda` in step 2.) +``` + 2. Create a new Python environment (replace "name_of_environment" with your desired name) in the command line via ``` mamba create -c conda-forge -n name_of_environment pymc nutpie arviz jupyter matplotlib openpyxl "python=3.10" diff --git a/docs/source/markdown/PeakPerformance_validation.md b/docs/source/markdown/PeakPerformance_validation.md new file mode 100644 index 0000000..87369df --- /dev/null +++ b/docs/source/markdown/PeakPerformance_validation.md @@ -0,0 +1,82 @@ +# Validation of `PeakPerformance` + +## Materials and Methods +Several stages of validation were employed to prove the suitability of `PeakPerformance` for chromatographic peak data analysis. +The goals were to showcase the efficacy of `PeakPerformance` utilizing noisy synthetic data, to investigate cases where a peak could reasonably be fit with either of the single peak models, and finally to use experimental data to compare results obtained with `PeakPerformance` to those from the commercial vendor software Sciex MultiQuant. + +For the first test, 500 random data sets were generated with the NumPy random module {cite}`harris2020array` by drawing from the normal distributions detailed in [Table 1](#tab_v1) except for the mean parameter which was held constant at a value of 6. +Subsequently, normally distributed random noise ($\mathcal{N}(0, 0.6)$ or $\mathcal{N}(0, 1.2)$ for data sets with the tag "higher noise") was added to each data point. +The amount of data points per time was chosen based on an LC-MS/MS method routinely utilized by the authors and accordingly set to one data point per 1.8 s. + +(tab_v1)= +:::{table} __Table 1:__ Normal distributions from which parameters were drawn randomly to create synthetic data sets for the validation of `PeakPerformance`. + +| **parameter** | **model (1st test)** | **model (2nd test)** | +| ------------------ | ----------------------- | ----------------------- | +| area | $\mathcal{N}(8, 0.5)$ | - | +| standard deviation | $\mathcal{N}(0.5, 0.1)$ | $\mathcal{N}(0.5, 0.1)$ | +| skewness | $\mathcal{N}(0, 2)$ | - | +| baseline intercept | $\mathcal{N}(25, 1)$ | $\mathcal{N}(25, 1)$ | +| baseline slope | $\mathcal{N}(0, 1)$ | $\mathcal{N}(0, 1)$ | +::: + +In marginal cases when the shape of a single peak had a slight skew, the automated model selection would at times settle on a normal or a skew normal model. +Therefore, it was relevant to investigate whether this choice would lead to a significant discrepancy in estimated peak parameters. +Accordingly, for the second test synthetic data sets were generated with the NumPy random module according to [Table 1](#tab_v1) and noise was added as described before. +The residual parameters were held constant, i.e. the mean was fixed to 6, the area to 8, and the skewness parameter $\alpha$ to 1. + +For the third and final test, experimental peak data was analyzed with both `PeakPerformance` (version 0.7.0) and Sciex MultiQuant (version 3.0.3) with human supervision, i.e. the results were visually inspected and corrected if necessary. +The data set consisted of 192 signals comprised of 123 single peaks, 50 peaks as part of double peaks, and 19 noise signals. + + +## Results and Discussion +In the first stage of validation, peak fitting with normal and skew normal peak models was tested regarding the ability to reproduce the ground truth of randomly generated noisy synthetic data sets. +The arithmetic means portrayed in [Figure 1a](#fig_v1) were calculated based on a measure of similarity + +$$ +F_{y / \hat{y}} = \frac{y}{\hat{y}} +$$ (eqn:F_yy) + +where $y$ represents the estimated parameter value and $\hat{y}$ its pertaining ground truth. +As they exhibit values close to 1, this demonstrates a near identity between estimation and ground truth. +Additionally, the normal-shaped peak model was paired with skew normally distributed noisy data and vice versa. +In both cases, $\sigma$ was not reproduced well, especially by the normal-shaped model. +Nevertheless, the peak area and height were still identified correctly with the skew normal model and merely slightly underestimated by the normal model. + +```{figure-md} fig_v1 +![Validation of results from PeakPerformance.](./Fig6_PP-validation.png) + +__Figure 1:__ Validation of results from `PeakPerformance`. +**a)** Noisy synthetic data was randomly generated from one of the implemented distributions and the program's ability to infer the ground truth was observed. Portrayed are the fractions of estimated parameter to ground truth. **b)** The influence of model choice between normal and skew normal model in marginal cases with little to no skew was tested and the ratios between results from both models are plotted. **c)** Lastly, experimental data was analyzed with `PeakPerformance` version 0.7.0 and compared to results achieved with the commercial software Sciex MultiQuant version 3.0.3. +``` + +In the second stage, marginal cases in the form of slightly skewed peaks were investigated to observe whether their estimation with a normal- or skew normal-shaped intensity function would result in significant differences in terms of peak area and height. +Here, a slight skew was defined as an $\alpha$ parameter of 1 resulting in peak shapes not visibly discernible as clearly normal or skew normal. +With a sample size of 100 noisy, randomly generated data sets, we show that nearly identical estimates for peak area and height, as well as their respective uncertainties are obtained regardless of the utilized model ([Fig. 1b](#fig_v1)). +The exhibited mean values are based on fractions of the key peak parameters area and height between results obtained with a normal and skew normal model which were defined as + +$$ +F_{n / \mathrm{sn}} = \frac{A_{\mathcal{N}}}{A_{\mathrm{skew \ normal}}} +$$ (eqn:F_nsn) + +where $A_{\mathrm{normal}}$ and $A_{\mathrm{skew \ normal}}$ are the estimated areas with normal and skew normal models, respectively. + +In the third stage, experimental peak data was analyzed with both `PeakPerformance` (version 0.7.0) and Sciex MultiQuant (version 3.0.3) and the fraction of the obtained areas was determined as + +$$ +F_{\mathrm{MQ} / \mathrm{PP}} = \frac{A_{\mathrm{MQ}}}{A_{\mathrm{PP}}} +$$ (eqn:F_mqpp) + +where $A_{\mathrm{MQ}}$ denominates the area yielded by MultiQuant and $A_{\mathrm{PP}}$ the area from `PeakPerformance`. +Beyond the comparability of the resulting peak area ratio means portrayed in [Figure 1c](#fig_v1), it is relevant to state that 103 signals from MultiQuant (54 % of total signals) were manually modified. +Of these, 31 % were false positives and 69 % were manually re-integrated. +These figures are the result of a relatively high share of double peaks in the test sample which generally give a lot more cause for manual interference than single peaks. +In contrast, however, the `PeakPerformance` pipeline was only started once and merely two single peaks and one double peak were fit again with a different model and/or increased sample size after the original pipeline batch run had finished. +Among the 192 signals of the test data set, there were 7 noisy, low intensity signals without a clear peak which were recognized as a peak only by either one or the other software and were hence omitted from this comparison. +By showing not only the mean area ratio of all peaks but also the ones for the single and double peak subgroups, it is evident that the variance is significantly higher for double peaks. +In case of this data set, two low quality double peaks in particular inflated the variance significantly which may not be representative for other data sets. +It has to be stated, too, that the prevalence of manual re-integration of double peaks in MQ might have introduced a user-specific bias, thereby increasing the final variance. +Nevertheless, it could be shown that `PeakPerformance` yields comparable peak area results to a commercially available vendor software. + +```{bibliography} +``` diff --git a/docs/source/markdown/PeakPerformance_workflow.md b/docs/source/markdown/PeakPerformance_workflow.md new file mode 100644 index 0000000..8e65b90 --- /dev/null +++ b/docs/source/markdown/PeakPerformance_workflow.md @@ -0,0 +1,60 @@ +# PeakPerformance workflow +`PeakPerformance` accommodates the use of a pre-manufactured data pipeline for standard applications as well as the creation of custom data pipelines using only its core functions. +The provided data analysis pipeline was designed in a user-friendly way and requires minimal programming knowledge ([Fig. 1](#fig_w1)). +As portrayed in an example notebook in the code repository, only a few simple Python commands need to be executed. +Instead of relying on these convenience functions, experienced users can also directly access the core functions of `PeakPerformance` for a more flexible application which is demonstrated in yet another example notebook. + +```{figure-md} fig_w1 +![](./Fig3_PP-standalone.png) + +__Figure 1:__ Overview of the pre-manufactured data analysis pipeline featured in `PeakPerformance`. +``` + +Before using `PeakPerformance`, the user has to supply raw data files containing a NumPy array with time in the first and intensity in the second dimension. +For each peak, such a file has to be provided according to the naming convention specified in `PeakPerformance`'s documentation and gathered in one directory. +If a complete time series of a 30 - 90 min LC-MS/MS run were to be submitted to the program, however, the target peak would make up an extremely small portion of this data. +Additionally, other peaks with the same mass and fragmentation pattern may have been observed at different retention times. +Therefore, it was decided from the outset that in order to enable proper peak fitting, only a fraction of such a time series with a range of 3 - 5 times the peak width and roughly centered on the target peak would be accepted as an input. +This guarantees that there is a sufficient number of data points at the beginning and end of the time frame for estimating the baseline and noise level, as well. +The provided data pipeline starts by defining a path to this raw data directory and one to a local clone of the `PeakPerformance` code repository. +Using the `prepare_model_selection()` function, an Excel template file ("Template.xlsx") for inputting user information is prepared and stored in the raw data directory. +It is the user's task, then, to select the settings for the pipeline within the file itself. +Accordingly, the file contains detailed explanations of all parameters and the parsing functions of the software feature clear error messages in case mandatory entries are missing or filled out incorrectly. + +Since targeted LC-MS/MS analyses essentially cycle through a list of mass traces for every sample, a model type has to be assigned to each mass trace. +Preferably, this is done by the user which is of course only possible when the model choice is self-evident. +If this is not the case, an optional automated model selection step can be performed, where one exemplary peak per mass trace is analyzed with all models to identify the most appropriate one. +It is then assumed that within one batch run, all instances of a mass trace across all acquisitions can be fitted with the same type of model. +For this purpose, the user must provide the name of an acquisition, i.e. sample, where a clear and representative peak for the given mass trace was observed. +If e.g. a standard mixture containing all targets was measured, this would be considered a prime candidate. +An additional feature lets the user exclude specific model types to save computation time and improve the accuracy of model selection by for example excluding double peak models when a single peak was observed. +Upon provision of the required information, the automated model selection can be started using the `model_selection()` function from the pipeline module and will be performed successively for each mass trace. +Essentially, every type of model which has not been excluded by the user needs to be instantiated, sampled, and the log-likelihood needs to be calculated. +Subsequently, the results for each model are ranked with the `compare()` function of the ArviZ package based on Pareto-smoothed importance sampling leave-one-out cross-validation (LOO-PIT) {cite}`RN146,RN145`. +This function returns a DataFrame showing the results of the models in order of their placement on the ranking which is decided by the expected log pointwise predictive density. +The best model for each mass trace is then written to the Excel template file. + +After a model was chosen either manually or automatically for each mass trace, the peak analysis pipeline can be started with the function `pipeline` from the `pipeline` module. +The first step consists of parsing the information from the Excel sheet. +Since the pipeline, just like model selection, acts successively, a time series is read from its data file next and the information contained in the name of the file according to the naming convention is parsed. +All this information is combined in an instance of `PeakPerformance`'s `UserInput` class acting as a centralized source of data for the program. +Depending on whether the "pre-filtering" option was selected, an optional filtering step will be executed to reject signals where clearly no peak is present before sampling, thus saving computation time. +This filtering step uses the `find_peaks()` function from the SciPy package {cite}`scipy` which simply checks for data points directly neighboured by points with lower intensity values. +If no data points within a certain range around the expected retention time of an analyte fulfill this most basic requirement of a peak, the signal is rejected. +Furthermore, if none of the candidate data points exceed a signal-to-noise ratio threshold defined by the user, the signal will also be discarded. +Depending on the origin of the samples, this step may reject a great many signals before sampling saving potentially hours of computation time across a batch run of the `PeakPerformance` pipeline. +For instance, in bioreactor cultivations, a product might be quantified but if it is only produced during the stationary growth phase, it will not show up in early samples. +Another pertinent example of such a use case are isotopic labeling experiments for which every theoretically achievable mass isotopomer needs to be investigated, yet depending on the input labeling mixture, the majority of them might not be present in actuality. +Upon passing the first filter, a Markov chain Monte Carlo (MCMC) simulation is conducted using a No-U-Turn Sampler (NUTS) {cite}`RN173`, preferably - if installed in the Python environment - the nutpie sampler {cite}`nutpie` due to its highly increased performance compared to the default sampler of PyMC. +Before sampling from the posterior distribution, a prior predictive check is performed the results of which can be accessed and evaluated after the fact. +When a posterior distribution has been obtained, the main filtering step is next in line. +The first criterion is constituted by checking the convergence of the Markov chains towards a common solution for the posterior represented by the potential scale reduction factor {cite}`RN152`, also referred to as the $\hat{R}$ statistic or Gelman-Rubin diagnostic. +If this factor is above 1.05 for any parameter, convergence was not reached and the sampling will be repeated once with a much higher number of tuning samples. +If the filter is not passed a second time, the pertaining signal is rejected. +Harnessing the advantages of the uncertainty quantification, a second criterion calculates the ratio of the resulting standard deviation of a peak parameter to its mean and discards signals exceeding a threshold. +Usually, false positives passing the first criterion are rather noisy signals where a fit was achieved but the uncertainty on the peak parameters is extremely high. +These signals will then be rejected by the second criterion, ultimately reducing the number of false positive peaks significantly if not eliminating them. +If a signal was accepted as a peak, the final simulation step is a posterior predictive check which is added to the inference data object resulting from the model simulation. + +```{bibliography} +``` diff --git a/docs/source/markdown/Peak_model_composition.md b/docs/source/markdown/Peak_model_composition.md new file mode 100644 index 0000000..3100bf9 --- /dev/null +++ b/docs/source/markdown/Peak_model_composition.md @@ -0,0 +1,134 @@ +# Composition and assumptions of peak models + +Peak models in `PeakPerformance` require the definition of prior probability distributions (priors) for their parameters as well as the choice of an intensity function and a likelihood function. +Generally, priors are derived from a given time series and given a weakly informative parametrization, such that the resulting inferences of parameters like the peak height are primarily based on the data. +While defining priors in a data-dependent manner is generally to be avoided, it is clearly not tenable to define legitimate priors for all kinds of different peaks with heights and areas varying by multiple orders of magnitude and retention times, i.e. mean values, scattered across the whole run time of the LC-MS/MS method. +In order to flexibly build models for all these peaks in an automated manner and embedded in a standardized data pipeline, some parameter priors had to be based on the raw data. +If specific distributions or their parameters had to be restricted to certain value ranges, error handling was incorporated. +For example, when only positive values were acceptable or when 0 was not a permissive value, a lower bound was defined using NumPy's clip function. + +Regarding shared model elements across all intensity functions, one such component of all models presented hereafter is the likelihood function + +$$ +L \sim \mathcal{N}(y, \mathrm{noise}) +$$ (eqn:likelihood) + +with $y$ as the predicted intensity and $\mathrm{noise}$ as the free parameter describing the standard deviation of measurement noise. +This definition encodes the assumption that observed intensities are the result of normally distributed noise around the true intensity values of a peak. +In turn, the noise parameter is defined as + +$$ +\mathrm{noise} \sim \mathrm{LogNormal}(\log_{10} \mathrm{max}(10, \mathrm{noise}_{\mathrm{guess}}), 1) +$$ (eqn:noise) + +The log-normal distribution where the logarithm of the random variable follows a normal distribution was chosen partly to exclude negative values from the solution space and also due to its shape attributing a higher fraction of the probability mass to lower values provided the standard deviation is defined sufficiently high. +This prior is defined in a raw data-dependent manner as the $\mathrm{noise}_{\mathrm{guess}}$ amounts to the standard deviation of the differences of the first and final 15 % of intensity values included in a given time frame and their respective mean values. + +The intensity function itself is defined as the sum of a linear baseline function and a peak intensity function, the latter of which is composed of a given distribution's probability density function (PDF) scaled up to the peak size by the area or height parameter. +The linear baseline + +$$ +y_{\mathrm{baseline}}(t) = at+b +$$ (eqn:baseline) + +features the slope and intersect parameters $a$ and $b$, respectively, both of which were assigned a normally distributed prior. +The data-dependent guesses for these priors are obtained by constructing a line through the means of the first and last three data points of a given intensity data set which oftentimes already resulted in a good fit. +Hence, the determined values for slope ($a_{\mathrm{guess}}$) and intercept ($b_{\mathrm{guess}}$) are used as the means for their pertaining priors and the standard deviations are defined as fractions of them with minima set to 0.5 and 0.05, respectively. +Here, the exact definition of the standard deviations was less important than simply obtaining an uninformative prior which, while based on the rough fit for the baseline, possesses a sufficient degree of independence from it, thus allowing deviations by the Bayesian parameter estimation. + +$$ + a \sim + \begin{cases} + \mathcal{N}(a_{\mathrm{guess}}, \frac{|a_{\mathrm{guess}}|}{5}) & \mathrm{if}\ \frac{|a_{guess}|}{5}\geq0.5\\ + \mathcal{N}(a_{\mathrm{guess}}, 0.5) & \mathrm{otherwise}\\ + \end{cases} +$$ (eqn:guess_a) + +$$ + b \sim + \begin{cases} + \mathcal{N}(b_{\mathrm{guess}}, \frac{|b_{\mathrm{guess}}|}{6}) & \mathrm{if}\ \frac{|b_{guess}|}{6}\geq0.05\\ + \mathcal{N}(b_{\mathrm{guess}}, 0.05) & \mathrm{otherwise}\\ + \end{cases} +$$ (eqn:guess_b) + +The initial guesses $\mathrm{noise}_{\mathrm{guess}}$, $a_{\mathrm{guess}}$, and $b_{\mathrm{guess}}$ are calculated from raw time and intensity by the `initial_guesses()` function from the `models` submodule. +Beyond this point, it is sensible to categorize models into single and double peak models since these subgroups share a larger common basis. +Starting with single peak models, the normal-shaped model ([Figure 1a](fig_c1)) requires only three additional parameters for defining its intensity function. + +```{figure-md} fig_c1 +![](./Fig1_model_single_peak.png) + +__Figure 1:__ The intensity functions of normal (**a**) and skew normal peak models (**b**) as well as the prior probability distributions of their parameters are shown in the style of a Kruschke diagram {cite}`RN162`. Connections with $\sim$ imply stochastic and with $=$ deterministic relationships. In case of variables with multiple occurrences in one formula, the prior was only connected to one such instance to preserve visual clarity. The variables $M_{i}$ and $O_{i}$ describe mean values and $T_{i}$, $R$, and $S$ standard deviations. +``` + +The mean value $\mu$ has a normally distributed prior with the center of the selected time frame $\mathrm{min}(t) + \frac{\Delta t}{2}$ as its mean and $\frac{\Delta t}{2}$ as the standard deviation where $\Delta t$ corresponds to the length of the time frame. +Accordingly, the resulting prior is rather compressed and weakly informative. +The prior for the standard deviation of the normal-shaped peak model was defined with a half-normal distribution, once again to avoid values equaling or below 0. +As a half normal distribution only features a standard deviation, this was set to $\frac{\Delta t}{3}$. +The final parameter is the peak height used for scaling up the distribution to match the size of the peak. +Here, a rather uninformative half-normal distribution with a scale amounting to 95 % of the highest intensity value in the time frame was selected. + +The second featured single peak model is based on the skew normal distribution ([Figure 1b](fig_c1)) which has an additional skewness parameter $\alpha$ enabling a one-sided distortion of the peak or resulting in identity to the normal-shaped peak model when $\alpha=0$. +Hence, the prior of $\alpha$ is constituted by a normal distribution centered on 0 with a standard deviation of 3.5 to allow for a sufficiently large range of possible values for $\alpha$ and thus a realistic skew. +Instead of the peak height, the peak area was utilized to scale the distribution, albeit with an identical prior. + +The double peak models ([Figure 2](fig_c2)) featured many of the same variables as their single peak counterparts so only the differences will be highlighted here. + +All variables pertaining to the actual peak were represented as vectors with two entries labeled with 0 and 1 by adding a named dimension to that effect. +Aside from that, their priors remained unaltered except for the peak mean $\mu$. + +To provide a flexible solution to find double peak means across the whole time frame, the implementation of additional parameters proved indispensable. +More precisely, the mean of both peaks or group mean was introduced as a hyperprior (eq. 6) with a broad normal prior which enabled it to vary across the time frame as needed. + +$$ +\mu_{\mu} \sim \mathcal{N}\biggl(\mathrm{min}(t) + \frac{\Delta t}{2}, \frac{\Delta t}{6}\biggr) +$$ (eqn:param_mu) + +By defining a separation parameter representing the distance between the sub-peaks of a double peak + +$$ +\mathrm{separation} \sim \mathrm{Gamma}\biggl(\frac{\Delta t}{6}, \frac{\Delta t}{12}\biggr) +$$ (eqn:param_separation) + +the offset of each peak's mean parameter from the group mean is calculated as + +$$ +\delta = \begin{bmatrix} - \frac{\mathrm{separation}}{2}\\\frac{\mathrm{separation}}{2}\end{bmatrix} +$$ (eqn:param_delta) + +The priors for the mean parameters of each subpeak were then defined in dependence of $\mu_{\mu}$ and $\delta$ as + +$$ +\mu = \mu_{\mu} + \delta +$$ (eqn:param_mumu) + +```{figure-md} fig_c2 +![](./Fig2_model_double_peak.png) + +__Figure 2:__ The intensity functions of double normal (**a**) and double skew normal peak models (**b**) as well as the prior probability distributions of their parameters are shown in the style of a Kruschke diagram {cite}`RN162`. Connections with $\sim$ imply stochastic and with $=$ deterministic relationships. In case of variables with multiple occurrences in one formula, the prior was only connected to one such instance to preserve visual clarity. The variables $M_{i}$ and $O_{i}$ describe mean values and $T_{i}$, $S_{i}$, $P_{i}$, and $V_{i}$ standard deviations. +``` + +While all aforementioned parameters are necessary for the models, not all are of equal relevance for the user. +A user's primary interest for consecutive data analysis generally lies in obtaining mean values, peak areas and perhaps - usually to a much lesser degree - peak heights. +Since only one of the latter two parameters is strictly required for scaling purposes, different models as shown in [Figure 1](fig_c1) and [Figure 2](fig_c2) will feature either one or the other. +Nonetheless, both peak area and peak height should be supplied to the user, hence the missing one was included as a deterministic model variable and thus equally accessible by the user. +In case of the normal and double normal models, the peak height $h$ was used for scaling and the area $A$ was calculated by + +$$ +A = \frac{h}{\frac{1}{\sigma\sqrt{2\pi}}} +$$ (eqn:area) + +For skew normal and double skew normal models, the scaling parameter was the peak area. +Since the mode and mean of a skewed distribution are – in contrast to normal distributions – distinct, the calculation of the height was nontrivial and ultimately a numerical approximation was added to the skewed models. + +Beyond these key peak parameters, all PyMC models created by `PeakPerformance` contain additional constant data variables, and deterministic model variables. +For example, the time series, i.e. the analyzed raw data, as well as the initial guesses for noise, baseline slope, and baseline intercept are kept as constant data variables to facilitate debugging and reproducibility. +Examples for deterministic model variables in addition to peak area or height are the predicted intensity values and the signal-to-noise ratio defined here as + +$$ +\mathrm{sn} = \frac{h}{\mathrm{noise}} +$$ (eqn:sn) + +```{bibliography} +``` diff --git a/docs/source/markdown/summary_joint.pdf b/docs/source/markdown/summary_joint.pdf new file mode 100644 index 0000000..89c93c6 Binary files /dev/null and b/docs/source/markdown/summary_joint.pdf differ diff --git a/docs/source/markdown/summary_joint.png b/docs/source/markdown/summary_joint.png new file mode 100644 index 0000000..a9c9497 Binary files /dev/null and b/docs/source/markdown/summary_joint.png differ diff --git a/docs/source/markdown/summary_joint.svg b/docs/source/markdown/summary_joint.svg new file mode 100644 index 0000000..f7bf037 --- /dev/null +++ b/docs/source/markdown/summary_joint.svg @@ -0,0 +1,1902 @@ + + + +meansdhdi_3%hdi_97%meansdhdi_3%hdi_97%intercept-43.947.41-57.88-30.021115.4038.691040.141185.07slope6.660.515.717.63-21.653.09-27.50-15.94noise103.637.5189.50117.26118.638.01103.52133.2911.730.0211.7011.7612.430.0112.4212.45317.1628.84263.23370.56674.3426.34623.47722.88774.9965.28653.50897.881762.6664.041639.621881.660.160.020.130.200.150.010.140.176.560.725.227.8814.931.1412.8217.14alpha2.960.392.273.71----parameter1512.3237.311879.7237.71single skew normal modelmean25.950.0125.9325.970.5318.240.021.37areaheightstdsn1581.371441.2520.7615.690.560.481950.641809.30double normal model