diff --git a/.Rbuildignore b/.Rbuildignore new file mode 100644 index 0000000..680036d --- /dev/null +++ b/.Rbuildignore @@ -0,0 +1,8 @@ +^inst/\.quarto$ +^\.png$ +^\.quarto$ +^\.github$ +^LICENSE\.md$ +^NOTES\.md$ +^README\.Rmd$ +^Dockerfile$ diff --git a/.github/CODE_OF_CONDUCT.md b/.github/CODE_OF_CONDUCT.md new file mode 100644 index 0000000..655e9e9 --- /dev/null +++ b/.github/CODE_OF_CONDUCT.md @@ -0,0 +1,17 @@ +The Bioconductor community values + +* an open approach to science that promotes the sharing of ideas, code, and expertise +* collaboration +* diversity and inclusivity +* a kind and welcoming environment +* community contributions + +In line with these values, Bioconductor is dedicated to providing a welcoming, supportive, collegial, experience free of harassment, intimidation, and bullying regardless of: + +* identity: gender, gender identity and expression, sexual orientation, disability, physical appearance, ethnicity, body size, race, age, religion, etc. +* intellectual position: approaches to data analysis, software preferences, coding style, scientific perspective, etc. +* stage of career + +In order to uphold these values, members of the Bioconductor community are required to follow the Code of Conduct.The latest version of Bioconductor project Code of Conduct is available at http://bioconductor.org/about/code-of-conduct/. Please read the Code of Conduct before contributing to this project. + +Thank you! diff --git a/.github/workflows/biocbook.yml b/.github/workflows/biocbook.yml new file mode 100644 index 0000000..723be33 --- /dev/null +++ b/.github/workflows/biocbook.yml @@ -0,0 +1,111 @@ +name: biocbook + +on: + push: + branches: + - devel + - RELEASE_** + +jobs: + build-push: + runs-on: ubuntu-latest + name: build-book (${{ github.ref_name }}) + permissions: + contents: write + packages: write + + steps: + + - name: 🧾 Checkout repository + uses: actions/checkout@v3 + + - name: ⏳ Collect Workflow Telemetry + uses: runforesight/workflow-telemetry-action@v1 + + - name: 🐳 Set up QEMU + uses: docker/setup-qemu-action@v2 + - name: 🐳 Set up Docker Buildx + uses: docker/setup-buildx-action@v2 + + - name: 📝 Get book info + id: info + env: + OWNER: ${{ github.repository_owner }} + run: | + Pkgname=$(grep -m1 -E '^Package: +' DESCRIPTION | sed -E 's/.*: +//') + echo Pkgname=${Pkgname} >> "${GITHUB_ENV}" + pkgname=${Pkgname,,} + echo pkgname=${pkgname} >> "${GITHUB_ENV}" + owner=${OWNER,,} + echo owner=${owner} >> "${GITHUB_ENV}" + echo pkgversion=$(grep -m1 -E '^Version: +' DESCRIPTION | sed -E 's/.*: +//') >> "${GITHUB_ENV}" + + - name: 🔐 Log in to the Github Container registry + uses: docker/login-action@v2 + with: + registry: ghcr.io + username: ${{ env.owner }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: 🏷 Get metadata for Docker + id: meta + uses: docker/metadata-action@v4 + with: + images: ghcr.io/${{ env.owner }}/${{ env.pkgname }} + tags: | + ${{ github.ref_name }} + ${{ env.pkgversion }} + type=raw,value=latest,enable=${{ github.ref == format('refs/heads/{0}', 'devel') }} + + - name: 📦 Install, build and check package in local Docker image + id: docker + uses: docker/build-push-action@v4 + with: + context: . + load: true + tags: ${{ steps.meta.outputs.tags }} + build-args: | + BIOC_VERSION=${{ github.ref_name }} + + - name: 🚀 Push local Docker image to ghcr.io + uses: docker/build-push-action@v4 + with: + context: . + push: true + tags: ${{ steps.meta.outputs.tags }} + build-args: | + BIOC_VERSION=${{ github.ref_name }} + + - name: 📚 Recover pkg artifacts generated during build in local Docker container (pkg bundle and book) + env: + IMG: ${{ steps.docker.outputs.ImageID }} + run: | + SHA=$(docker container create ${{ env.IMG }}) + docker container cp ${SHA}:/${{ env.Pkgname }}_${{ env.pkgversion }}.tar.gz . + tar --extract --gzip --file ${{ env.Pkgname }}_${{ env.pkgversion }}.tar.gz + echo bundle_path=${{ env.Pkgname }}_${{ env.pkgversion }}.tar.gz >> "${GITHUB_ENV}" + echo book_path=${{ env.Pkgname }}/inst/doc/book/ >> "${GITHUB_ENV}" + + - name: 🏷 Get gh-branch directory to deploy to + run: | + echo target_folder=$(echo ${{ github.ref_name }} | sed 's,RELEASE_,,' | tr '_' '.') >> "${GITHUB_ENV}" + + - name: 🚀 Deploy book to Github Pages on versioned branch + uses: JamesIves/github-pages-deploy-action@v4.4.3 + with: + folder: ${{ env.book_path }}/ + target-folder: docs/${{ env.target_folder }}/ + branch: gh-pages + clean: false + + - name: 💾 Upload package bundle artifact + uses: actions/upload-artifact@v3 + with: + name: bundle + path: ${{ env.bundle_path }} + + - name: 💾 Upload book artifact + uses: actions/upload-artifact@v3 + with: + name: book + path: ${{ env.book_path }} diff --git a/.github/workflows/rworkflows.yml b/.github/workflows/rworkflows.yml new file mode 100644 index 0000000..0df685c --- /dev/null +++ b/.github/workflows/rworkflows.yml @@ -0,0 +1,53 @@ +## Adapted from neurogenomics/rworkflows: rworkflows::use_workflow() + +name: rworkflows +'on': + push: + branches: + - devel + - RELEASE_** + pull_request: + branches: + - devel + - RELEASE_** + +jobs: + rworkflows: + runs-on: ${{ matrix.config.os }} + name: ${{ matrix.config.os }} (${{ matrix.config.r }}) + container: ${{ matrix.config.cont }} + strategy: + fail-fast: ${{ false }} + matrix: + config: + - os: ubuntu-latest + bioc: devel + r: auto + cont: ghcr.io/bioconductor/bioconductor:devel + rspm: https://packagemanager.rstudio.com/cran/__linux__/focal/release + # - os: macOS-latest + # bioc: release + # r: auto + # cont: ~ + # rspm: ~ + # - os: windows-latest + # bioc: release + # r: auto + # cont: ~ + # rspm: ~ + steps: + - uses: neurogenomics/rworkflows@master + with: + run_bioccheck: ${{ false }} + run_rcmdcheck: ${{ true }} + as_cran: ${{ false }} + run_vignettes: ${{ false }} + has_testthat: ${{ true }} + run_covr: ${{ false }} + run_pkgdown: ${{ false }} + has_runit: ${{ false }} + has_latex: ${{ false }} + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run_docker: ${{ false }} + runner_os: ${{ runner.os }} + cache_version: cache-v1 diff --git a/DESCRIPTION b/DESCRIPTION new file mode 100644 index 0000000..2c39be1 --- /dev/null +++ b/DESCRIPTION @@ -0,0 +1,61 @@ +Package: R4MS +Title: R for mass spectrometry +Description: | + This repository provides documentation and teaching material + focus on the analysis of mass spectrometry data for proteomics and metabolomics + using the [R for Mass Spectrometry](https://www.rformassspectrometry.org/) + software infrastructure. +Version: 0.98.0 +Date: `r date()` +Authors@R: c(person(given = "Laurent", family = "Gatto", + comment = c(ORCID = "0000-0002-1520-2268"), + email = "laurent.gatto@uclouvain.be", + role = c("aut","cre")), + person(given = "Johannes", family = "Rainer", + email = "Johannes.Rainer@eurac.edu", + role = "aut", + comment = c(ORCID = "0000-0002-6977-7147")), + person(given = "Sebastian", family = "Gibb", + email = "mail@sebastiangibb.de", + role = "aut", + comment = c(ORCID = "0000-0001-7406-4443"))) +URL: https://github.com/js2264/R4MS +BugReports: https://github.com/js2264/R4MS +biocViews: + Book +Depends: + R (>= 4.3) +Imports: + tidyverse, + factoextra, + msdata, + mzR, + rhdf5, + rpx, + MsCoreUtils, + QFeatures, + Spectra, + ProtGenerics, + PSMatch, + pheatmap, + limma, + gplots, + patchwork, + MSnID +Suggests: + BiocManager, + BiocVersion, + BiocStyle, + BiocCheck, + rcmdcheck, + glue, + sessioninfo, + knitr, + quarto, + BiocBook +Encoding: UTF-8 +Roxygen: list(markdown = TRUE) +RoxygenNote: 7.2.3 +BiocType: Book +VignetteBuilder: knitr +License: GPL (>= 3) diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..9e9a308 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,10 @@ +ARG BIOC_VERSION +FROM bioconductor/bioconductor_docker:${BIOC_VERSION} +COPY . /opt/pkg + +# Install book package +RUN Rscript -e 'repos <- BiocManager::repositories() ; remotes::install_local(path = "/opt/pkg/", repos=repos, dependencies=TRUE, build_vignettes=FALSE, upgrade=TRUE) ; sessioninfo::session_info(installed.packages()[,"Package"], include_base = TRUE)' + +## Build/install using same approach than BBS +RUN R CMD INSTALL /opt/pkg +RUN R CMD build --keep-empty-dirs --no-resave-data /opt/pkg diff --git a/LICENSE.md b/LICENSE.md new file mode 100644 index 0000000..175443c --- /dev/null +++ b/LICENSE.md @@ -0,0 +1,595 @@ +GNU General Public License +========================== + +_Version 3, 29 June 2007_ +_Copyright © 2007 Free Software Foundation, Inc. <>_ + +Everyone is permitted to copy and distribute verbatim copies of this license +document, but changing it is not allowed. + +## Preamble + +The GNU General Public License is a free, copyleft license for software and other +kinds of works. + +The licenses for most software and other practical works are designed to take away +your freedom to share and change the works. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change all versions of a +program--to make sure it remains free software for all its users. We, the Free +Software Foundation, use the GNU General Public License for most of our software; it +applies also to any other work released this way by its authors. You can apply it to +your programs, too. + +When we speak of free software, we are referring to freedom, not price. Our General +Public Licenses are designed to make sure that you have the freedom to distribute +copies of free software (and charge for them if you wish), that you receive source +code or can get it if you want it, that you can change the software or use pieces of +it in new free programs, and that you know you can do these things. + +To protect your rights, we need to prevent others from denying you these rights or +asking you to surrender the rights. Therefore, you have certain responsibilities if +you distribute copies of the software, or if you modify it: responsibilities to +respect the freedom of others. + +For example, if you distribute copies of such a program, whether gratis or for a fee, +you must pass on to the recipients the same freedoms that you received. You must make +sure that they, too, receive or can get the source code. And you must show them these +terms so they know their rights. + +Developers that use the GNU GPL protect your rights with two steps: **(1)** assert +copyright on the software, and **(2)** offer you this License giving you legal permission +to copy, distribute and/or modify it. + +For the developers' and authors' protection, the GPL clearly explains that there is +no warranty for this free software. For both users' and authors' sake, the GPL +requires that modified versions be marked as changed, so that their problems will not +be attributed erroneously to authors of previous versions. + +Some devices are designed to deny users access to install or run modified versions of +the software inside them, although the manufacturer can do so. This is fundamentally +incompatible with the aim of protecting users' freedom to change the software. The +systematic pattern of such abuse occurs in the area of products for individuals to +use, which is precisely where it is most unacceptable. Therefore, we have designed +this version of the GPL to prohibit the practice for those products. If such problems +arise substantially in other domains, we stand ready to extend this provision to +those domains in future versions of the GPL, as needed to protect the freedom of +users. + +Finally, every program is threatened constantly by software patents. States should +not allow patents to restrict development and use of software on general-purpose +computers, but in those that do, we wish to avoid the special danger that patents +applied to a free program could make it effectively proprietary. To prevent this, the +GPL assures that patents cannot be used to render the program non-free. + +The precise terms and conditions for copying, distribution and modification follow. + +## TERMS AND CONDITIONS + +### 0. Definitions + +“This License” refers to version 3 of the GNU General Public License. + +“Copyright” also means copyright-like laws that apply to other kinds of +works, such as semiconductor masks. + +“The Program” refers to any copyrightable work licensed under this +License. Each licensee is addressed as “you”. “Licensees” and +“recipients” may be individuals or organizations. + +To “modify” a work means to copy from or adapt all or part of the work in +a fashion requiring copyright permission, other than the making of an exact copy. The +resulting work is called a “modified version” of the earlier work or a +work “based on” the earlier work. + +A “covered work” means either the unmodified Program or a work based on +the Program. + +To “propagate” a work means to do anything with it that, without +permission, would make you directly or secondarily liable for infringement under +applicable copyright law, except executing it on a computer or modifying a private +copy. Propagation includes copying, distribution (with or without modification), +making available to the public, and in some countries other activities as well. + +To “convey” a work means any kind of propagation that enables other +parties to make or receive copies. Mere interaction with a user through a computer +network, with no transfer of a copy, is not conveying. + +An interactive user interface displays “Appropriate Legal Notices” to the +extent that it includes a convenient and prominently visible feature that **(1)** +displays an appropriate copyright notice, and **(2)** tells the user that there is no +warranty for the work (except to the extent that warranties are provided), that +licensees may convey the work under this License, and how to view a copy of this +License. If the interface presents a list of user commands or options, such as a +menu, a prominent item in the list meets this criterion. + +### 1. Source Code + +The “source code” for a work means the preferred form of the work for +making modifications to it. “Object code” means any non-source form of a +work. + +A “Standard Interface” means an interface that either is an official +standard defined by a recognized standards body, or, in the case of interfaces +specified for a particular programming language, one that is widely used among +developers working in that language. + +The “System Libraries” of an executable work include anything, other than +the work as a whole, that **(a)** is included in the normal form of packaging a Major +Component, but which is not part of that Major Component, and **(b)** serves only to +enable use of the work with that Major Component, or to implement a Standard +Interface for which an implementation is available to the public in source code form. +A “Major Component”, in this context, means a major essential component +(kernel, window system, and so on) of the specific operating system (if any) on which +the executable work runs, or a compiler used to produce the work, or an object code +interpreter used to run it. + +The “Corresponding Source” for a work in object code form means all the +source code needed to generate, install, and (for an executable work) run the object +code and to modify the work, including scripts to control those activities. However, +it does not include the work's System Libraries, or general-purpose tools or +generally available free programs which are used unmodified in performing those +activities but which are not part of the work. For example, Corresponding Source +includes interface definition files associated with source files for the work, and +the source code for shared libraries and dynamically linked subprograms that the work +is specifically designed to require, such as by intimate data communication or +control flow between those subprograms and other parts of the work. + +The Corresponding Source need not include anything that users can regenerate +automatically from other parts of the Corresponding Source. + +The Corresponding Source for a work in source code form is that same work. + +### 2. Basic Permissions + +All rights granted under this License are granted for the term of copyright on the +Program, and are irrevocable provided the stated conditions are met. This License +explicitly affirms your unlimited permission to run the unmodified Program. The +output from running a covered work is covered by this License only if the output, +given its content, constitutes a covered work. This License acknowledges your rights +of fair use or other equivalent, as provided by copyright law. + +You may make, run and propagate covered works that you do not convey, without +conditions so long as your license otherwise remains in force. You may convey covered +works to others for the sole purpose of having them make modifications exclusively +for you, or provide you with facilities for running those works, provided that you +comply with the terms of this License in conveying all material for which you do not +control copyright. Those thus making or running the covered works for you must do so +exclusively on your behalf, under your direction and control, on terms that prohibit +them from making any copies of your copyrighted material outside their relationship +with you. + +Conveying under any other circumstances is permitted solely under the conditions +stated below. Sublicensing is not allowed; section 10 makes it unnecessary. + +### 3. Protecting Users' Legal Rights From Anti-Circumvention Law + +No covered work shall be deemed part of an effective technological measure under any +applicable law fulfilling obligations under article 11 of the WIPO copyright treaty +adopted on 20 December 1996, or similar laws prohibiting or restricting circumvention +of such measures. + +When you convey a covered work, you waive any legal power to forbid circumvention of +technological measures to the extent such circumvention is effected by exercising +rights under this License with respect to the covered work, and you disclaim any +intention to limit operation or modification of the work as a means of enforcing, +against the work's users, your or third parties' legal rights to forbid circumvention +of technological measures. + +### 4. Conveying Verbatim Copies + +You may convey verbatim copies of the Program's source code as you receive it, in any +medium, provided that you conspicuously and appropriately publish on each copy an +appropriate copyright notice; keep intact all notices stating that this License and +any non-permissive terms added in accord with section 7 apply to the code; keep +intact all notices of the absence of any warranty; and give all recipients a copy of +this License along with the Program. + +You may charge any price or no price for each copy that you convey, and you may offer +support or warranty protection for a fee. + +### 5. Conveying Modified Source Versions + +You may convey a work based on the Program, or the modifications to produce it from +the Program, in the form of source code under the terms of section 4, provided that +you also meet all of these conditions: + +* **a)** The work must carry prominent notices stating that you modified it, and giving a +relevant date. +* **b)** The work must carry prominent notices stating that it is released under this +License and any conditions added under section 7. This requirement modifies the +requirement in section 4 to “keep intact all notices”. +* **c)** You must license the entire work, as a whole, under this License to anyone who +comes into possession of a copy. This License will therefore apply, along with any +applicable section 7 additional terms, to the whole of the work, and all its parts, +regardless of how they are packaged. This License gives no permission to license the +work in any other way, but it does not invalidate such permission if you have +separately received it. +* **d)** If the work has interactive user interfaces, each must display Appropriate Legal +Notices; however, if the Program has interactive interfaces that do not display +Appropriate Legal Notices, your work need not make them do so. + +A compilation of a covered work with other separate and independent works, which are +not by their nature extensions of the covered work, and which are not combined with +it such as to form a larger program, in or on a volume of a storage or distribution +medium, is called an “aggregate” if the compilation and its resulting +copyright are not used to limit the access or legal rights of the compilation's users +beyond what the individual works permit. Inclusion of a covered work in an aggregate +does not cause this License to apply to the other parts of the aggregate. + +### 6. Conveying Non-Source Forms + +You may convey a covered work in object code form under the terms of sections 4 and +5, provided that you also convey the machine-readable Corresponding Source under the +terms of this License, in one of these ways: + +* **a)** Convey the object code in, or embodied in, a physical product (including a +physical distribution medium), accompanied by the Corresponding Source fixed on a +durable physical medium customarily used for software interchange. +* **b)** Convey the object code in, or embodied in, a physical product (including a +physical distribution medium), accompanied by a written offer, valid for at least +three years and valid for as long as you offer spare parts or customer support for +that product model, to give anyone who possesses the object code either **(1)** a copy of +the Corresponding Source for all the software in the product that is covered by this +License, on a durable physical medium customarily used for software interchange, for +a price no more than your reasonable cost of physically performing this conveying of +source, or **(2)** access to copy the Corresponding Source from a network server at no +charge. +* **c)** Convey individual copies of the object code with a copy of the written offer to +provide the Corresponding Source. This alternative is allowed only occasionally and +noncommercially, and only if you received the object code with such an offer, in +accord with subsection 6b. +* **d)** Convey the object code by offering access from a designated place (gratis or for +a charge), and offer equivalent access to the Corresponding Source in the same way +through the same place at no further charge. You need not require recipients to copy +the Corresponding Source along with the object code. If the place to copy the object +code is a network server, the Corresponding Source may be on a different server +(operated by you or a third party) that supports equivalent copying facilities, +provided you maintain clear directions next to the object code saying where to find +the Corresponding Source. Regardless of what server hosts the Corresponding Source, +you remain obligated to ensure that it is available for as long as needed to satisfy +these requirements. +* **e)** Convey the object code using peer-to-peer transmission, provided you inform +other peers where the object code and Corresponding Source of the work are being +offered to the general public at no charge under subsection 6d. + +A separable portion of the object code, whose source code is excluded from the +Corresponding Source as a System Library, need not be included in conveying the +object code work. + +A “User Product” is either **(1)** a “consumer product”, which +means any tangible personal property which is normally used for personal, family, or +household purposes, or **(2)** anything designed or sold for incorporation into a +dwelling. In determining whether a product is a consumer product, doubtful cases +shall be resolved in favor of coverage. For a particular product received by a +particular user, “normally used” refers to a typical or common use of +that class of product, regardless of the status of the particular user or of the way +in which the particular user actually uses, or expects or is expected to use, the +product. A product is a consumer product regardless of whether the product has +substantial commercial, industrial or non-consumer uses, unless such uses represent +the only significant mode of use of the product. + +“Installation Information” for a User Product means any methods, +procedures, authorization keys, or other information required to install and execute +modified versions of a covered work in that User Product from a modified version of +its Corresponding Source. The information must suffice to ensure that the continued +functioning of the modified object code is in no case prevented or interfered with +solely because modification has been made. + +If you convey an object code work under this section in, or with, or specifically for +use in, a User Product, and the conveying occurs as part of a transaction in which +the right of possession and use of the User Product is transferred to the recipient +in perpetuity or for a fixed term (regardless of how the transaction is +characterized), the Corresponding Source conveyed under this section must be +accompanied by the Installation Information. But this requirement does not apply if +neither you nor any third party retains the ability to install modified object code +on the User Product (for example, the work has been installed in ROM). + +The requirement to provide Installation Information does not include a requirement to +continue to provide support service, warranty, or updates for a work that has been +modified or installed by the recipient, or for the User Product in which it has been +modified or installed. Access to a network may be denied when the modification itself +materially and adversely affects the operation of the network or violates the rules +and protocols for communication across the network. + +Corresponding Source conveyed, and Installation Information provided, in accord with +this section must be in a format that is publicly documented (and with an +implementation available to the public in source code form), and must require no +special password or key for unpacking, reading or copying. + +### 7. Additional Terms + +“Additional permissions” are terms that supplement the terms of this +License by making exceptions from one or more of its conditions. Additional +permissions that are applicable to the entire Program shall be treated as though they +were included in this License, to the extent that they are valid under applicable +law. If additional permissions apply only to part of the Program, that part may be +used separately under those permissions, but the entire Program remains governed by +this License without regard to the additional permissions. + +When you convey a copy of a covered work, you may at your option remove any +additional permissions from that copy, or from any part of it. (Additional +permissions may be written to require their own removal in certain cases when you +modify the work.) You may place additional permissions on material, added by you to a +covered work, for which you have or can give appropriate copyright permission. + +Notwithstanding any other provision of this License, for material you add to a +covered work, you may (if authorized by the copyright holders of that material) +supplement the terms of this License with terms: + +* **a)** Disclaiming warranty or limiting liability differently from the terms of +sections 15 and 16 of this License; or +* **b)** Requiring preservation of specified reasonable legal notices or author +attributions in that material or in the Appropriate Legal Notices displayed by works +containing it; or +* **c)** Prohibiting misrepresentation of the origin of that material, or requiring that +modified versions of such material be marked in reasonable ways as different from the +original version; or +* **d)** Limiting the use for publicity purposes of names of licensors or authors of the +material; or +* **e)** Declining to grant rights under trademark law for use of some trade names, +trademarks, or service marks; or +* **f)** Requiring indemnification of licensors and authors of that material by anyone +who conveys the material (or modified versions of it) with contractual assumptions of +liability to the recipient, for any liability that these contractual assumptions +directly impose on those licensors and authors. + +All other non-permissive additional terms are considered “further +restrictions” within the meaning of section 10. If the Program as you received +it, or any part of it, contains a notice stating that it is governed by this License +along with a term that is a further restriction, you may remove that term. If a +license document contains a further restriction but permits relicensing or conveying +under this License, you may add to a covered work material governed by the terms of +that license document, provided that the further restriction does not survive such +relicensing or conveying. + +If you add terms to a covered work in accord with this section, you must place, in +the relevant source files, a statement of the additional terms that apply to those +files, or a notice indicating where to find the applicable terms. + +Additional terms, permissive or non-permissive, may be stated in the form of a +separately written license, or stated as exceptions; the above requirements apply +either way. + +### 8. Termination + +You may not propagate or modify a covered work except as expressly provided under +this License. Any attempt otherwise to propagate or modify it is void, and will +automatically terminate your rights under this License (including any patent licenses +granted under the third paragraph of section 11). + +However, if you cease all violation of this License, then your license from a +particular copyright holder is reinstated **(a)** provisionally, unless and until the +copyright holder explicitly and finally terminates your license, and **(b)** permanently, +if the copyright holder fails to notify you of the violation by some reasonable means +prior to 60 days after the cessation. + +Moreover, your license from a particular copyright holder is reinstated permanently +if the copyright holder notifies you of the violation by some reasonable means, this +is the first time you have received notice of violation of this License (for any +work) from that copyright holder, and you cure the violation prior to 30 days after +your receipt of the notice. + +Termination of your rights under this section does not terminate the licenses of +parties who have received copies or rights from you under this License. If your +rights have been terminated and not permanently reinstated, you do not qualify to +receive new licenses for the same material under section 10. + +### 9. Acceptance Not Required for Having Copies + +You are not required to accept this License in order to receive or run a copy of the +Program. Ancillary propagation of a covered work occurring solely as a consequence of +using peer-to-peer transmission to receive a copy likewise does not require +acceptance. However, nothing other than this License grants you permission to +propagate or modify any covered work. These actions infringe copyright if you do not +accept this License. Therefore, by modifying or propagating a covered work, you +indicate your acceptance of this License to do so. + +### 10. Automatic Licensing of Downstream Recipients + +Each time you convey a covered work, the recipient automatically receives a license +from the original licensors, to run, modify and propagate that work, subject to this +License. You are not responsible for enforcing compliance by third parties with this +License. + +An “entity transaction” is a transaction transferring control of an +organization, or substantially all assets of one, or subdividing an organization, or +merging organizations. If propagation of a covered work results from an entity +transaction, each party to that transaction who receives a copy of the work also +receives whatever licenses to the work the party's predecessor in interest had or +could give under the previous paragraph, plus a right to possession of the +Corresponding Source of the work from the predecessor in interest, if the predecessor +has it or can get it with reasonable efforts. + +You may not impose any further restrictions on the exercise of the rights granted or +affirmed under this License. For example, you may not impose a license fee, royalty, +or other charge for exercise of rights granted under this License, and you may not +initiate litigation (including a cross-claim or counterclaim in a lawsuit) alleging +that any patent claim is infringed by making, using, selling, offering for sale, or +importing the Program or any portion of it. + +### 11. Patents + +A “contributor” is a copyright holder who authorizes use under this +License of the Program or a work on which the Program is based. The work thus +licensed is called the contributor's “contributor version”. + +A contributor's “essential patent claims” are all patent claims owned or +controlled by the contributor, whether already acquired or hereafter acquired, that +would be infringed by some manner, permitted by this License, of making, using, or +selling its contributor version, but do not include claims that would be infringed +only as a consequence of further modification of the contributor version. For +purposes of this definition, “control” includes the right to grant patent +sublicenses in a manner consistent with the requirements of this License. + +Each contributor grants you a non-exclusive, worldwide, royalty-free patent license +under the contributor's essential patent claims, to make, use, sell, offer for sale, +import and otherwise run, modify and propagate the contents of its contributor +version. + +In the following three paragraphs, a “patent license” is any express +agreement or commitment, however denominated, not to enforce a patent (such as an +express permission to practice a patent or covenant not to sue for patent +infringement). To “grant” such a patent license to a party means to make +such an agreement or commitment not to enforce a patent against the party. + +If you convey a covered work, knowingly relying on a patent license, and the +Corresponding Source of the work is not available for anyone to copy, free of charge +and under the terms of this License, through a publicly available network server or +other readily accessible means, then you must either **(1)** cause the Corresponding +Source to be so available, or **(2)** arrange to deprive yourself of the benefit of the +patent license for this particular work, or **(3)** arrange, in a manner consistent with +the requirements of this License, to extend the patent license to downstream +recipients. “Knowingly relying” means you have actual knowledge that, but +for the patent license, your conveying the covered work in a country, or your +recipient's use of the covered work in a country, would infringe one or more +identifiable patents in that country that you have reason to believe are valid. + +If, pursuant to or in connection with a single transaction or arrangement, you +convey, or propagate by procuring conveyance of, a covered work, and grant a patent +license to some of the parties receiving the covered work authorizing them to use, +propagate, modify or convey a specific copy of the covered work, then the patent +license you grant is automatically extended to all recipients of the covered work and +works based on it. + +A patent license is “discriminatory” if it does not include within the +scope of its coverage, prohibits the exercise of, or is conditioned on the +non-exercise of one or more of the rights that are specifically granted under this +License. You may not convey a covered work if you are a party to an arrangement with +a third party that is in the business of distributing software, under which you make +payment to the third party based on the extent of your activity of conveying the +work, and under which the third party grants, to any of the parties who would receive +the covered work from you, a discriminatory patent license **(a)** in connection with +copies of the covered work conveyed by you (or copies made from those copies), or **(b)** +primarily for and in connection with specific products or compilations that contain +the covered work, unless you entered into that arrangement, or that patent license +was granted, prior to 28 March 2007. + +Nothing in this License shall be construed as excluding or limiting any implied +license or other defenses to infringement that may otherwise be available to you +under applicable patent law. + +### 12. No Surrender of Others' Freedom + +If conditions are imposed on you (whether by court order, agreement or otherwise) +that contradict the conditions of this License, they do not excuse you from the +conditions of this License. If you cannot convey a covered work so as to satisfy +simultaneously your obligations under this License and any other pertinent +obligations, then as a consequence you may not convey it at all. For example, if you +agree to terms that obligate you to collect a royalty for further conveying from +those to whom you convey the Program, the only way you could satisfy both those terms +and this License would be to refrain entirely from conveying the Program. + +### 13. Use with the GNU Affero General Public License + +Notwithstanding any other provision of this License, you have permission to link or +combine any covered work with a work licensed under version 3 of the GNU Affero +General Public License into a single combined work, and to convey the resulting work. +The terms of this License will continue to apply to the part which is the covered +work, but the special requirements of the GNU Affero General Public License, section +13, concerning interaction through a network will apply to the combination as such. + +### 14. Revised Versions of this License + +The Free Software Foundation may publish revised and/or new versions of the GNU +General Public License from time to time. Such new versions will be similar in spirit +to the present version, but may differ in detail to address new problems or concerns. + +Each version is given a distinguishing version number. If the Program specifies that +a certain numbered version of the GNU General Public License “or any later +version” applies to it, you have the option of following the terms and +conditions either of that numbered version or of any later version published by the +Free Software Foundation. If the Program does not specify a version number of the GNU +General Public License, you may choose any version ever published by the Free +Software Foundation. + +If the Program specifies that a proxy can decide which future versions of the GNU +General Public License can be used, that proxy's public statement of acceptance of a +version permanently authorizes you to choose that version for the Program. + +Later license versions may give you additional or different permissions. However, no +additional obligations are imposed on any author or copyright holder as a result of +your choosing to follow a later version. + +### 15. Disclaimer of Warranty + +THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. +EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM “AS IS” WITHOUT WARRANTY OF ANY KIND, EITHER +EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE +QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE +DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + +### 16. Limitation of Liability + +IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY +COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS THE PROGRAM AS +PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, +INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE +PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE +OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE +WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + +### 17. Interpretation of Sections 15 and 16 + +If the disclaimer of warranty and limitation of liability provided above cannot be +given local legal effect according to their terms, reviewing courts shall apply local +law that most closely approximates an absolute waiver of all civil liability in +connection with the Program, unless a warranty or assumption of liability accompanies +a copy of the Program in return for a fee. + +_END OF TERMS AND CONDITIONS_ + +## How to Apply These Terms to Your New Programs + +If you develop a new program, and you want it to be of the greatest possible use to +the public, the best way to achieve this is to make it free software which everyone +can redistribute and change under these terms. + +To do so, attach the following notices to the program. It is safest to attach them +to the start of each source file to most effectively state the exclusion of warranty; +and each file should have at least the “copyright” line and a pointer to +where the full notice is found. + + + Copyright (C) + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + +Also add information on how to contact you by electronic and paper mail. + +If the program does terminal interaction, make it output a short notice like this +when it starts in an interactive mode: + + Copyright (C) + This program comes with ABSOLUTELY NO WARRANTY; for details type 'show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type 'show c' for details. + +The hypothetical commands `show w` and `show c` should show the appropriate parts of +the General Public License. Of course, your program's commands might be different; +for a GUI interface, you would use an “about box”. + +You should also get your employer (if you work as a programmer) or school, if any, to +sign a “copyright disclaimer” for the program, if necessary. For more +information on this, and how to apply and follow the GNU GPL, see +<>. + +The GNU General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may consider it +more useful to permit linking proprietary applications with the library. If this is +what you want to do, use the GNU Lesser General Public License instead of this +License. But first, please read +<>. diff --git a/Makefile b/Makefile deleted file mode 100644 index 02e3755..0000000 --- a/Makefile +++ /dev/null @@ -1,6 +0,0 @@ -all: - make book - -book: - R -e 'bookdown::render_book(".")' - diff --git a/NAMESPACE b/NAMESPACE new file mode 100644 index 0000000..6ae9268 --- /dev/null +++ b/NAMESPACE @@ -0,0 +1,2 @@ +# Generated by roxygen2: do not edit by hand + diff --git a/README.md b/README.md index ca88f65..f980b29 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,16 @@ -# R for Mass Spectrometry documentation + +📦 [Repo](https://github.com/js2264/R4MS) [![rworkflows](https://img.shields.io/github/actions/workflow/status/js2264/R4MS/rworkflows.yml?label=Package%20check)](https://github.com/js2264/R4MS/actions/workflows/rworkflows.yml) +📖 [Book](https://js2264.github.io/R4MS/) [![deployment](https://img.shields.io/github/actions/workflow/status/js2264/R4MS/pages/pages-build-deployment?label=Book%20deployment)](https://github.com/js2264/R4MS/actions/workflows/pages/pages-build-deployment) +🐳 [Docker](https://github.com/js2264/R4MS/pkgs/container/R4MS) [![biocbook](https://img.shields.io/github/actions/workflow/status/js2264/R4MS/biocbook.yml?label=Docker%20image)](https://github.com/js2264/R4MS/actions/workflows/biocbook.yml) + -This repository provides documentation and teaching material focus on -the analysis of mass spectrometry data for proteomics and metabolomics -using the [R for Mass -Spectrometry](https://www.rformassspectrometry.org/) software -infrastructure. +This is the [BiocBook](https://www.bioconductor.org/packages/release/bioc/html/BiocBook.html) version of the original [R for mass spectrometry book](https://rformassspectrometry.github.io/book). +Original authors of the *R for mass spectrometry* book are: + +- Laurent Gatto +- Sebastian Gibb +- Johannes Rainer Go to http://rformassspectrometry.github.io/book to browse the material online. diff --git a/_bookdown.yml b/_bookdown.yml deleted file mode 100644 index 5d919b6..0000000 --- a/_bookdown.yml +++ /dev/null @@ -1,6 +0,0 @@ -book_filename: "R4MS" -delete_merged_file: true -output_dir: "docs" -language: - ui: - chapter_name: "Chapter " diff --git a/docs/404.html b/docs/404.html deleted file mode 100644 index 236f113..0000000 --- a/docs/404.html +++ /dev/null @@ -1,206 +0,0 @@ - - - - - - - - - - - - - - - - - - - - -Page not found | R for Mass Spectrometry - - - - - - - - - - - - - - - - - - - - - - -
-
-
- -
-
-
-
-
- -
-

Page not found

-

The page you requested cannot be found (perhaps it was moved or renamed).

-

You may want to try searching to find the page's new location, or use -the table of contents to find the page you are looking for.

-
- -

-

-

Page built: -2023-09-06 - using -R version 4.3.1 Patched (2023-07-10 r84676) -

-
-
- - - - - diff --git a/docs/R4MS_files/figure-html/answid1-1.png b/docs/R4MS_files/figure-html/answid1-1.png deleted file mode 100644 index 90aea6a..0000000 Binary files a/docs/R4MS_files/figure-html/answid1-1.png and /dev/null differ diff --git a/docs/R4MS_files/figure-html/ex_raw-1.png b/docs/R4MS_files/figure-html/ex_raw-1.png deleted file mode 100644 index c5c6b8e..0000000 Binary files a/docs/R4MS_files/figure-html/ex_raw-1.png and /dev/null differ diff --git a/docs/R4MS_files/figure-html/ex_raw-2.png b/docs/R4MS_files/figure-html/ex_raw-2.png deleted file mode 100644 index 3aaf3a1..0000000 Binary files a/docs/R4MS_files/figure-html/ex_raw-2.png and /dev/null differ diff --git a/docs/R4MS_files/figure-html/ex_raw2-1.png b/docs/R4MS_files/figure-html/ex_raw2-1.png deleted file mode 100644 index 036c7da..0000000 Binary files a/docs/R4MS_files/figure-html/ex_raw2-1.png and /dev/null differ diff --git a/docs/R4MS_files/figure-html/ex_raw2-2.png b/docs/R4MS_files/figure-html/ex_raw2-2.png deleted file mode 100644 index 2ed9d49..0000000 Binary files a/docs/R4MS_files/figure-html/ex_raw2-2.png and /dev/null differ diff --git a/docs/R4MS_files/figure-html/featuresplot-1.png b/docs/R4MS_files/figure-html/featuresplot-1.png deleted file mode 100644 index 8b5a86e..0000000 Binary files a/docs/R4MS_files/figure-html/featuresplot-1.png and /dev/null differ diff --git a/docs/R4MS_files/figure-html/idqc1-1.png b/docs/R4MS_files/figure-html/idqc1-1.png deleted file mode 100644 index 3e58c7e..0000000 Binary files a/docs/R4MS_files/figure-html/idqc1-1.png and /dev/null differ diff --git a/docs/R4MS_files/figure-html/idqc2-1.png b/docs/R4MS_files/figure-html/idqc2-1.png deleted file mode 100644 index 60350c6..0000000 Binary files a/docs/R4MS_files/figure-html/idqc2-1.png and /dev/null differ diff --git a/docs/R4MS_files/figure-html/imagena-1.png b/docs/R4MS_files/figure-html/imagena-1.png deleted file mode 100644 index 2346a20..0000000 Binary files a/docs/R4MS_files/figure-html/imagena-1.png and /dev/null differ diff --git a/docs/R4MS_files/figure-html/miximp-1.png b/docs/R4MS_files/figure-html/miximp-1.png deleted file mode 100644 index 8872ca9..0000000 Binary files a/docs/R4MS_files/figure-html/miximp-1.png and /dev/null differ diff --git a/docs/R4MS_files/figure-html/nSequencePlot-1.png b/docs/R4MS_files/figure-html/nSequencePlot-1.png deleted file mode 100644 index 0f95474..0000000 Binary files a/docs/R4MS_files/figure-html/nSequencePlot-1.png and /dev/null differ diff --git a/docs/R4MS_files/figure-html/na2-1.png b/docs/R4MS_files/figure-html/na2-1.png deleted file mode 100644 index 7b50f5c..0000000 Binary files a/docs/R4MS_files/figure-html/na2-1.png and /dev/null differ diff --git a/docs/R4MS_files/figure-html/naex3-1.png b/docs/R4MS_files/figure-html/naex3-1.png deleted file mode 100644 index 90a7766..0000000 Binary files a/docs/R4MS_files/figure-html/naex3-1.png and /dev/null differ diff --git a/docs/R4MS_files/figure-html/nasetdist-1.png b/docs/R4MS_files/figure-html/nasetdist-1.png deleted file mode 100644 index 604faf3..0000000 Binary files a/docs/R4MS_files/figure-html/nasetdist-1.png and /dev/null differ diff --git a/docs/R4MS_files/figure-html/plotdens-1.png b/docs/R4MS_files/figure-html/plotdens-1.png deleted file mode 100644 index 10c9ae3..0000000 Binary files a/docs/R4MS_files/figure-html/plotdens-1.png and /dev/null differ diff --git a/docs/R4MS_files/figure-html/plotpca-1.png b/docs/R4MS_files/figure-html/plotpca-1.png deleted file mode 100644 index eec5818..0000000 Binary files a/docs/R4MS_files/figure-html/plotpca-1.png and /dev/null differ diff --git a/docs/R4MS_files/figure-html/plotqf-1.png b/docs/R4MS_files/figure-html/plotqf-1.png deleted file mode 100644 index c8deef0..0000000 Binary files a/docs/R4MS_files/figure-html/plotqf-1.png and /dev/null differ diff --git a/docs/R4MS_files/figure-html/plotqf2-1.png b/docs/R4MS_files/figure-html/plotqf2-1.png deleted file mode 100644 index fb5ad34..0000000 Binary files a/docs/R4MS_files/figure-html/plotqf2-1.png and /dev/null differ diff --git a/docs/R4MS_files/figure-html/unnamed-chunk-11-1.png b/docs/R4MS_files/figure-html/unnamed-chunk-11-1.png deleted file mode 100644 index efe0dc0..0000000 Binary files a/docs/R4MS_files/figure-html/unnamed-chunk-11-1.png and /dev/null differ diff --git a/docs/R4MS_files/figure-html/unnamed-chunk-12-1.png b/docs/R4MS_files/figure-html/unnamed-chunk-12-1.png deleted file mode 100644 index efe0dc0..0000000 Binary files a/docs/R4MS_files/figure-html/unnamed-chunk-12-1.png and /dev/null differ diff --git a/docs/R4MS_files/figure-html/unnamed-chunk-13-1.png b/docs/R4MS_files/figure-html/unnamed-chunk-13-1.png deleted file mode 100644 index 5fae6c1..0000000 Binary files a/docs/R4MS_files/figure-html/unnamed-chunk-13-1.png and /dev/null differ diff --git a/docs/R4MS_files/figure-html/unnamed-chunk-14-1.png b/docs/R4MS_files/figure-html/unnamed-chunk-14-1.png deleted file mode 100644 index 5fae6c1..0000000 Binary files a/docs/R4MS_files/figure-html/unnamed-chunk-14-1.png and /dev/null differ diff --git a/docs/R4MS_files/figure-html/unnamed-chunk-15-1.png b/docs/R4MS_files/figure-html/unnamed-chunk-15-1.png deleted file mode 100644 index dafc790..0000000 Binary files a/docs/R4MS_files/figure-html/unnamed-chunk-15-1.png and /dev/null differ diff --git a/docs/R4MS_files/figure-html/unnamed-chunk-16-1.png b/docs/R4MS_files/figure-html/unnamed-chunk-16-1.png deleted file mode 100644 index 78c33a0..0000000 Binary files a/docs/R4MS_files/figure-html/unnamed-chunk-16-1.png and /dev/null differ diff --git a/docs/R4MS_files/figure-html/unnamed-chunk-17-1.png b/docs/R4MS_files/figure-html/unnamed-chunk-17-1.png deleted file mode 100644 index 43f406a..0000000 Binary files a/docs/R4MS_files/figure-html/unnamed-chunk-17-1.png and /dev/null differ diff --git a/docs/R4MS_files/figure-html/unnamed-chunk-18-1.png b/docs/R4MS_files/figure-html/unnamed-chunk-18-1.png deleted file mode 100644 index e1acd0b..0000000 Binary files a/docs/R4MS_files/figure-html/unnamed-chunk-18-1.png and /dev/null differ diff --git a/docs/R4MS_files/figure-html/unnamed-chunk-19-1.png b/docs/R4MS_files/figure-html/unnamed-chunk-19-1.png deleted file mode 100644 index e1acd0b..0000000 Binary files a/docs/R4MS_files/figure-html/unnamed-chunk-19-1.png and /dev/null differ diff --git a/docs/R4MS_files/figure-html/unnamed-chunk-20-1.png b/docs/R4MS_files/figure-html/unnamed-chunk-20-1.png deleted file mode 100644 index 980e04a..0000000 Binary files a/docs/R4MS_files/figure-html/unnamed-chunk-20-1.png and /dev/null differ diff --git a/docs/R4MS_files/figure-html/unnamed-chunk-21-1.png b/docs/R4MS_files/figure-html/unnamed-chunk-21-1.png deleted file mode 100644 index a1580a5..0000000 Binary files a/docs/R4MS_files/figure-html/unnamed-chunk-21-1.png and /dev/null differ diff --git a/docs/R4MS_files/figure-html/unnamed-chunk-22-1.png b/docs/R4MS_files/figure-html/unnamed-chunk-22-1.png deleted file mode 100644 index 08da677..0000000 Binary files a/docs/R4MS_files/figure-html/unnamed-chunk-22-1.png and /dev/null differ diff --git a/docs/R4MS_files/figure-html/unnamed-chunk-23-1.png b/docs/R4MS_files/figure-html/unnamed-chunk-23-1.png deleted file mode 100644 index 42d1ce7..0000000 Binary files a/docs/R4MS_files/figure-html/unnamed-chunk-23-1.png and /dev/null differ diff --git a/docs/R4MS_files/figure-html/unnamed-chunk-24-1.png b/docs/R4MS_files/figure-html/unnamed-chunk-24-1.png deleted file mode 100644 index fa90b83..0000000 Binary files a/docs/R4MS_files/figure-html/unnamed-chunk-24-1.png and /dev/null differ diff --git a/docs/R4MS_files/figure-html/unnamed-chunk-25-1.png b/docs/R4MS_files/figure-html/unnamed-chunk-25-1.png deleted file mode 100644 index 2265bdc..0000000 Binary files a/docs/R4MS_files/figure-html/unnamed-chunk-25-1.png and /dev/null differ diff --git a/docs/R4MS_files/figure-html/unnamed-chunk-41-1.png b/docs/R4MS_files/figure-html/unnamed-chunk-41-1.png deleted file mode 100644 index bc5231b..0000000 Binary files a/docs/R4MS_files/figure-html/unnamed-chunk-41-1.png and /dev/null differ diff --git a/docs/R4MS_files/figure-html/unnamed-chunk-42-1.png b/docs/R4MS_files/figure-html/unnamed-chunk-42-1.png deleted file mode 100644 index bc5231b..0000000 Binary files a/docs/R4MS_files/figure-html/unnamed-chunk-42-1.png and /dev/null differ diff --git a/docs/R4MS_files/figure-html/unnamed-chunk-43-1.png b/docs/R4MS_files/figure-html/unnamed-chunk-43-1.png deleted file mode 100644 index f8af815..0000000 Binary files a/docs/R4MS_files/figure-html/unnamed-chunk-43-1.png and /dev/null differ diff --git a/docs/R4MS_files/figure-html/unnamed-chunk-45-1.png b/docs/R4MS_files/figure-html/unnamed-chunk-45-1.png deleted file mode 100644 index f8af815..0000000 Binary files a/docs/R4MS_files/figure-html/unnamed-chunk-45-1.png and /dev/null differ diff --git a/docs/R4MS_files/figure-html/unnamed-chunk-49-1.png b/docs/R4MS_files/figure-html/unnamed-chunk-49-1.png deleted file mode 100644 index 0044c98..0000000 Binary files a/docs/R4MS_files/figure-html/unnamed-chunk-49-1.png and /dev/null differ diff --git a/docs/R4MS_files/figure-html/unnamed-chunk-51-1.png b/docs/R4MS_files/figure-html/unnamed-chunk-51-1.png deleted file mode 100644 index 158545f..0000000 Binary files a/docs/R4MS_files/figure-html/unnamed-chunk-51-1.png and /dev/null differ diff --git a/docs/R4MS_files/figure-html/unnamed-chunk-52-1.png b/docs/R4MS_files/figure-html/unnamed-chunk-52-1.png deleted file mode 100644 index 158545f..0000000 Binary files a/docs/R4MS_files/figure-html/unnamed-chunk-52-1.png and /dev/null differ diff --git a/docs/R4MS_files/figure-html/unnamed-chunk-53-1.png b/docs/R4MS_files/figure-html/unnamed-chunk-53-1.png deleted file mode 100644 index 102964c..0000000 Binary files a/docs/R4MS_files/figure-html/unnamed-chunk-53-1.png and /dev/null differ diff --git a/docs/R4MS_files/figure-html/unnamed-chunk-54-1.png b/docs/R4MS_files/figure-html/unnamed-chunk-54-1.png deleted file mode 100644 index 102964c..0000000 Binary files a/docs/R4MS_files/figure-html/unnamed-chunk-54-1.png and /dev/null differ diff --git a/docs/R4MS_files/figure-html/unnamed-chunk-55-1.png b/docs/R4MS_files/figure-html/unnamed-chunk-55-1.png deleted file mode 100644 index bd92623..0000000 Binary files a/docs/R4MS_files/figure-html/unnamed-chunk-55-1.png and /dev/null differ diff --git a/docs/R4MS_files/figure-html/unnamed-chunk-56-1.png b/docs/R4MS_files/figure-html/unnamed-chunk-56-1.png deleted file mode 100644 index bd92623..0000000 Binary files a/docs/R4MS_files/figure-html/unnamed-chunk-56-1.png and /dev/null differ diff --git a/docs/R4MS_files/figure-html/unnamed-chunk-57-1.png b/docs/R4MS_files/figure-html/unnamed-chunk-57-1.png deleted file mode 100644 index 72026b9..0000000 Binary files a/docs/R4MS_files/figure-html/unnamed-chunk-57-1.png and /dev/null differ diff --git a/docs/R4MS_files/figure-html/unnamed-chunk-57-2.png b/docs/R4MS_files/figure-html/unnamed-chunk-57-2.png deleted file mode 100644 index 7902420..0000000 Binary files a/docs/R4MS_files/figure-html/unnamed-chunk-57-2.png and /dev/null differ diff --git a/docs/R4MS_files/figure-html/unnamed-chunk-58-1.png b/docs/R4MS_files/figure-html/unnamed-chunk-58-1.png deleted file mode 100644 index e0707a0..0000000 Binary files a/docs/R4MS_files/figure-html/unnamed-chunk-58-1.png and /dev/null differ diff --git a/docs/R4MS_files/figure-html/unnamed-chunk-59-1.png b/docs/R4MS_files/figure-html/unnamed-chunk-59-1.png deleted file mode 100644 index 06c9ac6..0000000 Binary files a/docs/R4MS_files/figure-html/unnamed-chunk-59-1.png and /dev/null differ diff --git a/docs/R4MS_files/figure-html/unnamed-chunk-59-2.png b/docs/R4MS_files/figure-html/unnamed-chunk-59-2.png deleted file mode 100644 index 362f25f..0000000 Binary files a/docs/R4MS_files/figure-html/unnamed-chunk-59-2.png and /dev/null differ diff --git a/docs/R4MS_files/figure-html/unnamed-chunk-59-3.png b/docs/R4MS_files/figure-html/unnamed-chunk-59-3.png deleted file mode 100644 index b2b861a..0000000 Binary files a/docs/R4MS_files/figure-html/unnamed-chunk-59-3.png and /dev/null differ diff --git a/docs/R4MS_files/figure-html/unnamed-chunk-60-1.png b/docs/R4MS_files/figure-html/unnamed-chunk-60-1.png deleted file mode 100644 index e913f2e..0000000 Binary files a/docs/R4MS_files/figure-html/unnamed-chunk-60-1.png and /dev/null differ diff --git a/docs/R4MS_files/figure-html/unnamed-chunk-60-2.png b/docs/R4MS_files/figure-html/unnamed-chunk-60-2.png deleted file mode 100644 index c7fc839..0000000 Binary files a/docs/R4MS_files/figure-html/unnamed-chunk-60-2.png and /dev/null differ diff --git a/docs/R4MS_files/figure-html/unnamed-chunk-60-3.png b/docs/R4MS_files/figure-html/unnamed-chunk-60-3.png deleted file mode 100644 index b2c2e11..0000000 Binary files a/docs/R4MS_files/figure-html/unnamed-chunk-60-3.png and /dev/null differ diff --git a/docs/R4MS_files/figure-html/unnamed-chunk-61-1.png b/docs/R4MS_files/figure-html/unnamed-chunk-61-1.png deleted file mode 100644 index e913f2e..0000000 Binary files a/docs/R4MS_files/figure-html/unnamed-chunk-61-1.png and /dev/null differ diff --git a/docs/R4MS_files/figure-html/unnamed-chunk-61-2.png b/docs/R4MS_files/figure-html/unnamed-chunk-61-2.png deleted file mode 100644 index 362f25f..0000000 Binary files a/docs/R4MS_files/figure-html/unnamed-chunk-61-2.png and /dev/null differ diff --git a/docs/R4MS_files/figure-html/unnamed-chunk-61-3.png b/docs/R4MS_files/figure-html/unnamed-chunk-61-3.png deleted file mode 100644 index b2b861a..0000000 Binary files a/docs/R4MS_files/figure-html/unnamed-chunk-61-3.png and /dev/null differ diff --git a/docs/R4MS_files/figure-html/unnamed-chunk-62-1.png b/docs/R4MS_files/figure-html/unnamed-chunk-62-1.png deleted file mode 100644 index b7835ed..0000000 Binary files a/docs/R4MS_files/figure-html/unnamed-chunk-62-1.png and /dev/null differ diff --git a/docs/R4MS_files/figure-html/unnamed-chunk-62-2.png b/docs/R4MS_files/figure-html/unnamed-chunk-62-2.png deleted file mode 100644 index b7e8684..0000000 Binary files a/docs/R4MS_files/figure-html/unnamed-chunk-62-2.png and /dev/null differ diff --git a/docs/R4MS_files/figure-html/unnamed-chunk-62-3.png b/docs/R4MS_files/figure-html/unnamed-chunk-62-3.png deleted file mode 100644 index b2b861a..0000000 Binary files a/docs/R4MS_files/figure-html/unnamed-chunk-62-3.png and /dev/null differ diff --git a/docs/R4MS_files/figure-html/unnamed-chunk-63-1.png b/docs/R4MS_files/figure-html/unnamed-chunk-63-1.png deleted file mode 100644 index 1ff202c..0000000 Binary files a/docs/R4MS_files/figure-html/unnamed-chunk-63-1.png and /dev/null differ diff --git a/docs/R4MS_files/figure-html/unnamed-chunk-63-2.png b/docs/R4MS_files/figure-html/unnamed-chunk-63-2.png deleted file mode 100644 index c55452a..0000000 Binary files a/docs/R4MS_files/figure-html/unnamed-chunk-63-2.png and /dev/null differ diff --git a/docs/R4MS_files/figure-html/unnamed-chunk-64-1.png b/docs/R4MS_files/figure-html/unnamed-chunk-64-1.png deleted file mode 100644 index e4c4f03..0000000 Binary files a/docs/R4MS_files/figure-html/unnamed-chunk-64-1.png and /dev/null differ diff --git a/docs/R4MS_files/figure-html/unnamed-chunk-64-2.png b/docs/R4MS_files/figure-html/unnamed-chunk-64-2.png deleted file mode 100644 index 981908f..0000000 Binary files a/docs/R4MS_files/figure-html/unnamed-chunk-64-2.png and /dev/null differ diff --git a/docs/R4MS_files/figure-html/unnamed-chunk-65-1.png b/docs/R4MS_files/figure-html/unnamed-chunk-65-1.png deleted file mode 100644 index e4c4f03..0000000 Binary files a/docs/R4MS_files/figure-html/unnamed-chunk-65-1.png and /dev/null differ diff --git a/docs/R4MS_files/figure-html/unnamed-chunk-65-2.png b/docs/R4MS_files/figure-html/unnamed-chunk-65-2.png deleted file mode 100644 index 981908f..0000000 Binary files a/docs/R4MS_files/figure-html/unnamed-chunk-65-2.png and /dev/null differ diff --git a/docs/R4MS_files/figure-html/unnamed-chunk-66-1.png b/docs/R4MS_files/figure-html/unnamed-chunk-66-1.png deleted file mode 100644 index 3417715..0000000 Binary files a/docs/R4MS_files/figure-html/unnamed-chunk-66-1.png and /dev/null differ diff --git a/docs/R4MS_files/figure-html/unnamed-chunk-66-2.png b/docs/R4MS_files/figure-html/unnamed-chunk-66-2.png deleted file mode 100644 index 75099b0..0000000 Binary files a/docs/R4MS_files/figure-html/unnamed-chunk-66-2.png and /dev/null differ diff --git a/docs/R4MS_files/figure-html/unnamed-chunk-67-1.png b/docs/R4MS_files/figure-html/unnamed-chunk-67-1.png deleted file mode 100644 index 8df003a..0000000 Binary files a/docs/R4MS_files/figure-html/unnamed-chunk-67-1.png and /dev/null differ diff --git a/docs/R4MS_files/figure-html/unnamed-chunk-67-2.png b/docs/R4MS_files/figure-html/unnamed-chunk-67-2.png deleted file mode 100644 index bb8705f..0000000 Binary files a/docs/R4MS_files/figure-html/unnamed-chunk-67-2.png and /dev/null differ diff --git a/docs/R4MS_files/figure-html/unnamed-chunk-68-1.png b/docs/R4MS_files/figure-html/unnamed-chunk-68-1.png deleted file mode 100644 index 8df003a..0000000 Binary files a/docs/R4MS_files/figure-html/unnamed-chunk-68-1.png and /dev/null differ diff --git a/docs/R4MS_files/figure-html/unnamed-chunk-68-2.png b/docs/R4MS_files/figure-html/unnamed-chunk-68-2.png deleted file mode 100644 index a7a6f34..0000000 Binary files a/docs/R4MS_files/figure-html/unnamed-chunk-68-2.png and /dev/null differ diff --git a/docs/R4MS_files/figure-html/unnamed-chunk-69-1.png b/docs/R4MS_files/figure-html/unnamed-chunk-69-1.png deleted file mode 100644 index f482722..0000000 Binary files a/docs/R4MS_files/figure-html/unnamed-chunk-69-1.png and /dev/null differ diff --git a/docs/R4MS_files/figure-html/unnamed-chunk-69-2.png b/docs/R4MS_files/figure-html/unnamed-chunk-69-2.png deleted file mode 100644 index bb8705f..0000000 Binary files a/docs/R4MS_files/figure-html/unnamed-chunk-69-2.png and /dev/null differ diff --git a/docs/R4MS_files/figure-html/unnamed-chunk-71-1.png b/docs/R4MS_files/figure-html/unnamed-chunk-71-1.png deleted file mode 100644 index a42ffae..0000000 Binary files a/docs/R4MS_files/figure-html/unnamed-chunk-71-1.png and /dev/null differ diff --git a/docs/R4MS_files/figure-html/vis-1.png b/docs/R4MS_files/figure-html/vis-1.png deleted file mode 100644 index 8b50cd4..0000000 Binary files a/docs/R4MS_files/figure-html/vis-1.png and /dev/null differ diff --git a/docs/R4MS_files/figure-html/vp-1.png b/docs/R4MS_files/figure-html/vp-1.png deleted file mode 100644 index 716e430..0000000 Binary files a/docs/R4MS_files/figure-html/vp-1.png and /dev/null differ diff --git a/docs/identification-data.html b/docs/identification-data.html deleted file mode 100644 index f6ab029..0000000 --- a/docs/identification-data.html +++ /dev/null @@ -1,1758 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - -Chapter 4 Identification data | R for Mass Spectrometry - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - -
-

-Chapter 4 Identification data

-
-

-4.1 Identification data.frame
-

-

Let’s use the identification from from msdata:

-
idf <- msdata::ident(full.names = TRUE)
-basename(idf)
-
## [1] "TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01-20141210.mzid"
-

The easiest way to read identification data in mzIdentML (often -abbreviated with mzid) into R is to read it with readPSMs() -function from the PSM -package. The function will parse the file and return a DataFrame.

-
library(PSM)
-id <- readPSMs(idf)
-dim(id)
-
## [1] 5802   35
-
names(id)
-
##  [1] "sequence"                 "spectrumID"              
-##  [3] "chargeState"              "rank"                    
-##  [5] "passThreshold"            "experimentalMassToCharge"
-##  [7] "calculatedMassToCharge"   "peptideRef"              
-##  [9] "modNum"                   "isDecoy"                 
-## [11] "post"                     "pre"                     
-## [13] "start"                    "end"                     
-## [15] "DatabaseAccess"           "DBseqLength"             
-## [17] "DatabaseSeq"              "DatabaseDescription"     
-## [19] "scan.number.s."           "acquisitionNum"          
-## [21] "spectrumFile"             "idFile"                  
-## [23] "MS.GF.RawScore"           "MS.GF.DeNovoScore"       
-## [25] "MS.GF.SpecEValue"         "MS.GF.EValue"            
-## [27] "MS.GF.QValue"             "MS.GF.PepQValue"         
-## [29] "modPeptideRef"            "modName"                 
-## [31] "modMass"                  "modLocation"             
-## [33] "subOriginalResidue"       "subReplacementResidue"   
-## [35] "subLocation"
-
-

-► Question -

-
-

Verify that this table contains 5802 matches for 5343 -scans and 4938 peptides sequences.

-

- -

-
-
-
-

-► Solution -

- -
-

The PSM data are read as is, without and filtering. As we can see -below, we still have all the hits from the forward and reverse (decoy) -databases.

-
table(id$isDecoy)
-
## 
-## FALSE  TRUE 
-##  2906  2896
-
-
-

-4.2 Keeping all matches
-

-

The data contains also contains multiple matches for several -spectra. The table below shows the number of number of spectra that -have 1, 2, … up to 5 matches.

-
table(table(id$spectrumID))
-
## 
-##    1    2    3    4    5 
-## 4936  369   26   10    2
-

Below, we can see how scan 1774 has 4 matches, all to sequence -RTRYQAEVR, which itself matches to 4 different proteins:

-
i <- which(id$spectrumID == "controllerType=0 controllerNumber=1 scan=1774")
-id[i, 1:5]
-
## PSM with 4 rows and 5 columns.
-## names(5): sequence spectrumID ... rank passThreshold
-

If the goal is to keep all the matches, but arranged by scan/spectrum, -one can reduce the DataFrame object by the spectrumID variable, -so that each scan correponds to a single row that still stores all -values5 The rownames aren’t needed here are are removed to reduce -to output in the the next code chunk display parts of id2.:

-
id2 <- QFeatures::reduceDataFrame(id, id$spectrumID)
-
## Warning: The dim() method for DataFrameList objects is deprecated. Please use
-##   dims() on these objects instead.
-
## Warning: The nrow() method for DataFrameList objects is deprecated. Please use
-##   nrows() on these objects instead.
-
## Warning: The ncol() method for CompressedSplitDataFrameList objects is
-##   deprecated. Please use ncols() on these objects instead.
-
rownames(id2) <- NULL ## rownames not needed here
-dim(id2)
-
## [1] 5343   35
-

The resulting object contains a single entrie for scan 1774 with -information for the multiple matches stored as lists within the cells.

-
j <- which(id2$spectrumID == "controllerType=0 controllerNumber=1 scan=1774")
-id2[j, ]
-
## DataFrame with 1 row and 35 columns
-##                            sequence    spectrumID chargeState          rank
-##                     <CharacterList>   <character>   <integer> <IntegerList>
-## 1 RTRYQAEVR,RTRYQAEVR,RTRYQAEVR,... controller...           2     1,1,1,...
-##   passThreshold experimentalMassToCharge      calculatedMassToCharge
-##       <logical>                <numeric>               <NumericList>
-## 1          TRUE                  589.821 589.823,589.823,589.823,...
-##                    peptideRef        modNum               isDecoy
-##               <CharacterList> <IntegerList>         <LogicalList>
-## 1 Pep1890,Pep1890,Pep1890,...     0,0,0,... FALSE,FALSE,FALSE,...
-##              post             pre         start           end
-##   <CharacterList> <CharacterList> <IntegerList> <IntegerList>
-## 1       P,P,P,...       R,R,R,...  89,99,89,... 97,107,97,...
-##                DatabaseAccess     DBseqLength DatabaseSeq
-##               <CharacterList>   <IntegerList> <character>
-## 1 ECA2104,ECA2867,ECA3427,... 675,619,678,...            
-##                             DatabaseDescription scan.number.s. acquisitionNum
-##                                 <CharacterList>      <numeric>      <numeric>
-## 1 ECA2104 Vg...,ECA2867 pu...,ECA3427 co...,...           1774           1774
-##    spectrumFile        idFile MS.GF.RawScore MS.GF.DeNovoScore MS.GF.SpecEValue
-##     <character>   <character>      <numeric>         <numeric>        <numeric>
-## 1 TMT_Erwini... TMT_Erwini...              0                96      3.69254e-06
-##                  MS.GF.EValue MS.GF.QValue                MS.GF.PepQValue
-##                 <NumericList>    <numeric>                  <NumericList>
-## 1 10.5388,10.5388,10.5388,...            1 0.990816,0.990816,0.990816,...
-##     modPeptideRef         modName       modMass   modLocation
-##   <CharacterList> <CharacterList> <NumericList> <IntegerList>
-## 1    NA,NA,NA,...    NA,NA,NA,...  NA,NA,NA,...  NA,NA,NA,...
-##   subOriginalResidue subReplacementResidue subLocation
-##          <character>           <character>   <integer>
-## 1                 NA                    NA          NA
-
id2[j, "DatabaseAccess"]
-
## CharacterList of length 1
-## [["controllerType=0 controllerNumber=1 scan=1774"]] ECA2104 ECA2867 ECA3427 ECA4142
-

The is the type of complete identification table that could be used to -annotate an raw mass spectrometry Spectra object, as shown below.

-
-
-

-4.3 Filtering data
-

-

Often, the PSM data is filtered to only retain reliable matches. The -MSnID package can be used to set thresholds to attain user-defined -PSM, peptide or protein-level FDRs. Here, we will simply filter out -wrong identification manually.

-

Here, the filter() from the dplyr package comes very handy. We -will thus start by convering the DataFrame to a tibble.

-
library("dplyr")
-id_tbl <- tidyr::as_tibble(id)
-id_tbl
-
## # A tibble: 5,802 × 35
-##    sequence     spectrumID     chargeState  rank passThreshold experimentalMass…
-##    <chr>        <chr>                <int> <int> <lgl>                     <dbl>
-##  1 RQCRTDFLNYLR controllerTyp…           3     1 TRUE                       548.
-##  2 ESVALADQVTC… controllerTyp…           2     1 TRUE                      1288.
-##  3 KELLCLAMQIIR controllerTyp…           2     1 TRUE                       744.
-##  4 QRMARTSDKQQ… controllerTyp…           3     1 TRUE                       913.
-##  5 KDEGSTEPLKV… controllerTyp…           3     1 TRUE                       927.
-##  6 DGGPAIYGHER… controllerTyp…           3     1 TRUE                       969.
-##  7 QRMARTSDKQQ… controllerTyp…           2     1 TRUE                      1369.
-##  8 CIDRARHVEVQ… controllerTyp…           3     1 TRUE                      1285.
-##  9 CIDRARHVEVQ… controllerTyp…           3     1 TRUE                      1285.
-## 10 VGRCRPIINYL… controllerTyp…           2     1 TRUE                      1102.
-## # … with 5,792 more rows, and 29 more variables: calculatedMassToCharge <dbl>,
-## #   peptideRef <chr>, modNum <int>, isDecoy <lgl>, post <chr>, pre <chr>,
-## #   start <int>, end <int>, DatabaseAccess <chr>, DBseqLength <int>,
-## #   DatabaseSeq <chr>, DatabaseDescription <chr>, scan.number.s. <dbl>,
-## #   acquisitionNum <dbl>, spectrumFile <chr>, idFile <chr>,
-## #   MS.GF.RawScore <dbl>, MS.GF.DeNovoScore <dbl>, MS.GF.SpecEValue <dbl>,
-## #   MS.GF.EValue <dbl>, MS.GF.QValue <dbl>, MS.GF.PepQValue <dbl>, …
-
-

-► Question -

-
-
    -
  • Remove decoy hits
  • -
-

- -

-
-
-
-

-► Solution -

- -
-
-

-► Question -

-
-
    -
  • Keep first rank matches
  • -
-

- -

-
-
-
-

-► Solution -

- -
-
-

-► Question -

-
-
    -
  • Remove non-proteotypic peptides. Start by identifying scans that -match different proteins. For example scan 4884 matches proteins -XXX_ECA3406 and ECA3415. Scan 4099 match XXX_ECA4416_1, -XXX_ECA4416_2 and XXX_ECA4416_3. Then remove the scans that -match any of these proteins.
  • -
-

- -

-
-
-
-

-► Solution -

- -
-

Which leaves us with 2666 PSMs.

-

This can also be achieved with the filterPSMs() function:

-
id_filtered <- filterPSMs(id)
-
## Starting with 5802 PSMs:
-
##  removed 2896 decoy hits
-
##  removed 155 PSMs with rank > 1
-
##  removed 85 non-proteotypic peptides
-
## 2666 PSMs left.
-
-

-► Question -

-
-

Compare the distribution of raw idenfication scores of the decoy and -non-decoy hits. Interpret the figure.

-

- -

-
-
-
-

-► Solution -

- -
-
-

-► Question -

-
-

The tidyverse -tools are fit for data wrangling with identification data. Using the -above identification dataframe, calculate the length of each peptide -(you can use nchar with the peptide sequence sequence) and the -number of peptides for each protein (defined as -DatabaseDescription). Plot the length of the proteins against their -respective number of peptides.

-

- -

-
-
-
-

-► Solution -

- -
-
-
-

-4.4 Low level access to id data (optional)
-

-

There are two packages that can be used to parse mzIdentML files, -namely mzR (that we have already used for raw data) and mzID. The -major difference is that the former leverages C++ code from -proteowizard and is hence faster than the latter (which uses the -XML R package). They both work in similar ways.

-
|Data type      |File format |Data structure |Package |
-|:--------------|:-----------|:--------------|:-------|
-|Identification |mzIdentML   |mzRident       |mzR     |
-|Identification |mzIdentML   |mzID           |mzID    |
-

Which of these packages is used by readPSMs() can be defined by the -parser argument.

-
-

-mzID
-

-

The main functions are mzID to read the data into a dedicated data -class and flatten to transform it into a data.frame.

-
idf
-
## [1] "/home/lgatto/R/x86_64-pc-linux-gnu-library/4.1/msdata/ident/TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01-20141210.mzid"
-
library("mzID")
-
## 
-## Attaching package: 'mzID'
-
## The following object is masked from 'package:dplyr':
-## 
-##     id
-
id <- mzID(idf)
-
## reading TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01-20141210.mzid...
-
## Warning in type.convert.default(...): 'as.is' should be specified by the caller;
-## using TRUE
-
-## Warning in type.convert.default(...): 'as.is' should be specified by the caller;
-## using TRUE
-
-## Warning in type.convert.default(...): 'as.is' should be specified by the caller;
-## using TRUE
-
-## Warning in type.convert.default(...): 'as.is' should be specified by the caller;
-## using TRUE
-
-## Warning in type.convert.default(...): 'as.is' should be specified by the caller;
-## using TRUE
-
-## Warning in type.convert.default(...): 'as.is' should be specified by the caller;
-## using TRUE
-
-## Warning in type.convert.default(...): 'as.is' should be specified by the caller;
-## using TRUE
-
-## Warning in type.convert.default(...): 'as.is' should be specified by the caller;
-## using TRUE
-
-## Warning in type.convert.default(...): 'as.is' should be specified by the caller;
-## using TRUE
-
-## Warning in type.convert.default(...): 'as.is' should be specified by the caller;
-## using TRUE
-
-## Warning in type.convert.default(...): 'as.is' should be specified by the caller;
-## using TRUE
-
-## Warning in type.convert.default(...): 'as.is' should be specified by the caller;
-## using TRUE
-
-## Warning in type.convert.default(...): 'as.is' should be specified by the caller;
-## using TRUE
-
##  DONE!
-
id
-
## An mzID object
-## 
-## Software used:   MS-GF+ (version: Beta (v10072))
-## 
-## Rawfile:         /home/lg390/dev/01_svn/workflows/proteomics/TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01-20141210.mzML
-## 
-## Database:        /home/lg390/dev/01_svn/workflows/proteomics/erwinia_carotovora.fasta
-## 
-## Number of scans: 5343
-## Number of PSM's: 5656
-

Various data can be extracted from the mzID object, using one the -accessor functions such as database, software, scans, peptides, -… The object can also be converted into a data.frame using the -flatten function.

-
head(flatten(id))
-
##                                      spectrumid scan number(s) acquisitionnum
-## 1 controllerType=0 controllerNumber=1 scan=5782           5782           5782
-## 2 controllerType=0 controllerNumber=1 scan=6037           6037           6037
-## 3 controllerType=0 controllerNumber=1 scan=5235           5235           5235
-##   passthreshold rank calculatedmasstocharge experimentalmasstocharge
-## 1          TRUE    1               1080.232                 1080.233
-## 2          TRUE    1               1002.212                 1002.209
-## 3          TRUE    1               1189.280                 1189.284
-##   chargestate ms-gf:denovoscore ms-gf:evalue ms-gf:pepqvalue ms-gf:qvalue
-## 1           3               174 1.086033e-20               0            0
-## 2           3               245 1.988774e-19               0            0
-## 3           3               264 5.129649e-19               0            0
-##   ms-gf:rawscore ms-gf:specevalue assumeddissociationmethod isotopeerror
-## 1            147     3.764831e-27                       HCD            0
-## 2            214     6.902626e-26                       HCD            0
-## 3            211     1.778789e-25                       HCD            0
-##   isdecoy post pre end start accession length
-## 1   FALSE    S   R  84    50   ECA1932    155
-## 2   FALSE    R   K 315   288   ECA1147    434
-## 3   FALSE    A   R 224   192   ECA0013    295
-##                          description                              pepseq
-## 1         outer membrane lipoprotein PVQIQAGEDSNVIGALGGAVLGGFLGNTIGGGSGR
-## 2                     trigger factor        TQVLDGLINANDIEVPVALIDGEIDVLR
-## 3 ribose-binding periplasmic protein   TKGLNVMQNLLTAHPDVQAVFAQNDEMALGALR
-##   modified modification
-## 1    FALSE         <NA>
-## 2    FALSE         <NA>
-## 3    FALSE         <NA>
-##                                                                idFile
-## 1 TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01-20141210.mzid
-## 2 TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01-20141210.mzid
-## 3 TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01-20141210.mzid
-##                                                          spectrumFile
-## 1 TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01-20141210.mzML
-## 2 TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01-20141210.mzML
-## 3 TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01-20141210.mzML
-##               databaseFile
-## 1 erwinia_carotovora.fasta
-## 2 erwinia_carotovora.fasta
-## 3 erwinia_carotovora.fasta
-##  [ reached 'max' / getOption("max.print") -- omitted 3 rows ]
-
-
-

-mzR
-

-

The mzR interface provides a similar interface. It is however much -faster as it does not read all the data into memory and only extracts -relevant data on demand. It has also accessor functions such as -softwareInfo, mzidInfo, … (use showMethods(classes = "mzRident", where = "package:mzR")) -to see all available methods.

-
library("mzR")
-id2 <- openIDfile(idf)
-id2
-
## Identification file handle.
-## Filename:  TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01-20141210.mzid 
-## Number of psms:  5759
-
softwareInfo(id2)
-
## [1] "MS-GF+ Beta (v10072) "                      
-## [2] "ProteoWizard MzIdentML 3.0.501 ProteoWizard"
-

The identification data can be accessed as a data.frame with the -psms accessor.

-
head(psms(id2))
-
##                                      spectrumID chargeState rank passThreshold
-## 1 controllerType=0 controllerNumber=1 scan=5782           3    1          TRUE
-## 2 controllerType=0 controllerNumber=1 scan=6037           3    1          TRUE
-## 3 controllerType=0 controllerNumber=1 scan=5235           3    1          TRUE
-## 4 controllerType=0 controllerNumber=1 scan=5397           3    1          TRUE
-## 5 controllerType=0 controllerNumber=1 scan=6075           3    1          TRUE
-##   experimentalMassToCharge calculatedMassToCharge
-## 1                1080.2325              1080.2321
-## 2                1002.2089              1002.2115
-## 3                1189.2836              1189.2800
-## 4                 960.5365               960.5365
-## 5                1264.3409              1264.3419
-##                              sequence peptideRef modNum isDecoy post pre start
-## 1 PVQIQAGEDSNVIGALGGAVLGGFLGNTIGGGSGR       Pep1      0   FALSE    S   R    50
-## 2        TQVLDGLINANDIEVPVALIDGEIDVLR       Pep2      0   FALSE    R   K   288
-## 3   TKGLNVMQNLLTAHPDVQAVFAQNDEMALGALR       Pep3      0   FALSE    A   R   192
-## 4         SQILQQAGTSVLSQANQVPQTVLSLLR       Pep4      0   FALSE    -   R   264
-## 5 PIIGDNPFVVVLPDVVLDESTADQTQENLALLISR       Pep5      0   FALSE    F   R   119
-##   end DatabaseAccess DBseqLength DatabaseSeq
-## 1  84        ECA1932         155            
-## 2 315        ECA1147         434            
-## 3 224        ECA0013         295            
-## 4 290        ECA1731         290            
-## 5 153        ECA1443         298            
-##                                    DatabaseDescription scan.number.s.
-## 1                   ECA1932 outer membrane lipoprotein           5782
-## 2                               ECA1147 trigger factor           6037
-## 3           ECA0013 ribose-binding periplasmic protein           5235
-## 4                                    ECA1731 flagellin           5397
-## 5 ECA1443 UTP--glucose-1-phosphate uridylyltransferase           6075
-##   acquisitionNum
-## 1           5782
-## 2           6037
-## 3           5235
-## 4           5397
-## 5           6075
-##  [ reached 'max' / getOption("max.print") -- omitted 1 rows ]
-
-
- -
-

-4.6 Adding identification data to raw data
-

-

We are goind to use the sp object created in the previous chapter -and the id_filtered variable generated above.

-

Identification data (as a DataFrame) can be merged into raw data (as -a Spectra object) by adding new spectra variables to the appropriate -MS2 spectra. Scans and peptide-spectrum matches can be matched by -their spectrum identifers.

-
-

-► Question -

-
-

Identify the spectum identifier columns in the sp the id_filtered -variables.

-

- -

-
-
-
-

-► Solution -

- -
-

These two data can thus simply be joined using:

-
sp <- joinSpectraData(sp, id_filtered,
-                      by.x = "spectrumId",
-                      by.y = "spectrumID")
-spectraVariables(sp)
-
##  [1] "msLevel"                  "rtime"                   
-##  [3] "acquisitionNum"           "scanIndex"               
-##  [5] "dataStorage"              "dataOrigin"              
-##  [7] "centroided"               "smoothed"                
-##  [9] "polarity"                 "precScanNum"             
-## [11] "precursorMz"              "precursorIntensity"      
-## [13] "precursorCharge"          "collisionEnergy"         
-## [15] "isolationWindowLowerMz"   "isolationWindowTargetMz" 
-## [17] "isolationWindowUpperMz"   "peaksCount"              
-## [19] "totIonCurrent"            "basePeakMZ"              
-## [21] "basePeakIntensity"        "ionisationEnergy"        
-## [23] "lowMZ"                    "highMZ"                  
-## [25] "mergedScan"               "mergedResultScanNum"     
-## [27] "mergedResultStartScanNum" "mergedResultEndScanNum"  
-## [29] "injectionTime"            "filterString"            
-## [31] "spectrumId"               "ionMobilityDriftTime"    
-## [33] "scanWindowLowerLimit"     "scanWindowUpperLimit"    
-## [35] "sequence"                 "chargeState"             
-## [37] "rank"                     "passThreshold"           
-## [39] "experimentalMassToCharge" "calculatedMassToCharge"  
-## [41] "peptideRef"               "modNum"                  
-## [43] "isDecoy"                  "post"                    
-## [45] "pre"                      "start"                   
-## [47] "end"                      "DatabaseAccess"          
-## [49] "DBseqLength"              "DatabaseSeq"             
-## [51] "DatabaseDescription"      "scan.number.s."          
-## [53] "acquisitionNum.y"         "spectrumFile"            
-## [55] "idFile"                   "MS.GF.RawScore"          
-## [57] "MS.GF.DeNovoScore"        "MS.GF.SpecEValue"        
-## [59] "MS.GF.EValue"             "MS.GF.QValue"            
-## [61] "MS.GF.PepQValue"          "modPeptideRef"           
-## [63] "modName"                  "modMass"                 
-## [65] "modLocation"              "subOriginalResidue"      
-## [67] "subReplacementResidue"    "subLocation"
-
-

-► Question -

-
-

Verify that the identification data has been added to the correct -spectra.

-

- -

-
-
-
-

-► Solution -

- -
-
-
-

-4.7 Visualising peptide-spectrum matches
-

-

Let’s choose a MS2 spectrum with a high identication score and plot -it.

-
i <- which(sp$MS.GF.RawScore > 100)[1]
-plotSpectra(sp[i])
-

-

We have seen above that we can add labels to each peak using the -labels argument in plotSpectra(). The addFragments() function -takes a spectrum as input (that is a Spectra object of length 1) and -annotates its peaks.

-
addFragments(sp[i])
-
##   [1] NA    NA    NA    "b1"  NA    NA    NA    NA    NA    NA    NA    NA   
-##  [13] NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA   
-##  [25] NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA   
-##  [37] NA    NA    NA    NA    NA    NA    NA    "y1_" NA    NA    NA    NA   
-##  [49] NA    "y1"  NA    NA    NA    NA    NA    NA    NA    NA    NA    NA   
-##  [61] NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA   
-##  [73] NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA   
-##  [85] NA    NA    "b2"  NA    NA    NA    NA    NA    NA    NA    NA    NA   
-##  [97] NA    NA    NA    NA   
-##  [ reached getOption("max.print") -- omitted 227 entries ]
-

It can be directly used with plotSpectra():

-
plotSpectra(sp[i], labels = addFragments,
-            labelPos = 3, labelCol = "steelblue")
-

-

When a precursor peptide ion is fragmented in a CID cell, it breaks at -specific bonds, producing sets of peaks (a, b, c and x, y, -z) that can be predicted.

-
-

-(#fig:frag_img)Peptide fragmentation. -

-Peptide fragmentation. -
-

The annotation of spectra is obtained by simulating fragmentation of a -peptide and matching observed peaks to fragments:

-
sp[i]$sequence
-
## [1] "THSQEEMQHMQR"
-
calculateFragments(sp[i]$sequence)
-
## Modifications used: C=57.02146
-
##           mz ion type pos z         seq
-## 1   102.0550  b1    b   1 1           T
-## 2   239.1139  b2    b   2 1          TH
-## 3   326.1459  b3    b   3 1         THS
-## 4   454.2045  b4    b   4 1        THSQ
-## 5   583.2471  b5    b   5 1       THSQE
-## 6   712.2897  b6    b   6 1      THSQEE
-## 7   843.3301  b7    b   7 1     THSQEEM
-## 8   971.3887  b8    b   8 1    THSQEEMQ
-## 9  1108.4476  b9    b   9 1   THSQEEMQH
-## 10 1239.4881 b10    b  10 1  THSQEEMQHM
-## 11 1367.5467 b11    b  11 1 THSQEEMQHMQ
-## 12  175.1190  y1    y   1 1           R
-## 13  303.1775  y2    y   2 1          QR
-## 14  434.2180  y3    y   3 1         MQR
-## 15  571.2769  y4    y   4 1        HMQR
-## 16  699.3355  y5    y   5 1       QHMQR
-##  [ reached 'max' / getOption("max.print") -- omitted 42 rows ]
-
-
-

-4.8 Comparing spectra
-

-

The compareSpectra() can be used to compare spectra (by default, -computing the normalised dot product).

-
-

-► Question -

-
-
    -
  1. Create a new Spectra object containing the MS2 spectra with -sequences "SQILQQAGTSVLSQANQVPQTVLSLLR" and -"TKGLNVMQNLLTAHPDVQAVFAQNDEMALGALR".
  2. -
-

- -

-
-
-
-

-► Solution -

- -
-
-

-► Question -

-
-
    -
  1. Calculate the 5 by 5 distance matrix -between all spectra using compareSpectra. See the ?Spectra man -page for details. Draw a heatmap of that distance matrix
  2. -
-

- -

-
-
-
-

-► Solution -

- -
-
-

-► Question -

-
-
    -
  1. Compare the spectra with the plotting function seen previously.
  2. -
-

- -

-
-
-
-

-► Solution -

- -
-
-
-

-4.9 Summary exercice
-

-
-

-► Question -

-
-

Download the 3 first mzML and mzID files from the -PXD022816 -project (Morgenstern, Barzilay, and Levin 2021).

-

- -

-
-
-
-

-► Solution -

- -
-
-

-► Question -

-
-

Generate a Spectra object and a table of filtered PSMs. Visualise -the total ion chromatograms and check the quality of the -identification data by comparing the density of the decoy and target -PSMs id scores for each file.

-

- -

-
-
-
-

-► Solution -

- -
-
-

-► Question -

-
-

Join the raw and identification data. Beware though that the joining -must now be performed by spectrum ids and by files.

-

- -

-
-
-
-

-► Solution -

- -
-
-

-► Question -

-
-

Extract the PSMs that have been matched to peptides from protein -O43175 and compare and cluster the scans. Hint: once you have -created the smaller Spectra object with the scans of interest, -switch to an in-memory backend to seed up the calculations.

-

- -

-
-
-
-

-► Solution -

- -
-
-
-

-4.10 Exploration and Assessment of Identifications using MSnID -
-

-

The MSnID package extracts MS/MS ID data from mzIdentML (leveraging -the mzID package) or text files. After collating the search results -from multiple datasets it assesses their identification quality and -optimises filtering criteria to achieve the maximum number of -identifications while not exceeding a specified false discovery -rate. It also contains a number of utilities to explore the MS/MS -results and assess missed and irregular enzymatic cleavages, mass -measurement accuracy, etc.

-
-

-4.10.1 Step-by-step work-flow
-

-

Let’s reproduce parts of the analysis described the MSnID -vignette. You can explore more with

-
vignette("msnid_vignette", package = "MSnID")
-

The MSnID package can be used for post-search filtering -of MS/MS identifications. One starts with the construction of an -MSnID object that is populated with identification results that can -be imported from a data.frame or from mzIdenML files. Here, we -will use the example identification data provided with the package.

-
mzids <- system.file("extdata", "c_elegans.mzid.gz", package="MSnID")
-basename(mzids)
-
## [1] "c_elegans.mzid.gz"
-

We start by loading the package, initialising the MSnID object, and -add the identification result from our mzid file (there could of -course be more that one).

-
library("MSnID")
-
## 
-## Attaching package: 'MSnID'
-
## The following object is masked from 'package:ProtGenerics':
-## 
-##     peptides
-
msnid <- MSnID(".")
-
## Note, the anticipated/suggested columns in the
-## peptide-to-spectrum matching results are:
-## -----------------------------------------------
-## accession
-## calculatedMassToCharge
-## chargeState
-## experimentalMassToCharge
-## isDecoy
-## peptide
-## spectrumFile
-## spectrumID
-
msnid <- read_mzIDs(msnid, mzids)
-
## Loaded cached data
-
show(msnid)
-
## MSnID object
-## Working directory: "."
-## #Spectrum Files:  1 
-## #PSMs: 12263 at 36 % FDR
-## #peptides: 9489 at 44 % FDR
-## #accessions: 7414 at 76 % FDR
-

Printing the MSnID object returns some basic information such as

-
    -
  • Working directory.
  • -
  • Number of spectrum files used to generate data.
  • -
  • Number of peptide-to-spectrum matches and corresponding FDR.
  • -
  • Number of unique peptide sequences and corresponding FDR.
  • -
  • Number of unique proteins or amino acid sequence accessions and corresponding FDR.
  • -
-

The package then enables to define, optimise and apply filtering based -for example on missed cleavages, identification scores, precursor mass -errors, etc. and assess PSM, peptide and protein FDR levels. To -properly function, it expects to have access to the following data

-
## [1] "accession"                "calculatedMassToCharge"  
-## [3] "chargeState"              "experimentalMassToCharge"
-## [5] "isDecoy"                  "peptide"                 
-## [7] "spectrumFile"             "spectrumID"
-

which are indeed present in our data:

-
names(msnid)
-
##  [1] "spectrumID"                "scan number(s)"           
-##  [3] "acquisitionNum"            "passThreshold"            
-##  [5] "rank"                      "calculatedMassToCharge"   
-##  [7] "experimentalMassToCharge"  "chargeState"              
-##  [9] "MS-GF:DeNovoScore"         "MS-GF:EValue"             
-## [11] "MS-GF:PepQValue"           "MS-GF:QValue"             
-## [13] "MS-GF:RawScore"            "MS-GF:SpecEValue"         
-## [15] "AssumedDissociationMethod" "IsotopeError"             
-## [17] "isDecoy"                   "post"                     
-## [19] "pre"                       "end"                      
-## [21] "start"                     "accession"                
-## [23] "length"                    "description"              
-## [25] "pepSeq"                    "modified"                 
-## [27] "modification"              "idFile"                   
-## [29] "spectrumFile"              "databaseFile"             
-## [31] "peptide"
-

Here, we summarise a few steps and redirect the reader to the -package’s vignette for more details:

-
-
-

-4.10.2 Analysis of peptide sequences
-

-

Cleaning irregular cleavages at the termini of the peptides and -missing cleavage site within the peptide sequences. The following two -function call create the new numMisCleavages and numIrregCleavages -columns in the MSnID object

-
msnid <- assess_termini(msnid, validCleavagePattern="[KR]\\.[^P]")
-msnid <- assess_missed_cleavages(msnid, missedCleavagePattern="[KR](?=[^P$])")
-
-
-

-4.10.3 Trimming the data
-

-

Now, we can use the apply_filter function to effectively apply -filters. The strings passed to the function represent expressions that -will be evaluated, thus keeping only PSMs that have 0 irregular -cleavages and 2 or less missed cleavages.

-
msnid <- apply_filter(msnid, "numIrregCleavages == 0")
-msnid <- apply_filter(msnid, "numMissCleavages <= 2")
-show(msnid)
-
## MSnID object
-## Working directory: "."
-## #Spectrum Files:  1 
-## #PSMs: 7838 at 17 % FDR
-## #peptides: 5598 at 23 % FDR
-## #accessions: 3759 at 53 % FDR
-
-
-

-4.10.4 Parent ion mass errors
-

-

Using "calculatedMassToCharge" and "experimentalMassToCharge", the -mass_measurement_error function calculates the parent ion mass -measurement error in parts per million.

-
summary(mass_measurement_error(msnid))
-
##       Min.    1st Qu.     Median       Mean    3rd Qu.       Max. 
-## -2184.0640    -0.6992     0.0000    17.6146     0.7512  2012.5178
-

We then filter any matches that do not fit the +/- 20 ppm tolerance

-
msnid <- apply_filter(msnid, "abs(mass_measurement_error(msnid)) < 20")
-summary(mass_measurement_error(msnid))
-
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
-## -19.7797  -0.5866   0.0000  -0.2970   0.5713  19.6758
-
-
-

-4.10.5 Filtering criteria
-

-

Filtering of the identification data will rely on

-
    -
  • -log10 transformed MS-GF+ Spectrum E-value, reflecting the goodness -of match experimental and theoretical fragmentation patterns
  • -
-
msnid$msmsScore <- -log10(msnid$`MS-GF:SpecEValue`)
-
    -
  • the absolute mass measurement error (in ppm units) of the parent ion
  • -
-
msnid$absParentMassErrorPPM <- abs(mass_measurement_error(msnid))
-
-
-

-4.10.6 Setting filters
-

-

MS2 filters are handled by a special MSnIDFilter class objects, where -individual filters are set by name (that is present in names(msnid)) -and comparison operator (>, <, = , …) defining if we should retain -hits with higher or lower given the threshold and finally the -threshold value itself.

-
filtObj <- MSnIDFilter(msnid)
-filtObj$absParentMassErrorPPM <- list(comparison="<", threshold=10.0)
-filtObj$msmsScore <- list(comparison=">", threshold=10.0)
-show(filtObj)
-
## MSnIDFilter object
-## (absParentMassErrorPPM < 10) & (msmsScore > 10)
-

We can then evaluate the filter on the identification data object, -which return the false discovery rate and number of retained -identifications for the filtering criteria at hand.

-
evaluate_filter(msnid, filtObj)
-
##           fdr    n
-## PSM         0 3807
-## peptide     0 2455
-## accession   0 1009
-
-
-

-4.10.7 Filter optimisation
-

-

Rather than setting filtering values by hand, as shown above, these -can be set automatically to meet a specific false discovery rate.

-
filtObj.grid <- optimize_filter(filtObj, msnid, fdr.max=0.01,
-                                method="Grid", level="peptide",
-                                n.iter=500)
-show(filtObj.grid)
-
## MSnIDFilter object
-## (absParentMassErrorPPM < 3) & (msmsScore > 7.4)
-
evaluate_filter(msnid, filtObj.grid)
-
##                   fdr    n
-## PSM       0.004097561 5146
-## peptide   0.006447651 3278
-## accession 0.021996616 1208
-

Filters can eventually be applied (rather than just evaluated) using -the apply_filter function.

-
msnid <- apply_filter(msnid, filtObj.grid)
-show(msnid)
-
## MSnID object
-## Working directory: "."
-## #Spectrum Files:  1 
-## #PSMs: 5146 at 0.41 % FDR
-## #peptides: 3278 at 0.64 % FDR
-## #accessions: 1208 at 2.2 % FDR
-

And finally, identifications that matched decoy and contaminant -protein sequences are removed

-
msnid <- apply_filter(msnid, "isDecoy == FALSE")
-msnid <- apply_filter(msnid, "!grepl('Contaminant',accession)")
-show(msnid)
-
## MSnID object
-## Working directory: "."
-## #Spectrum Files:  1 
-## #PSMs: 5117 at 0 % FDR
-## #peptides: 3251 at 0 % FDR
-## #accessions: 1179 at 0 % FDR
-
-
-

-4.10.8 Export MSnID data
-

-

The resulting filtered identification data can be exported to a -data.frame (or to a dedicated MSnSet data structure from the -MSnbase package) for quantitative MS data, described below, and -further processed and analyses using appropriate statistical tests.

-
head(psms(msnid))
-
##   spectrumID scan number(s) acquisitionNum passThreshold rank
-## 1 index=7151           8819           7151          TRUE    1
-## 2 index=8520          10419           8520          TRUE    1
-##   calculatedMassToCharge experimentalMassToCharge chargeState MS-GF:DeNovoScore
-## 1               1270.318                 1270.318           3               287
-## 2               1426.737                 1426.739           3               270
-##   MS-GF:EValue MS-GF:PepQValue MS-GF:QValue MS-GF:RawScore MS-GF:SpecEValue
-## 1 1.709082e-24               0            0            239     1.007452e-31
-## 2 3.780745e-24               0            0            230     2.217275e-31
-##   AssumedDissociationMethod IsotopeError isDecoy post pre end start accession
-## 1                       CID            0   FALSE    A   K 283   249   CE02347
-## 2                       CID            0   FALSE    A   K 182   142   CE07055
-##   length
-## 1    393
-## 2    206
-##                                                                                                                           description
-## 1 WBGene00001993; locus:hpd-1; 4-hydroxyphenylpyruvate dioxygenase; status:Confirmed; UniProt:Q22633; protein_id:CAA90315.1; T21C12.2
-## 2           WBGene00001755; locus:gst-7; glutathione S-transferase; status:Confirmed; UniProt:P91253; protein_id:AAB37846.1; F11G11.2
-##                                      pepSeq modified modification
-## 1       AISQIQEYVDYYGGSGVQHIALNTSDIITAIEALR    FALSE         <NA>
-## 2 SAGSGYLVGDSLTFVDLLVAQHTADLLAANAALLDEFPQFK    FALSE         <NA>
-##              idFile                                   spectrumFile
-## 1 c_elegans.mzid.gz c_elegans_A_3_1_21Apr10_Draco_10-03-04_dta.txt
-## 2 c_elegans.mzid.gz c_elegans_A_3_1_21Apr10_Draco_10-03-04_dta.txt
-##               databaseFile                                       peptide
-## 1 ID_004174_E48C5B52.fasta       K.AISQIQEYVDYYGGSGVQHIALNTSDIITAIEALR.A
-## 2 ID_004174_E48C5B52.fasta K.SAGSGYLVGDSLTFVDLLVAQHTADLLAANAALLDEFPQFK.A
-##   numIrregCleavages numMissCleavages msmsScore absParentMassErrorPPM
-## 1                 0                0  30.99678             0.3843772
-## 2                 0                0  30.65418             1.3689451
-##  [ reached 'max' / getOption("max.print") -- omitted 4 rows ]
- -
-
-
-

References
-

-
-
-Morgenstern, David, Rotem Barzilay, and Yishai Levin. 2021. RawBeans: A Simple, Vendor-Independent, Raw-Data Quality-Control Tool.” Journal of Proteome Research. https://doi.org/10.1021/acs.jproteome.0c00956. -
-
- - -

- - -

-

Page built: -2021-08-31 - using -R version 4.1.0 (2021-05-18) -

-
-
- - - - - diff --git a/docs/index.html b/docs/index.html deleted file mode 100644 index 45ecb9b..0000000 --- a/docs/index.html +++ /dev/null @@ -1,325 +0,0 @@ - - - - - - - - - - - - - - - - - - - - -Chapter 1 Preamble | R for Mass Spectrometry - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-

-Chapter 1 Preamble

-

The aim of the R for Mass -Spectrometry initiative is to -provide efficient, thoroughly documented, tested and flexible R -software for the analysis and interpretation of high throughput mass -spectrometry assays, including proteomics and metabolomics -experiments. The project formalises the longtime collaborative -development efforts of its core members under the RforMassSpectrometry -organisation to facilitate dissemination and accessibility of their -work.

-

- - -The *R for Mass Spectrometry* intiative sticker, designed by Johannes Rainer.Figure 1.1: The R for Mass Spectrometry intiative sticker, designed by Johannes Rainer. - -

-

This material introduces participants to the analysis and exploration -of mass spectrometry (MS) based proteomics data using R and -Bioconductor. The course will cover all levels of MS data, from raw -data to identification and quantitation data, up to the statistical -interpretation of a typical shotgun MS experiment and will focus on -hands-on tutorials. At the end of this course, the participants will -be able to manipulate MS data in R and use existing packages for their -exploratory and statistical proteomics data analysis.

-
-

Targeted audience and assumed background
-

-

The course material is targeted to either proteomics practitioners or -data analysts/bioinformaticians that would like to learn how to use R -and Bioconductor to analyse proteomics data. Familiarity with MS or -proteomics in general is desirable, but not essential as we will walk -through and describe a typical MS data as part of learning about the -tools. For approachable introductions to sample preparation, mass -spectrometry, data interpretation and analysis, readers are redirected -to:

-
    -
  • -A beginner’s guide to mass spectrometry–based proteomics (Sinha and Mann 2020Sinha, Ankit, and Matthias Mann. 2020. A beginner’s guide to mass spectrometry–based proteomics.” The Biochemist, September. https://doi.org/10.1042/BIO20200057.) -
  • -
  • -The ABC’s (and XYZ’s) of peptide sequencing (Steen and Mann 2004Steen, Hanno, and Matthias Mann. 2004. “The ABC’s (and XYZ’s) of Peptide Sequencing.” Nat. Rev. Mol. Cell Biol. 5 (9): 699–711.) -
  • -
  • -How do shotgun proteomics algorithms identify proteins? (Marcotte 2007Marcotte, Edward M. 2007. “How Do Shotgun Proteomics Algorithms Identify Proteins?” Nat. Biotechnol. 25 (7): 755–57.) -
  • -
  • -An Introduction to Mass Spectrometry-Based Proteomics (Shuken 2023Shuken, Steven R. 2023. “An Introduction to Mass Spectrometry-Based Proteomics.” J. Proteome Res., June.) -
  • -
-

A working knowledge of R (R syntax, commonly used functions, basic -data structures such as data frames, vectors, matrices, … and their -manipulation) is required. Familiarity with other Bioconductor omics -data classes and the tidyverse syntax is useful, but not necessary.

-
-
-

Setup
-

-

This material uses the latest version of the R for Mass Spectrometry -package and their dependencies. It might thus be possible that even -the latest Bioconductor stable version isn’t recent enough.

-

To install all the necessary package, please use the latest release of -R and execute:

-
if (!requireNamespace("BiocManager", quietly = TRUE))
-    install.packages("BiocManager")
-
-BiocManager::install("tidyverse")
-BiocManager::install("factoextra")
-BiocManager::install("msdata")
-BiocManager::install("mzR")
-BiocManager::install("rhdf5")
-BiocManager::install("rpx")
-BiocManager::install("MsCoreUtils")
-BiocManager::install("QFeatures")
-BiocManager::install("Spectra")
-BiocManager::install("ProtGenerics")
-BiocManager::install("PSMatch")
-BiocManager::install("pheatmap")
-BiocManager::install("limma")
-BiocManager::install("MSnID")
-BiocManager::install("RforMassSpectrometry/SpectraVis")
-

Follow the instructions in this -script -to install the packages and download some of the data used in the -following chapters. All software versions used to generate this -document are recoded at the end of the book in 7.

-

To compile and render the teaching material, you will also need -the BiocStyle package and the (slighly -modified) Modern Statistics for Model Biology (msmb) HTML Book -Style by Mike -Smith:

-
BiocManager::install(c("bookdown", "BiocStyle", "lgatto/msmbstyle"))
-

Run the installation -script -by executing the line below to install all requirements to compile the -book:

-
source("https://raw.githubusercontent.com/rformassspectrometry/docs/main/install_docs_deps.R")
-
-
-

Acknowledgments
-

-

Thank you to Charlotte Soneson for -fixing many typos in a previous version of this book.

-
-
-

License
-

-

Creative Commons Licence
This material is licensed under a Creative Commons -Attribution-ShareAlike 4.0 International License. You are free to -share (copy and redistribute the material in any medium or format) -and adapt (remix, transform, and build upon the material) for any -purpose, even commercially, as long as you give appropriate credit and -distribute your contributions under the same license as the original.

- -
-
- -

- -

-

Page built: -2023-09-06 - using -R version 4.3.1 Patched (2023-07-10 r84676) -

-
-
- - - - - diff --git a/docs/libs/accessible-code-block-0.0.1/empty-anchor.js b/docs/libs/accessible-code-block-0.0.1/empty-anchor.js deleted file mode 100644 index ca349fd..0000000 --- a/docs/libs/accessible-code-block-0.0.1/empty-anchor.js +++ /dev/null @@ -1,15 +0,0 @@ -// Hide empty tag within highlighted CodeBlock for screen reader accessibility (see https://github.com/jgm/pandoc/issues/6352#issuecomment-626106786) --> -// v0.0.1 -// Written by JooYoung Seo (jooyoung@psu.edu) and Atsushi Yasumoto on June 1st, 2020. - -document.addEventListener('DOMContentLoaded', function() { - const codeList = document.getElementsByClassName("sourceCode"); - for (var i = 0; i < codeList.length; i++) { - var linkList = codeList[i].getElementsByTagName('a'); - for (var j = 0; j < linkList.length; j++) { - if (linkList[j].innerHTML === "") { - linkList[j].setAttribute('aria-hidden', 'true'); - } - } - } -}); diff --git a/docs/libs/header-attrs-2.10/header-attrs.js b/docs/libs/header-attrs-2.10/header-attrs.js deleted file mode 100644 index dd57d92..0000000 --- a/docs/libs/header-attrs-2.10/header-attrs.js +++ /dev/null @@ -1,12 +0,0 @@ -// Pandoc 2.9 adds attributes on both header and div. We remove the former (to -// be compatible with the behavior of Pandoc < 2.8). -document.addEventListener('DOMContentLoaded', function(e) { - var hs = document.querySelectorAll("div.section[class*='level'] > :first-child"); - var i, h, a; - for (i = 0; i < hs.length; i++) { - h = hs[i]; - if (!/^h[1-6]$/i.test(h.tagName)) continue; // it should be a header h1-h6 - a = h.attributes; - while (a.length > 0) h.removeAttribute(a[0].name); - } -}); diff --git a/docs/libs/header-attrs-2.11/header-attrs.js b/docs/libs/header-attrs-2.11/header-attrs.js deleted file mode 100644 index dd57d92..0000000 --- a/docs/libs/header-attrs-2.11/header-attrs.js +++ /dev/null @@ -1,12 +0,0 @@ -// Pandoc 2.9 adds attributes on both header and div. We remove the former (to -// be compatible with the behavior of Pandoc < 2.8). -document.addEventListener('DOMContentLoaded', function(e) { - var hs = document.querySelectorAll("div.section[class*='level'] > :first-child"); - var i, h, a; - for (i = 0; i < hs.length; i++) { - h = hs[i]; - if (!/^h[1-6]$/i.test(h.tagName)) continue; // it should be a header h1-h6 - a = h.attributes; - while (a.length > 0) h.removeAttribute(a[0].name); - } -}); diff --git a/docs/libs/header-attrs-2.12/header-attrs.js b/docs/libs/header-attrs-2.12/header-attrs.js deleted file mode 100644 index dd57d92..0000000 --- a/docs/libs/header-attrs-2.12/header-attrs.js +++ /dev/null @@ -1,12 +0,0 @@ -// Pandoc 2.9 adds attributes on both header and div. We remove the former (to -// be compatible with the behavior of Pandoc < 2.8). -document.addEventListener('DOMContentLoaded', function(e) { - var hs = document.querySelectorAll("div.section[class*='level'] > :first-child"); - var i, h, a; - for (i = 0; i < hs.length; i++) { - h = hs[i]; - if (!/^h[1-6]$/i.test(h.tagName)) continue; // it should be a header h1-h6 - a = h.attributes; - while (a.length > 0) h.removeAttribute(a[0].name); - } -}); diff --git a/docs/libs/header-attrs-2.7/header-attrs.js b/docs/libs/header-attrs-2.7/header-attrs.js deleted file mode 100644 index dd57d92..0000000 --- a/docs/libs/header-attrs-2.7/header-attrs.js +++ /dev/null @@ -1,12 +0,0 @@ -// Pandoc 2.9 adds attributes on both header and div. We remove the former (to -// be compatible with the behavior of Pandoc < 2.8). -document.addEventListener('DOMContentLoaded', function(e) { - var hs = document.querySelectorAll("div.section[class*='level'] > :first-child"); - var i, h, a; - for (i = 0; i < hs.length; i++) { - h = hs[i]; - if (!/^h[1-6]$/i.test(h.tagName)) continue; // it should be a header h1-h6 - a = h.attributes; - while (a.length > 0) h.removeAttribute(a[0].name); - } -}); diff --git a/docs/libs/header-attrs-2.9/header-attrs.js b/docs/libs/header-attrs-2.9/header-attrs.js deleted file mode 100644 index dd57d92..0000000 --- a/docs/libs/header-attrs-2.9/header-attrs.js +++ /dev/null @@ -1,12 +0,0 @@ -// Pandoc 2.9 adds attributes on both header and div. We remove the former (to -// be compatible with the behavior of Pandoc < 2.8). -document.addEventListener('DOMContentLoaded', function(e) { - var hs = document.querySelectorAll("div.section[class*='level'] > :first-child"); - var i, h, a; - for (i = 0; i < hs.length; i++) { - h = hs[i]; - if (!/^h[1-6]$/i.test(h.tagName)) continue; // it should be a header h1-h6 - a = h.attributes; - while (a.length > 0) h.removeAttribute(a[0].name); - } -}); diff --git a/docs/libs/msmb-css-0/msmb.css b/docs/libs/msmb-css-0/msmb.css deleted file mode 100644 index 096e139..0000000 --- a/docs/libs/msmb-css-0/msmb.css +++ /dev/null @@ -1,452 +0,0 @@ -@import 'https://fonts.googleapis.com/css?family=Istok+Web|Oxygen|Source+Sans+Pro'; -@import 'https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.15.1/css/all.min.css'; -body { - font-family: 'Source Sans Pro', Arial, Helvetica, sans-serif; - background-color: #fefefe; - margin-top: 0; -} - -.title { - font-size: 2.5rem; -} - -.author { - font-size: 1.5rem; - padding-top: 10px; -} - -.dedication { - font-style: italic; - text-align: center; -} - -.proposition, .question-begin, .question-end, .exercise { - font-size: 1.4rem; - font-weight: bold; - margin-top: 1.2rem; - color: green; -} - -.solution, .solution-begin, .solution-end { - font-size: 1.4rem; - font-weight: bold; - color: red; -} - -/* we can remove this eventually -its only here while we sort out the book*/ -.solution-body { - font-weight: normal; - color: black; -} - -.solution-end, .question-end { -/* width: 55%; */ -/* text-align: right; */ -/* float: right; */ -/* margin-top: -30px; */ - height: 1em; -} - -.solution-icon { - float: right; -} - -.clickable { - cursor: pointer; -} - -.margintab { - margin: 0; - width: 30%; - position: relative; - right: 10%; -} - -.MJXc-display { - width: 55%; - font-size: 1.1rem; -} - -.math { - font-size: 1.1rem; -} - -h1 { - color: var(--main-bg-color); - width: 55%; - border-bottom-color: var(--main-bg-color); - border-bottom-style: solid; - padding-bottom: 0.5rem; -} - -h2 { - color: var(--main-bg-color); - border-bottom-color: var(--main-bg-color); - border-bottom-style: solid; - width: 55%; - padding-bottom: 0.5rem; - margin-top: 3em; - font-weight: bold; -} - -.tooltip>.internal-link-btn { - display: none; - float: none; - vertical-align: middle; - padding: 0; - margin-left: 10px; - border: none; - background: none; - color: #1881c2; -} -h2:hover>.tooltip>.internal-link-btn { display: inline } -h3:hover>.tooltip>.internal-link-btn { display: inline } - -.internal-link>a:link { - text-decoration: none; - background-size: 0; -} - -h3 { - color: var(--main-bg-color); - font-size: 1.5rem; -} - -h4 { - font-size: 1.4rem; -} - -li p { - margin-bottom: 0; -} - -video { - width: 100%; -} - -.shaka-video-container { - width: 55%; - max-width: 100%; -} - - -/* header bar */ - -ul.navbar { - list-style-type: none; - margin: 0; - padding: 0; - overflow: hidden; - background-color: var(--main-bg-color); - top: 0; -} - -li .dropbtn { - display: inline-block; - color: white; - text-align: center; - text-decoration: none; - text-shadow: none; - line-height: 2.0rem; - margin-right: 16px; - padding-top: 55px; - padding-bottom: 15px; -} - -li .dropdown:hover .dropbtn { - background-color: red; -} - -li.dropdown { - display: inline-block; - color: white; - padding: 0; -} - - -li .marginnote,li .sidenote { - width: 61%; - margin-right: -84.4%; -} - -blockquote { - width: 45%; - -webkit-padding-start: 5%; - -webkit-padding-end: 5%; - -moz-padding-start: 5%; - -moz-padding-end: 5%; - display: block; - margin-block-end: 0; - margin-block-start: 0; - margin-inline-start: 0; - scroll-margin-inline-end: 0; -} - -blockquote p { - width: 100%; - background-color: #F7F7F7; - font-size: 1.3rem; - font-style: italic; -} - -blockquote .marginnote, blockquote .sidenote { - width: 61%; - margin-right: -84.4%; -} - -.msmb { - display: inline-block; - color: white; - padding: 14px 16px; -} - -.dropdown-content { - display: none; - position: absolute; - background-color: #f9f9f9; - width: 27.5%; - z-index: 1; - margin-right: 140px; - /*modify margin as window resized */ - right: calc((87.5% - 1400px)/2); - max-width: 440px; - font-size: 1.2rem; -} - -/* keeps the dropdown menu aligned with header -when window exceeds 1615px */ -@media screen and (max-width: 1615px) { - .dropdown-content { - margin-right: 0; - right: 8.75%; - } -} - -.dropdown-content a { - color: black; - padding: 4px 16px; - text-decoration: none; - display: block; - text-align: left; - text-shadow: none; -} - -.dropdown-content a:hover { - background-color: rgb(24,129,194,0.4); -} - -.dropdown:hover .dropdown-content { - display: block; -} - -.dropdown a:link { - background-position-y: 0; - text-shadow: none; - background-size: 0; -} - -#active-page { - background-color: var(--main-bg-color); - color: white; -} - -.toc-sections { - font-size: 0.7rem; - padding-left: 0; - padding-right: 0; - background-color: rgb(24,129,194,0.3); - list-style-type: none; -} - -li.toc { - padding: 0; - font-size: 1.1rem; -} - -li.toc a { - padding-left: 10%; - padding-top: 1px; - padding-bottom: 1px; -} - -/* formatting tables */ -.console { - width: 80%; - table-layout: fixed; - border: 0; - font-family: monospace; -} - -.kable_wrapper { - border: 0; - table-layout: fixed; -} - -.code { - font-size: 0.9em; -} - -div.sourceCode { - width: 56%; - /*padding-left: 1%;*/ -} - -pre { - width: auto; - padding-top: 2px; - padding-bottom: 2px; - /*padding-left: 1%; - padding-right: 1%;*/ - background-color: #F7F7F7; - margin-bottom: 10px; - margin-top: 3px; - margin-top: 4px; - overflow-x: scroll; -} - -.build-date { - font-size: 0.8em; - float: left; - line-height: 1em; - margin-bottom: 2em; - margin-top: 0; -} - -.tooltip { - position: relative; - display: inline-block; -} - -.tooltip .tooltiptext { - visibility: hidden; - width: 70px; - background-color: #555; - color: #fff; - text-align: center; - border-radius: 6px; - padding: 5px; - position: absolute; - z-index: 1; - bottom: 100%; - left: 50%; - margin-left: -35px; - opacity: 0; - transition: opacity 0.3s; -} - -.tooltip .tooltiptext::after { - content: ""; - position: absolute; - top: 100%; - left: 50%; - margin-left: -5px; - border-width: 5px; - border-style: solid; - border-color: #555 transparent transparent transparent; -} - -.tooltip:hover .tooltiptext { - visibility: visible; - opacity: 1; -} - - -.tooltip-eqn { - position: relative; - z-index: 100; -} - -.tooltip-eqn .tooltiptext { - visibility: hidden; - width: 70px; - background-color: #555; - color: #fff; - text-align: center; - border-radius: 6px; - padding: 5px; - position: absolute; - z-index: 1; - bottom: 150%; - margin-left: -35px; - opacity: 0; - transition: opacity 0.3s; -} - -.tooltip-eqn .tooltiptext::after { - content: ""; - position: absolute; - top: 100%; - left: 50%; - margin-left: -5px; - border-width: 5px; - border-style: solid; - border-color: #555 transparent transparent transparent; -} - -.tooltip-eqn:hover .tooltiptext { - visibility: visible; - opacity: 1; -} - -.tooltip-eqn>.internal-link-eqn { - display: none; - float: none; - vertical-align: middle; - padding: 0; - margin-left: 10px; - margin-top: -10px; - border: none; - background: none; - color: black; - position: absolute; -} -.eqn-mouseover:hover>.tooltip-eqn>.internal-link-eqn { display: inline } - -.eqn-mouseover { - display: flex; - align-items: center; -} - -@media (max-width: 760px) { - body { - padding-left: 0; - margin-left: 2%; - margin-right: 2%; - width: auto; - } - p { - width: auto; - font-size: 1.2em; - } - .marginnote, .sidenote { - left: 0; - } - .sourceCode { - display: inline-block; - width: 90%; - } - .title, .author, .dropdown { - text-align: center; - } - .dropdown { - width: 100%; - } - .dropdown-content { - position: inherit; - width: 100%; - } - li .dropbtn { - padding-top: 0; - } - h1, h2, h3 { - width: 100%; - } - h3 { - display: inline-block; - } - #TOC { - max-width: 100%; - } - -} diff --git a/docs/libs/tufte-css-2015.12.29/envisioned.css b/docs/libs/tufte-css-2015.12.29/envisioned.css deleted file mode 100644 index b35e122..0000000 --- a/docs/libs/tufte-css-2015.12.29/envisioned.css +++ /dev/null @@ -1,7 +0,0 @@ -@import 'https://fonts.googleapis.com/css?family=Roboto+Condensed'; -body { - font-family: 'Roboto Condensed', Arial, Helvetica, sans-serif; - background-color: #fefefe; - color: #222; -} -.numeral, .sidenote-number { font-family: "Roboto Condensed"; } diff --git a/docs/libs/tufte-css-2015.12.29/tufte.css b/docs/libs/tufte-css-2015.12.29/tufte.css deleted file mode 100644 index 101c8f2..0000000 --- a/docs/libs/tufte-css-2015.12.29/tufte.css +++ /dev/null @@ -1,223 +0,0 @@ -/* Import ET Book styles - adapted from https://github.com/edwardtufte/et-book/blob/gh-pages/et-book.css */ - -@charset "UTF-8"; - -/* Tufte CSS styles */ -html { font-size: 15px; } - -body { width: 87.5%; - margin-left: auto; - margin-right: auto; - padding-left: 12.5%; - color: #111; - max-width: 1400px; - counter-reset: sidenote-counter; } - -h1.title { font-weight: 400; - font-style: normal; - margin-top: 4rem; - margin-bottom: 1.5rem; - font-size: 3.2rem; - line-height: 1; } - -h1 { - font-weight: 400; - margin-top: 2.1rem; - margin-bottom: 0; - font-size: 2.2rem; - line-height: 1; } - -h2 { - font-weight: 400; - font-size: 1.7rem; - margin-top: 2rem; - margin-bottom: 0; - line-height: 1; } - -h3.subtitle { - font-weight: 400; - margin-top: 1rem; - margin-bottom: 1rem; - font-size: 1.8rem; - display: block; - line-height: 1; } - -h4.author, h4.date { - font-size: 1.4rem; - font-weight: 400; - margin: 1rem auto; - line-height: 1; -} - -.danger { color: red; } - -article { position: relative; - padding: 5rem 0rem; } - -section { padding-top: 1rem; - padding-bottom: 1rem; } - -p, ol, ul { font-size: 1.4rem; } - -p { line-height: 2rem; - margin-top: 1.4rem; - margin-bottom: 1.4rem; - padding-right: 0; - vertical-align: baseline; } - -blockquote { font-size: 1.4rem; } - -blockquote p { width: 50%; } - -blockquote footer { width: 50%; - font-size: 1.1rem; - text-align: right; } - -ol, ul { width: 45%; - -webkit-padding-start: 5%; - -webkit-padding-end: 5%; } - -li { padding: 0.5rem 0; } - -table { - border-top: 2px solid #111; - border-bottom: 2px solid #111; - font-size: 1.1rem; -} - -th { - border-bottom: 1px solid #111; -} - -div.figure { - padding: 0; - border: 0; - font-size: 100%; - font: inherit; - vertical-align: baseline; - max-width: 55%; - -webkit-margin-start: 0; - -webkit-margin-end: 0; - margin: 0 0 3em 0; - } - -/* Links: replicate underline that clears descenders */ -a:link, a:visited { color: inherit; } - -a:link { text-decoration: none; - background: -webkit-linear-gradient(#fffff8, #fffff8), -webkit-linear-gradient(#fffff8, #fffff8), -webkit-linear-gradient(#333, #333); - background: linear-gradient(#fffff8, #fffff8), linear-gradient(#fffff8, #fffff8), linear-gradient(#333, #333); - -webkit-background-size: 0.05em 1px, 0.05em 1px, 1px 1px; - -moz-background-size: 0.05em 1px, 0.05em 1px, 1px 1px; - background-size: 0.05em 1px, 0.05em 1px, 1px 1px; - background-repeat: no-repeat, no-repeat, repeat-x; - text-shadow: 0.03em 0 #fffff8, -0.03em 0 #fffff8, 0 0.03em #fffff8, 0 -0.03em #fffff8, 0.06em 0 #fffff8, -0.06em 0 #fffff8, 0.09em 0 #fffff8, -0.09em 0 #fffff8, 0.12em 0 #fffff8, -0.12em 0 #fffff8, 0.15em 0 #fffff8, -0.15em 0 #fffff8; - background-position: 0% 93%, 100% 93%, 0% 93%; } - -@media screen and (-webkit-min-device-pixel-ratio: 0) { a:link { background-position-y: 87%, 87%, 87%; } } - -a:link::selection { text-shadow: 0.03em 0 #b4d5fe, -0.03em 0 #b4d5fe, 0 0.03em #b4d5fe, 0 -0.03em #b4d5fe, 0.06em 0 #b4d5fe, -0.06em 0 #b4d5fe, 0.09em 0 #b4d5fe, -0.09em 0 #b4d5fe, 0.12em 0 #b4d5fe, -0.12em 0 #b4d5fe, 0.15em 0 #b4d5fe, -0.15em 0 #b4d5fe; - background: #b4d5fe; } - -a:link::-moz-selection { text-shadow: 0.03em 0 #b4d5fe, -0.03em 0 #b4d5fe, 0 0.03em #b4d5fe, 0 -0.03em #b4d5fe, 0.06em 0 #b4d5fe, -0.06em 0 #b4d5fe, 0.09em 0 #b4d5fe, -0.09em 0 #b4d5fe, 0.12em 0 #b4d5fe, -0.12em 0 #b4d5fe, 0.15em 0 #b4d5fe, -0.15em 0 #b4d5fe; - background: #b4d5fe; } - -/* Sidenotes, margin notes, figures, captions */ -img {max-width: 100%;} - -.marginnote img { display: block; } - -.sidenote, .marginnote { float: right; - clear: right; - margin-right: -60%; - width: 50%; - margin-top: 0; - margin-bottom: 1rem; - font-size: 1.1rem; - line-height: 1.3; - vertical-align: baseline; - position: relative; } - -.sidenote-number { - position: relative; - vertical-align: baseline; } - -.sidenote-number { font-size: 1rem; - top: -0.5rem; - left: 0.1rem; } - -p, footer, table, hr { width: 55%; } -hr { margin-left: 0; } -table table, li p, li pre { width: auto; } -li p, li pre {margin-top: auto; } - -div.fullwidth, table.fullwidth { max-width: 90%; } -div.fullwidth > * { width: auto; } - -#TOC, h1.title { max-width: 90%; } -#TOC ol, #TOC ul { width: auto; } - -div.fullwidth p.caption { - margin-right: 0; - max-width: 33%; -} - -p.caption { text-align: left; } - -@media screen and (max-width: 760px) { p, footer, ol, ul, table, hr { width: 90%; } - pre { width: 87.5%; } - ul { width: 85%; } - figure { max-width: 90%; } - div.fullwidth p.caption { max-width: none; } - blockquote p, blockquote footer { width: 90%; }} - -.sans { font-family: "Gill Sans", "Gill Sans MT", Calibri, sans-serif; - letter-spacing: .03em; } - -code { font-family: Consolas, "Liberation Mono", Menlo, Courier, monospace; - font-size: 1.125rem; - line-height: 1.6; } - -pre code { font-size: 1rem; } - -p code { white-space: inherit; } - -h1 code, h2 code, h3 code { font-size: 0.80em; } - -.marginnote code, .sidenote code { font-size: 1rem; } - -pre { width: 52.5%; - overflow-x: auto; } - -.fullwidth { max-width: 90%; - clear:both; } - -span.newthought { font-variant: small-caps; - font-size: 1.2em; } - -input.margin-toggle { display: none; } - -label.sidenote-number { display: inline; } - -label.margin-toggle:not(.sidenote-number) { display: none; } - -@media (max-width: 760px) { label.margin-toggle:not(.sidenote-number) { display: inline; } - .sidenote, .marginnote { display: none; } - .shownote, - .margin-toggle:checked + .sidenote, - .margin-toggle:checked + .marginnote { - display: block; - float: left; - left: 1rem; - clear: both; - width: 95%; - margin: 1rem 2.5%; - vertical-align: baseline; - position: relative; - } - label { cursor: pointer; } - div.figure { max-width: 90%; } - pre { width: 90%; - padding: 0; } - } diff --git a/docs/raw-ms-data.html b/docs/raw-ms-data.html deleted file mode 100644 index 3572809..0000000 --- a/docs/raw-ms-data.html +++ /dev/null @@ -1,1156 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - -Chapter 3 Raw MS data | R for Mass Spectrometry - - - - - - - - - - - - - - - - - - - - - - - - -
-
- - -
-

-Chapter 3 Raw MS data

-

In this section, we will learn how to read raw data from in one of the -commonly used open formats (mzML, mzXML and netCDF) into R.

-
|Data type  |File format   |Data structure               |Package           |
-|:----------|:-------------|:----------------------------|:-----------------|
-|Raw        |mzXML or mzML |mzRpwiz or mzRramp           |mzR               |
-|Raw        |mzXML or mzML |list of MassSpectrum objects |MALDIquantForeign |
-|Raw        |mzXML or mzML |MSnExp                       |MSnbase           |
-|Peak lists |mgf           |MSnExp                       |MSnbase           |
-|Raw        |several       |Spectra                      |Spectra           |
-
-

-3.1 What is raw data in R
-

-

When we manipulate complex data, we need a way to abstract it.

-

The need for and abstraction saves us from having to know about all -the details of that data and its associated metadata. This allows -to rely on a few easy-to-remember conventions to make mundane and -repetitive tasks trivial and be able to complete more complex things -easily. Abstractions provide a smoother approaches to handle complex -data using common patterns.

-
- -

-Figure 3.1: Schematic representation of what is referred to by raw data: a collection of mass spectra and a table containing spectrum-level annotations along the lines. Raw data are imported from one of the many community-maintained open standards formats (mzML, mzXML, mzData or ANDI-MS/netCDF) (Figure taken from (Gatto, Gibb, and Rainer 2020)). -

-Schematic representation of what is referred to by *raw data*: a collection of mass spectra and a table containing spectrum-level annotations along the lines. Raw data are imported from one of the many community-maintained open standards formats (mzML, mzXML, mzData or ANDI-MS/netCDF) (Figure taken from [@Gatto:2020]). -
-
-

-3.1.1 The Spectra class
-

-

We are going to use the -Spectra package -as an abstraction to raw mass spectrometry data.

-
library(Spectra)
-

Spectra is part of the R for Mass Spectrometry -initiative initiative. It -defines the Spectra class that is used as a raw data abstration, to -maniputate MS data and metadata. The best way to learn about a data -structure is to create one by hand.

-

Let’s create a DataFrame4 As defined in the Bioconductor S4Vectors -package. containing MS levels, retention time, m/z and intensities -for 2 spectra:

-
spd <- DataFrame(msLevel = c(1L, 2L), rtime = c(1.1, 1.2))
-spd$mz <- list(c(100, 103.2, 104.3, 106.5), c(45.6, 120.4, 190.2))
-spd$intensity <- list(c(200, 400, 34.2, 17), c(12.3, 15.2, 6.8))
-spd
-
## DataFrame with 2 rows and 4 columns
-##     msLevel     rtime                    mz             intensity
-##   <integer> <numeric>                <list>                <list>
-## 1         1       1.1 100.0,103.2,104.3,... 200.0,400.0, 34.2,...
-## 2         2       1.2      45.6,120.4,190.2        12.3,15.2, 6.8
-

And now convert this DataFrame into a Spectra object:

-
sp0 <- Spectra(spd)
-sp0
-
## MSn data (Spectra) with 2 spectra in a MsBackendDataFrame backend:
-##     msLevel     rtime scanIndex
-##   <integer> <numeric> <integer>
-## 1         1       1.1        NA
-## 2         2       1.2        NA
-##  ... 16 more variables/columns.
-
-

Exercise

-

Explore the newly created object using

-
    -
  • -spectraVariables to extract all the metadata variables.
  • -
  • -spectraData to extract all the metadata.
  • -
  • -peaksData to extract a list containing thet raw data.
  • -
  • -[ to create subsets.
  • -
-
-
-
-

-3.1.2 Spectra from mzML files
-

-

Let’s now create a new object using the mzML data previously -downloaded and available in the mzf file.

-
mzf
-
## [1] "/home/lgatto/.cache/R/rpx/b87c573dec94f_TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01-20141210.mzML"
-
sp <- Spectra(mzf)
-sp
-
## MSn data (Spectra) with 7534 spectra in a MsBackendMzR backend:
-##        msLevel     rtime scanIndex
-##      <integer> <numeric> <integer>
-## 1            1    0.4584         1
-## 2            1    0.9725         2
-## 3            1    1.8524         3
-## 4            1    2.7424         4
-## 5            1    3.6124         5
-## ...        ...       ...       ...
-## 7530         2   3600.47      7530
-## 7531         2   3600.83      7531
-## 7532         2   3601.18      7532
-## 7533         2   3601.57      7533
-## 7534         2   3601.98      7534
-##  ... 33 more variables/columns.
-## 
-## file(s):
-## b87c573dec94f_TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01-20141210.mzML
-
-

Exercise

-
    -
  • Repeat the data manipulations above.
  • -
  • Check the number of scans in the object with length().
  • -
  • Note the difference in the first line when showing the object in the -console. We will get back to this idea of backend later.
  • -
-

Mass spectrometry data in Spectra objects can be thought of as a -list of individual spectra, with each spectrum having a set of -variables associated with it. Besides core spectra variables (such -as MS level or retention time) an arbitrary number of optional -variables can be assigned to a spectrum. The core spectra variables -all have their own accessor method and it is guaranteed that a value -is returned by it (or NA if the information is not available). The -core variables and their data type are (alphabetically ordered):

-
    -
  • -acquisitionNum integer(1): the index of acquisition of a -spectrum during a MS run.
  • -
  • -centroided logical(1): whether the spectrum is in profile or -centroid mode.
  • -
  • -collisionEnergy numeric(1): collision energy used to create an -MSn spectrum.
  • -
  • -dataOrigin character(1): the origin of the spectrum’s data, -e.g. the mzML file from which it was read.
  • -
  • -dataStorage character(1): the (current) storage location of the -spectrum data. This value depends on the backend used to handle and -provide the data. For an in-memory backend like the -MsBackendDataFrame this will be "<memory>", for an on-disk -backend such as the MsBackendHdf5Peaks it will be the name of the -HDF5 file where the spectrum’s peak data is stored.
  • -
  • -intensity numeric: intensity values for the spectrum’s peaks.
  • -
  • -isolationWindowLowerMz numeric(1): lower m/z for the isolation -window in which the (MSn) spectrum was measured.
  • -
  • -isolationWindowTargetMz numeric(1): the target m/z for the -isolation window in which the (MSn) spectrum was measured.
  • -
  • -isolationWindowUpperMz numeric(1): upper m/z for the isolation -window in which the (MSn) spectrum was measured.
  • -
  • -msLevel integer(1): the MS level of the spectrum.
  • -
  • -mz numeric: the m/z values for the spectrum’s peaks.
  • -
  • -polarity integer(1): the polarity of the spectrum (0 and 1 -representing negative and positive polarity, respectively).
  • -
  • -precScanNum integer(1): the scan (acquisition) number of the -precursor for an MSn spectrum.
  • -
  • -precursorCharge integer(1): the charge of the precursor of an -MSn spectrum.
  • -
  • -precursorIntensity numeric(1): the intensity of the precursor of -an MSn spectrum.
  • -
  • -precursorMz numeric(1): the m/z of the precursor of an MSn -spectrum.
  • -
  • -rtime numeric(1): the retention time of a spectrum.
  • -
  • -scanIndex integer(1): the index of a spectrum within a (raw) -file.
  • -
  • -smoothed logical(1): whether the spectrum was smoothed.
  • -
-

For details on the individual variables and their getter/setter -function see the help for Spectra (?Spectra). Also note that these -variables are suggested, but not required to characterize a -spectrum. Also, some only make sense for MSn, but not for MS1 spectra.

-
-
-

Exercise

-
    -
  • Extract a set of spectra variables using the accessor (for example -msLevel(.)) or using the $ notation (for example .$msLevel).
  • -
  • How many MS level are there, and how many scans of each level?
  • -
  • Extract the index of the MS2 spectrum with the highest base peak -intensity.
  • -
  • Are the data centroided or in profile mode?
  • -
  • Pick a spectrum of each level and visually check whether it is -centroided or in profile mode. You can use the plotSpectra() -function to visualise peaks and set the m/z range with the xlim -arguments.
  • -
-
-
-

Exercise

-

Using the first raw data file starting with MS3TMT10, answer the -following questions:

-
    -
  • How many spectra are there in that file?
  • -
  • How many MS levels, and how many spectra per MS level?
  • -
  • What is the index of the MS2 spectrum with the highest precursor -intensity?
  • -
  • Plot one spectrum of each level. Are they centroided or in profile -mode?
  • -
-

These objects and their manipulations are not limited to single files:

-
(fls <- dir(system.file("sciex", package = "msdata"), full.names = TRUE))
-
## [1] "/home/lgatto/R/x86_64-pc-linux-gnu-library/4.1/msdata/sciex/20171016_POOL_POS_1_105-134.mzML"
-## [2] "/home/lgatto/R/x86_64-pc-linux-gnu-library/4.1/msdata/sciex/20171016_POOL_POS_3_105-134.mzML"
-
sp_sciex <- Spectra(fls)
-table(dataOrigin(sp_sciex))
-
## 
-## /home/lgatto/R/x86_64-pc-linux-gnu-library/4.1/msdata/sciex/20171016_POOL_POS_1_105-134.mzML 
-##                                                                                          931 
-## /home/lgatto/R/x86_64-pc-linux-gnu-library/4.1/msdata/sciex/20171016_POOL_POS_3_105-134.mzML 
-##                                                                                          931
-
-
-
-

-3.1.3 Backends
-

-

Backends allow to use different backends to store mass spectrometry data while -providing via the Spectra class a unified interface to use that data. The -Spectra package defines a set of example backends but any object extending the -base MsBackend class could be used instead. The default backends are:

-
    -
  • -MsBackendMzR: this backend keeps only general spectra variables in memory -and relies on the mzR package to read mass peaks (m/z and -intensity values) from the original MS files on-demand.
  • -
-
sp_sciex
-
## MSn data (Spectra) with 1862 spectra in a MsBackendMzR backend:
-##        msLevel     rtime scanIndex
-##      <integer> <numeric> <integer>
-## 1            1     0.280         1
-## 2            1     0.559         2
-## 3            1     0.838         3
-## 4            1     1.117         4
-## 5            1     1.396         5
-## ...        ...       ...       ...
-## 1858         1   258.636       927
-## 1859         1   258.915       928
-## 1860         1   259.194       929
-## 1861         1   259.473       930
-## 1862         1   259.752       931
-##  ... 33 more variables/columns.
-## 
-## file(s):
-## 20171016_POOL_POS_1_105-134.mzML
-## 20171016_POOL_POS_3_105-134.mzML
-
    -
  • -MsBackendDataFrame: the mass spectrometry data is stored (in-memory) in a -DataFrame. Keeping the data in memory guarantees high performance but has -also, depending on the number of mass peaks in each spectrum, a much higher -memory footprint.
  • -
-
setBackend(sp_sciex, MsBackendDataFrame())
-
## MSn data (Spectra) with 1862 spectra in a MsBackendDataFrame backend:
-##        msLevel     rtime scanIndex
-##      <integer> <numeric> <integer>
-## 1            1     0.280         1
-## 2            1     0.559         2
-## 3            1     0.838         3
-## 4            1     1.117         4
-## 5            1     1.396         5
-## ...        ...       ...       ...
-## 1858         1   258.636       927
-## 1859         1   258.915       928
-## 1860         1   259.194       929
-## 1861         1   259.473       930
-## 1862         1   259.752       931
-##  ... 33 more variables/columns.
-## Processing:
-##  Switch backend from MsBackendMzR to MsBackendDataFrame [Tue Aug 31 11:35:38 2021]
-
    -
  • -MsBackendHdf5Peaks: similar to MsBackendMzR this backend reads peak data -only on-demand from disk while all other spectra variables are kept in -memory. The peak data are stored in Hdf5 files which guarantees scalability.
  • -
-
sp_hdf5 <- setBackend(sp_sciex, MsBackendHdf5Peaks(), hdf5path = tempdir())
-sp_hdf5
-
## MSn data (Spectra) with 1862 spectra in a MsBackendHdf5Peaks backend:
-##        msLevel     rtime scanIndex
-##      <integer> <numeric> <integer>
-## 1            1     0.280         1
-## 2            1     0.559         2
-## 3            1     0.838         3
-## 4            1     1.117         4
-## 5            1     1.396         5
-## ...        ...       ...       ...
-## 1858         1   258.636       927
-## 1859         1   258.915       928
-## 1860         1   259.194       929
-## 1861         1   259.473       930
-## 1862         1   259.752       931
-##  ... 33 more variables/columns.
-## 
-## file(s):
-##  20171016_POOL_POS_1_105-134.h5
-##  20171016_POOL_POS_3_105-134.h5
-## Processing:
-##  Switch backend from MsBackendMzR to MsBackendHdf5Peaks [Tue Aug 31 11:35:44 2021]
-
table(sp_hdf5$dataOrigin)
-
## 
-## /home/lgatto/R/x86_64-pc-linux-gnu-library/4.1/msdata/sciex/20171016_POOL_POS_1_105-134.mzML 
-##                                                                                          931 
-## /home/lgatto/R/x86_64-pc-linux-gnu-library/4.1/msdata/sciex/20171016_POOL_POS_3_105-134.mzML 
-##                                                                                          931
-
table(sp_hdf5$dataStorage)
-
## 
-## /tmp/RtmpFxKO1C/20171016_POOL_POS_1_105-134.h5 
-##                                            931 
-## /tmp/RtmpFxKO1C/20171016_POOL_POS_3_105-134.h5 
-##                                            931
-

All of the above mentioned backends support changing all of their their spectra -variables, except the MsBackendMzR that does not support changing m/z or -intensity values for the mass peaks.

-

With the example below we load the data from a single mzML file and use a -MsBackendHdf5Peaks backend for data storage. The hdf5path parameter allows -us to specify the storage location of the HDF5 file.

-

There is also an (under development) SQLite-based backend called -MsBackendSqlDb -that will store all data, i.e. raw and metadata, on disk.

-
-
-
-

-3.2 Under the hood: mzR (optional)
-

-

The mzR package in a direct interface to the -proteowizard code base. It -includes a substantial proportion of pwiz’s C/C++ code for fast and -efficient parsing of these large raw data files.

-

Let’s start by using some raw data files from the msdata -package. After loading it, we use the proteomics() function to -return the full file names for two raw data files. We will start by -focusing on the second one.

-
f <- msdata::proteomics(full.names = TRUE)
-f
-
## [1] "/home/lgatto/R/x86_64-pc-linux-gnu-library/4.1/msdata/proteomics/MRM-standmix-5.mzML.gz"                                                
-## [2] "/home/lgatto/R/x86_64-pc-linux-gnu-library/4.1/msdata/proteomics/MS3TMT10_01022016_32917-33481.mzML.gz"                                 
-## [3] "/home/lgatto/R/x86_64-pc-linux-gnu-library/4.1/msdata/proteomics/MS3TMT11.mzML"                                                         
-## [4] "/home/lgatto/R/x86_64-pc-linux-gnu-library/4.1/msdata/proteomics/TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01-20141210.mzML.gz"
-## [5] "/home/lgatto/R/x86_64-pc-linux-gnu-library/4.1/msdata/proteomics/TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01.mzML.gz"
-
(f2 <- grep("20141210", f, value = TRUE))
-
## [1] "/home/lgatto/R/x86_64-pc-linux-gnu-library/4.1/msdata/proteomics/TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01-20141210.mzML.gz"
-

The three main functions of mzR are

-
    -
  • -openMSfile to create a file handle to a raw data file
  • -
  • -header to extract metadata about the spectra contained in the file
  • -
  • -peaks to extract one or multiple spectra of interest.
  • -
-

Other functions such as instrumentInfo, or runInfo can be used to -gather general information about a run.

-
library("mzR")
-ms <- openMSfile(f2)
-ms
-
## Mass Spectrometry file handle.
-## Filename:  TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01-20141210.mzML.gz 
-## Number of scans:  7534
-
hd <- header(ms)
-dim(hd)
-
## [1] 7534   31
-
names(hd)
-
##  [1] "seqNum"                     "acquisitionNum"            
-##  [3] "msLevel"                    "polarity"                  
-##  [5] "peaksCount"                 "totIonCurrent"             
-##  [7] "retentionTime"              "basePeakMZ"                
-##  [9] "basePeakIntensity"          "collisionEnergy"           
-## [11] "ionisationEnergy"           "lowMZ"                     
-## [13] "highMZ"                     "precursorScanNum"          
-## [15] "precursorMZ"                "precursorCharge"           
-## [17] "precursorIntensity"         "mergedScan"                
-## [19] "mergedResultScanNum"        "mergedResultStartScanNum"  
-## [21] "mergedResultEndScanNum"     "injectionTime"             
-## [23] "filterString"               "spectrumId"                
-## [25] "centroided"                 "ionMobilityDriftTime"      
-## [27] "isolationWindowTargetMZ"    "isolationWindowLowerOffset"
-## [29] "isolationWindowUpperOffset" "scanWindowLowerLimit"      
-## [31] "scanWindowUpperLimit"
-
head(peaks(ms, 117))
-
##          [,1] [,2]
-## [1,] 399.9976    0
-## [2,] 399.9991    0
-## [3,] 400.0006    0
-## [4,] 400.0021    0
-## [5,] 400.2955    0
-## [6,] 400.2970    0
-
str(peaks(ms, 1:5))
-
## List of 5
-##  $ : num [1:25800, 1:2] 400 400 400 400 400 ...
-##  $ : num [1:25934, 1:2] 400 400 400 400 400 ...
-##  $ : num [1:26148, 1:2] 400 400 400 400 400 ...
-##  $ : num [1:26330, 1:2] 400 400 400 400 400 ...
-##  $ : num [1:26463, 1:2] 400 400 400 400 400 ...
-
-

-► Question -

-
-

Let’s extract the index of the MS2 spectrum with the highest base peak -intensity and plot its spectrum. Is the data centroided or in profile -mode?

-

- -

-
-
-
-

-► Solution -

- -
-
-

-► Question -

-
-

Pick an MS1 spectrum and visually check whether it is centroided or in -profile mode.

-

- -

-
-
-
-

-► Solution -

- -
-
-
-

-3.3 Visualisation of raw MS data
-

-

The importance of flexible access to specialised data becomes visible -in the figure below (taken from the RforProteomics visualisation -vignette). -Not only can we access specific data and understand/visualise them, -but we can transverse all the data and extracted/visualise/understand -structured slices of data.

-

The figure below show is an illustration of how mass spectrometry -works:

-
    -
  1. The chromatogram at the top display to total ion current along the -retention time. The vertical line identifies one scan in particular -at retention time 1800.68 seconds (the 2807th scan).

  2. -
  3. The spectra on the second line represent the full MS1 spectrum -marked by the red line. The vertical lines identify the 10 -precursor ions that where selected for MS2 analysis. The zoomed in -on the right shows one specific precursor peak.

  4. -
  5. The MS2 spectra displayed along the two rows at the bottom are -those resulting from the fragmentation of the 10 precursor peaks -identified by the vertical bars above.

  6. -
-

-

We are going to reproduce the figure above trought a set of exercices.

-
-

-► Question -

-
-
    -
  1. The chromatogram can be created by extracting the totIonCurrent -and rtime variables for all MS1 spectra. Annotate the spectrum of -interest.
  2. -
-

- -

-
-
-
-

-► Solution -

- -
-
-

-► Question -

-
-
    -
  1. The filterPrecursorScan() function can be used to retains parent -(MS1) and children scans (MS2) of a scan, as defined by its -acquisition number. Use it to extract the MS1 scan of interest and -all its MS2 children.
  2. -
-

- -

-
-
-
-

-► Solution -

- -
-
-

-► Question -

-
-
    -
  1. Plot the MS1 spectrum of interest and highlight all the peaks that -will be selected for MS2 analysis.
  2. -
-

- -

-
-
-
-

-► Solution -

- -
-
-

-► Question -

-
-
    -
  1. Zoom in mz values 521.1 and 522.5 to reveal the isotopic envelope -of that peak.
  2. -
-

- -

-
-
-
-

-► Solution -

- -
-
-

-► Question -

-
-
    -
  1. The plotSpectra() function is used to plot all 10 MS2 spectra in -one call.
  2. -
-

- -

-
-
-
-

-► Solution -

- -
-

It is possible to label the peaks with the plotSpectra() -function. The labels argument is either a character of appropriate -length (i.e. with a label for each peak) or, as illustrated below, a -function that computes the labels.

-
mzLabel <- function(z) {
-    z <- peaksData(z)[[1L]]
-    lbls <- format(z[, "mz"], digits = 4)
-    lbls[z[, "intensity"] < 1e5] <- ""
-    lbls
-}
-
-plotSpectra(ms_2[7],
-            xlim = c(126, 132),
-            labels = mzLabel,
-            labelSrt = -30, labelPos = 2,
-            labelOffset = 0.1)
-

-

Spectra can also be compared either by overlay or mirror plotting -using the plotSpectraOverlay() and plotSpectraMirror() functions.

-
-

-► Question -

-
-

Filter MS2 level spectra and find any 2 MS2 spectra that have matching -precursor peaks based on the precursor m/z values.

-

- -

-
-
-
-

-► Solution -

- -
-
-

-► Question -

-
-

Visualise the matching pair using the plotSpectraOverlay() and -plotSpectraMirror() functions.

-

- -

-
-
-
-

-► Solution -

- -
-
-
-

-3.4 Raw data processing and manipulation
-

-

Apart from classical subsetting operations such as [ and split, -a set of filter functions are defined for Spectra objects (for -detailed help please see the ?Spectra help):

-
    -
  • -filterAcquisitionNum: retain spectra with certain acquisition numbers.
  • -
  • -filterDataOrigin: subset to spectra from specific origins.
  • -
  • -filterDataStorage: subset to spectra from certain data storage files.
  • -
  • -filterEmptySpectra: remove spectra without mass peaks.
  • -
  • -filterMzRange: subset spectra keeping only peaks with an m/z within the -provided m/z range.
  • -
  • -filterMzValues: subset spectra keeping only peaks matching provided m/z -value(s).
  • -
  • -filterIsolationWindow: keep spectra with the provided mz in their -isolation window (m/z range).
  • -
  • -filterMsLevel: filter by MS level.
  • -
  • -filterPolarity: filter by polarity.
  • -
  • -filterPrecursorMz: retain (MSn) spectra with a precursor m/z within the -provided m/z range.
  • -
  • -filterPrecursorScan: retain (parent and children) scans of an acquisition -number.
  • -
  • -filterRt: filter based on retention time ranges.
  • -
-
-

-► Question -

-
-

Using the sp_sciex data, select all spectra measured in the second -mzML file and subsequently filter them to retain spectra measured -between 175 and 189 seconds in the measurement run.

-

- -

-
-
-
-

-► Solution -

- -
-

As an example of data processing, below we use the pickPeaks() -function to pick peaks:

-
plotSpectra(sp[2807], xlim = c(521.2, 522.5))
-

-
library("magrittr")
-pickPeaks(sp[2807]) %>%
-    filterIntensity(1e7) %>%
-    plotSpectra(xlim = c(521.2, 522.5))
-

-
-
-

-3.5 A note on efficiency
-

-
-

-3.5.1 Backends
-

-
- -

-Figure 3.2: (a) Reading time (triplicates, in seconds) and (b) data size in memory (in MB) to read/store 1, 5, and 10 files containing 1431 MS1 (on-disk only) and 6103 MS2 (on-disk and in-memory) spectra. (c) Filtering benchmark assessed over 10 interactions on in-memory and on-disk data containing 6103 MS2 spectra. (d) Access time to spectra for the in-memory (left) and on-disk (right) backends for 1, 10, 100 1000, 5000, and all 6103 spectra. Benchmarks were performed on a Dell XPS laptop with an Intel i5-8250U processor 1.60 GHz (4 cores, 8 threads), 7.5 GB RAM running Ubuntu 18.04.4 LTS 64-bit, and an SSD drive. The data used for the benchmarking are a TMT 4-plex experiment acquired on a LTQ Orbitrap Velos (Thermo Fisher Scientific) available in the msdata package . (Figure taken from (Gatto, Gibb, and Rainer 2020). -

-(a) Reading time (triplicates, in seconds) and (b) data size in memory (in MB) to read/store 1, 5, and 10 files containing 1431 MS1 (on-disk only) and 6103 MS2 (on-disk and in-memory) spectra. (c) Filtering benchmark assessed over 10 interactions on in-memory and on-disk data containing 6103 MS2 spectra. (d) Access time to spectra for the in-memory (left) and on-disk (right) backends for 1, 10, 100 1000, 5000, and all 6103 spectra. Benchmarks were performed on a Dell XPS laptop with an Intel i5-8250U processor 1.60 GHz (4 cores, 8 threads), 7.5 GB RAM running Ubuntu 18.04.4 LTS 64-bit, and an SSD drive. The data used for the benchmarking are a TMT 4-plex experiment acquired on a LTQ Orbitrap Velos (Thermo Fisher Scientific) available in the msdata package . (Figure taken from [@Gatto:2020]. -
-
-
-

-3.5.2 Parallel processing
-

-

Most functions on Spectra support (and use) parallel processing out -of the box. Peak data access and manipulation methods perform by -default parallel processing on a per-file basis (i.e. using the -dataStorage variable as splitting factor). Spectra uses -BiocParallel for -parallel processing and all functions use the default registered -parallel processing setup of that package.

-
-
-

-3.5.3 Lazy evaluation
-

-

Data manipulations on Spectra objects are not immediately applied to -the peak data. They are added to a so called processing queue which is -applied each time peak data is accessed (with the peaksData, mz or -intensity functions). Thanks to this processing queue data -manipulation operations are also possible for read-only backends -(e.g. mzML-file based backends or database-based backends). The -information about the number of such processing steps can be seen -below (next to Lazy evaluation queue).

-
min(intensity(sp_sciex[1]))
-
## [1] 0
-
sp_sciex <- filterIntensity(sp_sciex, intensity = c(10, Inf))
-sp_sciex ## Note the lazy evaluation queue
-
## MSn data (Spectra) with 1862 spectra in a MsBackendMzR backend:
-##        msLevel     rtime scanIndex
-##      <integer> <numeric> <integer>
-## 1            1     0.280         1
-## 2            1     0.559         2
-## 3            1     0.838         3
-## 4            1     1.117         4
-## 5            1     1.396         5
-## ...        ...       ...       ...
-## 1858         1   258.636       927
-## 1859         1   258.915       928
-## 1860         1   259.194       929
-## 1861         1   259.473       930
-## 1862         1   259.752       931
-##  ... 33 more variables/columns.
-## 
-## file(s):
-## 20171016_POOL_POS_1_105-134.mzML
-## 20171016_POOL_POS_3_105-134.mzML
-## Lazy evaluation queue: 1 processing step(s)
-## Processing:
-##  Remove peaks with intensities outside [10, Inf] in spectra of MS level(s) 1. [Tue Aug 31 11:35:51 2021]
-
min(intensity(sp_sciex[1]))
-
## [1] 412
-
sp_sciex@processingQueue
-
## [[1]]
-## Object of class "ProcessingStep"
-##  Function: user-provided function
-##  Arguments:
-##   o intensity = 10Inf
-##   o msLevel = 1
-
sp_sciex <- reset(sp_sciex)
-sp_sciex@processingQueue
-
## list()
-
min(intensity(sp_sciex[1]))
-
## [1] 0
- -
-
-
-

References
-

-
-
-Gatto, Laurent, Sebastian Gibb, and Johannes Rainer. 2020. MSnbase, Efficient and Elegant r-Based Processing and Visualisation of Raw Mass Spectrometry Data.” J. Proteome Res., September. -
-
- - -

- - -

-

Page built: -2021-08-31 - using -R version 4.1.0 (2021-05-18) -

-
-
- - - - - diff --git a/docs/reference-keys.txt b/docs/reference-keys.txt deleted file mode 100644 index 99b6fa7..0000000 --- a/docs/reference-keys.txt +++ /dev/null @@ -1,89 +0,0 @@ -fig:sticker -fig:unnamed-chunk-2 -fig:unnamed-chunk-3 -fig:unnamed-chunk-4 -fig:unnamed-chunk-5 -fig:unnamed-chunk-6 -fig:unnamed-chunk-7 -fig:unnamed-chunk-8 -fig:unnamed-chunk-23 -fig:answid1 -fig:sc -fig:itraq -fig:lf -fig:silab -fig:sefig -fig:featuresplot -fig:cptac -fig:imagena -fig:miximp -fig:lazar -fig:nasetdist -fig:plotdens -fig:plotpca -fig:vis -fig:vp -fig:unnamed-chunk-76 -preamble -sec-msintro -how-does-mass-spectrometry-work -accessing-data -sec-raw -what-is-raw-data-in-r -the-spectra-class -spectra-from-mzml-files -backends -visualisation-of-raw-ms-data -raw-data-processing-and-manipulation -a-note-on-efficiency -backends-1 -parallel-processing -lazy-evaluation -sec-id -identification-data.frame -keeping-all-matches -filtering-data -adding-identification-data-to-raw-data -an-identification-annotated-chromatogram -visualising-peptide-spectrum-matches -comparing-spectra -summary-exercise -exploration-and-assessment-of-identifications-using-msnid -step-by-step-work-flow -analysis-of-peptide-sequences -trimming-the-data -parent-ion-mass-errors -filtering-criteria -setting-filters -filter-optimisation -export-msnid-data -sec-quant -quantitation-methodologies -label-free-ms2-spectral-counting -labelled-ms2-isobaric-tagging -label-free-ms1-extracted-ion-chromatograms -labelled-ms1-silac -sec-qf -the-qfeatures-class -feature-aggregation -subsetting-and-filtering -creating-qfeatures-object -analysis-pipeline -missing-values -imputation -identification-quality-control -creating-the-qfeatures-data -filtering-out-contaminants-and-reverse-hits -log-transformation-and-normalisation -aggregation -principal-component-analysis -visualisation -statistical-analysis -summary-exercice -sec-anx -sec-raw2 -sec-id2 -sec-si -additional-materials -questions-and-help -session-information diff --git a/docs/sec-anx.html b/docs/sec-anx.html deleted file mode 100644 index c8e96b8..0000000 --- a/docs/sec-anx.html +++ /dev/null @@ -1,531 +0,0 @@ - - - - - - - - - - - - - - - - - - - - -Chapter 6 Annex | R for Mass Spectrometry - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-

-Chapter 6 Annex

-
-

-6.1 Raw MS data under the hood: the mzR package
-

-

The mzR package is a direct interface to the -proteowizard code base. It -includes a substantial proportion of pwiz’s C/C++ code for fast and -efficient parsing of these large raw data files.

-

Let’s start by using some raw data files from the msdata -package. After loading it, we use the proteomics() function to -return the full file names for two raw data files. We will start by -focusing on the second one.

-
f <- msdata::proteomics(full.names = TRUE)
-f
-
## [1] "/home/lgatto/disk/R/x86_64-pc-linux-gnu-library/4.3/msdata/proteomics/MRM-standmix-5.mzML.gz"                                                
-## [2] "/home/lgatto/disk/R/x86_64-pc-linux-gnu-library/4.3/msdata/proteomics/MS3TMT10_01022016_32917-33481.mzML.gz"                                 
-## [3] "/home/lgatto/disk/R/x86_64-pc-linux-gnu-library/4.3/msdata/proteomics/MS3TMT11.mzML"                                                         
-## [4] "/home/lgatto/disk/R/x86_64-pc-linux-gnu-library/4.3/msdata/proteomics/TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01-20141210.mzML.gz"
-## [5] "/home/lgatto/disk/R/x86_64-pc-linux-gnu-library/4.3/msdata/proteomics/TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01.mzML.gz"
-
(f2 <- grep("20141210", f, value = TRUE))
-
## [1] "/home/lgatto/disk/R/x86_64-pc-linux-gnu-library/4.3/msdata/proteomics/TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01-20141210.mzML.gz"
-

The three main functions of mzR are

-
    -
  • -openMSfile to create a file handle to a raw data file
  • -
  • -header to extract metadata about the spectra contained in the file
  • -
  • -peaks to extract one or multiple spectra of interest.
  • -
-

Other functions such as instrumentInfo, or runInfo can be used to -gather general information about a run.

-
library("mzR")
-ms <- openMSfile(f2)
-ms
-
## Mass Spectrometry file handle.
-## Filename:  TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01-20141210.mzML.gz 
-## Number of scans:  7534
-
hd <- header(ms)
-dim(hd)
-
## [1] 7534   31
-
names(hd)
-
##  [1] "seqNum"                     "acquisitionNum"            
-##  [3] "msLevel"                    "polarity"                  
-##  [5] "peaksCount"                 "totIonCurrent"             
-##  [7] "retentionTime"              "basePeakMZ"                
-##  [9] "basePeakIntensity"          "collisionEnergy"           
-## [11] "ionisationEnergy"           "lowMZ"                     
-## [13] "highMZ"                     "precursorScanNum"          
-## [15] "precursorMZ"                "precursorCharge"           
-## [17] "precursorIntensity"         "mergedScan"                
-## [19] "mergedResultScanNum"        "mergedResultStartScanNum"  
-## [21] "mergedResultEndScanNum"     "injectionTime"             
-## [23] "filterString"               "spectrumId"                
-## [25] "centroided"                 "ionMobilityDriftTime"      
-## [27] "isolationWindowTargetMZ"    "isolationWindowLowerOffset"
-## [29] "isolationWindowUpperOffset" "scanWindowLowerLimit"      
-## [31] "scanWindowUpperLimit"
-
head(peaks(ms, 117))
-
##            mz intensity
-## [1,] 399.9976         0
-## [2,] 399.9991         0
-## [3,] 400.0006         0
-## [4,] 400.0021         0
-## [5,] 400.2955         0
-## [6,] 400.2970         0
-
str(peaks(ms, 1:5))
-
## List of 5
-##  $ : num [1:25800, 1:2] 400 400 400 400 400 ...
-##   ..- attr(*, "dimnames")=List of 2
-##   .. ..$ : NULL
-##   .. ..$ : chr [1:2] "mz" "intensity"
-##  $ : num [1:25934, 1:2] 400 400 400 400 400 ...
-##   ..- attr(*, "dimnames")=List of 2
-##   .. ..$ : NULL
-##   .. ..$ : chr [1:2] "mz" "intensity"
-##  $ : num [1:26148, 1:2] 400 400 400 400 400 ...
-##   ..- attr(*, "dimnames")=List of 2
-##   .. ..$ : NULL
-##   .. ..$ : chr [1:2] "mz" "intensity"
-##  $ : num [1:26330, 1:2] 400 400 400 400 400 ...
-##   ..- attr(*, "dimnames")=List of 2
-##   .. ..$ : NULL
-##   .. ..$ : chr [1:2] "mz" "intensity"
-##  $ : num [1:26463, 1:2] 400 400 400 400 400 ...
-##   ..- attr(*, "dimnames")=List of 2
-##   .. ..$ : NULL
-##   .. ..$ : chr [1:2] "mz" "intensity"
-
-

-► Question -

-
-

Let’s extract the index of the MS2 spectrum with the highest base peak -intensity and plot its spectrum. Is the data centroided or in profile -mode?

-

- -

-
-
-
-

-► Solution -

- -
-
-

-► Question -

-
-

Pick an MS1 spectrum and visually check whether it is centroided or in -profile mode.

-

- -

-
-
-
-

-► Solution -

- -
-
-
-

-6.2 PSM data under the hood
-

-

There are two packages that can be used to parse mzIdentML files, -namely mzR (that we have already used for raw data) and mzID. The -major difference is that the former leverages C++ code from -proteowizard and is hence faster than the latter (which uses the -XML R package). They both work in similar ways.

-
|Data type      |File format |Data structure |Package |
-|:--------------|:-----------|:--------------|:-------|
-|Identification |mzIdentML   |mzRident       |mzR     |
-|Identification |mzIdentML   |mzID           |mzID    |
-

Which of these packages is used by PSM() can be defined by the -parser argument, as documented in ?PSM.

-
-

-mzID
-

-

The main functions are mzID to read the data into a dedicated data -class and flatten to transform it into a data.frame.

-
idf
-
## [1] "/home/lgatto/disk/R/x86_64-pc-linux-gnu-library/4.3/msdata/ident/TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01-20141210.mzid"
-
library("mzID")
-
## 
-## Attaching package: 'mzID'
-
## The following object is masked from 'package:purrr':
-## 
-##     flatten
-
## The following object is masked from 'package:dplyr':
-## 
-##     id
-
id <- mzID(idf)
-
## reading TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01-20141210.mzid... DONE!
-
id
-
## An mzID object
-## 
-## Software used:   MS-GF+ (version: Beta (v10072))
-## 
-## Rawfile:         /home/lg390/dev/01_svn/workflows/proteomics/TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01-20141210.mzML
-## 
-## Database:        /home/lg390/dev/01_svn/workflows/proteomics/erwinia_carotovora.fasta
-## 
-## Number of scans: 5343
-## Number of PSM's: 5656
-

Various data can be extracted from the mzID object, using one of the -accessor functions such as database, software, scans, peptides, -… The object can also be converted into a data.frame using the -flatten function.

-
head(flatten(id))
-
##                                      spectrumid scan number(s) acquisitionnum
-## 1 controllerType=0 controllerNumber=1 scan=5782           5782           5782
-## 2 controllerType=0 controllerNumber=1 scan=6037           6037           6037
-## 3 controllerType=0 controllerNumber=1 scan=5235           5235           5235
-##   passthreshold rank calculatedmasstocharge experimentalmasstocharge
-## 1          TRUE    1               1080.232                 1080.233
-## 2          TRUE    1               1002.212                 1002.209
-## 3          TRUE    1               1189.280                 1189.284
-##   chargestate ms-gf:denovoscore ms-gf:evalue ms-gf:pepqvalue ms-gf:qvalue
-## 1           3               174 1.086033e-20               0            0
-## 2           3               245 1.988774e-19               0            0
-## 3           3               264 5.129649e-19               0            0
-##   ms-gf:rawscore ms-gf:specevalue assumeddissociationmethod isotopeerror
-## 1            147     3.764831e-27                       HCD            0
-## 2            214     6.902626e-26                       HCD            0
-## 3            211     1.778789e-25                       HCD            0
-##   isdecoy post pre end start accession length
-## 1   FALSE    S   R  84    50   ECA1932    155
-## 2   FALSE    R   K 315   288   ECA1147    434
-## 3   FALSE    A   R 224   192   ECA0013    295
-##                          description                              pepseq
-## 1         outer membrane lipoprotein PVQIQAGEDSNVIGALGGAVLGGFLGNTIGGGSGR
-## 2                     trigger factor        TQVLDGLINANDIEVPVALIDGEIDVLR
-## 3 ribose-binding periplasmic protein   TKGLNVMQNLLTAHPDVQAVFAQNDEMALGALR
-##   modified modification
-## 1    FALSE         <NA>
-## 2    FALSE         <NA>
-## 3    FALSE         <NA>
-##                                                                idFile
-## 1 TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01-20141210.mzid
-## 2 TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01-20141210.mzid
-## 3 TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01-20141210.mzid
-##                                                          spectrumFile
-## 1 TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01-20141210.mzML
-## 2 TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01-20141210.mzML
-## 3 TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01-20141210.mzML
-##               databaseFile
-## 1 erwinia_carotovora.fasta
-## 2 erwinia_carotovora.fasta
-## 3 erwinia_carotovora.fasta
-##  [ reached 'max' / getOption("max.print") -- omitted 3 rows ]
-
-
-

-mzR
-

-

The mzR interface provides a similar interface. It is however much -faster as it does not read all the data into memory and only extracts -relevant data on demand. It has also accessor functions such as -softwareInfo, mzidInfo, … (use showMethods(classes = "mzRident", where = "package:mzR")) -to see all available methods.

-
library("mzR")
-id2 <- openIDfile(idf)
-id2
-
## Identification file handle.
-## Filename:  TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01-20141210.mzid 
-## Number of psms:  5759
-
softwareInfo(id2)
-
## [1] "MS-GF+ Beta (v10072) "                        
-## [2] "ProteoWizard MzIdentML 3.0.21263 ProteoWizard"
-

The identification data can be accessed as a data.frame with the -psms accessor.

-
head(psms(id2))
-
##                                      spectrumID chargeState rank passThreshold
-## 1 controllerType=0 controllerNumber=1 scan=5782           3    1          TRUE
-## 2 controllerType=0 controllerNumber=1 scan=6037           3    1          TRUE
-## 3 controllerType=0 controllerNumber=1 scan=5235           3    1          TRUE
-## 4 controllerType=0 controllerNumber=1 scan=5397           3    1          TRUE
-## 5 controllerType=0 controllerNumber=1 scan=6075           3    1          TRUE
-##   experimentalMassToCharge calculatedMassToCharge
-## 1                1080.2325              1080.2321
-## 2                1002.2089              1002.2115
-## 3                1189.2836              1189.2800
-## 4                 960.5365               960.5365
-## 5                1264.3409              1264.3419
-##                              sequence peptideRef modNum isDecoy post pre start
-## 1 PVQIQAGEDSNVIGALGGAVLGGFLGNTIGGGSGR       Pep1      0   FALSE    S   R    50
-## 2        TQVLDGLINANDIEVPVALIDGEIDVLR       Pep2      0   FALSE    R   K   288
-## 3   TKGLNVMQNLLTAHPDVQAVFAQNDEMALGALR       Pep3      0   FALSE    A   R   192
-## 4         SQILQQAGTSVLSQANQVPQTVLSLLR       Pep4      0   FALSE    -   R   264
-## 5 PIIGDNPFVVVLPDVVLDESTADQTQENLALLISR       Pep5      0   FALSE    F   R   119
-##   end DatabaseAccess DBseqLength DatabaseSeq
-## 1  84        ECA1932         155            
-## 2 315        ECA1147         434            
-## 3 224        ECA0013         295            
-## 4 290        ECA1731         290            
-## 5 153        ECA1443         298            
-##                                    DatabaseDescription scan.number.s.
-## 1                   ECA1932 outer membrane lipoprotein           5782
-## 2                               ECA1147 trigger factor           6037
-## 3           ECA0013 ribose-binding periplasmic protein           5235
-## 4                                    ECA1731 flagellin           5397
-## 5 ECA1443 UTP--glucose-1-phosphate uridylyltransferase           6075
-##   acquisitionNum
-## 1           5782
-## 2           6037
-## 3           5235
-## 4           5397
-## 5           6075
-##  [ reached 'max' / getOption("max.print") -- omitted 1 rows ]
- -
-
-
- -

- - -

-

Page built: -2023-09-06 - using -R version 4.3.1 Patched (2023-07-10 r84676) -

-
-
- - - - - diff --git a/docs/sec-id.html b/docs/sec-id.html deleted file mode 100644 index 7a0d6ae..0000000 --- a/docs/sec-id.html +++ /dev/null @@ -1,1735 +0,0 @@ - - - - - - - - - - - - - - - - - - - - -Chapter 4 Identification data | R for Mass Spectrometry - - - - - - - - - - - - - - - - - - - - - - - -
-
- - -
-

-Chapter 4 Identification data

-

Peptide identification is performed using third-party software - there -is no package to run these searches directly in R. When using line -search engines it possible to hard-code or automatically generate the -search command lines and run them from R using a system() call. This -allows to generate these reproducibly (especially useful if many -command lines need to be run) and to keep a record in the R script of -the exact command.

-

The example below illustrates this for 3 mzML files to be searched -using MSGFplus:

-
(mzmls <- paste0("file_", 1:3, ".mzML"))
-
## [1] "file_1.mzML" "file_2.mzML" "file_3.mzML"
-
(mzids <- sub("mzML", "mzid", mzmls))
-
## [1] "file_1.mzid" "file_2.mzid" "file_3.mzid"
-
paste0("java -jar /path/to/MSGFPlus.jar",
-       " -s ", mzmls,
-       " -o ", mzids,
-       " -d uniprot.fas",
-       " -t 20ppm",
-       " -m 0",
-       " int 1")
-
## [1] "java -jar /path/to/MSGFPlus.jar -s file_1.mzML -o file_1.mzid -d uniprot.fas -t 20ppm -m 0 int 1"
-## [2] "java -jar /path/to/MSGFPlus.jar -s file_2.mzML -o file_2.mzid -d uniprot.fas -t 20ppm -m 0 int 1"
-## [3] "java -jar /path/to/MSGFPlus.jar -s file_3.mzML -o file_3.mzid -d uniprot.fas -t 20ppm -m 0 int 1"
-
-

-4.1 Identification data.frame
-

-

Let’s use the identification from msdata:

-
idf <- msdata::ident(full.names = TRUE)
-basename(idf)
-
## [1] "TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01-20141210.mzid"
-

The easiest way to read identification data in mzIdentML (often -abbreviated with mzid) into R is to read it with the readPSMs() -function from the -PSMatch -package5. The function will parse the file and return a -DataFrame.

-
library(PSMatch)
-id <- PSM(idf)
-dim(id)
-
## [1] 5802   35
-
names(id)
-
##  [1] "sequence"                 "spectrumID"              
-##  [3] "chargeState"              "rank"                    
-##  [5] "passThreshold"            "experimentalMassToCharge"
-##  [7] "calculatedMassToCharge"   "peptideRef"              
-##  [9] "modNum"                   "isDecoy"                 
-## [11] "post"                     "pre"                     
-## [13] "start"                    "end"                     
-## [15] "DatabaseAccess"           "DBseqLength"             
-## [17] "DatabaseSeq"              "DatabaseDescription"     
-## [19] "scan.number.s."           "acquisitionNum"          
-## [21] "spectrumFile"             "idFile"                  
-## [23] "MS.GF.RawScore"           "MS.GF.DeNovoScore"       
-## [25] "MS.GF.SpecEValue"         "MS.GF.EValue"            
-## [27] "MS.GF.QValue"             "MS.GF.PepQValue"         
-## [29] "modPeptideRef"            "modName"                 
-## [31] "modMass"                  "modLocation"             
-## [33] "subOriginalResidue"       "subReplacementResidue"   
-## [35] "subLocation"
-
-

-► Question -

-
-

Verify that this table contains 5802 matches for 5343 -scans and 4938 peptides sequences.

-

- -

-
-
-
-

-► Solution -

- -
-

The PSM data are read as is, without any filtering. As we can see -below, we still have all the hits from the forward and reverse (decoy) -databases.

-
table(id$isDecoy)
-
## 
-## FALSE  TRUE 
-##  2906  2896
-
-
-

-4.2 Keeping all matches
-

-

The data contains also contains multiple matches for several -spectra. The table below shows the number of number of spectra that -have 1, 2, … up to 5 matches.

-
table(table(id$spectrumID))
-
## 
-##    1    2    3    4    5 
-## 4936  369   26   10    2
-

Below, we can see how scan 1774 has 4 matches, all to sequence -RTRYQAEVR, which itself matches to 4 different proteins:

-
i <- which(id$spectrumID == "controllerType=0 controllerNumber=1 scan=1774")
-data.frame(id[i, ])[1:5]
-
##    sequence                                    spectrumID chargeState rank
-## 1 RTRYQAEVR controllerType=0 controllerNumber=1 scan=1774           2    1
-## 2 RTRYQAEVR controllerType=0 controllerNumber=1 scan=1774           2    1
-## 3 RTRYQAEVR controllerType=0 controllerNumber=1 scan=1774           2    1
-## 4 RTRYQAEVR controllerType=0 controllerNumber=1 scan=1774           2    1
-##   passThreshold
-## 1          TRUE
-## 2          TRUE
-## 3          TRUE
-## 4          TRUE
-

If the goal is to keep all the matches, but arranged by scan/spectrum, -one can reduce the PSM object by the spectrumID variable, so -that each scan correponds to a single row that still stores all -values6:

-
id2 <- reducePSMs(id, id$spectrumID)
-id2
-
## Reduced PSM with 5343 rows and 35 columns.
-## names(35): sequence spectrumID ... subReplacementResidue subLocation
-

The resulting object contains a single entry for scan 1774 with -information for the multiple matches stored as lists within the cells.

-
j <- which(id2$spectrumID == "controllerType=0 controllerNumber=1 scan=1774")
-id2[j, ]
-
## Reduced PSM with 1 rows and 35 columns.
-## names(35): sequence spectrumID ... subReplacementResidue subLocation
-
id2[j, "DatabaseAccess"]
-
## CharacterList of length 1
-## [["controllerType=0 controllerNumber=1 scan=1774"]] ECA2104 ECA2867 ECA3427 ECA4142
-

The is the type of complete identification table that could be used to -annotate an raw mass spectrometry Spectra object, as shown below.

-
-
-

-4.3 Filtering data
-

-

Often, the PSM data is filtered to only retain reliable matches. The -MSnID package can be used to set thresholds to attain user-defined -PSM, peptide or protein-level FDRs. Here, we will simply filter out -wrong identification manually.

-

Here, the filter() from the dplyr package comes very handy. We -will thus start by converting the DataFrame to a tibble.

-
library("dplyr")
-id_tbl <- tidyr::as_tibble(id)
-id_tbl
-
## # A tibble: 5,802 × 35
-##    sequence    spectrumID chargeState  rank passThreshold experimentalMassToCh…¹
-##    <chr>       <chr>            <int> <int> <lgl>                          <dbl>
-##  1 RQCRTDFLNY… controlle…           3     1 TRUE                            548.
-##  2 ESVALADQVT… controlle…           2     1 TRUE                           1288.
-##  3 KELLCLAMQI… controlle…           2     1 TRUE                            744.
-##  4 QRMARTSDKQ… controlle…           3     1 TRUE                            913.
-##  5 KDEGSTEPLK… controlle…           3     1 TRUE                            927.
-##  6 DGGPAIYGHE… controlle…           3     1 TRUE                            969.
-##  7 QRMARTSDKQ… controlle…           2     1 TRUE                           1369.
-##  8 CIDRARHVEV… controlle…           3     1 TRUE                           1285.
-##  9 CIDRARHVEV… controlle…           3     1 TRUE                           1285.
-## 10 VGRCRPIINY… controlle…           2     1 TRUE                           1102.
-## # ℹ 5,792 more rows
-## # ℹ abbreviated name: ¹​experimentalMassToCharge
-## # ℹ 29 more variables: calculatedMassToCharge <dbl>, peptideRef <chr>,
-## #   modNum <int>, isDecoy <lgl>, post <chr>, pre <chr>, start <int>, end <int>,
-## #   DatabaseAccess <chr>, DBseqLength <int>, DatabaseSeq <chr>,
-## #   DatabaseDescription <chr>, scan.number.s. <dbl>, acquisitionNum <dbl>,
-## #   spectrumFile <chr>, idFile <chr>, MS.GF.RawScore <dbl>, …
-
-

-► Question -

-
-
    -
  • Remove decoy hits
  • -
-

- -

-
-
-
-

-► Solution -

- -
-
-

-► Question -

-
-
    -
  • Keep first rank matches
  • -
-

- -

-
-
-
-

-► Solution -

- -
-
-

-► Question -

-
-
    -
  • Remove shared peptides. Start by identifying scans that match -different proteins. For example scan 4884 matches proteins -XXX_ECA3406 and ECA3415. Scan 4099 match XXX_ECA4416_1, -XXX_ECA4416_2 and XXX_ECA4416_3. Then remove the scans that -match any of these proteins.
  • -
-

- -

-
-
-
-

-► Solution -

- -
-

Which leaves us with 2666 PSMs.

-

This can also be achieved with the filterPSMs() function, or the -individual filterPsmRank(), filterPsmDecoy and filterPsmShared() -functions:

-
id_filtered <- filterPSMs(id)
-
## Starting with 5802 PSMs:
-
## Removed 2896 decoy hits.
-
## Removed 155 PSMs with rank > 1.
-
## Removed 85 shared peptides.
-
## 2666 PSMs left.
-

The describePeptides() and describeProteins() functions from the -PSMatch package provide useful summaries of preptides and proteins -in a PSM search result.

-
    -
  • -describePeptides() gives the number of unique and shared peptides -and for the latter, the size of their protein groups:
  • -
-
describePeptides(id_filtered)
-
## 2324 peptides composed of
-
##  unique peptides: 2324
-
##  shared peptides (among protein):
-
##   ()
-
    -
  • -describeProteins() gives the number of proteins defined by only -unique, only shared, or a mixture of unique/shared peptides:
  • -
-
describeProteins(id_filtered)
-
## 1466 proteins composed of
-
##  only unique peptides: 1466
-
##  only shared peptides: 0
-
##  unique and shared peptides: 0
-

The Understanding protein groups with adjacency -matrices -PSMatch vignette provides additional tools to explore how proteins -were inferred from peptides.

-
-

-► Question -

-
-

Compare the distribution of raw identification scores of the decoy and -non-decoy hits. Interpret the figure.

-

- -

-
-
-
-

-► Solution -

- -
-
-

-► Question -

-
-

The tidyverse -tools are fit for data wrangling with identification data. Using the -above identification dataframe, calculate the length of each peptide -(you can use nchar with the peptide sequence sequence) and the -number of peptides for each protein (defined as -DatabaseDescription). Plot the length of the proteins against their -respective number of peptides.

-

- -

-
-
-
-

-► Solution -

- -
-

If you would like to learn more about how the mzid data are handled by -PSMatch via the mzR and mzID -packages, check out the 6.2 section in the annex.

-
-
-

-4.4 Adding identification data to raw data
-

-

We are goind to use the sp object created in the previous chapter -and the id_filtered variable generated above.

-

Identification data (as a DataFrame) can be merged into raw data (as -a Spectra object) by adding new spectra variables to the appropriate -MS2 spectra. Scans and peptide-spectrum matches can be matched by -their spectrum identifers.

-
-

-► Question -

-
-

Identify the spectum identifier columns in the sp the id_filtered -variables.

-

- -

-
-
-
-

-► Solution -

- -
-

We still have several PTMs that are matched to a single spectrum -identifier:

-
table(table(id_filtered$spectrumID))
-
## 
-##    1    2    3    4 
-## 2630   13    2    1
-

Let’s look at "controllerType=0 controllerNumber=1 scan=5490", the -has 4 matching PSMs in detail.

-
which(table(id_filtered$spectrumID) == 4)
-
## controllerType=0 controllerNumber=1 scan=5490 
-##                                          1903
-
id_4 <- id_filtered[id_filtered$spectrumID == "controllerType=0 controllerNumber=1 scan=5490", ] %>%
-    as.data.frame()
-id_4
-
##           sequence                                    spectrumID chargeState
-## 1 KCNQCLKVACTLFYCK controllerType=0 controllerNumber=1 scan=5490           3
-## 2 KCNQCLKVACTLFYCK controllerType=0 controllerNumber=1 scan=5490           3
-##   rank passThreshold experimentalMassToCharge calculatedMassToCharge peptideRef
-## 1    1          TRUE                 698.6633               698.3315     Pep453
-## 2    1          TRUE                 698.6633               698.3315     Pep453
-##   modNum isDecoy post pre start end DatabaseAccess DBseqLength DatabaseSeq
-## 1      4   FALSE    C   K   127 142        ECA0668         302            
-## 2      4   FALSE    C   K   127 142        ECA0668         302            
-##            DatabaseDescription scan.number.s. acquisitionNum
-## 1 ECA0668 hypothetical protein           5490           5490
-## 2 ECA0668 hypothetical protein           5490           5490
-##                                                          spectrumFile
-## 1 TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01-20141210.mzML
-## 2 TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01-20141210.mzML
-##                                                                idFile
-## 1 TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01-20141210.mzid
-## 2 TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01-20141210.mzid
-##   MS.GF.RawScore MS.GF.DeNovoScore MS.GF.SpecEValue MS.GF.EValue MS.GF.QValue
-## 1            -22                79     4.555588e-07     1.307689    0.9006211
-## 2            -22                79     4.555588e-07     1.307689    0.9006211
-##   MS.GF.PepQValue modPeptideRef         modName  modMass modLocation
-## 1       0.8901099        Pep453 Carbamidomethyl 57.02146           2
-## 2       0.8901099        Pep453 Carbamidomethyl 57.02146           5
-##   subOriginalResidue subReplacementResidue subLocation
-## 1               <NA>                  <NA>          NA
-## 2               <NA>                  <NA>          NA
-##  [ reached 'max' / getOption("max.print") -- omitted 2 rows ]
-

We can see that these 4 PSMs differ by the location of the -Carbamidomethyl modification.

-
id_4[, c("modName", "modLocation")]
-
##           modName modLocation
-## 1 Carbamidomethyl           2
-## 2 Carbamidomethyl           5
-## 3 Carbamidomethyl          10
-## 4 Carbamidomethyl          15
-

Let’s reduce that PSM table before joining it to the Spectra object, -to make sure we have unique one-to-one matches between the raw spectra -and the PSMs.

-
id_filtered <- reducePSMs(id_filtered, id_filtered$spectrumID)
-id_filtered
-
## Reduced PSM with 2646 rows and 35 columns.
-## names(35): sequence spectrumID ... subReplacementResidue subLocation
-

These two data can thus simply be joined using:

-
sp <- joinSpectraData(sp, id_filtered,
-                      by.x = "spectrumId",
-                      by.y = "spectrumID")
-spectraVariables(sp)
-
##  [1] "msLevel"                  "rtime"                   
-##  [3] "acquisitionNum"           "scanIndex"               
-##  [5] "dataStorage"              "dataOrigin"              
-##  [7] "centroided"               "smoothed"                
-##  [9] "polarity"                 "precScanNum"             
-## [11] "precursorMz"              "precursorIntensity"      
-## [13] "precursorCharge"          "collisionEnergy"         
-## [15] "isolationWindowLowerMz"   "isolationWindowTargetMz" 
-## [17] "isolationWindowUpperMz"   "peaksCount"              
-## [19] "totIonCurrent"            "basePeakMZ"              
-## [21] "basePeakIntensity"        "ionisationEnergy"        
-## [23] "lowMZ"                    "highMZ"                  
-## [25] "mergedScan"               "mergedResultScanNum"     
-## [27] "mergedResultStartScanNum" "mergedResultEndScanNum"  
-## [29] "injectionTime"            "filterString"            
-## [31] "spectrumId"               "ionMobilityDriftTime"    
-## [33] "scanWindowLowerLimit"     "scanWindowUpperLimit"    
-## [35] "rtime_minute"             "sequence"                
-## [37] "chargeState"              "rank"                    
-## [39] "passThreshold"            "experimentalMassToCharge"
-## [41] "calculatedMassToCharge"   "peptideRef"              
-## [43] "modNum"                   "isDecoy"                 
-## [45] "post"                     "pre"                     
-## [47] "start"                    "end"                     
-## [49] "DatabaseAccess"           "DBseqLength"             
-## [51] "DatabaseSeq"              "DatabaseDescription"     
-## [53] "scan.number.s."           "acquisitionNum.y"        
-## [55] "spectrumFile"             "idFile"                  
-## [57] "MS.GF.RawScore"           "MS.GF.DeNovoScore"       
-## [59] "MS.GF.SpecEValue"         "MS.GF.EValue"            
-## [61] "MS.GF.QValue"             "MS.GF.PepQValue"         
-## [63] "modPeptideRef"            "modName"                 
-## [65] "modMass"                  "modLocation"             
-## [67] "subOriginalResidue"       "subReplacementResidue"   
-## [69] "subLocation"
-
-

-► Question -

-
-

Verify that the identification data has been added to the correct -spectra.

-

- -

-
-
-
-

-► Solution -

- -
-
-
-

-4.5 An identification-annotated chromatogram
-

-

Now that we have combined raw data and their associated -peptide-spectrum matches, we can produce an improved total ion -chromatogram, identifying MS1 scans that lead to successful -identifications.

-

The countIdentifications() function is going to tally the number of -identifications (i.e non-missing characters in the sequence spectra -variable) for each scan. In the case of MS2 scans, these will be -either 1 or 0, depending on the presence of a sequence. For MS1 scans, -the function will count the number of sequences for the descendant MS2 -scans, i.e. those produced from precursor ions from each MS1 scan.

-
sp <- countIdentifications(sp)
-

Below, we see on the second line that 3457 MS2 scans lead to no PSM, -while 2546 lead to an identification. Among all MS1 scans, 833 lead to -no MS2 scans with PSMs. 30 MS1 scans generated one MS2 scan that lead -to a PSM, 45 lead to two PSMs, …

-
table(msLevel(sp), sp$countIdentifications)
-
##    
-##        0    1    2    3    4    5    6    7    8    9   10
-##   1  833   30   45   97  139  132   92   42   17    3    1
-##   2 3457 2646    0    0    0    0    0    0    0    0    0
-

These data can also be visualised on the total ion chromatogram:

-
sp |>
-filterMsLevel(1) |>
-spectraData() |>
-as_tibble() |>
-ggplot(aes(x = rtime,
-           y = totIonCurrent)) +
-    geom_line(alpha = 0.25) +
-    geom_point(aes(colour = ifelse(countIdentifications == 0,
-                                   NA, countIdentifications)),
-               size = 0.75,
-               alpha = 0.5) +
-    labs(colour = "Number of ids")
-
-

-

-
-
-
-

-4.6 Visualising peptide-spectrum matches
-

-

Let’s choose a MS2 spectrum with a high identification score and plot -it.

-
i <- which(sp$MS.GF.RawScore > 100)[1]
-plotSpectra(sp[i])
-

-

We have seen above that we can add labels to each peak using the -labels argument in plotSpectra(). The addFragments() function -takes a spectrum as input (that is a Spectra object of length 1) and -annotates its peaks.

-
addFragments(sp[i])
-
##   [1] NA    NA    NA    "b1"  NA    NA    NA    NA    NA    NA    NA    NA   
-##  [13] NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA   
-##  [25] NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA   
-##  [37] NA    NA    NA    NA    NA    NA    NA    "y1_" NA    NA    NA    NA   
-##  [49] NA    "y1"  NA    NA    NA    NA    NA    NA    NA    NA    NA    NA   
-##  [61] NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA   
-##  [73] NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA   
-##  [85] NA    NA    "b2"  NA    NA    NA    NA    NA    NA    NA    NA    NA   
-##  [97] NA    NA    NA    NA   
-##  [ reached getOption("max.print") -- omitted 227 entries ]
-

It can be directly used with plotSpectra():

-
plotSpectra(sp[i], labels = addFragments,
-            labelPos = 3, labelCol = "steelblue")
-

-

When a precursor peptide ion is fragmented in a CID cell, it breaks at -specific bonds, producing sets of peaks (a, b, c and x, y, -z) that can be predicted.

-
-

-(#fig:frag_img)Peptide fragmentation. -

-Peptide fragmentation. -
-

The annotation of spectra is obtained by simulating fragmentation of a -peptide and matching observed peaks to fragments:

-
sp[i]$sequence
-
## [1] "THSQEEMQHMQR"
-
calculateFragments(sp[i]$sequence)
-
## Modifications used: C=57.02146
-
##           mz ion type pos z         seq
-## 1   102.0550  b1    b   1 1           T
-## 2   239.1139  b2    b   2 1          TH
-## 3   326.1459  b3    b   3 1         THS
-## 4   454.2045  b4    b   4 1        THSQ
-## 5   583.2471  b5    b   5 1       THSQE
-## 6   712.2897  b6    b   6 1      THSQEE
-## 7   843.3301  b7    b   7 1     THSQEEM
-## 8   971.3887  b8    b   8 1    THSQEEMQ
-## 9  1108.4476  b9    b   9 1   THSQEEMQH
-## 10 1239.4881 b10    b  10 1  THSQEEMQHM
-## 11 1367.5467 b11    b  11 1 THSQEEMQHMQ
-## 12  175.1190  y1    y   1 1           R
-## 13  303.1775  y2    y   2 1          QR
-## 14  434.2180  y3    y   3 1         MQR
-## 15  571.2769  y4    y   4 1        HMQR
-## 16  699.3355  y5    y   5 1       QHMQR
-##  [ reached 'max' / getOption("max.print") -- omitted 42 rows ]
-
-
-

-4.7 Comparing spectra
-

-

The compareSpectra() function can be used to compare spectra (by default, -computing the normalised dot product).

-
-

-► Question -

-
-
    -
  1. Create a new Spectra object containing the MS2 spectra with -sequences "SQILQQAGTSVLSQANQVPQTVLSLLR" and -"TKGLNVMQNLLTAHPDVQAVFAQNDEMALGALR".
  2. -
-

- -

-
-
-
-

-► Solution -

- -
-
-

-► Question -

-
-
    -
  1. Calculate the 5 by 5 similarity -matrix between all spectra using compareSpectra. See the -?Spectra man page for details. Draw a heatmap of that matrix.
  2. -
-

- -

-
-
-
-

-► Solution -

- -
-
-

-► Question -

-
-
    -
  1. Compare the spectra with the plotting function seen previously.
  2. -
-

- -

-
-
-
-

-► Solution -

- -
-
-
-

-4.8 Summary exercise
-

-
-

-► Question -

-
-

Download the 3 first mzML and mzID files from the -PXD022816 -project (Morgenstern, Barzilay, and Levin 2021Morgenstern, David, Rotem Barzilay, and Yishai Levin. 2021. RawBeans: A Simple, Vendor-Independent, Raw-Data Quality-Control Tool.” Journal of Proteome Research. https://doi.org/10.1021/acs.jproteome.0c00956.).

-

- -

-
-
-
-

-► Solution -

- -
-
-

-► Question -

-
-

Generate a Spectra object and a table of filtered PSMs. Visualise -the total ion chromatograms and check the quality of the -identification data by comparing the density of the decoy and target -PSMs id scores for each file.

-

- -

-
-
-
-

-► Solution -

- -
-
-

-► Question -

-
-

Join the raw and identification data. Beware though that the joining -must now be performed by spectrum ids and by files.

-

- -

-
-
-
-

-► Solution -

- -
-
-

-► Question -

-
-

Extract the PSMs that have been matched to peptides from protein -O43175 and compare and cluster the scans. Hint: once you have -created the smaller Spectra object with the scans of interest, -switch to an in-memory backend to seed up the calculations.

-

- -

-
-
-
-

-► Solution -

- -
-
-

-► Question -

-
-

Generate total ion chromatograms for each acquisition and annotate the -MS1 scans with the number of PSMs using the countIdentifications() -function, as shown above. The function will automatically perform the -counts in parallel for each acquisition.

-

- -

-
-
-
-

-► Solution -

- -
-
-
-

-4.9 Exploration and Assessment of Identifications using MSnID -
-

-

The MSnID package extracts MS/MS ID data from mzIdentML (leveraging -the mzID package) or text files. After collating the search results -from multiple datasets it assesses their identification quality and -optimises filtering criteria to achieve the maximum number of -identifications while not exceeding a specified false discovery -rate. It also contains a number of utilities to explore the MS/MS -results and assess missed and irregular enzymatic cleavages, mass -measurement accuracy, etc.

-
-

-4.9.1 Step-by-step work-flow
-

-

Let’s reproduce parts of the analysis described the MSnID -vignette. You can explore more with

-
vignette("msnid_vignette", package = "MSnID")
-

The MSnID package can be used for post-search filtering -of MS/MS identifications. One starts with the construction of an -MSnID object that is populated with identification results that can -be imported from a data.frame or from mzIdenML files. Here, we -will use the example identification data provided with the package.

-
mzids <- system.file("extdata", "c_elegans.mzid.gz", package="MSnID")
-basename(mzids)
-
## [1] "c_elegans.mzid.gz"
-

We start by loading the package, initialising the MSnID object, and -add the identification result from our mzid file (there could of -course be more than one).

-
library("MSnID")
-
## 
-## Attaching package: 'MSnID'
-
## The following object is masked from 'package:ProtGenerics':
-## 
-##     peptides
-
msnid <- MSnID(".")
-
## Note, the anticipated/suggested columns in the
-## peptide-to-spectrum matching results are:
-## -----------------------------------------------
-## accession
-## calculatedMassToCharge
-## chargeState
-## experimentalMassToCharge
-## isDecoy
-## peptide
-## spectrumFile
-## spectrumID
-
msnid <- read_mzIDs(msnid, mzids)
-
## Loaded cached data
-
show(msnid)
-
## MSnID object
-## Working directory: "."
-## #Spectrum Files:  1 
-## #PSMs: 12263 at 36 % FDR
-## #peptides: 9489 at 44 % FDR
-## #accessions: 7414 at 76 % FDR
-

Printing the MSnID object returns some basic information such as

-
    -
  • Working directory.
  • -
  • Number of spectrum files used to generate data.
  • -
  • Number of peptide-to-spectrum matches and corresponding FDR.
  • -
  • Number of unique peptide sequences and corresponding FDR.
  • -
  • Number of unique proteins or amino acid sequence accessions and corresponding FDR.
  • -
-

The package then enables to define, optimise and apply filtering based -for example on missed cleavages, identification scores, precursor mass -errors, etc. and assess PSM, peptide and protein FDR levels. To -properly function, it expects to have access to the following data

-
## [1] "accession"                "calculatedMassToCharge"  
-## [3] "chargeState"              "experimentalMassToCharge"
-## [5] "isDecoy"                  "peptide"                 
-## [7] "spectrumFile"             "spectrumID"
-

which are indeed present in our data:

-
names(msnid)
-
##  [1] "spectrumID"                "scan number(s)"           
-##  [3] "acquisitionNum"            "passThreshold"            
-##  [5] "rank"                      "calculatedMassToCharge"   
-##  [7] "experimentalMassToCharge"  "chargeState"              
-##  [9] "MS-GF:DeNovoScore"         "MS-GF:EValue"             
-## [11] "MS-GF:PepQValue"           "MS-GF:QValue"             
-## [13] "MS-GF:RawScore"            "MS-GF:SpecEValue"         
-## [15] "AssumedDissociationMethod" "IsotopeError"             
-## [17] "isDecoy"                   "post"                     
-## [19] "pre"                       "end"                      
-## [21] "start"                     "accession"                
-## [23] "length"                    "description"              
-## [25] "pepSeq"                    "modified"                 
-## [27] "modification"              "idFile"                   
-## [29] "spectrumFile"              "databaseFile"             
-## [31] "peptide"
-

Here, we summarise a few steps and redirect the reader to the -package’s vignette for more details:

-
-
-

-4.9.2 Analysis of peptide sequences
-

-

Cleaning irregular cleavages at the termini of the peptides and -missing cleavage site within the peptide sequences. The following two -function calls create the new numMisCleavages and numIrregCleavages -columns in the MSnID object

-
msnid <- assess_termini(msnid, validCleavagePattern="[KR]\\.[^P]")
-msnid <- assess_missed_cleavages(msnid, missedCleavagePattern="[KR](?=[^P$])")
-
-
-

-4.9.3 Trimming the data
-

-

Now, we can use the apply_filter function to effectively apply -filters. The strings passed to the function represent expressions that -will be evaluated, thus keeping only PSMs that have 0 irregular -cleavages and 2 or less missed cleavages.

-
msnid <- apply_filter(msnid, "numIrregCleavages == 0")
-msnid <- apply_filter(msnid, "numMissCleavages <= 2")
-show(msnid)
-
## MSnID object
-## Working directory: "."
-## #Spectrum Files:  1 
-## #PSMs: 7838 at 17 % FDR
-## #peptides: 5598 at 23 % FDR
-## #accessions: 3759 at 53 % FDR
-
-
-

-4.9.4 Parent ion mass errors
-

-

Using "calculatedMassToCharge" and "experimentalMassToCharge", the -mass_measurement_error function calculates the parent ion mass -measurement error in parts per million.

-
summary(mass_measurement_error(msnid))
-
##       Min.    1st Qu.     Median       Mean    3rd Qu.       Max. 
-## -2184.0640    -0.6992     0.0000    17.6146     0.7512  2012.5178
-

We then filter any matches that do not fit the +/- 20 ppm tolerance

-
msnid <- apply_filter(msnid, "abs(mass_measurement_error(msnid)) < 20")
-summary(mass_measurement_error(msnid))
-
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
-## -19.7797  -0.5866   0.0000  -0.2970   0.5713  19.6758
-
-
-

-4.9.5 Filtering criteria
-

-

Filtering of the identification data will rely on

-
    -
  • -log10 transformed MS-GF+ Spectrum E-value, reflecting the goodness -of match between experimental and theoretical fragmentation patterns
  • -
-
msnid$msmsScore <- -log10(msnid$`MS-GF:SpecEValue`)
-
    -
  • the absolute mass measurement error (in ppm units) of the parent ion
  • -
-
msnid$absParentMassErrorPPM <- abs(mass_measurement_error(msnid))
-
-
-

-4.9.6 Setting filters
-

-

MS2 filters are handled by a special MSnIDFilter class objects, where -individual filters are set by name (that is present in names(msnid)) -and comparison operator (>, <, = , …) defining if we should retain -hits with higher or lower given the threshold and finally the -threshold value itself.

-
filtObj <- MSnIDFilter(msnid)
-filtObj$absParentMassErrorPPM <- list(comparison="<", threshold=10.0)
-filtObj$msmsScore <- list(comparison=">", threshold=10.0)
-show(filtObj)
-
## MSnIDFilter object
-## (absParentMassErrorPPM < 10) & (msmsScore > 10)
-

We can then evaluate the filter on the identification data object, -which returns the false discovery rate and number of retained -identifications for the filtering criteria at hand.

-
evaluate_filter(msnid, filtObj)
-
##           fdr    n
-## PSM         0 3807
-## peptide     0 2455
-## accession   0 1009
-
-
-

-4.9.7 Filter optimisation
-

-

Rather than setting filtering values by hand, as shown above, these -can be set automatically to meet a specific false discovery rate.

-
filtObj.grid <- optimize_filter(filtObj, msnid, fdr.max=0.01,
-                                method="Grid", level="peptide",
-                                n.iter=500)
-show(filtObj.grid)
-
## MSnIDFilter object
-## (absParentMassErrorPPM < 3) & (msmsScore > 7.4)
-
evaluate_filter(msnid, filtObj.grid)
-
##                   fdr    n
-## PSM       0.004097561 5146
-## peptide   0.006447651 3278
-## accession 0.021996616 1208
-

Filters can eventually be applied (rather than just evaluated) using -the apply_filter function.

-
msnid <- apply_filter(msnid, filtObj.grid)
-show(msnid)
-
## MSnID object
-## Working directory: "."
-## #Spectrum Files:  1 
-## #PSMs: 5146 at 0.41 % FDR
-## #peptides: 3278 at 0.64 % FDR
-## #accessions: 1208 at 2.2 % FDR
-

And finally, identifications that matched decoy and contaminant -protein sequences are removed

-
msnid <- apply_filter(msnid, "isDecoy == FALSE")
-msnid <- apply_filter(msnid, "!grepl('Contaminant',accession)")
-show(msnid)
-
## MSnID object
-## Working directory: "."
-## #Spectrum Files:  1 
-## #PSMs: 5117 at 0 % FDR
-## #peptides: 3251 at 0 % FDR
-## #accessions: 1179 at 0 % FDR
-
-
-

-4.9.8 Export MSnID data
-

-

The resulting filtered identification data can be exported to a -data.frame (or to a dedicated MSnSet data structure from the -MSnbase package) for quantitative MS data, described below, and -further processed and analysed using appropriate statistical tests.

-
head(psms(msnid))
-
##   spectrumID scan number(s) acquisitionNum passThreshold rank
-## 1 index=7151           8819           7151          TRUE    1
-## 2 index=8520          10419           8520          TRUE    1
-##   calculatedMassToCharge experimentalMassToCharge chargeState MS-GF:DeNovoScore
-## 1               1270.318                 1270.318           3               287
-## 2               1426.737                 1426.739           3               270
-##   MS-GF:EValue MS-GF:PepQValue MS-GF:QValue MS-GF:RawScore MS-GF:SpecEValue
-## 1 1.709082e-24               0            0            239     1.007452e-31
-## 2 3.780745e-24               0            0            230     2.217275e-31
-##   AssumedDissociationMethod IsotopeError isDecoy post pre end start accession
-## 1                       CID            0   FALSE    A   K 283   249   CE02347
-## 2                       CID            0   FALSE    A   K 182   142   CE07055
-##   length
-## 1    393
-## 2    206
-##                                                                                                                           description
-## 1 WBGene00001993; locus:hpd-1; 4-hydroxyphenylpyruvate dioxygenase; status:Confirmed; UniProt:Q22633; protein_id:CAA90315.1; T21C12.2
-## 2           WBGene00001755; locus:gst-7; glutathione S-transferase; status:Confirmed; UniProt:P91253; protein_id:AAB37846.1; F11G11.2
-##                                      pepSeq modified modification
-## 1       AISQIQEYVDYYGGSGVQHIALNTSDIITAIEALR    FALSE         <NA>
-## 2 SAGSGYLVGDSLTFVDLLVAQHTADLLAANAALLDEFPQFK    FALSE         <NA>
-##              idFile                                   spectrumFile
-## 1 c_elegans.mzid.gz c_elegans_A_3_1_21Apr10_Draco_10-03-04_dta.txt
-## 2 c_elegans.mzid.gz c_elegans_A_3_1_21Apr10_Draco_10-03-04_dta.txt
-##               databaseFile                                       peptide
-## 1 ID_004174_E48C5B52.fasta       K.AISQIQEYVDYYGGSGVQHIALNTSDIITAIEALR.A
-## 2 ID_004174_E48C5B52.fasta K.SAGSGYLVGDSLTFVDLLVAQHTADLLAANAALLDEFPQFK.A
-##   numIrregCleavages numMissCleavages msmsScore absParentMassErrorPPM
-## 1                 0                0  30.99678             0.3843772
-## 2                 0                0  30.65418             1.3689451
-##  [ reached 'max' / getOption("max.print") -- omitted 4 rows ]
- -
-
-
-
-
-
    -
  1. Previously named PSM.↩︎

  2. -
  3. The rownames aren’t needed here are are removed to reduce -to output in the the next code chunk display parts of id2.↩︎

  4. -
-
- - -

- - -

-

Page built: -2023-09-06 - using -R version 4.3.1 Patched (2023-07-10 r84676) -

-
-
- - - - - diff --git a/docs/sec-msintro.html b/docs/sec-msintro.html deleted file mode 100644 index 84ee364..0000000 --- a/docs/sec-msintro.html +++ /dev/null @@ -1,410 +0,0 @@ - - - - - - - - - - - - - - - - - - - - -Chapter 2 Introduction | R for Mass Spectrometry - - - - - - - - - - - - - - - - - - - - - - - -
-
- - -
-

-Chapter 2 Introduction

-
-

-2.1 How does mass spectrometry work?
-

-

Mass spectrometry (MS) is a technology that separates charged -molecules (ions) based on their mass to charge ratio (M/Z). It is -often coupled to chromatography (liquid LC, but can also be gas-based -GC). The time an analyte takes to elute from the chromatography -column is the retention time.

-
- -

-Figure 2.1: A chromatogram, illustrating the total amount of analytes over the retention time. -

-A chromatogram, illustrating the total amount of analytes over the retention time. -
-

An mass spectrometer is composed of three components:

-
    -
  1. The source, that ionises the molecules: examples are Matrix-assisted -laser desorption/ionisation (MALDI) or electrospray ionisation. -(ESI)
  2. -
  3. The analyser, that separates the ions: Time of flight (TOF) or Orbitrap.
  4. -
  5. The detector that quantifies the ions.
  6. -
-

When using mass spectrometry for proteomics, the proteins are first -digested with a protease such as trypsin. In mass shotgun proteomics, -the analytes assayed in the mass spectrometer are peptides.

-

Often, ions are subjected to more than a single MS round. After a -first round of separation, the peaks in the spectra, called MS1 -spectra, represent peptides. At this stage, the only information we -possess about these peptides are their retention time and their -mass-to-charge (we can also infer their charge by inspecting their -isotopic envelope, i.e the peaks of the individual isotopes, see -below), which is not enough to infer their identify (i.e. their -sequence).

-

In MSMS (or MS2), the settings of the mass spectrometer are set -automatically to select a certain number of MS1 peaks (for example -20)1. Once a narrow M/Z range has been -selected (corresponding to one high-intensity peak, a peptide, and -some background noise), it is fragmented (using for example -collision-induced dissociation (CID), higher energy collisional -dissociation (HCD) or electron-transfer dissociation (ETD)). The -fragment ions are then themselves separated in the analyser to produce -a MS2 spectrum. The unique fragment ion pattern can then be used to -infer the peptide sequence using de novo sequencing (when the spectrum -is of high enough quality) or using a search engine such as, for -example Mascot, MSGF+, …, that will match the observed, experimental -spectrum to theoretical spectra (see details below).

-
- -

-Figure 2.2: Schematics of a mass spectrometer and two rounds of MS. -

-Schematics of a mass spectrometer and two rounds of MS. -
-

The animation below show how 25 ions different ions (i.e. having -different M/Z values) are separated throughout the MS analysis and are -eventually detected (i.e. quantified). The final frame shows the -hypothetical spectrum.

-
- -

-Figure 2.3: Separation and detection of ions in a mass spectrometer. -

-Separation and detection of ions in a mass spectrometer. -
-

The figures below illustrate the two rounds of MS. The spectrum on the -left is an MS1 spectrum acquired after 21 minutes and 3 seconds of -elution. 10 peaks, highlited by dotted vertical lines, were selected -for MS2 analysis. The peak at M/Z 460.79 (488.8) is highlighted by a -red (orange) vertical line on the MS1 spectrum and the fragment -spectra are shown on the MS2 spectrum on the top (bottom) right -figure.

-
- -

-Figure 2.4: Parent ions in the MS1 spectrum (left) and two sected fragment ions MS2 spectra (right) -

-Parent ions in the MS1 spectrum (left) and two sected fragment ions MS2 spectra (right) -
-

The figures below represent the 3 dimensions of MS data: a set of -spectra (M/Z and intensity) of retention time, as well as the -interleaved nature of MS1 and MS2 (and there could be more levels) -data.

-
- -

-Figure 2.5: MS1 spectra over retention time. -

-MS1 spectra over retention time. -
-
- -

-Figure 2.6: MS2 spectra interleaved between two MS1 spectra. -

-MS2 spectra interleaved between two MS1 spectra. -
-
-
-

-2.2 Accessing data
-

-
-

From the ProteomeXchange database
-

-

MS-based proteomics data is disseminated through the -ProteomeXchange infrastructure, -which centrally coordinates submission, storage and dissemination -through multiple data repositories, such as the -PRoteomics IDEntifications (PRIDE) -database at the EBI for mass spectrometry-based experiments (including -quantitative data, as opposed as the name suggests), -PASSEL at the ISB for Selected -Reaction Monitoring (SRM, i.e. targeted) data and the -MassIVE -resource. These data can be downloaded within R using the -rpx package.

-
library("rpx")
-

Using the unique PXD000001 identifier, we can retrieve the relevant -metadata that will be stored in a PXDataset object. The names of the -files available in this data can be retrieved with the pxfiles -accessor function.

-
px <- PXDataset("PXD000001")
-
## Loading PXD000001 from cache.
-
px
-
## Project PXD000001 with 11 files
-## 
-
## Resource ID BFC225 in cache in /home/lgatto/.cache/R/rpx.
-
##  [1] 'F063721.dat' ... [11] 'erwinia_carotovora.fasta'
-##  Use 'pxfiles(.)' to see all files.
-
pxfiles(px)
-
## Project PXD000001 files (11):
-##  [remote] F063721.dat
-##  [local]  F063721.dat-mztab.txt
-##  [remote] PRIDE_Exp_Complete_Ac_22134.xml.gz
-##  [remote] PRIDE_Exp_mzData_Ac_22134.xml.gz
-##  [remote] PXD000001_mztab.txt
-##  [remote] README.txt
-##  [local]  TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01-20141210.mzML
-##  [remote] TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01-20141210.mzXML
-##  [local]  TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01.mzXML
-##  [remote] TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01.raw
-##  ...
-

Other metadata for the px data set:

-
pxtax(px)
-
## [1] "Erwinia carotovora"
-
pxurl(px)
-
## [1] "ftp://ftp.pride.ebi.ac.uk/pride/data/archive/2012/03/PXD000001"
-
pxref(px)
-
## [1] "Gatto L, Christoforou A; Using R and Bioconductor for proteomics data analysis., Biochim Biophys Acta, 2013 May 18, doi:10.1016/j.bbapap.2013.04.032 PMID:23692960"
-

Data files can then be downloaded with the pxget function. Below, we -retrieve the raw data file. The file is -downloaded2 -in the working directory and the name of the file is return by the -function and stored in the mzf variable for later use 3.

-
fn <- "TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01-20141210.mzML"
-mzf <- pxget(px, fn)
-
## Loading TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01-20141210.mzML from cache.
-
mzf
-
## [1] "/home/lgatto/.cache/R/rpx/8ee512042c5ff_TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01-20141210.mzML"
-
-
-

Data packages
-

-

Some data are also distributed through dedicated packages. The -msdata, for example, provides some -general raw data files relevant for both proteomics and -metabolomics.

-
library("msdata")
-## proteomics raw data
-proteomics()
-
## [1] "MRM-standmix-5.mzML.gz"                                                
-## [2] "MS3TMT10_01022016_32917-33481.mzML.gz"                                 
-## [3] "MS3TMT11.mzML"                                                         
-## [4] "TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01-20141210.mzML.gz"
-## [5] "TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01.mzML.gz"
-
## proteomics identification data
-ident()
-
## [1] "TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01-20141210.mzid"
-
## quantitative data
-quant()
-
## [1] "cptac_a_b_peptides.txt"
-

More often, such experiment packages distribute processed data; an -example of such is the pRolocdata -package, that offers quantitative proteomics data.

- -
-
-
-
-
-
    -
  1. Here, we will focus on data dependent acquisition (DDA), where -MS1 peaks are selected. In data independent acquisition (DIA), all peaks -in the MS1 spectrum are fragmented.↩︎

  2. -
  3. If the file is already available, it is not downloaded a second time.↩︎

  4. -
  5. This and other files are also availabel in the msdata package, described below↩︎

  6. -
-
- - -

- - -

-

Page built: -2023-09-06 - using -R version 4.3.1 Patched (2023-07-10 r84676) -

-
-
- - - - - diff --git a/docs/sec-quant.html b/docs/sec-quant.html deleted file mode 100644 index aae7f09..0000000 --- a/docs/sec-quant.html +++ /dev/null @@ -1,1695 +0,0 @@ - - - - - - - - - - - - - - - - - - - - -Chapter 5 Quantitative data | R for Mass Spectrometry - - - - - - - - - - - - - - - - - - - - - - - -
-
- - -
-

-Chapter 5 Quantitative data

-
-

-5.1 Quantitation methodologies
-

-

There are a wide range of proteomics quantitation techniques that can -broadly be classified as labelled vs. label-free, depending on whether -the features are labelled prior the MS acquisition and the MS level at -which quantitation is inferred, namely MS1 or MS2.

- - - - - - - - - - - - - - - - - - -
Label-freeLabelled
MS1XICSILAC, 15N
MS2CountingiTRAQ, TMT
-
-

-5.1.1 Label-free MS2: Spectral counting
-

-

In spectral counting, one simply counts the number of quantified -peptides that are assigned to a protein.

-
- -

-Figure 5.1: Spectral counting. Figure from the Pbase package. -

-Spectral counting. Figure from the `Pbase` package. -
-
-
-

-5.1.2 Labelled MS2: Isobaric tagging
-

-

Isobaric tagging refers to the labelling using isobaric tags, -i.e. chemical tags that have the same mass and hence can’t be -distinguished by the spectrometer. The peptides of different samples (4, -6, 10, 11 or 16) are labelled with different tags and combined prior -to mass spectrometry acquisition. Given that they are isobaric, all -identical peptides, irrespective of the tag and this the sample of -origin, are co-analysed, up to fragmentation prior to MS2 -analysis. During fragmentation, the isobaric tags fall of, fragment -themselves, and result in a set of sample specific peaks. These -specific peaks can be used to infer sample-specific quantitation, -while the rest of the MS2 spectrum is used for identification.

-
- -

-Figure 5.2: iTRAQ 4-plex isobaric tagging. Tandem Mass Tags (TMT) offer up to 16 tags. -

-iTRAQ 4-plex isobaric tagging. Tandem Mass Tags (TMT) offer up to 16 tags. -
-
-
-

-5.1.3 Label-free MS1: extracted ion chromatograms
-

-

In label-free quantitation, the precursor peaks that match an -identified peptide are integrated over retention time and the area under -that extracted ion chromatogram is used to quantify that peptide in -that sample.

-
- -

-Figure 5.3: Label-free quantitation. Figure credit Johannes Rainer. -

-Label-free quantitation. Figure credit [Johannes Rainer](https://github.com/jorainer/). -
-
-
-

-5.1.4 Labelled MS1: SILAC
-

-

In SILAC quantitation, sample are grown in a medium that contains -heavy amino acids (typically arginine and lysine). All proteins grown -in this heavy growth medium contain the heavy form of these amino -acids. Two samples, one grown in heavy medium, and one grown in normal -(light) medium are then combined and analysed together. The heavy -peptides precursor peaks are systematically shifted compared to the -light ones, and the ratio between the height of a heavy and light -peaks can be used to calculate peptide and protein fold-changes.

-
- -

-Figure 5.4: Silac quantitation. Figure credit Wikimedia Commons. -

-Silac quantitation. Figure credit Wikimedia Commons. -
-

These different quantitation techniques come with their respective -benefits and distinct challenges, such as large quantities of raw data -processing, data transformation and normalisation, missing values, and -different underlying statistical models for the quantitative data -(count data for spectral counting, continuous data for the others).

-

In terms of raw data quantitation in R/Bioconductor, most efforts have -been devoted to MS2-level quantitation. Label-free XIC quantitation -has been addressed in the frame of metabolomics data processing by the -xcms infrastructure.

- - - - - - - -
-
-
-

-5.2 QFeatures
-

-

Mass spectrometry-based quantitative proteomics data can be -represented as a matrix of quantitative values for features (PSMs, -peptides, proteins) arranged along the rows, measured for a set of -samples, arranged along the columns. There is a common representation -for such quantitative data set, namely the SummarizedExperiment -(Morgan et al. 2020Morgan, Martin, Valerie Obenchain, Jim Hester, and Hervé Pagès. 2020. SummarizedExperiment: SummarizedExperiment Container. https://bioconductor.org/packages/SummarizedExperiment.) class:

-
- -

-Figure 5.5: Schematic representation of the anatomy of a SummarizedExperiment object. (Figure taken from the SummarizedExperiment package vignette.) -

-Schematic representation of the anatomy of a `SummarizedExperiment` object. (Figure taken from the `SummarizedExperiment` package vignette.) -
-
    -
  • The sample (columns) metadata can be accessed with the colData() -function.
  • -
  • The features (rows) metadata can be accessed with the rowData() -column.
  • -
  • If the features represent ranges along genomic coordinates, these -can be accessed with rowRanges() -
  • -
  • Additional metadata describing the overall experiment can be -accessed with metadata().
  • -
  • The quantitative data can be accessed with assay().
  • -
  • -assays() returns a list of matrix-like assays.
  • -
-
-

-5.2.1 The QFeatures class
-

-

While mass spectrometers acquire data for spectra/peptides, the -biological entity of interest are the protein. As part of the data -processing, we are thus required to aggregate low-level -quantitative features into higher level data.

-
- -

-Figure 5.6: Conceptual representation of a QFeatures object and the aggregative relation between different assays. -

-Conceptual representation of a `QFeatures` object and the aggregative relation between different assays. -
-

We are going to start to familiarise ourselves with the QFeatures -class implemented in the -QFeatures -package. The class is derived from the Bioconductor -MultiAssayExperiment (Ramos et al. 2017Ramos, Marcel, Lucas Schiffer, Angela Re, Rimsha Azhar, Azfar Basunia, Carmen Rodriguez Cabrera, Tiffany Chan, et al. 2017. “Software for the Integration of Multi-Omics Experiments in Bioconductor.” Cancer Research 77(21); e39-42.) (MAE) class. Let’s start by loading the -QFeatures package.

-
library("QFeatures")
-

Next, we load the feat1 test data, which is composed of single -assay of class SummarizedExperiment composed of 10 rows and 2 -columns.

-
data(feat1)
-feat1
-
## An instance of class QFeatures containing 1 assays:
-##  [1] psms: SummarizedExperiment with 10 rows and 2 columns
-

Let’s perform some simple operations to familiarise ourselves with the -QFeatures class:

-
    -
  • Extract the sample metadata using the colData() accessor (like you -have previously done with SummarizedExperiment objects).
  • -
-
colData(feat1)
-
## DataFrame with 2 rows and 1 column
-##        Group
-##    <integer>
-## S1         1
-## S2         2
-

We can also further annotate the experiment by adding columns to the colData slot:

-
colData(feat1)$X <- c("X1", "X2")
-feat1$Y <- c("Y1", "Y2")
-colData(feat1)
-
## DataFrame with 2 rows and 3 columns
-##        Group           X           Y
-##    <integer> <character> <character>
-## S1         1          X1          Y1
-## S2         2          X2          Y2
-
    -
  • Extract the first (and only) assay composing this QFeatures data -using the [[ operator (as you have done to extract elements of a -list) by using the assay’s index or name.
  • -
-
feat1[[1]]
-
## class: SummarizedExperiment 
-## dim: 10 2 
-## metadata(0):
-## assays(1): ''
-## rownames(10): PSM1 PSM2 ... PSM9 PSM10
-## rowData names(5): Sequence Protein Var location pval
-## colnames(2): S1 S2
-## colData names(0):
-
feat1[["psms"]]
-
## class: SummarizedExperiment 
-## dim: 10 2 
-## metadata(0):
-## assays(1): ''
-## rownames(10): PSM1 PSM2 ... PSM9 PSM10
-## rowData names(5): Sequence Protein Var location pval
-## colnames(2): S1 S2
-## colData names(0):
-
    -
  • Extract the psms assay’s row data and quantitative values.
  • -
-
assay(feat1[[1]])
-
##       S1 S2
-## PSM1   1 11
-## PSM2   2 12
-## PSM3   3 13
-## PSM4   4 14
-## PSM5   5 15
-## PSM6   6 16
-## PSM7   7 17
-## PSM8   8 18
-## PSM9   9 19
-## PSM10 10 20
-
rowData(feat1[[1]])
-
## DataFrame with 10 rows and 5 columns
-##            Sequence     Protein       Var      location      pval
-##         <character> <character> <integer>   <character> <numeric>
-## PSM1       SYGFNAAR       ProtA         1 Mitochondr...     0.084
-## PSM2       SYGFNAAR       ProtA         2 Mitochondr...     0.077
-## PSM3       SYGFNAAR       ProtA         3 Mitochondr...     0.063
-## PSM4       ELGNDAYK       ProtA         4 Mitochondr...     0.073
-## PSM5       ELGNDAYK       ProtA         5 Mitochondr...     0.012
-## PSM6       ELGNDAYK       ProtA         6 Mitochondr...     0.011
-## PSM7  IAEESNFPFI...       ProtB         7       unknown     0.075
-## PSM8  IAEESNFPFI...       ProtB         8       unknown     0.038
-## PSM9  IAEESNFPFI...       ProtB         9       unknown     0.028
-## PSM10 IAEESNFPFI...       ProtB        10       unknown     0.097
-
-
-

-5.2.2 Feature aggregation
-

-

The central functionality of the QFeatures infrastructure is the -aggregation of features into higher-level features while retaining the -link between the different levels. This can be done with the -aggregateFeatures() function.

-

The call below will

-
    -
  • operate on the psms assay of the feat1 objects;
  • -
  • aggregate the rows of the assay following the grouping defined in the -peptides row data variables;
  • -
  • perform aggregation using the colMeans() function;
  • -
  • create a new assay named peptides and add it to the feat1 -object.
  • -
-
feat1 <- aggregateFeatures(feat1, i = "psms",
-                           fcol = "Sequence",
-                           name = "peptides",
-                           fun = colMeans)
-feat1
-
## An instance of class QFeatures containing 2 assays:
-##  [1] psms: SummarizedExperiment with 10 rows and 2 columns 
-##  [2] peptides: SummarizedExperiment with 3 rows and 2 columns
-
    -
  • Let’s convince ourselves that we understand the effect of feature -aggregation and repeat the calculations manually and check the -content of the new assay’s row data.
  • -
-
## SYGFNAAR
-colMeans(assay(feat1[[1]])[1:3, ])
-
## S1 S2 
-##  2 12
-
assay(feat1[[2]])["SYGFNAAR", ]
-
## S1 S2 
-##  2 12
-
## ELGNDAYK
-colMeans(assay(feat1[[1]])[4:6, ])
-
## S1 S2 
-##  5 15
-
assay(feat1[[2]])["ELGNDAYK", ]
-
## S1 S2 
-##  5 15
-
## IAEESNFPFIK
-colMeans(assay(feat1[[1]])[7:10, ])
-
##   S1   S2 
-##  8.5 18.5
-
assay(feat1[[2]])["IAEESNFPFIK", ]
-
##   S1   S2 
-##  8.5 18.5
-
rowData(feat1[[2]])
-
## DataFrame with 3 rows and 4 columns
-##                  Sequence     Protein      location        .n
-##               <character> <character>   <character> <integer>
-## ELGNDAYK         ELGNDAYK       ProtA Mitochondr...         3
-## IAEESNFPFIK IAEESNFPFI...       ProtB       unknown         4
-## SYGFNAAR         SYGFNAAR       ProtA Mitochondr...         3
-

We can now aggregate the peptide-level data into a new protein-level -assay using the colMedians() aggregation function.

-
feat1 <- aggregateFeatures(feat1, i = "peptides",
-                           fcol = "Protein",
-                           name = "proteins",
-                           fun = colMedians)
-feat1
-
## An instance of class QFeatures containing 3 assays:
-##  [1] psms: SummarizedExperiment with 10 rows and 2 columns 
-##  [2] peptides: SummarizedExperiment with 3 rows and 2 columns 
-##  [3] proteins: SummarizedExperiment with 2 rows and 2 columns
-
assay(feat1[["proteins"]])
-
##        S1   S2
-## ProtA 3.5 13.5
-## ProtB 8.5 18.5
-
-
-

-5.2.3 Subsetting and filtering
-

-

The link between the assays becomes apparent when we now subset the -assays for protein A as shown below or using the subsetByFeature() -function. This creates a new instance of class QFeatures containing -assays with the expression data for protein, its peptides and their -PSMs.

-
feat1["ProtA", , ]
-
## An instance of class QFeatures containing 3 assays:
-##  [1] psms: SummarizedExperiment with 6 rows and 2 columns 
-##  [2] peptides: SummarizedExperiment with 2 rows and 2 columns 
-##  [3] proteins: SummarizedExperiment with 1 rows and 2 columns
-

The filterFeatures() function can be used to filter rows the assays -composing a QFeatures object using the row data variables. We can -for example retain rows that have a pval < 0.05, which would only -keep rows in the psms assay because the pval is only relevant for -that assay.

-
filterFeatures(feat1, ~ pval < 0.05)
-
## 'pval' found in 1 out of 3 assay(s)
-## No filter applied to the following assay(s) because one or more filtering variables are missing in the rowData: peptides, proteins.
-## You can control whether to remove or keep the features using the 'keep' argument (see '?filterFeature').
-
## An instance of class QFeatures containing 3 assays:
-##  [1] psms: SummarizedExperiment with 4 rows and 2 columns 
-##  [2] peptides: SummarizedExperiment with 0 rows and 2 columns 
-##  [3] proteins: SummarizedExperiment with 0 rows and 2 columns
-
-

-► Question -

-
-

As the message above implies, it is also possible to apply a filter to -only the assays that have a filtering variables by setting the keep -variables.

-

- -

-
-
-
-

-► Solution -

- -
-

On the other hand, if we filter assay rows for those that localise to -the mitochondrion, we retain the relevant protein, peptides and PSMs.

-
filterFeatures(feat1, ~ location == "Mitochondrion")
-
## 'location' found in 3 out of 3 assay(s)
-
## An instance of class QFeatures containing 3 assays:
-##  [1] psms: SummarizedExperiment with 6 rows and 2 columns 
-##  [2] peptides: SummarizedExperiment with 2 rows and 2 columns 
-##  [3] proteins: SummarizedExperiment with 1 rows and 2 columns
-
-

-► Question -

-
-

As an exercise, let’s filter rows that do not localise to the -mitochondrion.

-

- -

-
-
-
-

-► Solution -

- -
-

You can refer to the Quantitative features for mass spectrometry -data -vignette and the QFeatures manual -page -for more details about the class.

-
-
-
-

-5.3 Creating QFeatures object
-

-

While QFeatures objects can be created manually (see ?QFeatures -for details), most users have a quantitative data in a spreadsheet or -a data.frame. In such cases, the easiest is to use the readQFeatures -function to extract the quantitative data and metadata columns. Below, -we load the hlpsms dataframe that contains data for 28 -PSMs from the TMT-10plex hyperLOPIT spatial proteomics experiment -from (Christoforou et al. 2016Christoforou, Andy, Claire M Mulvey, Lisa M Breckels, Aikaterini Geladaki, Tracey Hurrell, Penelope C Hayward, Thomas Naake, et al. 2016. “A Draft Map of the Mouse Pluripotent Stem Cell Spatial Proteome.” Nat Commun 7: 8992. https://doi.org/10.1038/ncomms9992.). The ecol argument specifies that columns -1 to 10 contain quantitation data, and that the assay should be named -psms in the returned QFeatures object, to reflect the nature of -the data.

-
data(hlpsms)
-hl <- readQFeatures(hlpsms, ecol = 1:10, name = "psms")
-hl
-
## An instance of class QFeatures containing 1 assays:
-##  [1] psms: SummarizedExperiment with 3010 rows and 10 columns
-

Below, we see that we can extract an assay using its index or its -name. The individual assays are stored as SummarizedExperiment -object and further access its quantitative data and metadata using -the assay and rowData functions.

-
hl[[1]]
-
## class: SummarizedExperiment 
-## dim: 3010 10 
-## metadata(0):
-## assays(1): ''
-## rownames(3010): 1 2 ... 3009 3010
-## rowData names(18): Sequence ProteinDescriptions ... RTmin markers
-## colnames(10): X126 X127C ... X130N X131
-## colData names(0):
-
hl[["psms"]]
-
## class: SummarizedExperiment 
-## dim: 3010 10 
-## metadata(0):
-## assays(1): ''
-## rownames(3010): 1 2 ... 3009 3010
-## rowData names(18): Sequence ProteinDescriptions ... RTmin markers
-## colnames(10): X126 X127C ... X130N X131
-## colData names(0):
-
head(assay(hl[["psms"]]))
-
##         X126      X127C       X127N      X128C       X128N      X129C
-## 1 0.12283431 0.08045915 0.070804055 0.09386901 0.051815695 0.13034383
-## 2 0.35268185 0.14162381 0.167523880 0.07843497 0.071087436 0.03214548
-## 3 0.01546089 0.16142297 0.086938133 0.23120844 0.114664348 0.09610188
-## 4 0.04702854 0.09288723 0.102012167 0.11125409 0.067969116 0.14155358
-## 5 0.01044693 0.15866147 0.167315736 0.21017494 0.147946673 0.07088253
-## 6 0.04955362 0.01215244 0.002477681 0.01297833 0.002988949 0.06253195
-##        X129N       X130C      X130N       X131
-## 1 0.17540095 0.040068658 0.11478839 0.11961594
-## 2 0.06686260 0.031961793 0.02810434 0.02957384
-## 3 0.15977819 0.010127118 0.08059400 0.04370403
-## 4 0.18015910 0.035329902 0.12166589 0.10014038
-## 5 0.17555789 0.007088253 0.02884754 0.02307803
-## 6 0.01726511 0.172651119 0.37007905 0.29732174
-
head(rowData(hl[["psms"]]))
-
## DataFrame with 6 rows and 18 columns
-##      Sequence ProteinDescriptions NbProteins ProteinGroupAccessions
-##   <character>         <character>  <integer>            <character>
-## 1     SQGEIDk       Tetratrico...          1                 Q8BYY4
-## 2     YEAQGDk       Vacuolar p...          1                 P46467
-## 3     TTScDTk       C-type man...          1                 Q64449
-## 4     aEELESR       Liprin-alp...          1                 P60469
-##   Modifications    qValue       PEP  IonScore NbMissedCleavages
-##     <character> <numeric> <numeric> <integer>         <integer>
-## 1 K7(TMT6ple...     0.008   0.11800        27                 0
-## 2 K7(TMT6ple...     0.001   0.01070        27                 0
-## 3 C4(Carbami...     0.008   0.11800        11                 0
-## 4 N-Term(TMT...     0.002   0.04450        24                 0
-##   IsolationInterference IonInjectTimems Intensity    Charge      mzDa      MHDa
-##               <integer>       <integer> <numeric> <integer> <numeric> <numeric>
-## 1                     0              70    335000         2   503.274   1005.54
-## 2                     0              70    926000         2   520.267   1039.53
-## 3                     0              70    159000         2   521.258   1041.51
-## 4                     0              70    232000         2   531.785   1062.56
-##   DeltaMassPPM     RTmin       markers
-##      <numeric> <numeric>   <character>
-## 1        -0.38     24.02       unknown
-## 2         0.61     18.85       unknown
-## 3         1.11     10.17       unknown
-## 4         0.35     29.18       unknown
-##  [ reached getOption("max.print") -- omitted 2 rows ]
-

For further details on how to manipulate such objects, refer to the -MultiAssayExperiment (Ramos et al. 2017Ramos, Marcel, Lucas Schiffer, Angela Re, Rimsha Azhar, Azfar Basunia, Carmen Rodriguez Cabrera, Tiffany Chan, et al. 2017. “Software for the Integration of Multi-Omics Experiments in Bioconductor.” Cancer Research 77(21); e39-42.) and -SummarizedExperiment (Morgan et al. 2020Morgan, Martin, Valerie Obenchain, Jim Hester, and Hervé Pagès. 2020. SummarizedExperiment: SummarizedExperiment Container. https://bioconductor.org/packages/SummarizedExperiment.) packages.

-

It is also possible to first create a SummarizedExperiment, and then -only include it into a QFeatures object.

-
se <- readSummarizedExperiment(hlpsms, ecol = 1:10)
-se
-
## class: SummarizedExperiment 
-## dim: 3010 10 
-## metadata(0):
-## assays(1): ''
-## rownames(3010): 1 2 ... 3009 3010
-## rowData names(18): Sequence ProteinDescriptions ... RTmin markers
-## colnames(10): X126 X127C ... X130N X131
-## colData names(0):
-
QFeatures(list(psm = se))
-
## An instance of class QFeatures containing 1 assays:
-##  [1] psm: SummarizedExperiment with 3010 rows and 10 columns
-

At this stage, i.e. at the beginning of the analysis, whether you have -a SummarizedExperiment or a QFeatures object, it is a good time to -define the experimental design in the colData slot.

-
-

Exercise
-

-

The CPTAC spike-in study 6 (Paulovich et al. 2010Paulovich, Amanda G, Dean Billheimer, Amy-Joan L Ham, Lorenzo Vega-Montoto, Paul A Rudnick, David L Tabb, Pei Wang, et al. 2010. “Interlaboratory Study Characterizing a Yeast Performance Standard for Benchmarking LC-MS Platform Performance.” Mol. Cell. Proteomics 9 (2): 242–54.) combines the Sigma UPS1 -standard containing 48 different human proteins that are spiked in at -5 different concentrations (conditions A to E) into a constant yeast -protein background. The sample were acquired in triplicate on -different instruments in different labs. We are going to start with a -subset of the CPTAC study 6 containing conditions A and B for a single -lab.

-
- -

-Figure 5.7: The CPTAC spike-in study design (credit Lieven Clement, statOmics, Ghent University). -

-The CPTAC spike-in study design (credit Lieven Clement, statOmics, Ghent University). -
-

The peptide-level data, as processed by MaxQuant (Cox and Mann 2008Cox, J, and M Mann. 2008. “MaxQuant Enables High Peptide Identification Rates, Individualized p.p.b.-Range Mass Accuracies and Proteome-Wide Protein Quantification.” Nat Biotechnol 26 (12): 1367–72. https://doi.org/10.1038/nbt.1511.) is -available in the msdata package:

-
basename(f <- msdata::quant(pattern = "cptac", full.names = TRUE))
-
## [1] "cptac_a_b_peptides.txt"
-
-

-► Question -

-
-

Read these data in as either a SummarizedExperiment or a QFeatures -object.

-

- -

-
-
-
-

-► Solution -

- -
-
-

-► Question -

-
-

Before proceeding, we are going to clean up the sample names by -removing the unnecessary Intensity prefix and annotate the -experiment in the object’s colData.

-

- -

-
-
-
-

-► Solution -

- -
-
-

-► Question -

-
-

There are many row variables that aren’t useful here. Get rid or all -of them but Sequence, Proteins, Leading.razor.protein, PEP, -Score, Reverse, and Potential.contaminant.

-

- -

-
-
-
-

-► Solution -

- -
-
-
-
-

-5.4 Analysis pipeline
-

-

A typical quantitative proteomics data processing is composed of the -following steps, which we are going to apply to the cptac data created -above.

-
    -
  • Data import
  • -
  • Exploratory data analysis (PCA)
  • -
  • Missing data management (filtering and/or imputation)
  • -
  • Data cleaning
  • -
  • Transformation and normalisation
  • -
  • Aggregation
  • -
  • Downstream analysis
  • -
-
library("tidyverse")
-library("ggplot2")
-library("QFeatures")
-library("limma")
-
-

-5.4.1 Missing values
-

-

Missing values can be highly frequent in proteomics. There are two -reasons supporting the existence of missing values, namely biological -or technical.

-
    -
  1. Values that are missing due to the absence (or extremely low -concentration) of a protein are observed for biological reasons, -and their pattern aren’t random (MNAR). A protein missing -due to the suppression of its expression will not be missing at -random: it will be missing in the condition in which it was -suppressed, and be present in the condition where it is expressed.

  2. -
  3. Due to its data-dependent acquisition, mass spectrometry isn’t -capable of assaying all peptides in a sample. Peptides that are -less abundant than some of their co-eluting ions, peptides that do -not ionise well or peptides that do not get identified might be -sporadically missing in the final quantitation table, despite their -presence in the biological samples. Their absence patterns are -(completely) random (MAR or MCAR) in such cases.

  4. -
-

Often, third party software that produce quantitative data use zeros -instead of properly reporting missing values. We can use the -zeroIsNA() function to replace the 0 by NA values in our -cptac_se object and then explore the missing data patterns across -columns and rows.

-
cptac_se <- zeroIsNA(cptac_se)
-nNA(cptac_se)
-
## $nNA
-## DataFrame with 1 row and 2 columns
-##         nNA       pNA
-##   <integer> <numeric>
-## 1     31130  0.452497
-## 
-## $nNArows
-## DataFrame with 11466 rows and 3 columns
-##                name       nNA       pNA
-##         <character> <integer> <numeric>
-## 1     AAAAGAGGAG...         4  0.666667
-## 2         AAAALAGGK         0  0.000000
-## 3        AAAALAGGKK         0  0.000000
-## 4     AAADALSDLE...         0  0.000000
-## 5     AAADALSDLE...         0  0.000000
-## ...             ...       ...       ...
-## 11462 YYSIYDLGNN...         6  1.000000
-## 11463 YYTFNGPNYN...         3  0.500000
-## 11464    YYTITEVATR         4  0.666667
-## 11465 YYTVFDRDNN...         6  1.000000
-## 11466 YYTVFDRDNN...         6  1.000000
-## 
-## $nNAcols
-## DataFrame with 6 rows and 3 columns
-##          name       nNA       pNA
-##   <character> <integer> <numeric>
-## 1        6A_7      4743  0.413658
-## 2        6A_8      5483  0.478196
-## 3        6A_9      5320  0.463980
-## 4        6B_7      4721  0.411739
-## 5        6B_8      5563  0.485174
-## 6        6B_9      5300  0.462236
-
- -

-Figure 5.8: Distribution of missing value (white). Peptides row with more missing values are moved towards the top of the figure. -

-Distribution of missing value (white). Peptides row with more missing values are moved towards the top of the figure. -
-

Let’s now explore these missing values:

-
    -
  • Explore the number or proportion of missing values across peptides -and samples of the cptac_se data.
  • -
-
barplot(nNA(cptac_se)$nNAcols$pNA)
-

-
table(nNA(cptac_se)$nNArows$nNA)
-
## 
-##    0    1    2    3    4    5    6 
-## 4059  990  884  717  934  807 3075
-
    -
  • Remove rows that have too many missing values. You can do this by -hand or using the filterNA() function.
  • -
-
## remove rows that have 4 or more NAs out of 6
-cptac_se <- filterNA(cptac_se, pNA = 4/6)
-
-
-

-5.4.2 Imputation
-

-

Imputation is the technique of replacing missing data with probable -values. This can be done with impute() method. As we have discussed -above, there are however two types of missing values in mass -spectrometry-based proteomics, namely data missing at random (MAR), -and data missing not at random (MNAR). These two types of missing -data, those missing at random, and those missing not at random, need -to be imputed with different types of imputation -methods -(Lazar et al. 2016Lazar, C, L Gatto, M Ferro, C Bruley, and T Burger. 2016. “Accounting for the Multiple Natures of Missing Values in Label-Free Quantitative Proteomics Data Sets to Compare Imputation Strategies.” J Proteome Res 15 (4): 1116–25. https://doi.org/10.1021/acs.jproteome.5b00981.).

-
- -

-Figure 5.9: Mixed imputation method. Black cells represent presence of quantitation values and light grey corresponds to missing data. The two groups of interest are depicted in green and blue along the heatmap columns. Two classes of proteins are annotated on the left: yellow are proteins with randomly occurring missing values (if any) while proteins in brown are candidates for non-random missing value imputation. -

-Mixed imputation method. Black cells represent presence of quantitation values and light grey corresponds to missing data. The two groups of interest are depicted in green and blue along the heatmap columns. Two classes of proteins are annotated on the left: yellow are proteins with randomly occurring missing values (if any) while proteins in brown are candidates for non-random missing value imputation. -
-
- -

-Figure 5.10: Effect of the nature of missing values on their imputation. Root-mean-square error (RMSE) observations standard deviation ratio (RSR), KNN and MinDet imputation. Lower (blue) is better. -

-Effect of the nature of missing values on their imputation. Root-mean-square error (RMSE) observations standard deviation ratio (RSR), KNN and MinDet imputation. Lower (blue) is better. -
-

Generally, it is recommended to use hot deck methods (nearest -neighbour (left), maximum likelihood, …) when data are missing -at random.Conversely, MNAR features should ideally be imputed with a -left-censor (minimum value (right), but not zero, …) method.

-

There are various methods to perform data imputation, as described in -?impute. The imp4p package contains additional -functionality, including some to estimate the randomness of missing -data.

-

The general syntax for imputation is shown below, using the se_na2 -object as an example:

-
data(se_na2)
-## impute missing values using knn imputation
-impute(se_na2, method = "knn")
-
## Imputing along margin 1 (features/rows).
-
## Warning in knnimp(x, k, maxmiss = rowmax, maxp = maxp): 12 rows with more than 50 % entries missing;
-##  mean imputation used for these rows
-
## class: SummarizedExperiment 
-## dim: 689 16 
-## metadata(3): MSnbaseFiles MSnbaseProcessing MSnbaseVersion
-## assays(1): ''
-## rownames(689): AT1G09210 AT1G21750 ... AT4G11150 AT4G39080
-## rowData names(2): nNA randna
-## colnames(16): M1F1A M1F4A ... M2F8B M2F11B
-## colData names(1): nNA
-
-

-► Question -

-
-

Following the example above, apply a mixed imputation, using knn for -data missing at random and the zero imputation for data missing not at -random. Hint: the randna variable defines which features are assumed -to be missing at random.

-

- -

-
-
-
-

-► Solution -

- -
-
-

-► Question -

-
-

When assessing missing data imputation methods, such as in Lazar et -al. (2016), -one often replaces values with missing data, imputes these with a -method of choice, then quantifies the difference between original -(expected) and observed (imputed) values. Here, using the se_na2 -data, use this strategy to assess the difference between knn and -Bayesian PCA imputation.

-

- -

-
-
-
-

-► Solution -

- -
-
-

-► Question -

-
-

When assessing the impact of missing value imputation on real data, -one can’t use the strategy above. Another useful approach is to assess -the impact of the imputation method on the distribution of the -quantitative data. For instance, here is the intensity distribution of -the se_na2 data. Verify the effect of applying knn, zero, -MinDet and bpca on this distribution.

-
plot(density(na.omit(assay(se_na2))))
-
- -

-Figure 5.11: Intensity disctribution of the naset data. -

-Intensity disctribution of the `naset` data. -
-

- -

-
-
-
-

-► Solution -

- -
-

Tip: When downstream analyses permit, it might be safer not to -impute data and deal explicitly with missing values. Indeed missing -data imputation is not straightforward, and is likely to dramatically -fail when a high proportion of data is missing (10s of %). It is -possible to keep NAs when performing hypothesis tests7, but (generally) not to perform a principal component -analysis.

-
-
-

-5.4.3 Identification quality control
-

-

As discussed in the previous chapter, PSMs are deemed relevant after -comparison against hits from a decoy database. The origin of these -hits is recorded with + in the Reverse variable:

-
table(rowData(cptac_se)$Reverse)
-
## 
-##         + 
-## 7572   12
-

Similarly, a proteomics experiment is also searched against a database -of contaminants:

-
table(rowData(cptac_se)$Potential.contaminant)
-
## 
-##         + 
-## 7558   26
-

Let’s visualise some of the cptac’s metadata using standard ggplot2 -code:

-
-

-► Question -

-
-

Visualise the identification score and the posterior probability -probability (PEP) distributions from forward and reverse hits and -interpret the figure.

-

- -

-
-
-
-

-► Solution -

- -
-

Note: it is also possible to compute and visualise protein groups -as connected components starting from a quantitative dataset such as a -SummarizedExperiment. See the Using quantitative -data -section in the Understanding protein groups with adjacency matrices -vignette.

-
-
-

-5.4.4 Creating the QFeatures data
-

-

We can now create our QFeatures object using the -SummarizedExperiment as shown below.

-
cptac <- QFeatures(list(peptides = cptac_se))
-cptac
-
## An instance of class QFeatures containing 1 assays:
-##  [1] peptides: SummarizedExperiment with 7584 rows and 6 columns
-

We should also assign the QFeatures column data with the -SummarizedExperiment slot.

-
colData(cptac) <- colData(cptac_se)
-

Note that it is also possible to directly create a QFeatures object -with the readQFeatures() function and the same arguments as the -readSummarizedExperiment() used above. In addition, most functions -used above and below work on single SummarizedExperiment objects or -assays within a QFeatures object.

-
-
-

-5.4.5 Filtering out contaminants and reverse hits
-

-
-

-► Question -

-
-

Using the filterFeatures() function, filter out the reverse and -contaminant hits, and also retain those that have a posterior error -probability smaller than 0.05.

-

- -

-
-
-
-

-► Solution -

- -
-
-
-

-5.4.6 Log-transformation and normalisation
-

-

The two code chunks below log-transform and normalise using the assay -i as input and adding a new one names as defined by name.

-
cptac <- logTransform(cptac, i = "peptides",
-                      name = "log_peptides")
-
-

-► Question -

-
-

Use the normalize() method to normalise the data. The syntax is the -same as logTransform(). Use the center.median method.

-

- -

-
-
-
-

-► Solution -

- -
-
-

-► Question -

-
-

Visualise the result of the transformations above. The -plotDensities() function from the limma package is very -convenient, but feel free to use boxplots, violin plots, or any other -visualisation that you deem useful to assess the tranformations.

-

- -

-
-
-
-

-► Solution -

- -
-
-
-

-5.4.7 Aggregation
-

-
-

-► Question -

-
-

Use median aggregation to aggregation peptides into protein -values. This is not necessarily the best choice, as we will see -later, but a good start.

-

- -

-
-
-
-

-► Solution -

- -
-

Looking at the .n row variable computed during the aggregation, we -see that most proteins result from the aggregation of 5 peptides or -less, while very few proteins are accounted for by tens of peptides.

-
table(rowData(cptac[["proteins_med"]])$.n)
-
## 
-##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20 
-## 327 234 167 132  84  73  62  49  49  29  29  24  20  13  15  12   4   6  11   5 
-##  21  22  23  24  25  26  28  29  30  31  32  34  37  38  39  42  51  52  62 
-##   7   4   7   2   2   3   1   3   1   2   2   1   1   1   1   2   1   1   1
-
-
-

-5.4.8 Principal component analysis
-

-
library("factoextra")
-
-pca_pep <-
-    cptac[["lognorm_peptides"]] %>%
-    filterNA() %>%
-    assay() %>%
-    t() %>%
-    prcomp(scale = TRUE, center = TRUE) %>%
-    fviz_pca_ind(habillage = cptac$condition, title = "Peptides")
-
-pca_prot <-
-    cptac[["proteins_med"]] %>%
-    filterNA() %>%
-    assay() %>%
-    t() %>%
-    prcomp() %>%
-    fviz_pca_ind(habillage = cptac$condition,
-                 title = "Proteins (median aggregation)")
-
library("patchwork")
-pca_pep + pca_prot
-
- -

-Figure 5.13: Peptide and protein level PCA analyses. -

-Peptide and protein level PCA analyses. -
-
-
-

-5.4.9 Visualisation
-

-

Below, we use the longFormat() function to extract the quantitative -and row data in a long format, that can be directly reused by the -tidyverse tools.

-
longFormat(cptac["P02787ups|TRFE_HUMAN_UPS", ,
-                 c("lognorm_peptides", "proteins_med")]) %>%
-    as_tibble() %>%
-    mutate(condition = ifelse(grepl("A", colname), "A", "B")) %>%
-    ggplot(aes(x = colname, y = value, colour = rowname, shape = condition)) +
-    geom_point(size = 3) +
-    geom_line(aes(group = rowname)) +
-    facet_grid(~ assay) +
-    ggtitle("P02787ups|TRFE_HUMAN_UPS")
-
- -

-Figure 5.14: Peptide and protein expression profile. -

-Peptide and protein expression profile. -
-

We can also visualise the assays withing a QFeatures object and -their relation.

-
plot(cptac)
-

-
-

-► Question -

-
-

The example above shows a simple linear relationship between -assays. Create a more interesting one by applying a different -normalisation method on the log_peptides assay and aggreate that new -normalised peptide assay. Visualise the relationship with plot(), as -above.

-

- -

-
-
-
-

-► Solution -

- -
-
-
-

-5.4.10 Statistical analysis
-

-

R in general and Bioconductor in particular are well suited for the -statistical analysis of quantitative proteomics data. Several -packages provide dedicated resources for proteomics data:

-
    -
  • MSstats and MSstatsTMT: A set of tools -for statistical relative protein significance analysis in Data -dependent (DDA), SRM, Data independent acquisition (DIA) and TMT -experiments.

  • -
  • msmsTests: Statistical tests for label-free LC-MS/MS -data by spectral counts, to discover differentially expressed -proteins between two biological conditions. Three tests are -available: Poisson GLM regression, quasi-likelihood GLM regression, -and the negative binomial of the edgeR -package. All can be readily applied on MSnSet instances produced, -for example by MSnID.

  • -
  • DEP provides an integrated analysis workflow for the -analysis of mass spectrometry proteomics data for differential -protein expression or differential enrichment.

  • -
  • MSqRob: The MSqRob package -allows a user to do quantitative protein-level statistical inference -on LC-MS proteomics data. More specifically, our package makes use -of peptide-level input data, thus correcting for unbalancedness and -peptide-specific biases. As previously shown (Goeminne et -al. (2015)), this -approach is both more sensitive and specific than summarizing -peptide-level input to protein-level values. Model estimates are -stabilized by ridge regression, empirical Bayes variance estimation -and downweighing of outliers. Currently, only label-free proteomics -data types are -supported. msqrob2 is now -available and makes use of the QFeatures infrastructure.

  • -
  • proDA accounts for missing -values in label-free mass spectrometry data without imputation. The -package implements a probabilistic dropout model that ensures that -the information from observed and missing values are properly -combined. It adds empirical Bayesian priors to increase power to -detect differentially abundant proteins.

  • -
-

Others, while not specfic to proteomics, are also recommended, such as -the limma package. When analysing spectral counting -data, methods for high throughput sequencing data are -applicable. Below, we illustrate how to apply a typical edgeR test -to count data using the msms.edgeR function from the msmsTests -package.

-

Below, we are going to perform our statistical analysis on the protein -data using limma.

-
prots <- getWithColData(cptac, "proteins_med")
-
## Warning: 'experiments' dropped; see 'metadata'
-
## Warning: Ignoring redundant column names in 'colData(x)':
-

The limma package is the precursor package -that enables the consistent application of linear models to normalliy -distributed omics data in general, and microarrays in -particular.

-

The limma package implements an empirical Bayes method that -borrows information across features to estimate the standard error and -calculate (so called moderated) t statistics. This approach is -demonstrably more powerful that a standard t-tests when the number of -samples is low.

-

The code chunk below illustrates how to set up the model, fit it, and -apply the empirical Bayes moderation.

-
library("limma")
-design <- model.matrix(~ prots$condition)
-fit <- lmFit(assay(prots), design)
-
## Warning: Partial NA coefficients for 25 probe(s)
-
fit <- eBayes(fit)
-

Finally, the topTable() function is used the extract the results for -the coefficient of interest.

-
res <-
-    topTable(fit, coef = "prots$condition6B", number = Inf) %>%
-    rownames_to_column("protein") %>%
-    as_tibble() %>%
-    mutate(TP = grepl("ups", protein))
-

Note the warning about partial NA coefficients for 23 probes:

-
na_coefs <-
-    filter(res, is.na(t)) %>%
-    pull(protein)
-assay(prots[na_coefs, ])
-
##                                6A_7      6A_8       6A_9       6B_7       6B_8
-## P00167ups|CYB5_HUMAN_UPS        NaN       NaN        NaN -0.7840558 -2.0282987
-## P01112ups|RASH_HUMAN_UPS        NaN       NaN        NaN -1.5564896        NaN
-## P05413ups|FABPH_HUMAN_UPS       NaN       NaN        NaN -3.3419480        NaN
-## P08758ups|ANXA5_HUMAN_UPS       NaN       NaN        NaN -2.7973872 -2.0137585
-## sp|P06704|CDC31_YEAST           NaN       NaN        NaN -1.2032046 -2.1252371
-## sp|P25574|EMC1_YEAST      -1.506177 -1.983737 -0.7795009        NaN        NaN
-## sp|P32608|RTG2_YEAST            NaN       NaN        NaN        NaN -4.4424189
-## sp|P32769|HBS1_YEAST            NaN -1.384031 -0.7285780        NaN        NaN
-## sp|P34217|PIN4_YEAST            NaN       NaN        NaN -0.8378614 -0.1316397
-## sp|P34237|CASP_YEAST            NaN       NaN        NaN -1.5645172 -1.6600291
-## sp|P38166|SFT2_YEAST      -1.585685 -1.076707        NaN        NaN        NaN
-## sp|P40056|GET2_YEAST            NaN -1.091696 -1.4014211        NaN        NaN
-## sp|P40533|TED1_YEAST            NaN       NaN        NaN -2.0491876        NaN
-## sp|P43582|WWM1_YEAST            NaN       NaN        NaN -0.5538711 -0.7360990
-## sp|P46965|SPC1_YEAST            NaN -3.428771 -3.6321984        NaN        NaN
-## sp|P48363|PFD3_YEAST            NaN       NaN        NaN -0.1904905        NaN
-##                                 6B_9
-## P00167ups|CYB5_HUMAN_UPS  -1.1230809
-## P01112ups|RASH_HUMAN_UPS  -1.5618192
-## P05413ups|FABPH_HUMAN_UPS -3.8907081
-## P08758ups|ANXA5_HUMAN_UPS -2.0894752
-## sp|P06704|CDC31_YEAST     -1.5844104
-## sp|P25574|EMC1_YEAST             NaN
-## sp|P32608|RTG2_YEAST      -2.7873186
-## sp|P32769|HBS1_YEAST             NaN
-## sp|P34217|PIN4_YEAST      -0.1989392
-## sp|P34237|CASP_YEAST      -1.6877463
-## sp|P38166|SFT2_YEAST             NaN
-## sp|P40056|GET2_YEAST             NaN
-## sp|P40533|TED1_YEAST      -1.7474812
-## sp|P43582|WWM1_YEAST      -0.7207043
-## sp|P46965|SPC1_YEAST             NaN
-## sp|P48363|PFD3_YEAST      -0.5087747
-##  [ reached getOption("max.print") -- omitted 9 rows ]
-

We can now visualise the results using a volcano plot:

-
res %>%
-    ggplot(aes(x = logFC, y = -log10(adj.P.Val))) +
-    geom_point(aes(colour = TP)) +
-    geom_vline(xintercept = c(-1, 1)) +
-    geom_hline(yintercept = -log10(0.05)) +
-    scale_color_manual(values = c("black","red"))
-
## Warning: Removed 25 rows containing missing values (`geom_point()`).
-
- -

-Figure 5.15: Volcano plot highlighing spiked-in proteins in red. -

-Volcano plot highlighing spiked-in proteins in red. -
-

Using the pipeline described above, we would would identify a single -differentially expressed protein at an 5 percent FDR but miss out the -other 36 expected spike-in proteins.

-

We can assess our results in terms of true/false postitves/negatives:

-
    -
  • True positives: 1
  • -
  • False positives: 0
  • -
  • True negatives: 1330
  • -
  • False negatives: 32
  • -
-
-
-
-

-5.5 Summary exercice
-

-

As shown below, it is possible to substantially improve these results -by aggregating features using a robust summarisation (available as -MsCoreUtils::robustSummary()), i.e robust regression with -M-estimation using Huber weights, as described in section 2.7 in -(Sticker et al. 2019Sticker, Adriaan, Ludger Goeminne, Lennart Martens, and Lieven Clement. 2019. “Robust Summarization and Inference in Proteome-Wide Label-Free Quantification.” bioRxiv. https://doi.org/10.1101/668863.).

-
- -

-Figure 5.16: Aggregation using robust summarisation. -

-Aggregation using robust summarisation. -
-
    -
  • True positives: 21
  • -
  • False positives: 2
  • -
  • True negatives: 1340
  • -
  • False negatives: 12
  • -
-

Repeat and adapt what we have seen here using, for example, the -robustSummary() function.

- -
-
-
-
-
    -
  1. Still, it is -recommended to explore missingness as part of the exploratory data -analysis.↩︎

  2. -
-
- - -

- - -

-

Page built: -2023-09-06 - using -R version 4.3.1 Patched (2023-07-10 r84676) -

-
-
- - - - - diff --git a/docs/sec-raw.html b/docs/sec-raw.html deleted file mode 100644 index 16cf473..0000000 --- a/docs/sec-raw.html +++ /dev/null @@ -1,1104 +0,0 @@ - - - - - - - - - - - - - - - - - - - - -Chapter 3 Raw MS data | R for Mass Spectrometry - - - - - - - - - - - - - - - - - - - - - - - -
-
- - -
-

-Chapter 3 Raw MS data

-

In this section, we will learn how to read raw data in one of the -commonly used open formats (mzML, mzXML, netCDF or mgf) into -R.

-
-

-3.1 What is raw data in R
-

-

When we manipulate complex data, we need a way to abstract it.

-

The abstraction saves us from having to know about all -the details of that data and its associated metadata. In R, we -think of MS data as illustrated on the figure below (taken from -(Gatto, Gibb, and Rainer 2020Gatto, Laurent, Sebastian Gibb, and Johannes Rainer. 2020. MSnbase, Efficient and Elegant r-Based Processing and Visualisation of Raw Mass Spectrometry Data.” J. Proteome Res., September.)): a metadata table and a set of raw spectra. This allows -to rely on a few easy-to-remember conventions to make mundane and -repetitive tasks trivial and be able to complete more complex things -easily. Abstractions provide a smoother approach to handle complex -data using common patterns.

-
- -

-Figure 3.1: Schematic representation of what is referred to by raw data: a collection of mass spectra and a table containing spectrum-level annotations along the lines. Raw data are imported from one of the many community-maintained open standards formats (mzML, mzXML, mzData or ANDI-MS/netCDF). -

-Schematic representation of what is referred to by *raw data*: a collection of mass spectra and a table containing spectrum-level annotations along the lines. Raw data are imported from one of the many community-maintained open standards formats (mzML, mzXML, mzData or ANDI-MS/netCDF). -
-
-

-3.1.1 The Spectra class
-

-

We are going to use the -Spectra package -as an abstraction to raw mass spectrometry data.

-
library(Spectra)
-

Spectra is part of the R for Mass Spectrometry -initiative. It -defines the Spectra class that is used as a raw data abstraction, to -manipulate MS data and metadata. The best way to learn about a data -structure is to create one by hand.

-

Let’s create a DataFrame4 containing MS levels, retention time, m/z and intensities -for 2 spectra:

-
spd <- DataFrame(msLevel = c(1L, 2L), rtime = c(1.1, 1.2))
-spd$mz <- list(c(100, 103.2, 104.3, 106.5), c(45.6, 120.4, 190.2))
-spd$intensity <- list(c(200, 400, 34.2, 17), c(12.3, 15.2, 6.8))
-spd
-
## DataFrame with 2 rows and 4 columns
-##     msLevel     rtime                    mz             intensity
-##   <integer> <numeric>                <list>                <list>
-## 1         1       1.1 100.0,103.2,104.3,... 200.0,400.0, 34.2,...
-## 2         2       1.2      45.6,120.4,190.2        12.3,15.2, 6.8
-

And now convert this DataFrame into a Spectra object:

-
sp0 <- Spectra(spd)
-sp0
-
## MSn data (Spectra) with 2 spectra in a MsBackendMemory backend:
-##     msLevel     rtime scanIndex
-##   <integer> <numeric> <integer>
-## 1         1       1.1        NA
-## 2         2       1.2        NA
-##  ... 16 more variables/columns.
-
-

Exercise

-

Explore the newly created object using

-
    -
  • -spectraVariables to extract all the metadata variables. Compare these to the -spectra variables available from the previous example.
  • -
  • -spectraData to extract all the metadata.
  • -
  • -peaksData to extract a list containing the raw data.
  • -
  • -[ to create subsets.
  • -
-
-
-
-

-3.1.2 Spectra from mzML files
-

-

Let’s now create a new object using the mzML data previously -downloaded and available in the mzf file.

-
mzf
-
## [1] "/home/lgatto/.cache/R/rpx/8ee512042c5ff_TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01-20141210.mzML"
-
sp <- Spectra(mzf)
-sp
-
## MSn data (Spectra) with 7534 spectra in a MsBackendMzR backend:
-##        msLevel     rtime scanIndex
-##      <integer> <numeric> <integer>
-## 1            1    0.4584         1
-## 2            1    0.9725         2
-## 3            1    1.8524         3
-## 4            1    2.7424         4
-## 5            1    3.6124         5
-## ...        ...       ...       ...
-## 7530         2   3600.47      7530
-## 7531         2   3600.83      7531
-## 7532         2   3601.18      7532
-## 7533         2   3601.57      7533
-## 7534         2   3601.98      7534
-##  ... 33 more variables/columns.
-## 
-## file(s):
-## 8ee512042c5ff_TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01-20141210.mzML
-
-

Exercise

-
    -
  • Repeat the data manipulations above.
  • -
  • Check the number of scans in the object with length().
  • -
  • Note the difference in the first line when showing the object in the -console. We will get back to this idea of backend later.
  • -
-

Mass spectrometry data in Spectra objects can be thought of as a -list of individual spectra, with each spectrum having a set of -variables associated with it. Besides core spectra variables (such -as MS level or retention time) an arbitrary number of optional -variables can be assigned to a spectrum. The core spectra variables -all have their own accessor method and it is guaranteed that a value -is returned by it (or NA if the information is not available). The -core variables and their data type are (alphabetically ordered):

-
    -
  • -acquisitionNum integer(1): the index of acquisition of a -spectrum during a MS run.
  • -
  • -centroided logical(1): whether the spectrum is in profile or -centroid mode.
  • -
  • -collisionEnergy numeric(1): collision energy used to create an -MSn spectrum.
  • -
  • -dataOrigin character(1): the origin of the spectrum’s data, -e.g. the mzML file from which it was read.
  • -
  • -dataStorage character(1): the (current) storage location of the -spectrum data. This value depends on the backend used to handle and -provide the data. For an in-memory backend like the -MsBackendMemory this will be "<memory>", for an on-disk -backend such as the MsBackendHdf5Peaks it will be the name of the -HDF5 file where the spectrum’s peak data is stored.
  • -
  • -intensity numeric: intensity values for the spectrum’s peaks.
  • -
  • -isolationWindowLowerMz numeric(1): lower m/z for the isolation -window in which the (MSn) spectrum was measured.
  • -
  • -isolationWindowTargetMz numeric(1): the target m/z for the -isolation window in which the (MSn) spectrum was measured.
  • -
  • -isolationWindowUpperMz numeric(1): upper m/z for the isolation -window in which the (MSn) spectrum was measured.
  • -
  • -msLevel integer(1): the MS level of the spectrum.
  • -
  • -mz numeric: the m/z values for the spectrum’s peaks.
  • -
  • -polarity integer(1): the polarity of the spectrum (0 and 1 -representing negative and positive polarity, respectively).
  • -
  • -precScanNum integer(1): the scan (acquisition) number of the -precursor for an MSn spectrum.
  • -
  • -precursorCharge integer(1): the charge of the precursor of an -MSn spectrum.
  • -
  • -precursorIntensity numeric(1): the intensity of the precursor of -an MSn spectrum.
  • -
  • -precursorMz numeric(1): the m/z of the precursor of an MSn -spectrum.
  • -
  • -rtime numeric(1): the retention time of a spectrum.
  • -
  • -scanIndex integer(1): the index of a spectrum within a (raw) -file.
  • -
  • -smoothed logical(1): whether the spectrum was smoothed.
  • -
-

For details on the individual variables and their getter/setter -function see the help for Spectra (?Spectra). Also note that these -variables are suggested, but not required to characterize a -spectrum. Also, some only make sense for MSn, but not for MS1 spectra.

-

In addition to the core spectra variables it is also possible to add additional -spectra variables to a Spectra object. As an example we add below a spectra -variable representing the retention times in minutes to the object. This -information can then be extracted again using the $ notation (similar to -accessing a column in a data.frame, i.e., $ and the name of the spectra -variable).

-
sp$rtime_minute <- rtime(sp) / 60
-sp$rtime_minute |> head()
-
## [1] 0.00764000 0.01620833 0.03087333 0.04570667 0.06020667 0.07487500
-
-
-

Exercise

-
    -
  • Extract a set of spectra variables using the accessor (for example -msLevel(.)) or using the $ notation (for example .$msLevel).
  • -
  • How many MS level are there, and how many scans of each level?
  • -
  • Extract the index of the MS2 spectrum with the highest base peak -intensity.
  • -
  • Are the data centroided or in profile mode?
  • -
  • Pick a spectrum of each level and visually check whether it is -centroided or in profile mode. You can use the plotSpectra() -function to visualise peaks and set the m/z range with the xlim -arguments.
  • -
-
-
-

Exercise

-

Using the first raw data file starting with MS3TMT10, answer the -following questions:

-
    -
  • How many spectra are there in that file?
  • -
  • How many MS levels, and how many spectra per MS level?
  • -
  • What is the index of the MS2 spectrum with the highest precursor -intensity?
  • -
  • Plot one spectrum of each level. Are they centroided or in profile -mode?
  • -
-

These objects and their manipulations are not limited to single files or -samples. Below we load data from two mzML files. The MS data from both files in -the Spectra is organized linearly (first all spectra from the first file -and then from the second). The dataOrigin function can be used to identify -spectra from the different data files.

-
(fls <- dir(system.file("sciex", package = "msdata"), full.names = TRUE))
-
## [1] "/home/lgatto/disk/R/x86_64-pc-linux-gnu-library/4.3/msdata/sciex/20171016_POOL_POS_1_105-134.mzML"
-## [2] "/home/lgatto/disk/R/x86_64-pc-linux-gnu-library/4.3/msdata/sciex/20171016_POOL_POS_3_105-134.mzML"
-
sp_sciex <- Spectra(fls)
-table(dataOrigin(sp_sciex))
-
## 
-## /home/lgatto/disk/R/x86_64-pc-linux-gnu-library/4.3/msdata/sciex/20171016_POOL_POS_1_105-134.mzML 
-##                                                                                               931 
-## /home/lgatto/disk/R/x86_64-pc-linux-gnu-library/4.3/msdata/sciex/20171016_POOL_POS_3_105-134.mzML 
-##                                                                                               931
-
-
-
-

-3.1.3 Backends
-

-

Backends allow to use different backends to store mass spectrometry data while -providing via the Spectra class a unified interface to use that data. With -the setBackend function it is possible to change between different backends -and hence different data representations. The Spectra package defines a set of -example backends but any object extending the base MsBackend class could be -used instead. The default backends are:

-
    -
  • -MsBackendMzR: this backend keeps only general spectra variables in memory -and relies on the mzR package to read mass peaks (m/z and -intensity values) from the original MS files on-demand.
  • -
-
sp_sciex
-
## MSn data (Spectra) with 1862 spectra in a MsBackendMzR backend:
-##        msLevel     rtime scanIndex
-##      <integer> <numeric> <integer>
-## 1            1     0.280         1
-## 2            1     0.559         2
-## 3            1     0.838         3
-## 4            1     1.117         4
-## 5            1     1.396         5
-## ...        ...       ...       ...
-## 1858         1   258.636       927
-## 1859         1   258.915       928
-## 1860         1   259.194       929
-## 1861         1   259.473       930
-## 1862         1   259.752       931
-##  ... 33 more variables/columns.
-## 
-## file(s):
-## 20171016_POOL_POS_1_105-134.mzML
-## 20171016_POOL_POS_3_105-134.mzML
-
    -
  • -MsBackendMemory and MsBackendDataFrame: the full mass spectrometry data is -stored (in-memory) within the object. Keeping the data in memory guarantees -high performance but has also, depending on the number of mass peaks in each -spectrum, a much higher memory footprint.
  • -
-
setBackend(sp_sciex, MsBackendMemory())
-
## MSn data (Spectra) with 1862 spectra in a MsBackendMemory backend:
-##        msLevel     rtime scanIndex
-##      <integer> <numeric> <integer>
-## 1            1     0.280         1
-## 2            1     0.559         2
-## 3            1     0.838         3
-## 4            1     1.117         4
-## 5            1     1.396         5
-## ...        ...       ...       ...
-## 1858         1   258.636       927
-## 1859         1   258.915       928
-## 1860         1   259.194       929
-## 1861         1   259.473       930
-## 1862         1   259.752       931
-##  ... 33 more variables/columns.
-## Processing:
-##  Switch backend from MsBackendMzR to MsBackendMemory [Wed Sep  6 11:50:11 2023]
-
    -
  • -MsBackendHdf5Peaks: similar to MsBackendMzR this backend reads peak data -only on-demand from disk while all other spectra variables are kept in -memory. The peak data are stored in Hdf5 files which guarantees scalability.
  • -
-

With the example below we load the data from a single mzML file and use a -MsBackendHdf5Peaks backend for data storage. The hdf5path parameter allows -us to specify the storage location of the HDF5 file.

-
sp_hdf5 <- setBackend(sp_sciex, MsBackendHdf5Peaks(), hdf5path = tempdir())
-sp_hdf5
-
## MSn data (Spectra) with 1862 spectra in a MsBackendHdf5Peaks backend:
-##        msLevel     rtime scanIndex
-##      <integer> <numeric> <integer>
-## 1            1     0.280         1
-## 2            1     0.559         2
-## 3            1     0.838         3
-## 4            1     1.117         4
-## 5            1     1.396         5
-## ...        ...       ...       ...
-## 1858         1   258.636       927
-## 1859         1   258.915       928
-## 1860         1   259.194       929
-## 1861         1   259.473       930
-## 1862         1   259.752       931
-##  ... 33 more variables/columns.
-## 
-## file(s):
-##  20171016_POOL_POS_1_105-134.h5
-##  20171016_POOL_POS_3_105-134.h5
-## Processing:
-##  Switch backend from MsBackendMzR to MsBackendHdf5Peaks [Wed Sep  6 11:50:18 2023]
-
table(sp_hdf5$dataOrigin)
-
## 
-## /home/lgatto/disk/R/x86_64-pc-linux-gnu-library/4.3/msdata/sciex/20171016_POOL_POS_1_105-134.mzML 
-##                                                                                               931 
-## /home/lgatto/disk/R/x86_64-pc-linux-gnu-library/4.3/msdata/sciex/20171016_POOL_POS_3_105-134.mzML 
-##                                                                                               931
-
table(sp_hdf5$dataStorage)
-
## 
-## /tmp/RtmphN3t3B/20171016_POOL_POS_1_105-134.h5 
-##                                            931 
-## /tmp/RtmphN3t3B/20171016_POOL_POS_3_105-134.h5 
-##                                            931
-

All of the above mentioned backends support changing all of their their spectra -variables, except the MsBackendMzR that does not support changing m/z or -intensity values for the mass peaks.

-

Next to these default backends there are a set of other backend implementations -provided by additional R packages. The -MsBackendSql for -example allows to store (and retrieve) all MS data in (from) an SQL database -guaranteeing thus a minimal memory footprint.

-

Other backends focus on specific file formats such as -MsBackendMgf for files -in mgf file format or on specific acquisitions such as -MsBackendTimsTof -or provide access to certain MS data resources such as the -MsBackendMassbank. -Additional backends are being developed to address specific needs or -technologies, while remaining compliant with the Spectra interface.

-

If you would like to learn more about how the raw MS formats are -handled by Spectra via the mzR package, -check out the 6.1 section in the annex.

-

See also Spectra -backends -for more information on different backends, their properties and -advantages/disadvantages.

-
-
-
-

-3.2 Visualisation of raw MS data
-

-

The importance of flexible access to specialised data becomes visible -in the figure below (taken from the RforProteomics visualisation -vignette). -Not only can we access specific data and understand/visualise them, -but we can transverse all the data and extract/visualise/understand -structured slices of data.

-

The figure below shows an illustration of how mass spectrometry -works:

-
    -
  1. The chromatogram at the top displays the total ion current along the -retention time. The vertical line identifies one scan in particular -at retention time 1800.68 seconds (the 2807th scan).

  2. -
  3. The spectra on the second line represent the full MS1 spectrum -marked by the red line. The vertical lines identify the 10 -precursor ions that where selected for MS2 analysis. The zoomed in -on the right shows one specific precursor peak.

  4. -
  5. The MS2 spectra displayed along the two rows at the bottom are -those resulting from the fragmentation of the 10 precursor peaks -identified by the vertical bars above.

  6. -
-

-

We are going to reproduce the figure above through a set of exercices.

-
-

-► Question -

-
-
    -
  1. The chromatogram can be created by extracting the totIonCurrent -and rtime variables for all MS1 spectra. Annotate the spectrum of -interest.
  2. -
-

- -

-
-
-
-

-► Solution -

- -
-
-

-► Question -

-
-
    -
  1. The filterPrecursorScan() function can be used to retain a set -parent (MS1) and children scans (MS2), as defined by an acquisition -number. Use it to extract the MS1 scan of interest and all its MS2 -children.
  2. -
-

- -

-
-
-
-

-► Solution -

- -
-
-

-► Question -

-
-
    -
  1. Plot the MS1 spectrum of interest and highlight all the peaks that -will be selected for MS2 analysis.
  2. -
-

- -

-
-
-
-

-► Solution -

- -
-
-

-► Question -

-
-
    -
  1. Zoom in mz values 521.1 and 522.5 to reveal the isotopic envelope -of that peak.
  2. -
-

- -

-
-
-
-

-► Solution -

- -
-
-

-► Question -

-
-
    -
  1. The plotSpectra() function is used to plot all 10 MS2 spectra in -one call.
  2. -
-

- -

-
-
-
-

-► Solution -

- -
-

It is possible to label the peaks with the plotSpectra() -function. The labels argument is either a character of appropriate -length (i.e. with a label for each peak) or, as illustrated below, a -function that computes the labels.

-
mzLabel <- function(z) {
-    z <- peaksData(z)[[1L]]
-    lbls <- format(z[, "mz"], digits = 4)
-    lbls[z[, "intensity"] < 1e5] <- ""
-    lbls
-}
-
-plotSpectra(ms_2[7],
-            xlim = c(126, 132),
-            labels = mzLabel,
-            labelSrt = -30, labelPos = 2,
-            labelOffset = 0.1)
-

-

Spectra can also be compared either by overlay or mirror plotting -using the plotSpectraOverlay() and plotSpectraMirror() functions.

-
-

-► Question -

-
-

Filter MS2 level spectra and find any 2 MS2 spectra that have matching -precursor peaks based on the precursor m/z values.

-

- -

-
-
-
-

-► Solution -

- -
-
-

-► Question -

-
-

Visualise the matching pair using the plotSpectraOverlay() and -plotSpectraMirror() functions.

-

- -

-
-
-
-

-► Solution -

- -
-

It is also possible to explore raw data interactively with the -SpectraVis -package:

-
    -
  • The -browseSpectra() -function opens a simple shiny application that allows to browse -through the individual scans of a Spectra object.

  • -
  • The -plotlySpectra() -function displays a single spectrum using -plotly allowing to explore (zooming, -panning) the spectrum interactively.

  • -
-
-

-► Question -

-
-

Test the SpectraVis function on some the Spectra objects produce -above.

-

- -

-
-
-
-
-

-3.3 Raw data processing and manipulation
-

-

Apart from classical subsetting operations such as [ and split, a set of -filter functions are defined for Spectra objects that filter/reduce the number -of spectra within the object (for detailed help please see the ?Spectra help):

-
    -
  • -filterAcquisitionNum: retains spectra with certain acquisition numbers.
  • -
  • -filterDataOrigin: subsets to spectra from specific origins.
  • -
  • -filterDataStorage: subsets to spectra from certain data storage files.
  • -
  • -filterEmptySpectra: removes spectra without mass peaks.
  • -
  • -filterMzRange: subsets spectra keeping only peaks with an m/z within the -provided m/z range.
  • -
  • -filterIsolationWindow: keeps spectra with the provided mz in their -isolation window (m/z range).
  • -
  • -filterMsLevel: filters by MS level.
  • -
  • -filterPolarity: filters by polarity.
  • -
  • -filterPrecursorIsotopes: identifies precursor ions (from fragment spectra) -that could represent isotopes of the same molecule. For each of these spectra -groups only the spectrum of the monoisotopic precursor ion is returned. MS1 -spectra are returned without filtering.
  • -
  • -filterPrecursorMaxIntensity: filters spectra keeping, for groups of spectra -with similar precursor m/z, the one spectrum with the highest precursor -intensity. All MS1 spectra are returned without filtering.
  • -
  • -filterPrecursorMzRange: retains (MSn) spectra with a precursor m/z within -the provided m/z range.
  • -
  • -filterPrecursorMzValues: retains (MSn) spectra with precursor m/z value -matching the provided value(s) considering also a tolerance and ppm.
  • -
  • -filterPrecursorCharge: retains (MSn) spectra with speified -precursor charge(s).
  • -
  • -filterPrecursorScan: retains (parent and children) scans of an acquisition -number.
  • -
  • -filterRt: filters based on retention time range.
  • -
-

In addition to these, there is also a set of filter functions that operate on -the peak data, filtering and modifying the number of peaks of each spectrum -within a Spectra:

-
    -
  • -combinePeaks: groups peaks within each spectrum based on similarity of their -m/z values and combines these into a single peak per peak group.
  • -
  • -deisotopeSpectra: deisotopes each individual spectrum keeping only the -monoisotopic peak for peaks groups of potential isotopologues.
  • -
  • -filterIntensity: filter each spectrum keeping only peaks with intensities -meeting certain criteria.
  • -
  • -filterMzRange: subsets peaks data within each spectrum keeping only peaks -with their m/z values within the specified m/z range.
  • -
  • -filterPrecursorPeaks: removes peaks with either an m/z value matching the -precursor m/z of the respective spectrum (with parameter mz = "==") or peaks -with an m/z value larger or equal to the precursor m/z (with parameter -mz = ">=").
  • -
  • -filterMzValues: subsets peaks within each spectrum keeping or removing (all) -peaks matching provided m/z value(s) (given parameters ppm and tolerance).
  • -
  • -reduceSpectra: filters individual spectra keeping only the largest peak for -groups of peaks with similar m/z values.
  • -
-
-

-► Question -

-
-

Using the sp_sciex data, select all spectra measured in the second -mzML file and subsequently filter them to retain spectra measured -between 175 and 189 seconds in the measurement run.

-

- -

-
-
-
-

-► Solution -

- -
-

As an example of data processing, we use below the pickPeaks() -function. This function allows to convert profile mode MS data to centroid -mode data (a process also referred to as centroiding).

-
plotSpectra(sp[2807], xlim = c(521.2, 522.5))
-

-

Centroiding reduces the profile mode MS data to a representative single mass -peak per ion.

-
pickPeaks(sp[2807]) |>
-    filterIntensity(1e7) |>
-    plotSpectra(xlim = c(521.2, 522.5))
-

-
-
-

-3.4 A note on efficiency
-

-
-

-3.4.1 Backends
-

-

The figure below (taken from (Gatto, Gibb, and Rainer 2020Gatto, Laurent, Sebastian Gibb, and Johannes Rainer. 2020. MSnbase, Efficient and Elegant r-Based Processing and Visualisation of Raw Mass Spectrometry Data.” J. Proteome Res., September.)) illustrates the respective -advantages of storing data in memory or on disk. The benchmarking was -done for the MSnbase package but also applies to the Spectra backends.

-
- -

-Figure 3.2: (a) Reading time (triplicates, in seconds) and (b) data size in memory (in MB) to read/store 1, 5, and 10 files containing 1431 MS1 (on-disk only) and 6103 MS2 (on-disk and in-memory) spectra. (c) Filtering benchmark assessed over 10 interactions on in-memory and on-disk data containing 6103 MS2 spectra. (d) Access time to spectra for the in-memory (left) and on-disk (right) backends for 1, 10, 100 1000, 5000, and all 6103 spectra. Benchmarks were performed on a Dell XPS laptop with an Intel i5-8250U processor 1.60 GHz (4 cores, 8 threads), 7.5 GB RAM running Ubuntu 18.04.4 LTS 64-bit, and an SSD drive. The data used for the benchmarking are a TMT 4-plex experiment acquired on a LTQ Orbitrap Velos (Thermo Fisher Scientific) available in the msdata package. -

-(a) Reading time (triplicates, in seconds) and (b) data size in memory (in MB) to read/store 1, 5, and 10 files containing 1431 MS1 (on-disk only) and 6103 MS2 (on-disk and in-memory) spectra. (c) Filtering benchmark assessed over 10 interactions on in-memory and on-disk data containing 6103 MS2 spectra. (d) Access time to spectra for the in-memory (left) and on-disk (right) backends for 1, 10, 100 1000, 5000, and all 6103 spectra. Benchmarks were performed on a Dell XPS laptop with an Intel i5-8250U processor 1.60 GHz (4 cores, 8 threads), 7.5 GB RAM running Ubuntu 18.04.4 LTS 64-bit, and an SSD drive. The data used for the benchmarking are a TMT 4-plex experiment acquired on a LTQ Orbitrap Velos (Thermo Fisher Scientific) available in the msdata package. -
-
-
-

-3.4.2 Parallel processing
-

-

Most functions on Spectra support (and use) parallel processing out -of the box. Peak data access and manipulation methods perform by -default parallel processing on a per-file basis (i.e. using the -dataStorage variable as splitting factor). Spectra uses -BiocParallel for -parallel processing and all functions use the default registered -parallel processing setup of that package.

-
-
-

-3.4.3 Lazy evaluation
-

-

Data manipulations on Spectra objects are not immediately applied to -the peak data. They are added to a so called processing queue which is -applied each time peak data is accessed (with the peaksData, mz or -intensity functions). Thanks to this processing queue data -manipulation operations are also possible for read-only backends -(e.g. mzML-file based backends or database-based backends). The -information about the number of such processing steps can be seen -below (next to Lazy evaluation queue).

-
min(intensity(sp_sciex[1]))
-
## [1] 0
-
sp_sciex <- filterIntensity(sp_sciex, intensity = c(10, Inf))
-sp_sciex ## Note the lazy evaluation queue
-
## MSn data (Spectra) with 1862 spectra in a MsBackendMzR backend:
-##        msLevel     rtime scanIndex
-##      <integer> <numeric> <integer>
-## 1            1     0.280         1
-## 2            1     0.559         2
-## 3            1     0.838         3
-## 4            1     1.117         4
-## 5            1     1.396         5
-## ...        ...       ...       ...
-## 1858         1   258.636       927
-## 1859         1   258.915       928
-## 1860         1   259.194       929
-## 1861         1   259.473       930
-## 1862         1   259.752       931
-##  ... 33 more variables/columns.
-## 
-## file(s):
-## 20171016_POOL_POS_1_105-134.mzML
-## 20171016_POOL_POS_3_105-134.mzML
-## Lazy evaluation queue: 1 processing step(s)
-## Processing:
-##  Remove peaks with intensities outside [10, Inf] in spectra of MS level(s) 1. [Wed Sep  6 11:50:20 2023]
-
min(intensity(sp_sciex[1]))
-
## [1] 412
-
sp_sciex@processingQueue
-
## [[1]]
-## Object of class "ProcessingStep"
-##  Function: user-provided function
-##  Arguments:
-##   o intensity = 10Inf
-##   o msLevel = 1
-

Through this lazy evaluation system it is also possible to undo data -manipulations:

-
sp_sciex <- reset(sp_sciex)
-sp_sciex@processingQueue
-
## list()
-
min(intensity(sp_sciex[1]))
-
## [1] 0
-

More information on this lazy evaluation concept implemented in Spectra is -provided in the Spectra -backends -vignette.

- -
-
-
-
-
-
    -
  1. As defined in the Bioconductor S4Vectors -package.↩︎

  2. -
-
- - -

- - -

-

Page built: -2023-09-06 - using -R version 4.3.1 Patched (2023-07-10 r84676) -

-
-
- - - - - diff --git a/docs/sec-si.html b/docs/sec-si.html deleted file mode 100644 index 2a9fdf8..0000000 --- a/docs/sec-si.html +++ /dev/null @@ -1,358 +0,0 @@ - - - - - - - - - - - - - - - - - - - - -Chapter 7 Additional materials and session information | R for Mass Spectrometry - - - - - - - - - - - - - - - - - - - - - - - -
-
- -
-

-Chapter 7 Additional materials and session information

-
-

-7.1 Additional materials
-

-
    -
  • The Single-cell proteomics data analysis using QFeatures and -scp workshop -is provided as two vignettes. The first one provides a general -introduction to the QFeatures class in the general context of mass -spectrometry-based proteomics data manipulation. The second vignette -focuses on single-cell application and introduces the scp package -(Vanderaa and Gatto 2021Vanderaa, Christophe, and Laurent Gatto. 2021. “Replication of Single-Cell Proteomics Data Reveals Important Computational Challenges.” Expert Rev. Proteomics, October.) as an extension of QFeatures. This second -vignette also provides exercises that give the attendee the -opportunity to apply the learned concepts to reproduce a published -analysis on a subset of a real data set.

  • -
  • -

    The SpectraTutorials package -provides three different vignettes:

    - -
  • -
  • A tutorial presenting Use Cases and Examples for Annotation of -Untargeted Metabolomics -Data using -the MetaboAnnotation and MetaboCoreUtils packages -(Rainer et al. 2022Rainer, Johannes, Andrea Vicini, Liesa Salzer, Jan Stanstrup, Josep M Badia, Steffen Neumann, Michael A Stravs, et al. 2022. “A Modular and Expandable Ecosystem for Metabolomics Data Annotation in R.” Metabolites 12 (2): 173.).

  • -
  • Exploring and analyzing LC-MS data with Spectra and -xcms provides an -overview of recent developments in Bioconductor to work with mass -spectrometry -(MsExperiment, -Spectra) and -specifically LC-MS data (xcms) -and walks through the preprocessing of a small data set emphasizing -on selection of data-dependent settings for the individual -pre-processing steps.

  • -
-
-
-

-7.2 Questions and help
-

-

For questions about specific software or their usage, please refer to -the software’s github issue page, or use the Bioconductor support -site.

-
-
-

-7.3 Session information
-

-

The following packages have been used to generate this document.

-
sessionInfo()
-
## R version 4.3.1 Patched (2023-07-10 r84676)
-## Platform: x86_64-pc-linux-gnu (64-bit)
-## Running under: Manjaro Linux
-## 
-## Matrix products: default
-## BLAS:   /usr/lib/libblas.so.3.11.0 
-## LAPACK: /usr/lib/liblapack.so.3.11.0
-## 
-## locale:
-##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
-##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
-##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
-##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
-##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
-## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
-## 
-## time zone: Europe/Brussels
-## tzcode source: system (glibc)
-## 
-## attached base packages:
-## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
-## [8] base     
-## 
-## other attached packages:
-##  [1] mzID_1.38.0                 patchwork_1.1.3            
-##  [3] factoextra_1.0.7            gplots_3.1.3               
-##  [5] limma_3.56.2                lubridate_1.9.2            
-##  [7] forcats_1.0.0               stringr_1.5.0              
-##  [9] purrr_1.0.2                 readr_2.1.4                
-## [11] tibble_3.2.1                tidyverse_2.0.0            
-## [13] MSnID_1.34.0                magrittr_2.0.3             
-## [15] tidyr_1.3.0                 ggplot2_3.4.3              
-## [17] dplyr_1.1.2                 msdata_0.40.0              
-## [19] rpx_2.8.0                   MsCoreUtils_1.12.0         
-## [21] QFeatures_1.11.1            MultiAssayExperiment_1.26.0
-## [23] SummarizedExperiment_1.30.2 Biobase_2.60.0             
-## [25] GenomicRanges_1.52.0        GenomeInfoDb_1.36.1        
-## [27] IRanges_2.34.1              MatrixGenerics_1.12.3      
-## [29] matrixStats_1.0.0           Spectra_1.10.2             
-## [31] ProtGenerics_1.32.0         BiocParallel_1.34.2        
-## [33] S4Vectors_0.38.1            BiocGenerics_0.46.0        
-## [35] mzR_2.34.1                  Rcpp_1.0.11                
-## [37] BiocStyle_2.28.0           
-## 
-## loaded via a namespace (and not attached):
-##   [1] later_1.3.1             bitops_1.0-7            filelock_1.0.2         
-##   [4] R.oo_1.25.0             preprocessCore_1.62.1   XML_3.99-0.14          
-##   [7] lifecycle_1.0.3         rstatix_0.7.2           doParallel_1.0.17      
-##  [10] lattice_0.21-8          MASS_7.3-60             backports_1.4.1        
-##  [13] sass_0.4.7              rmarkdown_2.24          jquerylib_0.1.4        
-##  [16] yaml_2.3.7              httpuv_1.6.11           DBI_1.1.3              
-##  [19] RColorBrewer_1.1-3      abind_1.4-5             zlibbioc_1.46.0        
-##  [22] R.cache_0.16.0          R.utils_2.12.2          AnnotationFilter_1.24.0
-##  [25] RCurl_1.98-1.12         rappdirs_0.3.3          GenomeInfoDbData_1.2.10
-##  [28] ggrepel_0.9.3           pheatmap_1.0.12         MSnbase_2.27.1         
-##  [31] ncdf4_1.21              codetools_0.2-19        DelayedArray_0.26.7    
-##  [34] xml2_1.3.5              tidyselect_1.2.0        farver_2.1.1           
-##  [37] BiocFileCache_2.8.0     jsonlite_1.8.7          ellipsis_0.3.2         
-##  [40] iterators_1.0.14        foreach_1.5.2           tools_4.3.1            
-##  [43] glue_1.6.2              BiocBaseUtils_1.2.0     xfun_0.40              
-##  [46] withr_2.5.0             BiocManager_1.30.22     fastmap_1.1.1          
-##  [49] rhdf5filters_1.12.1     fansi_1.0.4             caTools_1.18.2         
-##  [52] digest_0.6.33           timechange_0.2.0        R6_2.5.1               
-##  [55] mime_0.12               colorspace_2.1-0        gtools_3.9.4           
-##  [58] RSQLite_2.3.1           R.methodsS3_1.8.2       utf8_1.2.3             
-##  [61] generics_0.1.3          data.table_1.14.8       httr_1.4.7             
-##  [64] S4Arrays_1.0.5          pkgconfig_2.0.3         gtable_0.3.4           
-##  [67] blob_1.2.4              impute_1.74.1           XVector_0.40.0         
-##  [70] htmltools_0.5.6         carData_3.0-5           bookdown_0.34.2        
-##  [73] MALDIquant_1.22.1       clue_0.3-64             scales_1.2.1           
-##  [76] png_0.1-8               knitr_1.43              rstudioapi_0.15.0      
-##  [79] tzdb_0.4.0              reshape2_1.4.4          curl_5.0.2             
-##  [82] cachem_1.0.8            rhdf5_2.44.0            BiocVersion_3.17.1     
-##  [85] KernSmooth_2.23-22      parallel_4.3.1          AnnotationDbi_1.62.2   
-##  [88] vsn_3.68.0              msmbstyle_0.0.19        pillar_1.9.0           
-##  [91] grid_4.3.1              vctrs_0.6.3             pcaMethods_1.92.0      
-##  [94] promises_1.2.1          ggpubr_0.6.0            car_3.1-2              
-##  [97] dbplyr_2.3.3            xtable_1.8-4            cluster_2.1.4          
-## [100] evaluate_0.21          
-##  [ reached getOption("max.print") -- omitted 28 entries ]
- -
-
- -

- -

-

Page built: -2023-09-06 - using -R version 4.3.1 Patched (2023-07-10 r84676) -

-
-
- - - - - diff --git a/docs/style.css b/docs/style.css deleted file mode 100644 index 4c51529..0000000 --- a/docs/style.css +++ /dev/null @@ -1,5 +0,0 @@ -/* original background colour is #1881c2 */ - -:root { - --main-bg-color: #115a88; -} diff --git a/img/F02-3D-MS1-MS2-scans-100-1200-lattice.png b/img/F02-3D-MS1-MS2-scans-100-1200-lattice.png deleted file mode 100644 index 4440af6..0000000 Binary files a/img/F02-3D-MS1-MS2-scans-100-1200-lattice.png and /dev/null differ diff --git a/img/F02-3D-MS1-scans-400-1200-lattice.png b/img/F02-3D-MS1-scans-400-1200-lattice.png deleted file mode 100644 index 3672c9c..0000000 Binary files a/img/F02-3D-MS1-scans-400-1200-lattice.png and /dev/null differ diff --git a/img/MS1-MS2-spectra.png b/img/MS1-MS2-spectra.png deleted file mode 100644 index 8713843..0000000 Binary files a/img/MS1-MS2-spectra.png and /dev/null differ diff --git a/img/MSGFgui.png b/img/MSGFgui.png deleted file mode 100644 index 7195618..0000000 Binary files a/img/MSGFgui.png and /dev/null differ diff --git a/img/SE.png b/img/SE.png deleted file mode 100644 index 54974c1..0000000 Binary files a/img/SE.png and /dev/null differ diff --git a/img/SchematicMS2.png b/img/SchematicMS2.png deleted file mode 100644 index 7ff3198..0000000 Binary files a/img/SchematicMS2.png and /dev/null differ diff --git a/img/Silac.png b/img/Silac.png deleted file mode 100644 index b799a6c..0000000 Binary files a/img/Silac.png and /dev/null differ diff --git a/img/chromatogram.png b/img/chromatogram.png deleted file mode 100644 index 578e1bb..0000000 Binary files a/img/chromatogram.png and /dev/null differ diff --git a/img/chrompeaks.png b/img/chrompeaks.png deleted file mode 100644 index b6d71a3..0000000 Binary files a/img/chrompeaks.png and /dev/null differ diff --git a/img/cptac.png b/img/cptac.png deleted file mode 100644 index fa4b3d4..0000000 Binary files a/img/cptac.png and /dev/null differ diff --git a/img/frag.png b/img/frag.png deleted file mode 100644 index 309ba91..0000000 Binary files a/img/frag.png and /dev/null differ diff --git a/img/imp-sim.png b/img/imp-sim.png deleted file mode 100644 index 45054a1..0000000 Binary files a/img/imp-sim.png and /dev/null differ diff --git a/img/itraq.png b/img/itraq.png deleted file mode 100644 index f995f61..0000000 Binary files a/img/itraq.png and /dev/null differ diff --git a/img/mstut.gif b/img/mstut.gif deleted file mode 100644 index 63dd457..0000000 Binary files a/img/mstut.gif and /dev/null differ diff --git a/img/msvisfig.png b/img/msvisfig.png deleted file mode 100644 index c8d368b..0000000 Binary files a/img/msvisfig.png and /dev/null differ diff --git a/img/pbase.png b/img/pbase.png deleted file mode 100644 index b02a089..0000000 Binary files a/img/pbase.png and /dev/null differ diff --git a/img/pr0c00313_0002.gif b/img/pr0c00313_0002.gif deleted file mode 100644 index 8ddfa50..0000000 Binary files a/img/pr0c00313_0002.gif and /dev/null differ diff --git a/img/raw.png b/img/raw.png deleted file mode 100644 index e961670..0000000 Binary files a/img/raw.png and /dev/null differ diff --git a/img/vp2.png b/img/vp2.png deleted file mode 100644 index de9f234..0000000 Binary files a/img/vp2.png and /dev/null differ diff --git a/inst/.gitignore b/inst/.gitignore new file mode 100644 index 0000000..c1adf9a --- /dev/null +++ b/inst/.gitignore @@ -0,0 +1,4 @@ +/.quarto/ +/docs/ +*_cache +*_files diff --git a/inst/LICENSE.qmd b/inst/LICENSE.qmd new file mode 100644 index 0000000..7ce3af0 --- /dev/null +++ b/inst/LICENSE.qmd @@ -0,0 +1,9 @@ +--- +title: "Open Source License" +--- + +The "R for Mass Spectrometry" *package* is licensed under the [GNU GPL v3](https://www.gnu.org/licenses/gpl-3.0.en.html). + +The "R for Mass Spectrometry" *book* rendered by the package is licensed under the [Creative Commons Attribution-ShareAlike 4.0 International License](https://creativecommons.org/licenses/by-sa/4.0/). + + diff --git a/inst/_quarto.yml b/inst/_quarto.yml new file mode 100644 index 0000000..0f09571 --- /dev/null +++ b/inst/_quarto.yml @@ -0,0 +1,12 @@ +project: + type: book + output-dir: docs + +metadata-files: + - assets/_book.yml + - assets/_website.yml + - assets/_format.yml + - assets/_knitr.yml + +filters: + - extensions/tools-tabset-ext/tools-tabset.lua diff --git a/inst/assets/_book.yml b/inst/assets/_book.yml new file mode 100644 index 0000000..6ab17cf --- /dev/null +++ b/inst/assets/_book.yml @@ -0,0 +1,24 @@ +book: + license: "CC BY-NC-SA" + title: "R for Mass Spectrometry" + chapters: + - index.qmd + - pages/introduction.qmd + - pages/raw-ms-data.qmd + - pages/20-id.qmd + - pages/30-quant.qmd + - pages/95-annex.qmd + - pages/99-si.qmd + cover-image: assets/cover.png + favicon: assets/favicon.png + sidebar: + tools: + - icon: git + menu: + - text: Source Code + url: https://github.com/js2264/R4MS/ + - text: Browse version `devel` + url: https://js2264.github.io/R4MS/docs/devel/ + style: "docked" + background: "light" + collapse-level: 5 diff --git a/inst/assets/_format.yml b/inst/assets/_format.yml new file mode 100644 index 0000000..5c1b255 --- /dev/null +++ b/inst/assets/_format.yml @@ -0,0 +1,15 @@ +format: + html: + grid: + sidebar-width: 500px + body-width: 800px + theme: + - cosmo + - assets/book.scss + mainfont: "Atkinson Hyperlegible, sans-serif" + highlight-style: atom-one + code-link: true + editor: visual + bibliography: assets/bibliography.bib + from: markdown+emoji + # pdf: default diff --git a/inst/assets/_knitr.yml b/inst/assets/_knitr.yml new file mode 100644 index 0000000..d0001a2 --- /dev/null +++ b/inst/assets/_knitr.yml @@ -0,0 +1,15 @@ +knitr: + opts_chunk: + collapse: true + comment: "## " + cache: false + fig.align: "center" + python.reticulate: false + R.options: + dplyr.print_min: 6 + dplyr.print_max: 6 + pillar.max_footer_lines: 2 + pillar.min_chars: 15 + stringr.view_n: 6 + pillar.bold: TRUE + width: 77 # 80 - 3 for #> comments diff --git a/inst/assets/_website.yml b/inst/assets/_website.yml new file mode 100644 index 0000000..7ebfca5 --- /dev/null +++ b/inst/assets/_website.yml @@ -0,0 +1,14 @@ +website: + back-to-top-navigation: true + search: + location: sidebar + page-footer: + background: light + left: | + This book was built with BiocBook with :heart: + center: + - text: "License" + href: LICENSE.qmd + repo-branch: devel + repo-actions: [edit, issue] + open-graph: true diff --git a/packages.bib b/inst/assets/bibliography.bib similarity index 50% rename from packages.bib rename to inst/assets/bibliography.bib index cdcaaf6..9c2f8d7 100644 --- a/packages.bib +++ b/inst/assets/bibliography.bib @@ -1,3 +1,74 @@ +@Manual{serizay2023, + title = {BiocBook: Write, publish and maintain versioned Quarto books with Bioconductor}, + author = {Jacques Serizay}, + year = {2023}, + note = {R package version 0.99.0}, + url = {https://github.com/js2264/BiocBook}, +} + +@Manual{lun2023, + title = {rebook: Re-using Content in Bioconductor Books}, + author = {Aaron Lun}, + year = {2023}, + note = {R package version 1.11.1}, + url = {https://bioconductor.org/packages/rebook}, + doi = {10.18129/B9.bioc.rebook}, +} + +@software{Allaire_Quarto_2022, +author = {Allaire, J.J. and Teague, Charles and Scheidegger, Carlos and Xie, Yihui and Dervieux, Christophe}, +doi = {10.5281/zenodo.5960048}, +month = jan, +title = {{Quarto}}, +url = {https://github.com/quarto-dev/quarto-cli}, +version = {1.2}, +year = {2022} +} + +@Manual{Wickham2022, + title = {devtools: Tools to Make Developing R Packages Easier}, + author = {Hadley Wickham and Jim Hester and Winston Chang and Jennifer Bryan}, + year = {2022}, + note = {R package version 2.4.5}, + url = {https://CRAN.R-project.org/package=devtools}, +} + + +@Manual{serizay2023, + title = {BiocBook: Write, publish and maintain versioned Quarto books with Bioconductor}, + author = {Jacques Serizay}, + year = {2023}, + note = {R package version 0.99.0}, + url = {https://github.com/js2264/BiocBook}, +} + +@Manual{lun2023, + title = {rebook: Re-using Content in Bioconductor Books}, + author = {Aaron Lun}, + year = {2023}, + note = {R package version 1.11.1}, + url = {https://bioconductor.org/packages/rebook}, + doi = {10.18129/B9.bioc.rebook}, +} + +@software{Allaire_Quarto_2022, +author = {Allaire, J.J. and Teague, Charles and Scheidegger, Carlos and Xie, Yihui and Dervieux, Christophe}, +doi = {10.5281/zenodo.5960048}, +month = jan, +title = {{Quarto}}, +url = {https://github.com/quarto-dev/quarto-cli}, +version = {1.2}, +year = {2022} +} + +@Manual{Wickham2022, + title = {devtools: Tools to Make Developing R Packages Easier}, + author = {Hadley Wickham and Jim Hester and Winston Chang and Jennifer Bryan}, + year = {2022}, + note = {R package version 2.4.5}, + url = {https://CRAN.R-project.org/package=devtools}, +} + @Manual{R-base, title = {R: A Language and Environment for Statistical Computing}, author = {{R Core Team}}, @@ -617,3 +688,435 @@ @Article{tidyverse2019 doi = {10.21105/joss.01686}, } +@Manual{R-base, + title = {R: A Language and Environment for Statistical Computing}, + author = {{R Core Team}}, + organization = {R Foundation for Statistical Computing}, + address = {Vienna, Austria}, + year = {2021}, + url = {https://www.R-project.org/}, +} + +@Manual{R-bookdown, + title = {bookdown: Authoring Books and Technical Documents with R Markdown}, + author = {Yihui Xie}, + year = {2021}, + note = {R package version 0.21.6}, + url = {https://github.com/rstudio/bookdown}, +} + +@Manual{R-msmbstyle, + title = {msmbstyle: MSMB Styles for R Markdown Documents}, + author = {Mike Smith}, + year = {2021}, + note = {R package version 0.0.18}, +} + +@Manual{R-rmarkdown, + title = {rmarkdown: Dynamic Documents for R}, + author = {JJ Allaire and Yihui Xie and Jonathan McPherson and Javier Luraschi and Kevin Ushey and Aron Atkins and Hadley Wickham and Joe Cheng and Winston Chang and Richard Iannone}, + year = {2021}, + note = {R package version 2.7}, + url = {https://CRAN.R-project.org/package=rmarkdown}, +} + +@Book{bookdown2016, + title = {bookdown: Authoring Books and Technical Documents with {R} Markdown}, + author = {Yihui Xie}, + publisher = {Chapman and Hall/CRC}, + address = {Boca Raton, Florida}, + year = {2016}, + note = {ISBN 978-1138700109}, + url = {https://github.com/rstudio/bookdown}, +} + +@Book{rmarkdown2018, + title = {R Markdown: The Definitive Guide}, + author = {Yihui Xie and J.J. Allaire and Garrett Grolemund}, + publisher = {Chapman and Hall/CRC}, + address = {Boca Raton, Florida}, + year = {2018}, + note = {ISBN 9781138359338}, + url = {https://bookdown.org/yihui/rmarkdown}, +} + +@Book{rmarkdown2020, + title = {R Markdown Cookbook}, + author = {Yihui Xie and Christophe Dervieux and Emily Riederer}, + publisher = {Chapman and Hall/CRC}, + address = {Boca Raton, Florida}, + year = {2020}, + note = {ISBN 9780367563837}, + url = {https://bookdown.org/yihui/rmarkdown-cookbook}, +} + + +@ARTICLE{Gatto:2020, + title = "{MSnbase}, efficient and elegant R-based processing and + visualisation of raw mass spectrometry data", + author = "Gatto, Laurent and Gibb, Sebastian and Rainer, Johannes", + abstract = "We present version 2 of the MSnbase R/Bioconductor package. + MSnbase provides infrastructure for the manipulation, processing + and visualisation of mass spectrometry data. We focus on the new + on-disk infrastructure, that allows the handling of large raw + mass spectrometry experiments on commodity hardware and + illustrate how the package is used for elegant data processing, + method development and visualisation.", + journal = "J. Proteome Res.", + month = sep, + year = 2020, + language = "en" +} + +@Article{MAE, + title = {Software For The Integration Of Multi-Omics Experiments + In Bioconductor}, + author = {Marcel Ramos and Lucas Schiffer and Angela Re and Rimsha + Azhar and Azfar Basunia and Carmen Rodriguez Cabrera + and Tiffany Chan and Philip Chapman and Sean Davis + and David Gomez-Cabrero and Aedin C. Culhane and + Benjamin Haibe-Kains and Kasper Hansen and Hanish + Kodali and Marie Stephie Louis and Arvind Singh Mer + and Markus Reister and Martin Morgan and Vincent + Carey and Levi Waldron}, + journal = {Cancer Research}, + year = {2017}, + volume = {77(21); e39-42}, + } + + +@Manual{SE, + title = {SummarizedExperiment: SummarizedExperiment container}, + author = {Martin Morgan and Valerie Obenchain and Jim Hester and Hervé Pagès}, + year = {2020}, + note = {R package version 1.21.0}, + url = {https://bioconductor.org/packages/SummarizedExperiment}, + } + +@Article{Christoforou:2016, + author = {Christoforou, Andy and Mulvey, Claire M and + Breckels, Lisa M and Geladaki, Aikaterini and + Hurrell, Tracey and Hayward, Penelope C and Naake, + Thomas and Gatto, Laurent and Viner, Rosa and + Martinez Arias, Alfonso and Lilley, Kathryn S}, + title = {A draft map of the mouse pluripotent stem cell + spatial proteome.}, + journal = {Nat Commun}, + year = {2016}, + month = {}, + number = {}, + volume = {7}, + pages = {8992}, + doi = {10.1038/ncomms9992}, + PMID = {26754106}} + +@article{Sticker:2019, + author = {Sticker, Adriaan and Goeminne, Ludger and Martens, Lennart and Clement, Lieven}, + title = {Robust summarization and inference in proteome-wide label-free quantification}, + elocation-id = {668863}, + year = {2019}, + doi = {10.1101/668863}, + publisher = {Cold Spring Harbor Laboratory}, + abstract = {Label-Free Quantitative mass spectrometry based + workflows for differential expression (DE) analysis + of proteins impose important challenges on the data + analysis due to peptide-specific effects and context + dependent missingness of peptide + intensities. Peptide-based workflows, like MSqRob, + test for DE directly from peptide intensities and + outper-form summarization methods which first + aggregate MS1 peptide intensities to protein + intensities before DE analysis. However, these + methods are computationally expensive, often hard to + understand for the non-specialised end-user, and do + not provide protein summaries, which are important + for visualisation or downstream processing. In this + work, we therefore evaluate state-of-the-art + summarization strategies using a benchmark spike-in + dataset and discuss why and when these fail compared + to the state-of-the-art peptide based model, + MSqRob. Based on this evaluation, we propose a novel + summarization strategy, MSqRob-Sum, which estimates + MSqRob{\textquoteright}s model parameters in a + two-stage procedure circumventing the drawbacks of + peptide-based workflows. MSqRobSum maintains + MSqRob{\textquoteright}s superior performance, while + providing useful protein expression summaries for + plotting and downstream analysis. Summarising + peptide to protein intensities considerably reduces + the computational complexity, the memory footprint + and the model complexity, and makes it easier to + disseminate DE inferred on protein + summaries. Moreover, MSqRobSum provides a highly + modular analysis framework, which provides + researchers with full flexibility to develop data + analysis workflows tailored towards their specific + applications.}, + URL = {https://www.biorxiv.org/content/early/2019/06/13/668863}, + eprint = {https://www.biorxiv.org/content/early/2019/06/13/668863.full.pdf}, + journal = {bioRxiv} +} + +@ARTICLE{Paulovich:2010, + title = "Interlaboratory study characterizing a yeast performance standard + for benchmarking {LC-MS} platform performance", + author = "Paulovich, Amanda G and Billheimer, Dean and Ham, Amy-Joan L and + Vega-Montoto, Lorenzo and Rudnick, Paul A and Tabb, David L and + Wang, Pei and Blackman, Ronald K and Bunk, David M and Cardasis, + Helene L and Clauser, Karl R and Kinsinger, Christopher R and + Schilling, Birgit and Tegeler, Tony J and Variyath, Asokan + Mulayath and Wang, Mu and Whiteaker, Jeffrey R and Zimmerman, + Lisa J and Fenyo, David and Carr, Steven A and Fisher, Susan J + and Gibson, Bradford W and Mesri, Mehdi and Neubert, Thomas A and + Regnier, Fred E and Rodriguez, Henry and Spiegelman, Cliff and + Stein, Stephen E and Tempst, Paul and Liebler, Daniel C", + abstract = "Optimal performance of LC-MS/MS platforms is critical to + generating high quality proteomics data. Although individual + laboratories have developed quality control samples, there is no + widely available performance standard of biological complexity + (and associated reference data sets) for benchmarking of platform + performance for analysis of complex biological proteomes across + different laboratories in the community. Individual preparations + of the yeast Saccharomyces cerevisiae proteome have been used + extensively by laboratories in the proteomics community to + characterize LC-MS platform performance. The yeast proteome is + uniquely attractive as a performance standard because it is the + most extensively characterized complex biological proteome and + the only one associated with several large scale studies + estimating the abundance of all detectable proteins. In this + study, we describe a standard operating protocol for large scale + production of the yeast performance standard and offer aliquots + to the community through the National Institute of Standards and + Technology where the yeast proteome is under development as a + certified reference material to meet the long term needs of the + community. Using a series of metrics that characterize LC-MS + performance, we provide a reference data set demonstrating + typical performance of commonly used ion trap instrument + platforms in expert laboratories; the results provide a basis for + laboratories to benchmark their own performance, to improve upon + current methods, and to evaluate new technologies. Additionally, + we demonstrate how the yeast reference, spiked with human + proteins, can be used to benchmark the power of proteomics + platforms for detection of differentially expressed proteins at + different levels of concentration in a complex matrix, thereby + providing a metric to evaluate and minimize pre-analytical and + analytical variation in comparative proteomics experiments.", + journal = "Mol. Cell. Proteomics", + volume = 9, + number = 2, + pages = "242--254", + month = feb, + year = 2010, + language = "en" +} + +@Article{Lazar:2016, + author = {Lazar, C and Gatto, L and Ferro, M and Bruley, C + and Burger, T}, + title = {Accounting for the Multiple Natures of Missing + Values in Label-Free Quantitative Proteomics Data + Sets to Compare Imputation Strategies.}, + journal = {J Proteome Res}, + year = {2016}, + month = {Apr}, + number = {4}, + volume = {15}, + pages = {1116-25}, + doi = {10.1021/acs.jproteome.5b00981}, + PMID = {26906401} +} + +@Article{Cox:2008, + author = {Cox, J and Mann, M}, + title = {MaxQuant enables high peptide identification + rates, individualized p.p.b.-range mass accuracies + and proteome-wide protein quantification.}, + journal = {Nat Biotechnol}, + year = {2008}, + month = {Dec}, + number = {12}, + volume = {26}, + pages = {1367-72}, + doi = {10.1038/nbt.1511}, + PMID = {19029910}} + +@article{Morgenstern:2020, + author = {Morgenstern, David and Barzilay, Rotem and Levin, Yishai}, + title = {{RawBeans}: A Simple, Vendor-Independent, Raw-Data Quality-Control Tool}, + journal = {Journal of Proteome Research}, + year = {2021}, + doi = {10.1021/acs.jproteome.0c00956}, + note ={PMID: 33657803}, + URL = {https://doi.org/10.1021/acs.jproteome.0c00956}, + eprint = {https://doi.org/10.1021/acs.jproteome.0c00956} +} + + +@ARTICLE{Vanderaa:2021, + title = "Replication of single-cell proteomics data reveals important + computational challenges", + author = "Vanderaa, Christophe and Gatto, Laurent", + abstract = "INTRODUCTION: Mass spectrometry-based proteomics is actively + embracing quantitative, single-cell level analyses. Indeed, + recent advances in sample preparation and mass spectrometry (MS) + have enabled the emergence of quantitative MS-based single-cell + proteomics (SCP). While exciting and promising, SCP still has + many rough edges. The current analysis workflows are custom and + built from scratch. The field is therefore craving for + standardized software that promotes principled and reproducible + SCP data analyses. AREAS COVERED: This special report is the + first step toward the formalization and standardization of SCP + data analysis. scp, the software that accompanies this work, + successfully replicates one of the landmark SCP studies and is + applicable to other experiments and designs. We created a + repository containing the replicated workflow with comprehensive + documentation in order to favor further dissemination and + improvements of SCP data analyses. EXPERT OPINION: Replicating + SCP data analyses uncovers important challenges in SCP data + analysis. We describe two such challenges in detail: batch + correction and data missingness. We provide the current + state-of-the-art and illustrate the associated limitations. We + also highlight the intimate dependence that exists between batch + effects and data missingness and offer avenues for dealing with + these exciting challenges.", + journal = "Expert Rev. Proteomics", + month = oct, + year = 2021, + keywords = "Bioconductor; R; batch correction; imputation; mass spectrometry; + proteomics; replication; reproducible research; single-cell; + software", + language = "en" +} + + +@ARTICLE{Rainer:2022, + title = "A Modular and Expandable Ecosystem for Metabolomics Data + Annotation in {R}", + author = "Rainer, Johannes and Vicini, Andrea and Salzer, Liesa and + Stanstrup, Jan and Badia, Josep M and Neumann, Steffen and + Stravs, Michael A and Verri Hernandes, Vinicius and Gatto, + Laurent and Gibb, Sebastian and Witting, Michael", + abstract = "Liquid chromatography-mass spectrometry (LC-MS)-based untargeted + metabolomics experiments have become increasingly popular + because of the wide range of metabolites that can be analyzed + and the possibility to measure novel compounds. LC-MS + instrumentation and analysis conditions can differ substantially + among laboratories and experiments, thus resulting in + non-standardized datasets demanding customized annotation + workflows. We present an ecosystem of R packages, centered + around the MetaboCoreUtils, MetaboAnnotation and CompoundDb + packages that together provide a modular infrastructure for the + annotation of untargeted metabolomics data. Initial annotation + can be performed based on MS1 properties such as m/z and + retention times, followed by an MS2-based annotation in which + experimental fragment spectra are compared against a reference + library. Such reference databases can be created and managed + with the CompoundDb package. The ecosystem supports data from a + variety of formats, including, but not limited to, MSP, MGF, + mzML, mzXML, netCDF as well as MassBank text files and SQL + databases. Through its highly customizable functionality, the + presented infrastructure allows to build reproducible annotation + workflows tailored for and adapted to most untargeted + LC-MS-based datasets. All core functionality, which supports + base R data types, is exported, also facilitating its re-use in + other R packages. Finally, all packages are thoroughly + unit-tested and documented and are available on GitHub and + through Bioconductor.", + journal = "Metabolites", + publisher = "Multidisciplinary Digital Publishing Institute", + volume = 12, + number = 2, + pages = "173", + month = feb, + year = 2022, + language = "en" +} + +@article{Sinha:2020, + author = {Sinha, Ankit and Mann, Matthias}, + title = "{A beginner’s guide to mass spectrometry–based proteomics}", + journal = {The Biochemist}, + year = {2020}, + month = {09}, + abstract = "{Mass spectrometry (MS)-based proteomics is the most + comprehensive approach for the quantitative + profiling of proteins, their interactions and + modifications. It is a challenging topic as a firm + grasp requires expertise in biochemistry for sample + preparation, analytical chemistry for + instrumentation and computational biology for data + analysis. In this short guide, we highlight the + various components of a mass spectrometer, the + sample preparation process for conversion of + proteins into peptides, and quantification and + analysis strategies. The advancing technology of + MS-based proteomics now opens up opportunities in + clinical applications and single-cell analysis.}", + issn = {0954-982X}, + doi = {10.1042/BIO20200057}, + url = {https://doi.org/10.1042/BIO20200057}, + note = {BIO20200057}, + eprint = {https://portlandpress.com/biochemist/article-pdf/doi/10.1042/BIO20200057/892770/bio20200057.pdf}, +} + +@Article{Steen:2004, + title = "The {ABC's} (and {XYZ's}) of peptide sequencing", + author = "Steen, Hanno and Mann, Matthias", + abstract = "Proteomics is an increasingly powerful and indispensable + technology in molecular cell biology. It can be used to identify + the components of small protein complexes and large organelles, + to determine post-translational modifications and in + sophisticated functional screens. The key - but little understood + - technology in mass-spectrometry-based proteomics is peptide + sequencing, which we describe and review here in an easily + accessible format.", + journal = "Nat. Rev. Mol. Cell Biol.", + volume = 5, + number = 9, + pages = "699--711", + month = sep, + year = 2004, + language = "en" +} + +@ARTICLE{Marcotte:2007, + title = "How do shotgun proteomics algorithms identify proteins?", + author = "Marcotte, Edward M", + journal = "Nat. Biotechnol.", + volume = 25, + number = 7, + pages = "755--757", + month = jul, + year = 2007, + language = "en" +} + + +@ARTICLE{Shuken:2023, + title = "An Introduction to Mass {Spectrometry-Based} Proteomics", + author = "Shuken, Steven R", + abstract = "Mass spectrometry is unmatched in its versatility for studying + practically any aspect of the proteome. Because the foundations + of mass spectrometry-based proteomics are complex and span + multiple scientific fields, proteomics can be perceived as having + a high barrier to entry. This tutorial is intended to be an + accessible illustrated guide to the technical details of a + relatively simple quantitative proteomic experiment. An attempt + is made to explain the relevant concepts to those with limited + knowledge of mass spectrometry and a basic understanding of + proteins. An experimental overview is provided, from the + beginning of sample preparation to the analysis of protein group + quantities, with explanations of how the data are acquired, + processed, and analyzed. A selection of advanced topics is + briefly surveyed and works for further reading are cited. To + conclude, a brief discussion of the future of proteomics is + given, considering next-generation protein sequencing + technologies that may complement mass spectrometry to create a + fruitful future for proteomics.", + journal = "J. Proteome Res.", + month = jun, + year = 2023, + keywords = "bottom-up; data-dependent acquisition; label-free quantification; + mass spectrometry; proteomics; untargeted proteomics", + language = "en" +} diff --git a/inst/assets/book.scss b/inst/assets/book.scss new file mode 100644 index 0000000..174271f --- /dev/null +++ b/inst/assets/book.scss @@ -0,0 +1,249 @@ +/*-- scss:defaults --*/ + +$primary: #070707 !default; +$body-color: #070707 !default; + +/*-- scss:rules --*/ + +/* ------------------------------------------------------ */ +/* ------------------------------------------------------ */ +/* ------------------ CUSTOM RULES ---------------------- */ +/* ------------------------------------------------------ */ +/* ------------------------------------------------------ */ + +// Add any custom css styling here... + +/* Callout ------------------------------------------------ */ + +$icon: url('data:image/svg+xml,') !default; + +div.callout-question.callout { + border-left-color: #0df0a1; +} + +div.callout-question.callout-style-default>.callout-header { + background-color: #bfebce; +} + +.callout-question.icon.callout-style-default div.callout-icon-container { + padding-top: 0.1em; + padding-right: 0.35em; +} + +div.callout-answer.callout { + border-left-color: #0dcaf0; +} + +div.callout-answer.callout-style-default>.callout-header { + background-color: #bfe4eb; +} + +.callout-answer.icon .callout-icon { + display: unset !important; +} + +div.callout-answer.icon.callout-captioned .callout-icon::before { + background-image: $icon; +} + +.callout-answer.icon.callout-style-default div.callout-icon-container { + padding-top: 0.1em; + padding-right: 0.35em; +} + + +/* ------------------------------------------------------ */ +/* ------------------------------------------------------ */ +/* ----------------- DEFAULT RULES ---------------------- */ +/* ------------------------------------------------------ */ +/* ------------------------------------------------------ */ + +code { color: #070707; } + +/* Code chunks ------------------------------------------ */ + +div.sourceCode { + background-color: #ffffff00; + border: 2px; + border-radius: 8px; + box-shadow: 0 0 0 0 rgba(0, 0, 0, 0.06), 0 2px 5px 0 rgba(0, 0, 0, 0.06), 0 10px 10px 0 rgba(0, 0, 0, 0.05), 0 22px 13px 0 rgba(0, 0, 0, 0.03), 0 39px 16px 0 rgba(0, 0, 0, 0.01), 0 61px 17px 0 rgba(0, 0, 0, 0); +} + +pre.sourceCode.r, code.sourceCode.r { + + $border: 2px; + color: #070707; + background: #FFF; + background-clip: padding-box; /* !importanté */ + border: solid $border transparent; /* !importanté */ + border-radius: 8px; + + &:before { + content: ''; + position: absolute; + top: 0; right: 0; bottom: 0; left: 0; + z-index: -1; + margin: -$border; /* !importanté */ + border-radius: inherit; /* !importanté */ + background: linear-gradient(to right, #18a603, #0484a9, #0087af); + } + +} + +/* Chapter label ---------------------------------------- */ + +.chapter-number::before { + content: "Chapter "; +} + +.chapter-number::after { + content: " –"; +} + +/* Cover image ------------------------------------------ */ + +.quarto-cover-image { + max-width: 250px; + float: right; + margin-left: 30px; + margin-top: -30px; + margin-right: 10%; +} + +/* Left navbar ------------------------------------------ */ + +div.sidebar-item-container .active, div.sidebar-item-container .show>.nav-link, div.sidebar-item-container .sidebar-link>code { + font-weight: 800; +} + +.sidebar-title { + font-weight: 800; + background: linear-gradient(to right, #18a603, #0484a9, #0087af); + -webkit-background-clip: text; + -webkit-text-fill-color: transparent; +} + +.sidebar-tools-main { + font-weight: normal; + background: white; + color: black; + -webkit-background-clip: text; + -webkit-text-fill-color: black; +} + +.sidebar-navigation li a { + text-decoration: underline; +} + +.text-start { + text-align: left !important; + font-weight: 800; +} + +#quarto-sidebar { + transition: width .15s ease-in; + padding: 14px 10%; + background-color: white; +} + +.sidebar.sidebar-navigation:not(.rollup) { + border-right: 0px !important +} + +.sidebar-navigation .sidebar-item { + font-size: 1rem; + line-height: 2em; +} + +.sidebar-menu-container { + border: solid #add2dd 1px; + border-radius: 8px; + padding: 8px; + margin-top: 25px; +} + +/* right navbar ----------------------------------------- */ + +.sidebar nav[role=doc-toc] ul>li>a.active, .sidebar nav[role=doc-toc] ul>li>ul>li>a.active { + border-left: 4px solid #3792ad; + font-weight: 800; +} + +#toc-title+ ul > li > .nav-link { + font-weight: 800; +} + +/* Headings and text styling --------------------------- */ + +h1 { + font-size: 36px; + color: #070707; + font-weight: 700; + border-image: linear-gradient(to right, #18a603, #0484a9, #0087af) 1; + border-bottom-style: solid; + border-bottom-width: 4px; +} +h2 { + margin-top: 3rem; + margin-bottom: 1rem; + font-size: 32px; + border-bottom: 0px; +} +h3 { margin-top: 1.5em; font-size: 1.2rem; } +h4 { margin-top: 1.5em; font-size: 1.1rem; } +h5 { margin-top: 1.5em; font-size: 1rem; } + +h1, .h1, h2, .h2, h3, .h3, h4, .h4, h5, .h5 { + line-height: 120%; + margin: 0 0 1rem; + width: fit-content; + padding-top: 0.5rem; + color: #070707; +} + +p { + margin: 0 0 1rem; + font-size: 1rem; + color: #070707; + line-height: 130%; + display: block; + margin-block-start: 1em; + margin-block-end: 1em; + margin-inline-start: 0px; + margin-inline-end: 0px; +} + +.quarto-section-identifier { + color: #070707; + font-weight: normal; +} + +ul li::marker { + color: #3792ad; +} + +/* Underlining links ------------------------------------ */ + +.citation a, .footnote-ref { + text-decoration: underline; +} + +/* Printing --------------------------------------------- */ + +@media print { + :root { + font-size: 11pt; + } + #quarto-sidebar, #TOC, .nav-page { + display: none; + } + .page-columns .content { + grid-column-start: page-start; + } + .fixed-top { + position: relative; + } + .panel-caption, .figure-caption, figcaption { + color: #666; + } +} diff --git a/inst/assets/cover.png b/inst/assets/cover.png new file mode 100644 index 0000000..0c59537 Binary files /dev/null and b/inst/assets/cover.png differ diff --git a/inst/assets/favicon.png b/inst/assets/favicon.png new file mode 100644 index 0000000..d36072a Binary files /dev/null and b/inst/assets/favicon.png differ diff --git a/inst/extensions/pandoc-ext/section-bibliographies/_extension.yml b/inst/extensions/pandoc-ext/section-bibliographies/_extension.yml new file mode 100644 index 0000000..2f3e8c7 --- /dev/null +++ b/inst/extensions/pandoc-ext/section-bibliographies/_extension.yml @@ -0,0 +1,6 @@ +name: section-bibliographies +author: Albert Krewinkel +version: 0.0.1 +contributes: + filters: + - section-bibliographies.lua diff --git a/inst/extensions/pandoc-ext/section-bibliographies/section-bibliographies.lua b/inst/extensions/pandoc-ext/section-bibliographies/section-bibliographies.lua new file mode 100644 index 0000000..957aad3 --- /dev/null +++ b/inst/extensions/pandoc-ext/section-bibliographies/section-bibliographies.lua @@ -0,0 +1,148 @@ +--- greetings.lua – turns any document into a friendly greeting +--- +--- Copyright: © 2018 Jesse Rosenthal, 2020–2022 Albert Krewinkel +--- License: MIT – see LICENSE for details + +-- pandoc.utils.make_sections exists since pandoc 2.8 +PANDOC_VERSION:must_be_at_least {2,8} + +local utils = require 'pandoc.utils' +local run_json_filter = utils.run_json_filter + +--- The document's metadata +local meta +-- Lowest level at which bibliographies should be generated. +local section_refs_level +-- original bibliography value +local orig_bibliography + +-- Returns true iff a div is a section div. +local function is_section_div (div) + return div.t == 'Div' + and div.classes[1] == 'section' + and div.attributes.number +end + +local function section_header (div) + local header = div.content and div.content[1] + local is_header = is_section_div(div) + and header + and header.t == 'Header' + return is_header and header or nil +end + +local function adjust_refs_components (div) + local header = section_header(div) + if not header then + return div + end + local blocks = div.content + local bib_header = blocks:find_if(function (b) + return b.attr and b.identifier == 'bibliography' + end) + local refs = blocks:find_if(function (b) + return b.attr and b.identifier == 'refs' + end) + if bib_header then + bib_header.identifier = 'bibliography-' .. header.attributes.number + bib_header.level = header.level + 1 + end + if refs and refs.identifier == 'refs' then + refs.identifier = 'refs-' .. header.attributes.number + end + return div +end + +local function run_citeproc (doc) + if PANDOC_VERSION >= '2.19.1' then + return pandoc.utils.citeproc(doc) + elseif PANDOC_VERSION >= '2.11' then + local args = {'--from=json', '--to=json', '--citeproc'} + return run_json_filter(doc, 'pandoc', args) + else + return run_json_filter(doc, 'pandoc-citeproc', {FORMAT, '-q'}) + end +end + +--- Create a bibliography for a given topic. This acts on all +-- section divs at or above `section_refs_level` +local function create_section_bibliography (div) + -- don't do anything if there is no bibliography + if not meta.bibliography and not meta.references then + return nil + end + local header = section_header(div) + -- Blocks for which a bibliography will be generated + local subsections + local blocks + if not header or section_refs_level < header.level then + -- Don't do anything for lower level sections. + return nil + elseif section_refs_level == header.level then + blocks = div.content + subsections = pandoc.List:new{} + else + blocks = div.content:filter(function (b) + return not is_section_div(b) + end) + subsections = div.content:filter(is_section_div) + end + local tmp_doc = pandoc.Pandoc(blocks, meta) + local new_doc = run_citeproc(tmp_doc) + div.content = new_doc.blocks .. subsections + return adjust_refs_components(div) +end + +--- Remove remaining section divs +local function flatten_sections (div) + local header = section_header(div) + if not header then + return nil + else + header.identifier = div.identifier + header.attributes.number = nil + div.content[1] = header + return div.content + end +end + +--- Filter to the references div and bibliography header added by +--- pandoc-citeproc. +local remove_pandoc_citeproc_results = { + Header = function (header) + return header.identifier == 'bibliography' + and {} + or nil + end, + Div = function (div) + return div.identifier == 'refs' + and {} + or nil + end +} + +local function restore_bibliography (meta) + meta.bibliography = orig_bibliography + return meta +end + +--- Setup the document for further processing by wrapping all +--- sections in Div elements. +function setup_document (doc) + -- save meta for other filter functions + meta = doc.meta + section_refs_level = tonumber(meta["section-bibs-level"]) or 1 + orig_bibliography = meta.bibliography + meta.bibliography = meta['section-bibs-bibliography'] or meta.bibliography + local sections = utils.make_sections(true, nil, doc.blocks) + return pandoc.Pandoc(sections, doc.meta) +end + +return { + -- remove result of previous pandoc-citeproc run (for backwards + -- compatibility) + remove_pandoc_citeproc_results, + {Pandoc = setup_document}, + {Div = create_section_bibliography}, + {Div = flatten_sections, Meta = restore_bibliography} +} diff --git a/inst/extensions/tools-tabset-ext/tools-tabset.lua b/inst/extensions/tools-tabset-ext/tools-tabset.lua new file mode 100644 index 0000000..6b11e00 --- /dev/null +++ b/inst/extensions/tools-tabset-ext/tools-tabset.lua @@ -0,0 +1,39 @@ + + +local kTabsetIcons = { + ["VS Code"] = "vscode-logo.jpg", + ["R"] = "rstudio-logo.jpg", + ["Terminal"] = "text-editor-logo.jpg" +} + +local injected = false +local function injectChooseYourTool() + if not injected then + injected = true + quarto.doc.include_text('after-body', [[ + + ]]) + end +end + +function Tabset(el) + if el.attr.attributes["group"] == "tools-tabset" then + injectChooseYourTool() + for i, tab in ipairs(el.tabs) do + local text = pandoc.utils.stringify(tab.title) + local icon = kTabsetIcons[text] + if icon then + tab.title.content:insert(1, pandoc.Image("", "/pages/images/" .. icon)) + end + end + end + return el +end \ No newline at end of file diff --git a/index.Rmd b/inst/index.qmd similarity index 69% rename from index.Rmd rename to inst/index.qmd index fb8478c..eacc643 100644 --- a/index.Rmd +++ b/inst/index.qmd @@ -1,25 +1,37 @@ --- -title: "R for Mass Spectrometry" -subtitle: "Applications in Proteomics and Metabolomics" -author: "Laurent Gatto, Sebastian Gibb, Johannes Rainer" -date: "`r Sys.Date()`" -output: - msmbstyle::msmb_html_book: - highlight: tango - toc: TRUE - toc_depth: 1 - split_by: chapter - margin_references: TRUE - css: style.css -bibliography: [refs.bib, packages.bib] -link-citations: yes +license: "CC BY-SA" --- -# Preamble +```{r "intro"} +#| echo: false +intro <- tryCatch( + { + description <- packageDescription("R4MS") + pkg <- description$Package + version <- description$Version + authors <- eval(parse(text = description$Authors)) + license <- description$License + glue::glue( + "**Package:** {pkg}
\n", + "**Authors:** {paste(format(authors, include = c('given', 'family', 'role')), collapse = ', ')}
\n", + "**Compiled:** {as.character(Sys.Date())}
\n", + "**Package version:** {version}
\n", + "**R version:** {R.version.string}
\n", + "**BioC version:** {BiocManager::version()}
\n", + "**Package license:** {license}
\n", + "**Book license:** CC BY-NC-SA
" + ) + }, + error = function(e) {"Local preview"} +) +``` + +`r intro` + +# Welcome {-} ```{r, echo = FALSE} options(bitmapType="cairo") - ``` The aim of the [R for Mass @@ -32,10 +44,6 @@ development efforts of its core members under the RforMassSpectrometry organisation to facilitate dissemination and accessibility of their work. -```{r sticker, fig.cap = "The *R for Mass Spectrometry* intiative sticker, designed by Johannes Rainer.", out.width = '50%', fig.margin=TRUE, echo=FALSE} -knitr::include_graphics("https://github.com/rformassspectrometry/stickers/raw/master/sticker/RforMassSpectrometry.png") -``` - This material introduces participants to the analysis and exploration of mass spectrometry (MS) based proteomics data using R and Bioconductor. The course will cover all levels of MS data, from raw @@ -66,12 +74,6 @@ data structures such as data frames, vectors, matrices, ... and their manipulation) is required. Familiarity with other Bioconductor omics data classes and the tidyverse syntax is useful, but not necessary. - -```{r bib, include=FALSE} -# create a bib file for the R packages used in this document -knitr::write_bib(c('base', 'rmarkdown', 'bookdown', 'msmbstyle'), file = 'skeleton.bib') -``` - ```{r env_0, echo = FALSE, message = FALSE, warning = FALSE} suppressPackageStartupMessages(library("BiocStyle")) suppressPackageStartupMessages(library("mzR")) @@ -107,7 +109,6 @@ BiocManager::install("PSMatch") BiocManager::install("pheatmap") BiocManager::install("limma") BiocManager::install("MSnID") -BiocManager::install("RforMassSpectrometry/SpectraVis") ``` Follow the instructions in [this @@ -116,25 +117,6 @@ to install the packages and download some of the data used in the following chapters. All software versions used to generate this document are recoded at the end of the book in \@ref(sec-si). -To compile and render the teaching material, you will also need -the `r BiocStyle::Biocpkg("BiocStyle")` package and the (slighly -modified) [Modern Statistics for Model Biology (msmb) HTML Book -Style](https://www-huber.embl.de/users/msmith/msmbstyle/) by Mike -Smith: - -```{r setup2, eval = FALSE} -BiocManager::install(c("bookdown", "BiocStyle", "lgatto/msmbstyle")) -``` - -Run the [installation -script](https://github.com/rformassspectrometry/docs/blob/main/install_docs_deps.R) -by executing the line below to install all requirements to compile the -book: - -```{r source, eval = FALSE} -source("https://raw.githubusercontent.com/rformassspectrometry/docs/main/install_docs_deps.R") -``` - ## Acknowledgments {-} Thank you to [Charlotte Soneson](https://github.com/csoneson) for @@ -153,3 +135,57 @@ Attribution-ShareAlike 4.0 International License. You are free to and **adapt** (remix, transform, and build upon the material) for any purpose, even commercially, as long as you give appropriate credit and distribute your contributions under the same license as the original. + + +# Docker image {-} + +A `Docker` image built from this repository is available here: + +👉 [ghcr.io/js2264/r4ms](https://ghcr.io/js2264/r4ms) 🐳 + +::: {.callout-tip icon='true'} +## Get started now 🎉 + +You can get access to all the packages used in this book in < 1 minute, +using this command in a terminal: + +```{sh "docker", filename="bash"} +#| eval: false +docker run -it ghcr.io/js2264/r4ms:devel R +``` + +::: + +# RStudio Server {-} + +An RStudio Server instance can be initiated from the `Docker` image as follows: + +```{sh "rstudio", filename="bash"} +#| eval: false +docker run \ + --volume : \ + -e PASSWORD=OHCA \ + -p 8787:8787 \ + ghcr.io/js2264/r4ms:devel +``` + +The initiated RStudio Server instance will be available at +[https://localhost:8787](https://localhost:8787). + +# Session info {-} + +::: {.callout-note collapse="true"} + +## Click to expand 👇 + +```{r "session info"} +#| cache: false +sessioninfo::session_info( + installed.packages()[,"Package"], + include_base = TRUE +) +``` + +::: + +# References {-} diff --git a/20-id.Rmd b/inst/pages/20-id.qmd similarity index 84% rename from 20-id.Rmd rename to inst/pages/20-id.qmd index ac65891..ea37dcb 100644 --- a/20-id.Rmd +++ b/inst/pages/20-id.qmd @@ -11,17 +11,18 @@ the exact command. The example below illustrates this for 3 mzML files to be searched using `MSGFplus`: -```{r msgf, eval = TRUE} -(mzmls <- paste0("file_", 1:3, ".mzML")) -(mzids <- sub("mzML", "mzid", mzmls)) +```{r msgf} +mzmls <- paste0("file_", 1:3, ".mzML") +mzids <- sub("mzML", "mzid", mzmls) -paste0("java -jar /path/to/MSGFPlus.jar", +cmds <- paste0("java -jar /path/to/MSGFPlus.jar", " -s ", mzmls, " -o ", mzids, " -d uniprot.fas", " -t 20ppm", " -m 0", " int 1") +cmds ``` ## Identification data.frame @@ -55,14 +56,12 @@ n_scans <- length(unique(id$spectrumID)) n_seqs <- length(unique(id$sequence)) ``` -`r msmbstyle::question_begin()` +::: {.callout-question .icon .callout-note} Verify that this table contains `r n_matches` matches for `r n_scans` scans and `r n_seqs` peptides sequences. -`r msmbstyle::question_end()` - -`r msmbstyle::solution_begin()` +::: {.callout-answer .icon .callout-note collapse=true} ```{r} nrow(id) ## number of matches @@ -70,7 +69,9 @@ length(unique(id$spectrumID)) ## number of scans length(unique(id$sequence)) ## number of peptide sequences ``` -`r msmbstyle::solution_end()` +::: + +::: The PSM data are read as is, without any filtering. As we can see below, we still have all the hits from the forward and reverse (decoy) @@ -93,7 +94,6 @@ table(table(id$spectrumID)) Below, we can see how scan 1774 has 4 matches, all to sequence `RTRYQAEVR`, which itself matches to 4 different proteins: - ```{r} i <- which(id$spectrumID == "controllerType=0 controllerNumber=1 scan=1774") data.frame(id[i, ])[1:5] @@ -138,44 +138,36 @@ Here, the `filter()` from the `dplyr` package comes very handy. We will thus start by converting the `DataFrame` to a `tibble`. ```{r, message = FALSE} -library("dplyr") +library(dplyr) id_tbl <- tidyr::as_tibble(id) id_tbl ``` -`r msmbstyle::question_begin()` +::: {.callout-question .icon .callout-note} - Remove decoy hits -`r msmbstyle::question_end()` - -`r msmbstyle::solution_begin()` +::: {.callout-answer .icon .callout-note collapse=true} ```{r, message = FALSE} -id_tbl <- id_tbl %>% +id_tbl <- id_tbl |> filter(!isDecoy) id_tbl ``` -`r msmbstyle::solution_end()` -`r msmbstyle::question_begin()` +::: - Keep first rank matches -`r msmbstyle::question_end()` - -`r msmbstyle::solution_begin()` - +::: {.callout-answer .icon .callout-note collapse=true} ```{r} -id_tbl <- id_tbl %>% +id_tbl <- id_tbl |> filter(rank == 1) id_tbl ``` -`r msmbstyle::solution_end()` - -`r msmbstyle::question_begin()` +::: - Remove shared peptides. Start by identifying scans that match different proteins. For example scan 4884 matches proteins @@ -183,27 +175,28 @@ id_tbl `XXX_ECA4416_2` and `XXX_ECA4416_3`. Then remove the scans that match any of these proteins. -`r msmbstyle::question_end()` - -`r msmbstyle::solution_begin()` +::: {.callout-answer .icon .callout-note collapse=true} ```{r} mltm <- - id_tbl %>% - group_by(spectrumID) %>% - mutate(nProts = length(unique(DatabaseAccess))) %>% - filter(nProts > 1) %>% + id_tbl |> + group_by(spectrumID) |> + mutate(nProts = length(unique(DatabaseAccess))) |> + filter(nProts > 1) |> select(spectrumID, nProts) mltm ``` + ```{r} id_tbl <- - id_tbl %>% + id_tbl |> filter(!spectrumID %in% mltm$spectrumID) id_tbl ``` -`r msmbstyle::solution_end()` +::: + +::: Which leaves us with `r nrow(id_tbl)` PSMs. @@ -219,7 +212,6 @@ The `describePeptides()` and `describeProteins()` functions from the `PSMatch` package provide useful summaries of preptides and proteins in a PSM search result. - - `describePeptides()` gives the number of unique and shared peptides and for the latter, the size of their protein groups: @@ -239,28 +231,28 @@ matrices](https://rformassspectrometry.github.io/PSMatch/articles/AdjacencyMatri `PSMatch` vignette provides additional tools to explore how proteins were inferred from peptides. -`r msmbstyle::question_begin()` +::: {.callout-question .icon .callout-note} Compare the distribution of raw identification scores of the decoy and non-decoy hits. Interpret the figure. -`r msmbstyle::question_end()` - -`r msmbstyle::solution_begin()` +::: {.callout-answer .icon .callout-note collapse=true} ```{r} library(ggplot2) -as_tibble(id) %>% +as_tibble(id) |> ggplot(aes(x = MS.GF.RawScore, colour = isDecoy)) + geom_density() ``` -`r msmbstyle::solution_end()` +::: -`r msmbstyle::question_begin()` +::: -The `r CRANpkg("tidyverse")` +::: {.callout-question .icon .callout-note} + +The `r BiocStyle::CRANpkg("tidyverse")` tools are fit for data wrangling with identification data. Using the above identification dataframe, calculate the length of each peptide (you can use `nchar` with the peptide sequence `sequence`) and the @@ -268,27 +260,22 @@ number of peptides for each protein (defined as `DatabaseDescription`). Plot the length of the proteins against their respective number of peptides. -`r msmbstyle::question_end()` - -`r msmbstyle::solution_begin()` - +::: {.callout-answer .icon .callout-note collapse=true} ```{r answid1, fig.cap = "Identifcation data wrangling."} -suppressPackageStartupMessages(library("dplyr")) -iddf <- as_tibble(id_filtered) %>% +iddf <- as_tibble(id_filtered) |> mutate(peplen = nchar(sequence)) -npeps <- iddf %>% - group_by(DatabaseAccess) %>% - tally +npeps <- iddf |> + group_by(DatabaseAccess) |> + tally() iddf <- full_join(iddf, npeps) -library("ggplot2") ggplot(iddf, aes(x = n, y = DBseqLength)) + geom_point() ``` -`r msmbstyle::solution_end()` - +::: +::: If you would like to learn more about how the mzid data are handled by `PSMatch` via the `r BiocStyle::Biocpkg("mzR")` and `r BiocStyle::Biocpkg("mzID")` @@ -300,19 +287,33 @@ packages, check out the \@ref(sec-id2) section in the annex. We are goind to use the `sp` object created in the previous chapter and the `id_filtered` variable generated above. +::: {.callout-tip collapse="true"} + +## Generating the `sp` object from scratch 👇 + +```{r} +library(rpx) +library(Spectra) +fn <- "TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01-20141210.mzML" +px <- PXDataset("PXD000001") +mzf <- pxget(px, fn) +sp <- Spectra(mzf) +sp +``` + +::: + Identification data (as a `DataFrame`) can be merged into raw data (as a `Spectra` object) by adding new spectra variables to the appropriate MS2 spectra. Scans and peptide-spectrum matches can be matched by their spectrum identifers. -`r msmbstyle::question_begin()` +::: {.callout-question .icon .callout-note} Identify the spectum identifier columns in the `sp` the `id_filtered` variables. -`r msmbstyle::question_end()` - -`r msmbstyle::solution_begin()` +::: {.callout-answer .icon .callout-note collapse=true} In the raw data, it is encoded as `spectrumId`, while in the identification data, we have `spectrumID`. @@ -322,7 +323,9 @@ spectraVariables(sp) names(id_filtered) ``` -`r msmbstyle::solution_end()` +::: + +::: We still have several PTMs that are matched to a single spectrum identifier: @@ -334,10 +337,9 @@ table(table(id_filtered$spectrumID)) Let's look at `"controllerType=0 controllerNumber=1 scan=5490"`, the has 4 matching PSMs in detail. - ```{r} which(table(id_filtered$spectrumID) == 4) -id_4 <- id_filtered[id_filtered$spectrumID == "controllerType=0 controllerNumber=1 scan=5490", ] %>% +id_4 <- id_filtered[id_filtered$spectrumID == "controllerType=0 controllerNumber=1 scan=5490", ] |> as.data.frame() id_4 ``` @@ -353,14 +355,11 @@ Let's reduce that PSM table before joining it to the `Spectra` object, to make sure we have unique one-to-one matches between the raw spectra and the PSMs. - ```{r, warning = FALSE} id_filtered <- reducePSMs(id_filtered, id_filtered$spectrumID) id_filtered ``` - - These two data can thus simply be joined using: ```{r} @@ -370,15 +369,12 @@ sp <- joinSpectraData(sp, id_filtered, spectraVariables(sp) ``` -`r msmbstyle::question_begin()` +::: {.callout-question .icon .callout-note} Verify that the identification data has been added to the correct spectra. -`r msmbstyle::question_end()` - -`r msmbstyle::solution_begin()` - +::: {.callout-answer .icon .callout-note collapse=true} Let's first verify that no identification data has been added to the MS1 scans. @@ -400,8 +396,10 @@ Let's compare the precursor/peptide mass to charges sp_2 <- sp_2[!is.na(sp_2$sequence)] summary(sp_2$precursorMz - sp_2$experimentalMassToCharge) ``` -`r msmbstyle::solution_end()` +::: + +::: ## An identification-annotated chromatogram @@ -417,7 +415,6 @@ either 1 or 0, depending on the presence of a sequence. For MS1 scans, the function will count the number of sequences for the descendant MS2 scans, i.e. those produced from precursor ions from each MS1 scan. - ```{r nSequence, cache = FALSE} sp <- countIdentifications(sp) ``` @@ -435,11 +432,11 @@ These data can also be visualised on the total ion chromatogram: ```{r nSequencePlot, fig.fullwidth = TRUE, fig.width = 8, fig.height = 4} sp |> -filterMsLevel(1) |> -spectraData() |> -as_tibble() |> -ggplot(aes(x = rtime, - y = totIonCurrent)) + + filterMsLevel(1) |> + spectraData() |> + as_tibble() |> + ggplot(aes(x = rtime, + y = totIonCurrent)) + geom_line(alpha = 0.25) + geom_point(aes(colour = ifelse(countIdentifications == 0, NA, countIdentifications)), @@ -474,12 +471,10 @@ plotSpectra(sp[i], labels = addFragments, labelPos = 3, labelCol = "steelblue") ``` - When a precursor peptide ion is fragmented in a CID cell, it breaks at specific bonds, producing sets of peaks (*a*, *b*, *c* and *x*, *y*, *z*) that can be predicted. - ```{r frag_img, results='markup', fig.margin=FALSE, fig.cap="Peptide fragmentation.", echo=FALSE, out.width = "80%"} knitr::include_graphics("img/frag.png") ``` @@ -497,34 +492,31 @@ calculateFragments(sp[i]$sequence) The `compareSpectra()` function can be used to compare spectra (by default, computing the normalised dot product). -`r msmbstyle::question_begin()` +::: {.callout-question .icon .callout-note} -1. Create a new `Spectra` object containing the MS2 spectra with - sequences `"SQILQQAGTSVLSQANQVPQTVLSLLR"` and - `"TKGLNVMQNLLTAHPDVQAVFAQNDEMALGALR"`. +Create a new `Spectra` object containing the MS2 spectra with +sequences `"SQILQQAGTSVLSQANQVPQTVLSLLR"` and +`"TKGLNVMQNLLTAHPDVQAVFAQNDEMALGALR"`. -`r msmbstyle::question_end()` - -`r msmbstyle::solution_begin()` +::: {.callout-answer .icon .callout-note collapse=true} ```{r} k <- which(sp$sequence %in% c("SQILQQAGTSVLSQANQVPQTVLSLLR", "TKGLNVMQNLLTAHPDVQAVFAQNDEMALGALR")) sp_k <- sp[k] sp_k ``` -`r msmbstyle::solution_end()` - -`r msmbstyle::question_begin()` +::: -2. Calculate the `r length(sp_k)` by `r length(sp_k)` similarity - matrix between all spectra using `compareSpectra`. See the - `?Spectra` man page for details. Draw a heatmap of that matrix. +::: -`r msmbstyle::question_end()` +::: {.callout-question .icon .callout-note} -`r msmbstyle::solution_begin()` +Calculate the `r length(sp_k)` by `r length(sp_k)` similarity +matrix between all spectra using `compareSpectra`. See the +`?Spectra` man page for details. Draw a heatmap of that matrix. +::: {.callout-answer .icon .callout-note collapse=true} ```{r} mat <- compareSpectra(sp_k) @@ -533,41 +525,37 @@ mat pheatmap::pheatmap(mat) ``` -`r msmbstyle::solution_end()` +::: -`r msmbstyle::question_begin()` +::: -3. Compare the spectra with the plotting function seen previously. +::: {.callout-question .icon .callout-note} -`r msmbstyle::question_end()` +Compare the spectra with the plotting function seen previously. -`r msmbstyle::solution_begin()` - - -```{r} -filterIntensity(sp_k, 1e3) %>% plotSpectra(main = sp_k$sequence) -``` +::: {.callout-answer .icon .callout-note collapse=true} ```{r} +filterIntensity(sp_k, 1e3) |> plotSpectra(main = sp_k$sequence) par(mfrow = c(3, 1)) plotSpectraMirror(sp_k[1], sp_k[2], main = "TK...") plotSpectraMirror(sp_k[3], sp_k[4], main = "SQ...") plotSpectraMirror(sp_k[3], sp_k[4], main = "SQ...") ``` -`r msmbstyle::solution_end()` +::: + +::: ## Summary exercise -`r msmbstyle::question_begin()` +::: {.callout-question .icon .callout-note} Download the 3 first mzML and mzID files from the [PXD022816](https://www.ebi.ac.uk/pride/archive/projects/PXD022816) project [@Morgenstern:2020]. -`r msmbstyle::question_end()` - -`r msmbstyle::solution_begin()` +::: {.callout-answer .icon .callout-note collapse=true} ```{r} ## Getting data from PX/PRIDE @@ -586,41 +574,41 @@ pxfiles(PXD022816) (mzmls <- pxget(PXD022816, grep("mzML", pxfiles(PXD022816))[1:3])) ``` -`r msmbstyle::solution_end()` +::: +::: -`r msmbstyle::question_begin()` +::: {.callout-question .icon .callout-note} Generate a `Spectra` object and a table of filtered PSMs. Visualise the total ion chromatograms and check the quality of the identification data by comparing the density of the decoy and target PSMs id scores for each file. -`r msmbstyle::question_end()` - -`r msmbstyle::solution_begin()` +::: {.callout-answer .icon .callout-note collapse=true} ```{r, message = FALSE} ## Loading raw data -library("Spectra") +library(Spectra) sp <- Spectra(mzmls) sp ## number of spectra per file table(basename(sp$dataOrigin)) + ## all levels are centroided table(sp$centroided, sp$msLevel) ``` ```{r, message = FALSE, fig.width = 12} -library("ggplot2") -library("tidyr") -library("magrittr") +library(ggplot2) +library(tidyr) +library(magrittr) ## Chromatograms -filterMsLevel(sp, 1) %>% - spectraData() %>% - as_tibble() %>% +filterMsLevel(sp, 1) |> + spectraData() |> + as_tibble() |> ggplot(aes(x = rtime, y = totIonCurrent, colour = basename(dataOrigin))) + @@ -629,13 +617,13 @@ filterMsLevel(sp, 1) %>% ```{r, message = FALSE, fig.width = 12} ## Identification data -library("PSMatch") +library(PSMatch) id <- PSM(mzids) ## Number of PSMs per acquisition table(id$idFile) -tidyr::as_tibble(id) %>% +tidyr::as_tibble(id) |> ggplot(aes(x = MetaMorpheus.score, colour = isDecoy)) + geom_density() + @@ -647,17 +635,16 @@ id_filtered <- filterPSMs(id) max(id_filtered$PSM.level.q.value) ``` -`r msmbstyle::solution_end()` +::: +::: -`r msmbstyle::question_begin()` +::: {.callout-question .icon .callout-note} Join the raw and identification data. Beware though that the joining must now be performed by spectrum ids and by files. -`r msmbstyle::question_end()` - -`r msmbstyle::solution_begin()` +::: {.callout-answer .icon .callout-note collapse=true} ```{r} ## primary key for spectra @@ -683,19 +670,19 @@ sp <- joinSpectraData(sp, id_filtered, by.x = "pkey") ## Number of MS2 scans with a PSM table(!is.na(filterMsLevel(sp, 2)$sequence)) ``` -`r msmbstyle::solution_end()` +::: + +::: -`r msmbstyle::question_begin()` +::: {.callout-question .icon .callout-note} Extract the PSMs that have been matched to peptides from protein `O43175` and compare and cluster the scans. Hint: once you have created the smaller `Spectra` object with the scans of interest, switch to an in-memory backend to seed up the calculations. -`r msmbstyle::question_end()` - -`r msmbstyle::solution_begin()` +::: {.callout-answer .icon .callout-note collapse=true} ```{r} sp_O43175 <- sp[which(sp$DatabaseAccess == "O43175")] @@ -724,20 +711,19 @@ spectraData(sp_O43175[i])$modName plotSpectraMirror(sp_O43175[4], sp_O43175[9]) plotSpectraMirror(sp_O43175[2], sp_O43175[10]) ``` -`r msmbstyle::solution_end()` +::: + +::: -`r msmbstyle::question_begin()` +::: {.callout-question .icon .callout-note} Generate total ion chromatograms for each acquisition and annotate the MS1 scans with the number of PSMs using the `countIdentifications()` function, as shown above. The function will automatically perform the counts in parallel for each acquisition. -`r msmbstyle::question_end()` - - -`r msmbstyle::solution_begin()` +::: {.callout-answer .icon .callout-note collapse=true} ```{r} sp <- countIdentifications(sp) @@ -746,22 +732,23 @@ table(msLevel(sp), sp$countIdentifications) ```{r} sp |> - filterMsLevel(1) |> - spectraData() |> - as_tibble() |> - ggplot(aes(x = rtime, - y = totIonCurrent)) + - geom_line(alpha = 0.25) + - geom_point(aes(colour = ifelse(countIdentifications == 0, - NA, countIdentifications)), - size = 0.75, - alpha = 0.5) + - scale_colour_gradient(low = "orange", high = "red") + - facet_grid(sub("^.+_", "", basename(dataOrigin)) ~ .) + - labs(colour = "Number of ids") + filterMsLevel(1) |> + spectraData() |> + as_tibble() |> + ggplot(aes(x = rtime, y = totIonCurrent)) + + geom_line(alpha = 0.25) + + geom_point(aes(colour = ifelse(countIdentifications == 0, + NA, countIdentifications)), + size = 0.75, + alpha = 0.5) + + scale_colour_gradient(low = "orange", high = "red") + + facet_grid(sub("^.+_", "", basename(dataOrigin)) ~ .) + + labs(colour = "Number of ids") ``` -`r msmbstyle::solution_end()` +::: + +::: ## Exploration and Assessment of Identifications using `MSnID` @@ -787,7 +774,7 @@ vignette. You can explore more with vignette("msnid_vignette", package = "MSnID") ``` -The `r Biocpkg("MSnID")` package can be used for post-search filtering +The `r BiocStyle::Biocpkg("MSnID")` package can be used for post-search filtering of MS/MS identifications. One starts with the construction of an `MSnID` object that is populated with identification results that can be imported from a `data.frame` or from `mzIdenML` files. Here, we @@ -803,7 +790,7 @@ add the identification result from our `mzid` file (there could of course be more than one). ```{r msnid1, warning = FALSE} -library("MSnID") +library(MSnID) msnid <- MSnID(".") msnid <- read_mzIDs(msnid, mzids) show(msnid) @@ -817,7 +804,6 @@ Printing the `MSnID` object returns some basic information such as * Number of unique peptide sequences and corresponding FDR. * Number of unique proteins or amino acid sequence accessions and corresponding FDR. - The package then enables to define, optimise and apply filtering based for example on missed cleavages, identification scores, precursor mass errors, etc. and assess PSM, peptide and protein FDR levels. To @@ -836,7 +822,6 @@ names(msnid) Here, we summarise a few steps and redirect the reader to the package's vignette for more details: - ### Analysis of peptide sequences Cleaning irregular cleavages at the termini of the peptides and @@ -962,3 +947,5 @@ further processed and analysed using appropriate statistical tests. ```{r} head(psms(msnid)) ``` + +# References {-} diff --git a/30-quant.Rmd b/inst/pages/30-quant.qmd similarity index 90% rename from 30-quant.Rmd rename to inst/pages/30-quant.qmd index 40e6502..6b6ce3d 100644 --- a/30-quant.Rmd +++ b/inst/pages/30-quant.qmd @@ -55,7 +55,6 @@ that sample. knitr::include_graphics("./img/chrompeaks.png") ``` - ### Labelled MS1: SILAC In SILAC quantitation, sample are grown in a medium that contains @@ -67,7 +66,6 @@ peptides precursor peaks are systematically shifted compared to the light ones, and the ratio between the height of a heavy and light peaks can be used to calculate peptide and protein fold-changes. - ```{r silab, echo=FALSE, out.width = "75%", fig.cap = "Silac quantitation. Figure credit Wikimedia Commons."} knitr::include_graphics("./img/Silac.png") ``` @@ -78,20 +76,19 @@ processing, data transformation and normalisation, missing values, and different underlying statistical models for the quantitative data (count data for spectral counting, continuous data for the others). - In terms of raw data quantitation in R/Bioconductor, most efforts have been devoted to MS2-level quantitation. Label-free XIC quantitation has been addressed in the frame of metabolomics data processing by the -`r Biocpkg("xcms")` infrastructure. +`r BiocStyle::Biocpkg("xcms")` infrastructure. - - - + + + - + ## QFeatures {#sec-qf} @@ -124,7 +121,6 @@ biological entity of interest are the protein. As part of the data processing, we are thus required to **aggregate** low-level quantitative features into higher level data. - ```{r featuresplot, fig.cap = "Conceptual representation of a `QFeatures` object and the aggregative relation between different assays.", echo = FALSE} par(mar = c(0, 0, 0, 0)) plot(NA, xlim = c(0, 12), ylim = c(0, 20), @@ -192,9 +188,8 @@ package. The class is derived from the Bioconductor `MultiAssayExperiment` [@MAE] (MAE) class. Let's start by loading the `QFeatures` package. - ```{r, message = FALSE} -library("QFeatures") +library(QFeatures) ``` Next, we load the `feat1` test data, which is composed of single @@ -209,12 +204,21 @@ feat1 Let's perform some simple operations to familiarise ourselves with the `QFeatures` class: +::: {.callout-question .icon .callout-note} + - Extract the sample metadata using the `colData()` accessor (like you have previously done with `SummarizedExperiment` objects). +::: {.callout-answer .icon .callout-note collapse=true} + ```{r cd} colData(feat1) ``` + +::: + +::: + We can also further annotate the experiment by adding columns to the `colData` slot: ```{r cd2} @@ -223,21 +227,22 @@ feat1$Y <- c("Y1", "Y2") colData(feat1) ``` +::: {.callout-question .icon .callout-note} + - Extract the first (and only) assay composing this `QFeatures` data using the `[[` operator (as you have done to extract elements of a list) by using the assay's index or name. +::: {.callout-answer .icon .callout-note collapse=true} + ```{r assay1} feat1[[1]] feat1[["psms"]] ``` -- Extract the `psms` assay's row data and quantitative values. +::: -```{r rd} -assay(feat1[[1]]) -rowData(feat1[[1]]) -``` +::: ### Feature aggregation @@ -285,7 +290,6 @@ assay(feat1[[2]])["IAEESNFPFIK", ] rowData(feat1[[2]]) ``` - We can now aggregate the peptide-level data into a new protein-level assay using the `colMedians()` aggregation function. @@ -298,7 +302,6 @@ feat1 assay(feat1[["proteins"]]) ``` - ### Subsetting and filtering The link between the assays becomes apparent when we now subset the @@ -311,7 +314,6 @@ PSMs. feat1["ProtA", , ] ``` - The `filterFeatures()` function can be used to filter rows the assays composing a `QFeatures` object using the row data variables. We can for example retain rows that have a `pval` < 0.05, which would only @@ -322,20 +324,21 @@ that assay. filterFeatures(feat1, ~ pval < 0.05) ``` -`r msmbstyle::question_begin()` +::: {.callout-question .icon .callout-note} As the message above implies, it is also possible to apply a filter to only the assays that have a filtering variables by setting the `keep` variables. -`r msmbstyle::question_end()` - -`r msmbstyle::solution_begin()` +::: {.callout-answer .icon .callout-note collapse=true} ```{r ff1b} filterFeatures(feat1, ~ pval < 0.05, keep = TRUE) ``` -`r msmbstyle::solution_end()` + +::: + +::: On the other hand, if we filter assay rows for those that localise to the mitochondrion, we retain the relevant protein, peptides and PSMs. @@ -344,20 +347,20 @@ the mitochondrion, we retain the relevant protein, peptides and PSMs. filterFeatures(feat1, ~ location == "Mitochondrion") ``` -`r msmbstyle::question_begin()` +::: {.callout-question .icon .callout-note} As an exercise, let's filter rows that do not localise to the mitochondrion. -`r msmbstyle::question_end()` - -`r msmbstyle::solution_begin()` +::: {.callout-answer .icon .callout-note collapse=true} ```{r ff3} filterFeatures(feat1, ~ location != "Mitochondrion") ``` -`r msmbstyle::solution_end()` +::: + +::: You can refer to the [*Quantitative features for mass spectrometry data*](https://rformassspectrometry.github.io/QFeatures/articles/QFeatures.html) @@ -365,7 +368,6 @@ vignette and the `QFeatures` [manual page](https://rformassspectrometry.github.io/QFeatures/reference/QFeatures-class.html) for more details about the class. - ## Creating `QFeatures` object ```{r loaddfr, echo = FALSE} @@ -383,7 +385,6 @@ from [@Christoforou:2016]. The `ecol` argument specifies that columns `psms` in the returned `QFeatures` object, to reflect the nature of the data. - ```{r readQFeatures} data(hlpsms) hl <- readQFeatures(hlpsms, ecol = 1:10, name = "psms") @@ -395,8 +396,6 @@ name. The individual assays are stored as *SummarizedExperiment* object and further access its quantitative data and metadata using the `assay` and `rowData` functions. - - ```{r subsetassay} hl[[1]] hl[["psms"]] @@ -445,15 +444,12 @@ available in the `msdata` package: basename(f <- msdata::quant(pattern = "cptac", full.names = TRUE)) ``` - -`r msmbstyle::question_begin()` +::: {.callout-question .icon .callout-note} Read these data in as either a `SummarizedExperiment` or a `QFeatures` object. -`r msmbstyle::question_end()` - -`r msmbstyle::solution_begin()` +::: {.callout-answer .icon .callout-note collapse=true} From the names of the columns, we see that the quantitative columns, starting with `"Intensity."` (note the dot!) are at positions 56 to @@ -477,17 +473,17 @@ cptac_se <- readSummarizedExperiment(f, ecol = i, cptac_se ``` -`r msmbstyle::solution_end()` +::: -`r msmbstyle::question_begin()` +::: + +::: {.callout-question .icon .callout-note} Before proceeding, we are going to clean up the sample names by removing the unnecessary *Intensity* prefix and annotate the experiment in the object's `colData`. -`r msmbstyle::question_end()` - -`r msmbstyle::solution_begin()` +::: {.callout-answer .icon .callout-note collapse=true} ```{r cptac_names} colnames(cptac_se) <- sub("I.+\\.", "", colnames(cptac_se)) @@ -495,17 +491,18 @@ cptac_se$condition <- sub("_[7-9]", "", colnames(cptac_se)) cptac_se$id <- sub("^.+_", "", colnames(cptac_se)) colData(cptac_se) ``` -`r msmbstyle::solution_end()` -`r msmbstyle::question_begin()` +::: + +::: + +::: {.callout-question .icon .callout-note} There are many row variables that aren't useful here. Get rid or all of them but `Sequence`, `Proteins`, `Leading.razor.protein`, `PEP`, `Score`, `Reverse`, and `Potential.contaminant`. -`r msmbstyle::question_end()` - -`r msmbstyle::solution_begin()` +::: {.callout-answer .icon .callout-note collapse=true} ```{r vars} keep_var <- c("Sequence", "Proteins", "Leading.razor.protein", "PEP", @@ -514,8 +511,9 @@ keep_var <- c("Sequence", "Proteins", "Leading.razor.protein", "PEP", rowData(cptac_se) <- rowData(cptac_se)[, keep_var] ``` -`r msmbstyle::solution_end()` +::: +::: ## Analysis pipeline @@ -532,10 +530,10 @@ above. - Downstream analysis ```{r pkgs, message = FALSE} -library("tidyverse") -library("ggplot2") -library("QFeatures") -library("limma") +library(tidyverse) +library(ggplot2) +library(QFeatures) +library(limma) ``` ### Missing values @@ -565,7 +563,6 @@ instead of properly reporting missing values. We can use the `cptac_se` object and then explore the missing data patterns across columns and rows. - ```{r na} cptac_se <- zeroIsNA(cptac_se) nNA(cptac_se) @@ -618,7 +615,6 @@ table(nNA(cptac_se)$nNArows$nNA) cptac_se <- filterNA(cptac_se, pNA = 4/6) ``` - ### Imputation Imputation is the technique of replacing missing data with probable @@ -631,12 +627,11 @@ to be imputed with [different types of imputation methods](https://rformassspectrometry.github.io/QFeatures/articles/Processing.html#imputation-1) [@Lazar:2016]. - ```{r miximp, echo = FALSE, fig.cap = "Mixed imputation method. Black cells represent presence of quantitation values and light grey corresponds to missing data. The two groups of interest are depicted in green and blue along the heatmap columns. Two classes of proteins are annotated on the left: yellow are proteins with randomly occurring missing values (if any) while proteins in brown are candidates for non-random missing value imputation."} data(se_na2) x <- assay(impute(se_na2, "zero")) x[x != 0] <- 1 -suppressPackageStartupMessages(library("gplots")) +suppressPackageStartupMessages(library(gplots)) heatmap.2(x, col = c("lightgray", "black"), scale = "none", dendrogram = "none", trace = "none", keysize = 0.5, key = FALSE, @@ -648,14 +643,14 @@ heatmap.2(x, col = c("lightgray", "black"), ```{r lazar, fig.cap = "Effect of the nature of missing values on their imputation. Root-mean-square error (RMSE) observations standard deviation ratio (RSR), KNN and MinDet imputation. Lower (blue) is better.", echo = FALSE, out.width='100%'} knitr::include_graphics("./img/imp-sim.png") ``` + Generally, it is recommended to use **hot deck** methods (nearest neighbour (**left**), maximum likelihood, ...) when data are missing at random.Conversely, MNAR features should ideally be imputed with a **left-censor** (minimum value (**right**), but not zero, ...) method. - There are various methods to perform data imputation, as described in -`?impute`. The `r CRANpkg("imp4p")` package contains additional +`?impute`. The `r BiocStyle::CRANpkg("imp4p")` package contains additional functionality, including some to estimate the randomness of missing data. @@ -668,16 +663,14 @@ data(se_na2) impute(se_na2, method = "knn") ``` -`r msmbstyle::question_begin()` +::: {.callout-question .icon .callout-note} Following the example above, apply a mixed imputation, using knn for data missing at random and the zero imputation for data missing not at random. Hint: the `randna` variable defines which features are assumed to be missing at random. -`r msmbstyle::question_end()` - -`r msmbstyle::solution_begin()` +::: {.callout-answer .icon .callout-note collapse=true} ```{r naex1, message = FALSE} impute(se_na2, "mixed", @@ -685,9 +678,11 @@ impute(se_na2, "mixed", mar = "knn", mnar = "zero") ``` -`r msmbstyle::solution_end()` +::: -`r msmbstyle::question_begin()` +::: + +::: {.callout-question .icon .callout-note} When assessing missing data imputation methods, such as in [Lazar et al. (2016)](https://pubs.acs.org/doi/abs/10.1021/acs.jproteome.5b00981), @@ -697,10 +692,7 @@ method of choice, then quantifies the difference between original data, use this strategy to assess the difference between knn and Bayesian PCA imputation. -`r msmbstyle::question_end()` - -`r msmbstyle::solution_begin()` - +::: {.callout-answer .icon .callout-note collapse=true} ```{r naex2, cache = TRUE} imp1 <- impute(se_na2, method = "knn") @@ -708,11 +700,12 @@ imp2 <- impute(se_na2, method = "bpca") summary(abs(assay(imp1)[is.na(assay(se_na2))] - assay(imp2)[is.na(assay(se_na2))])) summary(as.numeric(na.omit(assay(se_na2)))) ``` -`r msmbstyle::solution_end()` +::: -`r msmbstyle::question_begin()` +::: +::: {.callout-question .icon .callout-note} When assessing the impact of missing value imputation on real data, one can't use the strategy above. Another useful approach is to assess @@ -721,14 +714,11 @@ quantitative data. For instance, here is the intensity distribution of the `se_na2` data. Verify the effect of applying `knn`, `zero`, `MinDet` and `bpca` on this distribution. -```{r nasetdist, fig.cap = "Intensity disctribution of the `naset` data."} +```{r nasetdist, fig.cap = "Intensity disctribution of the naset data."} plot(density(na.omit(assay(se_na2)))) ``` -`r msmbstyle::question_end()` - -`r msmbstyle::solution_begin()` - +::: {.callout-answer .icon .callout-note collapse=true} ```{r naex3, cache = TRUE} cls <- c("black", "red", "blue", "steelblue", "orange") @@ -741,8 +731,9 @@ legend("topright", legend = c("orig", "knn", "zero", "MinDet", "bpca"), col = cls, lwd = 2, bty = "n") ``` -`r msmbstyle::solution_end()` +::: +::: **Tip**: When downstream analyses permit, it might be safer not to impute data and deal explicitly with missing values. Indeed missing @@ -753,7 +744,6 @@ recommended to explore missingness as part of the exploratory data analysis.], but (generally) not to perform a principal component analysis. - ### Identification quality control As discussed in the previous chapter, PSMs are deemed relevant after @@ -774,36 +764,32 @@ table(rowData(cptac_se)$Potential.contaminant) Let's visualise some of the cptac's metadata using standard `ggplot2` code: - -`r msmbstyle::question_begin()` - +::: {.callout-question .icon .callout-note} Visualise the identification score and the posterior probability probability (PEP) distributions from forward and reverse hits and interpret the figure. -`r msmbstyle::question_end()` - -`r msmbstyle::solution_begin()` - +::: {.callout-answer .icon .callout-note collapse=true} ```{r idqc1} -rowData(cptac_se) %>% - as_tibble() %>% +rowData(cptac_se) |> + as_tibble() |> ggplot(aes(x = Score, colour = Reverse)) + geom_density() ``` ```{r idqc2} -rowData(cptac_se) %>% - as_tibble() %>% +rowData(cptac_se) |> + as_tibble() |> ggplot(aes(x = PEP, colour = Reverse)) + geom_density() ``` -`r msmbstyle::solution_end()` +::: +::: **Note**: it is also possible to compute and visualise protein groups as connected components starting from a quantitative dataset such as a @@ -835,31 +821,29 @@ with the `readQFeatures()` function and the same arguments as the used above and below work on single `SummarizedExperiment` objects or assays within a `QFeatures` object. - ### Filtering out contaminants and reverse hits -`r msmbstyle::question_begin()` +::: {.callout-question .icon .callout-note} Using the `filterFeatures()` function, filter out the reverse and contaminant hits, and also retain those that have a posterior error probability smaller than 0.05. -`r msmbstyle::question_end()` - -`r msmbstyle::solution_begin()` +::: {.callout-answer .icon .callout-note collapse=true} ```{r qcfilter} cptac <- - cptac %>% - filterFeatures(~ Reverse != "+") %>% - filterFeatures(~ Potential.contaminant != "+") %>% + cptac |> + filterFeatures(~ Reverse != "+") |> + filterFeatures(~ Potential.contaminant != "+") |> filterFeatures(~ PEP < 0.05) ``` -`r msmbstyle::solution_end()` +::: -### Log-transformation and normalisation +::: +### Log-transformation and normalisation The two code chunks below log-transform and normalise using the assay `i` as input and adding a new one names as defined by `name`. @@ -869,15 +853,12 @@ cptac <- logTransform(cptac, i = "peptides", name = "log_peptides") ``` - -`r msmbstyle::question_begin()` +::: {.callout-question .icon .callout-note} Use the `normalize()` method to normalise the data. The syntax is the same as `logTransform()`. Use the `center.median` method. -`r msmbstyle::question_end()` - -`r msmbstyle::solution_begin()` +::: {.callout-answer .icon .callout-note collapse=true} ```{r nrom} cptac <- normalize(cptac, i = "log_peptides", @@ -885,18 +866,18 @@ cptac <- normalize(cptac, i = "log_peptides", method = "center.median") ``` -`r msmbstyle::solution_end()` +::: + +::: -`r msmbstyle::question_begin()` +::: {.callout-question .icon .callout-note} Visualise the result of the transformations above. The `plotDensities()` function from the `limma` package is very convenient, but feel free to use boxplots, violin plots, or any other visualisation that you deem useful to assess the tranformations. -`r msmbstyle::question_end()` - -`r msmbstyle::solution_begin()` +::: {.callout-answer .icon .callout-note collapse=true} ```{r plotdens, fig.cap = "Three peptide level assays: raw data, log transformed and normalised.", fig.width = 15, fig.height = 5} par(mfrow = c(1, 3)) @@ -904,20 +885,20 @@ limma::plotDensities(assay(cptac[["peptides"]])) limma::plotDensities(assay(cptac[["log_peptides"]])) limma::plotDensities(assay(cptac[["lognorm_peptides"]])) ``` -`r msmbstyle::solution_end()` -### Aggregation +::: +::: -`r msmbstyle::question_begin()` +### Aggregation + +::: {.callout-question .icon .callout-note} Use median aggregation to aggregation peptides into protein values. This is not necessarily the best choice, as we will see later, but a good start. -`r msmbstyle::question_end()` - -`r msmbstyle::solution_begin()` +::: {.callout-answer .icon .callout-note collapse=true} ```{r qfagg1, message = FALSE} cptac <- @@ -929,7 +910,9 @@ cptac <- na.rm = TRUE) ``` -`r msmbstyle::solution_end()` +::: + +::: Looking at the `.n` row variable computed during the aggregation, we see that most proteins result from the aggregation of 5 peptides or @@ -942,29 +925,29 @@ table(rowData(cptac[["proteins_med"]])$.n) ### Principal component analysis ```{r pca, message = FALSE} -library("factoextra") +library(factoextra) pca_pep <- - cptac[["lognorm_peptides"]] %>% - filterNA() %>% - assay() %>% - t() %>% - prcomp(scale = TRUE, center = TRUE) %>% + cptac[["lognorm_peptides"]] |> + filterNA() |> + assay() |> + t() |> + prcomp(scale = TRUE, center = TRUE) |> fviz_pca_ind(habillage = cptac$condition, title = "Peptides") pca_prot <- - cptac[["proteins_med"]] %>% - filterNA() %>% - assay() %>% - t() %>% - prcomp() %>% + cptac[["proteins_med"]] |> + filterNA() |> + assay() |> + t() |> + prcomp() |> fviz_pca_ind(habillage = cptac$condition, title = "Proteins (median aggregation)") ``` ```{r plotpca, fig.width = 12, fig.height = 6, fig.cap = "Peptide and protein level PCA analyses."} -library("patchwork") +library(patchwork) pca_pep + pca_prot ``` @@ -974,12 +957,11 @@ Below, we use the `longFormat()` function to extract the quantitative and row data in a long format, that can be directly reused by the tidyverse tools. - ```{r vis, message = FALSE, warning = FALSE, fig.width = 12, fig.height = 6, fig.cap = "Peptide and protein expression profile."} longFormat(cptac["P02787ups|TRFE_HUMAN_UPS", , - c("lognorm_peptides", "proteins_med")]) %>% - as_tibble() %>% - mutate(condition = ifelse(grepl("A", colname), "A", "B")) %>% + c("lognorm_peptides", "proteins_med")]) |> + as_tibble() |> + mutate(condition = ifelse(grepl("A", colname), "A", "B")) |> ggplot(aes(x = colname, y = value, colour = rowname, shape = condition)) + geom_point(size = 3) + geom_line(aes(group = rowname)) + @@ -994,7 +976,7 @@ their relation. plot(cptac) ``` -`r msmbstyle::question_begin()` +::: {.callout-question .icon .callout-note} The example above shows a simple linear relationship between assays. Create a more interesting one by applying a different @@ -1002,10 +984,7 @@ normalisation method on the *log_peptides* assay and aggreate that new normalised peptide assay. Visualise the relationship with `plot()`, as above. -`r msmbstyle::question_end()` - - -`r msmbstyle::solution_begin()` +::: {.callout-answer .icon .callout-note collapse=true} ```{r plotqf2, message = FALSE} normalize(cptac, "log_peptides", @@ -1020,8 +999,9 @@ normalize(cptac, "log_peptides", plot() ``` -`r msmbstyle::solution_end()` +::: +::: ### Statistical analysis @@ -1029,20 +1009,20 @@ R in general and Bioconductor in particular are well suited for the statistical analysis of quantitative proteomics data. Several packages provide dedicated resources for proteomics data: -- `r Biocpkg("MSstats")` and `r Biocpkg("MSstatsTMT")`: A set of tools +- `r BiocStyle::Biocpkg("MSstats")` and `r BiocStyle::Biocpkg("MSstatsTMT")`: A set of tools for statistical relative protein significance analysis in Data dependent (DDA), SRM, Data independent acquisition (DIA) and TMT experiments. -- `r Biocpkg("msmsTests")`: Statistical tests for label-free LC-MS/MS +- `r BiocStyle::Biocpkg("msmsTests")`: Statistical tests for label-free LC-MS/MS data by spectral counts, to discover differentially expressed proteins between two biological conditions. Three tests are available: Poisson GLM regression, quasi-likelihood GLM regression, - and the negative binomial of the `r Biocpkg("edgeR")` + and the negative binomial of the `r BiocStyle::Biocpkg("edgeR")` package. All can be readily applied on `MSnSet` instances produced, for example by `MSnID`. -- `r Biocpkg("DEP")` provides an integrated analysis workflow for the +- `r BiocStyle::Biocpkg("DEP")` provides an integrated analysis workflow for the analysis of mass spectrometry proteomics data for differential protein expression or differential enrichment. @@ -1068,7 +1048,7 @@ packages provide dedicated resources for proteomics data: detect differentially abundant proteins. Others, while not specfic to proteomics, are also recommended, such as -the `r Biocpkg("limma")` package. When analysing spectral counting +the `r BiocStyle::Biocpkg("limma")` package. When analysing spectral counting data, methods for high throughput sequencing data are applicable. Below, we illustrate how to apply a typical `edgeR` test to count data using the `msms.edgeR` function from the `msmsTests` @@ -1078,7 +1058,6 @@ package. Below, we are going to perform our statistical analysis on the protein data using `limma`. - ```{r protse} prots <- getWithColData(cptac, "proteins_med") ``` @@ -1098,7 +1077,7 @@ The code chunk below illustrates how to set up the model, fit it, and apply the empirical Bayes moderation. ```{r limma, message = FALSE} -library("limma") +library(limma) design <- model.matrix(~ prots$condition) fit <- lmFit(assay(prots), design) fit <- eBayes(fit) @@ -1109,9 +1088,9 @@ the coefficient of interest. ```{r res} res <- - topTable(fit, coef = "prots$condition6B", number = Inf) %>% - rownames_to_column("protein") %>% - as_tibble() %>% + topTable(fit, coef = "prots$condition6B", number = Inf) |> + rownames_to_column("protein") |> + as_tibble() |> mutate(TP = grepl("ups", protein)) ``` @@ -1119,7 +1098,7 @@ Note the warning about partial `NA` coefficients for 23 probes: ```{r nacoefs} na_coefs <- - filter(res, is.na(t)) %>% + filter(res, is.na(t)) |> pull(protein) assay(prots[na_coefs, ]) ``` @@ -1127,7 +1106,7 @@ assay(prots[na_coefs, ]) We can now visualise the results using a volcano plot: ```{r vp, fig.cap = "Volcano plot highlighing spiked-in proteins in red."} -res %>% +p <- res |> ggplot(aes(x = logFC, y = -log10(adj.P.Val))) + geom_point(aes(colour = TP)) + geom_vline(xintercept = c(-1, 1)) + @@ -1135,7 +1114,6 @@ res %>% scale_color_manual(values = c("black","red")) ``` - Using the pipeline described above, we would would identify a single differentially expressed protein at an 5 percent FDR but miss out the other `r sum(res$TP) - 1` expected spike-in proteins. @@ -1147,7 +1125,6 @@ We can assess our results in terms of true/false postitves/negatives: - True negatives: `r nrow(filter(res, adj.P.Val > 0.05 & !TP))` - False negatives: `r nrow(filter(res, adj.P.Val > 0.05 & TP))` - ## Summary exercice As shown below, it is possible to substantially improve these results @@ -1156,7 +1133,6 @@ by aggregating features using a robust summarisation (available as M-estimation using Huber weights, as described in section 2.7 in [@Sticker:2019]. - ```{r echo = FALSE, message = FALSE, warning = FALSE, fig.cap = "Aggregation using robust summarisation."} knitr::include_graphics("./img/vp2.png") ``` @@ -1168,3 +1144,5 @@ knitr::include_graphics("./img/vp2.png") Repeat and adapt what we have seen here using, for example, the `robustSummary()` function. + +# References {-} diff --git a/95-annex.Rmd b/inst/pages/95-annex.qmd similarity index 92% rename from 95-annex.Rmd rename to inst/pages/95-annex.qmd index 770988c..03b8dfa 100644 --- a/95-annex.Rmd +++ b/inst/pages/95-annex.qmd @@ -1,6 +1,5 @@ # Annex {#sec-anx} - ## Raw MS data under the hood: the `mzR` package {#sec-raw2} The `mzR` package is a direct interface to the @@ -28,9 +27,8 @@ The three main functions of `mzR` are Other functions such as `instrumentInfo`, or `runInfo` can be used to gather general information about a run. - ```{r rawms} -library("mzR") +library(mzR) ms <- openMSfile(f2) ms ``` @@ -46,15 +44,13 @@ head(peaks(ms, 117)) str(peaks(ms, 1:5)) ``` -`r msmbstyle::question_begin()` +::: {.callout-question .icon .callout-note} Let's extract the index of the MS2 spectrum with the highest base peak intensity and plot its spectrum. Is the data centroided or in profile mode? -`r msmbstyle::question_end()` - -`r msmbstyle::solution_begin()` +::: {.callout-answer .icon .callout-note collapse=true} ```{r ex_raw, echo=TRUE, eval=TRUE, fig.align='center'} hd2 <- hd[hd$msLevel == 2, ] @@ -66,18 +62,16 @@ mz <- hd2[i, "basePeakMZ"] plot(pi, type = "h", xlim = c(mz - 0.5, mz + 0.5)) ``` -`r msmbstyle::solution_end()` +::: +::: -`r msmbstyle::question_begin()` +::: {.callout-question .icon .callout-note} Pick an MS1 spectrum and visually check whether it is centroided or in profile mode. -`r msmbstyle::question_end()` - - -`r msmbstyle::solution_begin()` +::: {.callout-answer .icon .callout-note collapse=true} ```{r ex_raw2} ## Zooming into spectrum 300 (an MS1 spectrum). @@ -88,8 +82,9 @@ mz <- hd[j, "basePeakMZ"] plot(pj, type = "l", xlim = c(mz - 0.5, mz + 0.5)) ``` -`r msmbstyle::solution_end()` +::: +::: ## PSM data under the hood {#sec-id2} @@ -121,8 +116,8 @@ The main functions are `mzID` to read the data into a dedicated data class and `flatten` to transform it into a `data.frame`. ```{r mzid1, warning = FALSE} -idf -library("mzID") +library(mzID) +idf <- msdata::ident(full.names = TRUE) id <- mzID(idf) id ``` @@ -145,7 +140,7 @@ relevant data on demand. It has also accessor functions such as to see all available methods. ```{r idmzr} -library("mzR") +library(mzR) id2 <- openIDfile(idf) id2 softwareInfo(id2) diff --git a/99-si.Rmd b/inst/pages/99-si.qmd similarity index 88% rename from 99-si.Rmd rename to inst/pages/99-si.qmd index 4bdb47a..921a2e8 100644 --- a/99-si.Rmd +++ b/inst/pages/99-si.qmd @@ -1,4 +1,4 @@ -# Additional materials and session information {#sec-si} +# Additional materials and help {#sec-si} ## Additional materials @@ -52,18 +52,4 @@ For questions about specific software or their usage, please refer to the software's github issue page, or use the [Bioconductor support site](http://support.bioconductor.org/). - -## Session information - -The following packages have been used to generate this document. - -```{r include=FALSE} -# automatically create a bib database for R packages -knitr::write_bib(c( - .packages(), 'bookdown', 'knitr', 'rmarkdown', 'msmbstyle' -), 'packages.bib') -``` - -```{r si} -sessionInfo() -``` +# References {-} diff --git a/inst/pages/images/rstudio-logo.jpg b/inst/pages/images/rstudio-logo.jpg new file mode 100644 index 0000000..0177f5c Binary files /dev/null and b/inst/pages/images/rstudio-logo.jpg differ diff --git a/inst/pages/images/text-editor-logo.jpg b/inst/pages/images/text-editor-logo.jpg new file mode 100644 index 0000000..bc9e17b Binary files /dev/null and b/inst/pages/images/text-editor-logo.jpg differ diff --git a/inst/pages/images/vscode-logo.jpg b/inst/pages/images/vscode-logo.jpg new file mode 100644 index 0000000..22c00ce Binary files /dev/null and b/inst/pages/images/vscode-logo.jpg differ diff --git a/docs/img/F02-3D-MS1-MS2-scans-100-1200-lattice.png b/inst/pages/img/F02-3D-MS1-MS2-scans-100-1200-lattice.png similarity index 100% rename from docs/img/F02-3D-MS1-MS2-scans-100-1200-lattice.png rename to inst/pages/img/F02-3D-MS1-MS2-scans-100-1200-lattice.png diff --git a/docs/img/F02-3D-MS1-scans-400-1200-lattice.png b/inst/pages/img/F02-3D-MS1-scans-400-1200-lattice.png similarity index 100% rename from docs/img/F02-3D-MS1-scans-400-1200-lattice.png rename to inst/pages/img/F02-3D-MS1-scans-400-1200-lattice.png diff --git a/docs/img/MS1-MS2-spectra.png b/inst/pages/img/MS1-MS2-spectra.png similarity index 100% rename from docs/img/MS1-MS2-spectra.png rename to inst/pages/img/MS1-MS2-spectra.png diff --git a/docs/img/MSGFgui.png b/inst/pages/img/MSGFgui.png similarity index 100% rename from docs/img/MSGFgui.png rename to inst/pages/img/MSGFgui.png diff --git a/docs/img/SE.png b/inst/pages/img/SE.png similarity index 100% rename from docs/img/SE.png rename to inst/pages/img/SE.png diff --git a/docs/img/SchematicMS2.png b/inst/pages/img/SchematicMS2.png similarity index 100% rename from docs/img/SchematicMS2.png rename to inst/pages/img/SchematicMS2.png diff --git a/docs/img/Silac.png b/inst/pages/img/Silac.png similarity index 100% rename from docs/img/Silac.png rename to inst/pages/img/Silac.png diff --git a/docs/img/chromatogram.png b/inst/pages/img/chromatogram.png similarity index 100% rename from docs/img/chromatogram.png rename to inst/pages/img/chromatogram.png diff --git a/docs/img/chrompeaks.png b/inst/pages/img/chrompeaks.png similarity index 100% rename from docs/img/chrompeaks.png rename to inst/pages/img/chrompeaks.png diff --git a/docs/img/cptac.png b/inst/pages/img/cptac.png similarity index 100% rename from docs/img/cptac.png rename to inst/pages/img/cptac.png diff --git a/img/features.png b/inst/pages/img/features.png similarity index 100% rename from img/features.png rename to inst/pages/img/features.png diff --git a/docs/img/frag.png b/inst/pages/img/frag.png similarity index 100% rename from docs/img/frag.png rename to inst/pages/img/frag.png diff --git a/docs/img/imp-sim.png b/inst/pages/img/imp-sim.png similarity index 100% rename from docs/img/imp-sim.png rename to inst/pages/img/imp-sim.png diff --git a/docs/img/itraq.png b/inst/pages/img/itraq.png similarity index 100% rename from docs/img/itraq.png rename to inst/pages/img/itraq.png diff --git a/img/msanim1.gif b/inst/pages/img/msanim1.gif similarity index 100% rename from img/msanim1.gif rename to inst/pages/img/msanim1.gif diff --git a/img/msanim2.gif b/inst/pages/img/msanim2.gif similarity index 100% rename from img/msanim2.gif rename to inst/pages/img/msanim2.gif diff --git a/img/msnset.png b/inst/pages/img/msnset.png similarity index 100% rename from img/msnset.png rename to inst/pages/img/msnset.png diff --git a/docs/img/mstut.gif b/inst/pages/img/mstut.gif similarity index 100% rename from docs/img/mstut.gif rename to inst/pages/img/mstut.gif diff --git a/docs/img/msvisfig.png b/inst/pages/img/msvisfig.png similarity index 100% rename from docs/img/msvisfig.png rename to inst/pages/img/msvisfig.png diff --git a/docs/img/pbase.png b/inst/pages/img/pbase.png similarity index 100% rename from docs/img/pbase.png rename to inst/pages/img/pbase.png diff --git a/docs/img/pr0c00313_0002.gif b/inst/pages/img/pr0c00313_0002.gif similarity index 100% rename from docs/img/pr0c00313_0002.gif rename to inst/pages/img/pr0c00313_0002.gif diff --git a/img/pset.jpg b/inst/pages/img/pset.jpg similarity index 100% rename from img/pset.jpg rename to inst/pages/img/pset.jpg diff --git a/docs/img/raw.png b/inst/pages/img/raw.png similarity index 100% rename from docs/img/raw.png rename to inst/pages/img/raw.png diff --git a/docs/img/vp2.png b/inst/pages/img/vp2.png similarity index 100% rename from docs/img/vp2.png rename to inst/pages/img/vp2.png diff --git a/05-intro.Rmd b/inst/pages/introduction.qmd similarity index 98% rename from 05-intro.Rmd rename to inst/pages/introduction.qmd index 42796d0..2427776 100644 --- a/05-intro.Rmd +++ b/inst/pages/introduction.qmd @@ -1,6 +1,5 @@ # Introduction {#sec-msintro} - ## How does mass spectrometry work? Mass spectrometry (MS) is a technology that *separates* charged @@ -9,7 +8,6 @@ often coupled to chromatography (liquid LC, but can also be gas-based GC). The time an analyte takes to elute from the chromatography column is the *retention time*. - ```{r, results='markup', fig.cap="A chromatogram, illustrating the total amount of analytes over the retention time.", echo=FALSE, purl=FALSE, out.width='100%', fig.align='center'} knitr::include_graphics("./img/chromatogram.png") ``` @@ -51,18 +49,15 @@ is of high enough quality) or using a search engine such as, for example Mascot, MSGF+, ..., that will match the observed, experimental spectrum to theoretical spectra (see details below). - ```{r, results='markup', fig.cap="Schematics of a mass spectrometer and two rounds of MS.", echo=FALSE, purl=FALSE, out.width='100%', fig.align='center'} knitr::include_graphics("./img/SchematicMS2.png") ``` - The animation below show how 25 ions different ions (i.e. having different M/Z values) are separated throughout the MS analysis and are eventually detected (i.e. quantified). The final frame shows the hypothetical spectrum. - ```{r, results='markup', fig.cap="Separation and detection of ions in a mass spectrometer.", echo=FALSE, purl=FALSE, out.width='100%', fig.align='center'} knitr::include_graphics("./img/mstut.gif") ``` @@ -94,7 +89,7 @@ knitr::include_graphics("./img/F02-3D-MS1-MS2-scans-100-1200-lattice.png") ## Accessing data -### From the ProteomeXchange database {-} +### From the ProteomeXchange database MS-based proteomics data is disseminated through the [ProteomeXchange](http://www.proteomexchange.org/) infrastructure, @@ -107,11 +102,10 @@ quantitative data, as opposed as the name suggests), Reaction Monitoring (SRM, i.e. targeted) data and the [MassIVE](http://massive.ucsd.edu/ProteoSAFe/static/massive.jsp) resource. These data can be downloaded within R using the -`r Biocpkg("rpx")` package. - +`r BiocStyle::Biocpkg("rpx")` package. ```{r rpx} -library("rpx") +library(rpx) ``` Using the unique `PXD000001` identifier, we can retrieve the relevant @@ -145,7 +139,7 @@ mzf <- pxget(px, fn) mzf ``` -### Data packages {-} +### Data packages Some data are also distributed through dedicated packages. The `r BiocStyle::Biocexptpkg("msdata")`, for example, provides some @@ -153,7 +147,7 @@ general raw data files relevant for both proteomics and metabolomics. ```{r msdatafiles, message = FALSE} -library("msdata") +library(msdata) ## proteomics raw data proteomics() ## proteomics identification data diff --git a/10-raw.Rmd b/inst/pages/raw-ms-data.qmd similarity index 89% rename from 10-raw.Rmd rename to inst/pages/raw-ms-data.qmd index 2b35a67..469a0fe 100644 --- a/10-raw.Rmd +++ b/inst/pages/raw-ms-data.qmd @@ -1,16 +1,9 @@ -# Raw MS data {#sec-raw} - +# Raw MS data {#sec-raw} In this section, we will learn how to read raw data in one of the commonly used open formats (`mzML`, `mzXML`, `netCDF` or `mgf`) into R. -```{r rwpkgs, echo = FALSE, message = FALSE, warning = FALSE} -## x <- RforProteomics:::msDataTab() -## sel <- x[, 1] %in% c("Raw", "Peak lists") -## knitr::kable(x[sel, ]) -``` - ## What is raw data in R When we manipulate complex data, we need a way to abstract it. @@ -64,7 +57,7 @@ sp0 #### Exercise {-} -Explore the newly created object using +Explore the newly created object using: - `spectraVariables` to extract all the metadata variables. Compare these to the spectra variables available from the previous example. @@ -78,6 +71,10 @@ Let's now create a new object using the mzML data previously downloaded and available in the `mzf` file. ```{r, spectra2} +library(rpx) +fn <- "TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01-20141210.mzML" +px <- PXDataset("PXD000001") +mzf <- pxget(px, fn) mzf sp <- Spectra(mzf) sp @@ -154,7 +151,6 @@ sp$rtime_minute <- rtime(sp) / 60 sp$rtime_minute |> head() ``` - #### Exercise {-} - Extract a set of spectra variables using the accessor (for example @@ -180,7 +176,6 @@ following questions: * Plot one spectrum of each level. Are they centroided or in profile mode? - These objects and their manipulations are not limited to single files or samples. Below we load data from two mzML files. The MS data from both files in the `Spectra` is organized linearly (first all spectra from the first file @@ -188,7 +183,7 @@ and then from the second). The `dataOrigin` function can be used to identify spectra from the different data files. ```{r, sciex_mzr} -(fls <- dir(system.file("sciex", package = "msdata"), full.names = TRUE)) +fls <- dir(system.file("sciex", package = "msdata"), full.names = TRUE) sp_sciex <- Spectra(fls) table(dataOrigin(sp_sciex)) ``` @@ -203,7 +198,7 @@ example backends but any object extending the base `MsBackend` class could be used instead. The default backends are: - `MsBackendMzR`: this backend keeps only general spectra variables in memory - and relies on the `r Biocpkg("mzR")` package to read mass peaks (m/z and + and relies on the `r BiocStyle::Biocpkg("mzR")` package to read mass peaks (m/z and intensity values) from the original MS files on-demand. ```{r sciex_mzr_show} @@ -215,7 +210,6 @@ sp_sciex high performance but has also, depending on the number of mass peaks in each spectrum, a much higher memory footprint. - ```{r sciex_dfr} setBackend(sp_sciex, MsBackendMemory()) ``` @@ -263,7 +257,6 @@ backends](https://jorainer.github.io/SpectraTutorials/articles/Spectra-backends. for more information on different backends, their properties and advantages/disadvantages. - ## Visualisation of raw MS data The importance of flexible access to specialised data becomes visible @@ -273,7 +266,6 @@ Not only can we access specific data and understand/visualise them, but we can transverse all the data and extract/visualise/understand structured slices of data. - The figure below shows an illustration of how mass spectrometry works: @@ -296,16 +288,11 @@ knitr::include_graphics("./img/msvisfig.png") We are going to reproduce the figure above through a set of exercices. -`r msmbstyle::question_begin()` - -1. The chromatogram can be created by extracting the `totIonCurrent` - and `rtime` variables for all MS1 spectra. Annotate the spectrum of - interest. +- The chromatogram can be created by extracting the `totIonCurrent` + and `rtime` variables for all MS1 spectra. + Annotate the spectrum of interest. -`r msmbstyle::question_end()` - - -`r msmbstyle::solution_begin()` +::: {.callout-answer .icon .callout-note collapse=true} ```{r} with(spectraData(filterMsLevel(sp, 1)), @@ -313,33 +300,26 @@ with(spectraData(filterMsLevel(sp, 1)), abline(v = rtime(sp)[2807], col = "red") ``` -`r msmbstyle::solution_end()` - -`r msmbstyle::question_begin()` +::: -2. The `filterPrecursorScan()` function can be used to retain a set - parent (MS1) and children scans (MS2), as defined by an acquisition - number. Use it to extract the MS1 scan of interest and all its MS2 - children. +- The `filterPrecursorScan()` function can be used to retain a set + parent (MS1) and children scans (MS2), as defined by an acquisition + number. Use it to extract the MS1 scan of interest and all its MS2 + children. -`r msmbstyle::question_end()` - -`r msmbstyle::solution_begin()` +::: {.callout-answer .icon .callout-note collapse=true} ```{r} ms_2 <- filterPrecursorScan(sp, 2807) ms_2 ``` -`r msmbstyle::solution_end()` - -`r msmbstyle::question_begin()` -3. Plot the MS1 spectrum of interest and highlight all the peaks that - will be selected for MS2 analysis. +::: -`r msmbstyle::question_end()` +- Plot the MS1 spectrum of interest and highlight all the peaks that + will be selected for MS2 analysis. -`r msmbstyle::solution_begin()` +::: {.callout-answer .icon .callout-note collapse=true} ```{r} plotSpectra(sp[2807], xlim = c(400, 1000)) @@ -347,47 +327,36 @@ abline(v = precursorMz(ms_2)[-1], col = "grey") abline(v = precursorMz(ms_2)[2], col = "red") ``` -`r msmbstyle::solution_end()` +::: +- Zoom in mz values 521.1 and 522.5 to reveal the isotopic envelope + of that peak. -`r msmbstyle::question_begin()` - -4. Zoom in mz values 521.1 and 522.5 to reveal the isotopic envelope - of that peak. - -`r msmbstyle::question_end()` - -`r msmbstyle::solution_begin()` +::: {.callout-answer .icon .callout-note collapse=true} ```{r} plotSpectra(sp[2807], xlim = c(521.2, 522.5), type = "l") abline(v = precursorMz(ms_2)[2], col = "red") ``` -`r msmbstyle::solution_end()` - +::: -`r msmbstyle::question_begin()` +- The `plotSpectra()` function is used to plot all 10 MS2 spectra in + one call. -5. The `plotSpectra()` function is used to plot all 10 MS2 spectra in - one call. +::: {.callout-answer .icon .callout-note collapse=true} -`r msmbstyle::question_end()` - -`r msmbstyle::solution_begin()` - -```{r, fig.height = 12, fig.width = 8} +```{r} plotSpectra(ms_2[-1]) ``` -`r msmbstyle::solution_end()` +::: It is possible to label the peaks with the `plotSpectra()` function. The `labels` argument is either a `character` of appropriate length (i.e. with a label for each peak) or, as illustrated below, a function that computes the labels. - ```{r} mzLabel <- function(z) { z <- peaksData(z)[[1L]] @@ -406,14 +375,10 @@ plotSpectra(ms_2[7], Spectra can also be compared either by overlay or mirror plotting using the `plotSpectraOverlay()` and `plotSpectraMirror()` functions. -`r msmbstyle::question_begin()` - -Filter MS2 level spectra and find any 2 MS2 spectra that have matching +- Filter MS2 level spectra and find any 2 MS2 spectra that have matching precursor peaks based on the precursor m/z values. -`r msmbstyle::question_end()` - -`r msmbstyle::solution_begin()` +::: {.callout-answer .icon .callout-note collapse=true} ```{r} sp2 <- filterMsLevel(sp, 2L) @@ -422,16 +387,12 @@ i <- which(precursorMz(sp2) == precursorMz(sp2)[37]) sp2i <- sp2[i] ``` -`r msmbstyle::solution_end()` - -`r msmbstyle::question_begin()` +::: -Visualise the matching pair using the `plotSpectraOverlay()` and +- Visualise the matching pair using the `plotSpectraOverlay()` and `plotSpectraMirror()` functions. -`r msmbstyle::question_end()` - -`r msmbstyle::solution_begin()` +::: {.callout-answer .icon .callout-note collapse=true} ```{r} plotSpectraOverlay(sp2i, col = c("red", "steelblue")) @@ -440,8 +401,8 @@ plotSpectraOverlay(sp2i, col = c("red", "steelblue")) ```{r} plotSpectraMirror(sp2i[1], sp2i[2]) ``` -`r msmbstyle::solution_end()` +::: It is also possible to explore raw data interactively with the [`SpectraVis` @@ -458,15 +419,9 @@ package](https://rformassspectrometry.github.io/SpectraVis/): [`plotly`](https://plotly.com/r/) allowing to explore (zooming, panning) the spectrum interactively. - -`r msmbstyle::question_begin()` - -Test the `SpectraVis` function on some the `Spectra` objects produce +- Test the `SpectraVis` function on some the `Spectra` objects produce above. -`r msmbstyle::question_end()` - - ## Raw data processing and manipulation Apart from *classical* subsetting operations such as `[` and `split`, a set of @@ -521,18 +476,13 @@ within a `Spectra`: - `reduceSpectra`: filters individual spectra keeping only the largest peak for groups of peaks with similar m/z values. - -`r msmbstyle::question_begin()` - -Using the `sp_sciex` data, select all spectra measured in the second +- Using the `sp_sciex` data, select all spectra measured in the second mzML file and subsequently filter them to retain spectra measured between 175 and 189 seconds in the measurement run. -`r msmbstyle::question_end()` +::: {.callout-answer .icon .callout-note collapse=true} -`r msmbstyle::solution_begin()` - -```{r filterfile-filterrt1} +```{r} fls <- unique(dataOrigin(sp_sciex)) fls file_2 <- filterDataOrigin(sp_sciex, dataOrigin = fls[2]) @@ -547,7 +497,7 @@ sp_sciex |> filterRt(c(175, 189)) ``` -`r msmbstyle::solution_end()` +::: As an example of data processing, we use below the `pickPeaks()` function. This function allows to convert *profile mode* MS data to *centroid @@ -599,7 +549,6 @@ manipulation operations are also possible for read-only backends information about the number of such processing steps can be seen below (next to Lazy evaluation queue). - ```{r} min(intensity(sp_sciex[1])) sp_sciex <- filterIntensity(sp_sciex, intensity = c(10, Inf)) @@ -621,3 +570,5 @@ More information on this lazy evaluation concept implemented in `Spectra` is provided in the [Spectra backends](https://jorainer.github.io/SpectraTutorials/articles/Spectra-backends.html) vignette. + +# References {-} diff --git a/install_docs_deps.R b/install_docs_deps.R deleted file mode 100644 index a22f718..0000000 --- a/install_docs_deps.R +++ /dev/null @@ -1,11 +0,0 @@ -if (!requireNamespace("BiocManager", quietly = TRUE)) - install.packages("BiocManager") - -deps <- c("dplyr", "factoextra", "ggplot2", "gplots", "limma", - "magrittr", "MsCoreUtils", "msdata", "MSnID", "mzID", "mzR", - "patchwork", "PSMatch", "QFeatures", "rpx", "Spectra", - "tidyr", "tidyverse", "impute", "MSnID") -BiocManager::install(deps, ask = FALSE, udpate = TRUE) - -deps2 <- c("lgatto/msmbstyle", "BiocStyle", "bookdown") -BiocManager::install(deps2, ask = FALSE, udpate = TRUE) diff --git a/refs.bib b/refs.bib deleted file mode 100644 index c37168f..0000000 --- a/refs.bib +++ /dev/null @@ -1,432 +0,0 @@ -@Manual{R-base, - title = {R: A Language and Environment for Statistical Computing}, - author = {{R Core Team}}, - organization = {R Foundation for Statistical Computing}, - address = {Vienna, Austria}, - year = {2021}, - url = {https://www.R-project.org/}, -} - -@Manual{R-bookdown, - title = {bookdown: Authoring Books and Technical Documents with R Markdown}, - author = {Yihui Xie}, - year = {2021}, - note = {R package version 0.21.6}, - url = {https://github.com/rstudio/bookdown}, -} - -@Manual{R-msmbstyle, - title = {msmbstyle: MSMB Styles for R Markdown Documents}, - author = {Mike Smith}, - year = {2021}, - note = {R package version 0.0.18}, -} - -@Manual{R-rmarkdown, - title = {rmarkdown: Dynamic Documents for R}, - author = {JJ Allaire and Yihui Xie and Jonathan McPherson and Javier Luraschi and Kevin Ushey and Aron Atkins and Hadley Wickham and Joe Cheng and Winston Chang and Richard Iannone}, - year = {2021}, - note = {R package version 2.7}, - url = {https://CRAN.R-project.org/package=rmarkdown}, -} - -@Book{bookdown2016, - title = {bookdown: Authoring Books and Technical Documents with {R} Markdown}, - author = {Yihui Xie}, - publisher = {Chapman and Hall/CRC}, - address = {Boca Raton, Florida}, - year = {2016}, - note = {ISBN 978-1138700109}, - url = {https://github.com/rstudio/bookdown}, -} - -@Book{rmarkdown2018, - title = {R Markdown: The Definitive Guide}, - author = {Yihui Xie and J.J. Allaire and Garrett Grolemund}, - publisher = {Chapman and Hall/CRC}, - address = {Boca Raton, Florida}, - year = {2018}, - note = {ISBN 9781138359338}, - url = {https://bookdown.org/yihui/rmarkdown}, -} - -@Book{rmarkdown2020, - title = {R Markdown Cookbook}, - author = {Yihui Xie and Christophe Dervieux and Emily Riederer}, - publisher = {Chapman and Hall/CRC}, - address = {Boca Raton, Florida}, - year = {2020}, - note = {ISBN 9780367563837}, - url = {https://bookdown.org/yihui/rmarkdown-cookbook}, -} - - -@ARTICLE{Gatto:2020, - title = "{MSnbase}, efficient and elegant R-based processing and - visualisation of raw mass spectrometry data", - author = "Gatto, Laurent and Gibb, Sebastian and Rainer, Johannes", - abstract = "We present version 2 of the MSnbase R/Bioconductor package. - MSnbase provides infrastructure for the manipulation, processing - and visualisation of mass spectrometry data. We focus on the new - on-disk infrastructure, that allows the handling of large raw - mass spectrometry experiments on commodity hardware and - illustrate how the package is used for elegant data processing, - method development and visualisation.", - journal = "J. Proteome Res.", - month = sep, - year = 2020, - language = "en" -} - -@Article{MAE, - title = {Software For The Integration Of Multi-Omics Experiments - In Bioconductor}, - author = {Marcel Ramos and Lucas Schiffer and Angela Re and Rimsha - Azhar and Azfar Basunia and Carmen Rodriguez Cabrera - and Tiffany Chan and Philip Chapman and Sean Davis - and David Gomez-Cabrero and Aedin C. Culhane and - Benjamin Haibe-Kains and Kasper Hansen and Hanish - Kodali and Marie Stephie Louis and Arvind Singh Mer - and Markus Reister and Martin Morgan and Vincent - Carey and Levi Waldron}, - journal = {Cancer Research}, - year = {2017}, - volume = {77(21); e39-42}, - } - - -@Manual{SE, - title = {SummarizedExperiment: SummarizedExperiment container}, - author = {Martin Morgan and Valerie Obenchain and Jim Hester and Hervé Pagès}, - year = {2020}, - note = {R package version 1.21.0}, - url = {https://bioconductor.org/packages/SummarizedExperiment}, - } - -@Article{Christoforou:2016, - author = {Christoforou, Andy and Mulvey, Claire M and - Breckels, Lisa M and Geladaki, Aikaterini and - Hurrell, Tracey and Hayward, Penelope C and Naake, - Thomas and Gatto, Laurent and Viner, Rosa and - Martinez Arias, Alfonso and Lilley, Kathryn S}, - title = {A draft map of the mouse pluripotent stem cell - spatial proteome.}, - journal = {Nat Commun}, - year = {2016}, - month = {}, - number = {}, - volume = {7}, - pages = {8992}, - doi = {10.1038/ncomms9992}, - PMID = {26754106}} - -@article{Sticker:2019, - author = {Sticker, Adriaan and Goeminne, Ludger and Martens, Lennart and Clement, Lieven}, - title = {Robust summarization and inference in proteome-wide label-free quantification}, - elocation-id = {668863}, - year = {2019}, - doi = {10.1101/668863}, - publisher = {Cold Spring Harbor Laboratory}, - abstract = {Label-Free Quantitative mass spectrometry based - workflows for differential expression (DE) analysis - of proteins impose important challenges on the data - analysis due to peptide-specific effects and context - dependent missingness of peptide - intensities. Peptide-based workflows, like MSqRob, - test for DE directly from peptide intensities and - outper-form summarization methods which first - aggregate MS1 peptide intensities to protein - intensities before DE analysis. However, these - methods are computationally expensive, often hard to - understand for the non-specialised end-user, and do - not provide protein summaries, which are important - for visualisation or downstream processing. In this - work, we therefore evaluate state-of-the-art - summarization strategies using a benchmark spike-in - dataset and discuss why and when these fail compared - to the state-of-the-art peptide based model, - MSqRob. Based on this evaluation, we propose a novel - summarization strategy, MSqRob-Sum, which estimates - MSqRob{\textquoteright}s model parameters in a - two-stage procedure circumventing the drawbacks of - peptide-based workflows. MSqRobSum maintains - MSqRob{\textquoteright}s superior performance, while - providing useful protein expression summaries for - plotting and downstream analysis. Summarising - peptide to protein intensities considerably reduces - the computational complexity, the memory footprint - and the model complexity, and makes it easier to - disseminate DE inferred on protein - summaries. Moreover, MSqRobSum provides a highly - modular analysis framework, which provides - researchers with full flexibility to develop data - analysis workflows tailored towards their specific - applications.}, - URL = {https://www.biorxiv.org/content/early/2019/06/13/668863}, - eprint = {https://www.biorxiv.org/content/early/2019/06/13/668863.full.pdf}, - journal = {bioRxiv} -} - -@ARTICLE{Paulovich:2010, - title = "Interlaboratory study characterizing a yeast performance standard - for benchmarking {LC-MS} platform performance", - author = "Paulovich, Amanda G and Billheimer, Dean and Ham, Amy-Joan L and - Vega-Montoto, Lorenzo and Rudnick, Paul A and Tabb, David L and - Wang, Pei and Blackman, Ronald K and Bunk, David M and Cardasis, - Helene L and Clauser, Karl R and Kinsinger, Christopher R and - Schilling, Birgit and Tegeler, Tony J and Variyath, Asokan - Mulayath and Wang, Mu and Whiteaker, Jeffrey R and Zimmerman, - Lisa J and Fenyo, David and Carr, Steven A and Fisher, Susan J - and Gibson, Bradford W and Mesri, Mehdi and Neubert, Thomas A and - Regnier, Fred E and Rodriguez, Henry and Spiegelman, Cliff and - Stein, Stephen E and Tempst, Paul and Liebler, Daniel C", - abstract = "Optimal performance of LC-MS/MS platforms is critical to - generating high quality proteomics data. Although individual - laboratories have developed quality control samples, there is no - widely available performance standard of biological complexity - (and associated reference data sets) for benchmarking of platform - performance for analysis of complex biological proteomes across - different laboratories in the community. Individual preparations - of the yeast Saccharomyces cerevisiae proteome have been used - extensively by laboratories in the proteomics community to - characterize LC-MS platform performance. The yeast proteome is - uniquely attractive as a performance standard because it is the - most extensively characterized complex biological proteome and - the only one associated with several large scale studies - estimating the abundance of all detectable proteins. In this - study, we describe a standard operating protocol for large scale - production of the yeast performance standard and offer aliquots - to the community through the National Institute of Standards and - Technology where the yeast proteome is under development as a - certified reference material to meet the long term needs of the - community. Using a series of metrics that characterize LC-MS - performance, we provide a reference data set demonstrating - typical performance of commonly used ion trap instrument - platforms in expert laboratories; the results provide a basis for - laboratories to benchmark their own performance, to improve upon - current methods, and to evaluate new technologies. Additionally, - we demonstrate how the yeast reference, spiked with human - proteins, can be used to benchmark the power of proteomics - platforms for detection of differentially expressed proteins at - different levels of concentration in a complex matrix, thereby - providing a metric to evaluate and minimize pre-analytical and - analytical variation in comparative proteomics experiments.", - journal = "Mol. Cell. Proteomics", - volume = 9, - number = 2, - pages = "242--254", - month = feb, - year = 2010, - language = "en" -} - -@Article{Lazar:2016, - author = {Lazar, C and Gatto, L and Ferro, M and Bruley, C - and Burger, T}, - title = {Accounting for the Multiple Natures of Missing - Values in Label-Free Quantitative Proteomics Data - Sets to Compare Imputation Strategies.}, - journal = {J Proteome Res}, - year = {2016}, - month = {Apr}, - number = {4}, - volume = {15}, - pages = {1116-25}, - doi = {10.1021/acs.jproteome.5b00981}, - PMID = {26906401} -} - -@Article{Cox:2008, - author = {Cox, J and Mann, M}, - title = {MaxQuant enables high peptide identification - rates, individualized p.p.b.-range mass accuracies - and proteome-wide protein quantification.}, - journal = {Nat Biotechnol}, - year = {2008}, - month = {Dec}, - number = {12}, - volume = {26}, - pages = {1367-72}, - doi = {10.1038/nbt.1511}, - PMID = {19029910}} - -@article{Morgenstern:2020, - author = {Morgenstern, David and Barzilay, Rotem and Levin, Yishai}, - title = {{RawBeans}: A Simple, Vendor-Independent, Raw-Data Quality-Control Tool}, - journal = {Journal of Proteome Research}, - year = {2021}, - doi = {10.1021/acs.jproteome.0c00956}, - note ={PMID: 33657803}, - URL = {https://doi.org/10.1021/acs.jproteome.0c00956}, - eprint = {https://doi.org/10.1021/acs.jproteome.0c00956} -} - - -@ARTICLE{Vanderaa:2021, - title = "Replication of single-cell proteomics data reveals important - computational challenges", - author = "Vanderaa, Christophe and Gatto, Laurent", - abstract = "INTRODUCTION: Mass spectrometry-based proteomics is actively - embracing quantitative, single-cell level analyses. Indeed, - recent advances in sample preparation and mass spectrometry (MS) - have enabled the emergence of quantitative MS-based single-cell - proteomics (SCP). While exciting and promising, SCP still has - many rough edges. The current analysis workflows are custom and - built from scratch. The field is therefore craving for - standardized software that promotes principled and reproducible - SCP data analyses. AREAS COVERED: This special report is the - first step toward the formalization and standardization of SCP - data analysis. scp, the software that accompanies this work, - successfully replicates one of the landmark SCP studies and is - applicable to other experiments and designs. We created a - repository containing the replicated workflow with comprehensive - documentation in order to favor further dissemination and - improvements of SCP data analyses. EXPERT OPINION: Replicating - SCP data analyses uncovers important challenges in SCP data - analysis. We describe two such challenges in detail: batch - correction and data missingness. We provide the current - state-of-the-art and illustrate the associated limitations. We - also highlight the intimate dependence that exists between batch - effects and data missingness and offer avenues for dealing with - these exciting challenges.", - journal = "Expert Rev. Proteomics", - month = oct, - year = 2021, - keywords = "Bioconductor; R; batch correction; imputation; mass spectrometry; - proteomics; replication; reproducible research; single-cell; - software", - language = "en" -} - - -@ARTICLE{Rainer:2022, - title = "A Modular and Expandable Ecosystem for Metabolomics Data - Annotation in {R}", - author = "Rainer, Johannes and Vicini, Andrea and Salzer, Liesa and - Stanstrup, Jan and Badia, Josep M and Neumann, Steffen and - Stravs, Michael A and Verri Hernandes, Vinicius and Gatto, - Laurent and Gibb, Sebastian and Witting, Michael", - abstract = "Liquid chromatography-mass spectrometry (LC-MS)-based untargeted - metabolomics experiments have become increasingly popular - because of the wide range of metabolites that can be analyzed - and the possibility to measure novel compounds. LC-MS - instrumentation and analysis conditions can differ substantially - among laboratories and experiments, thus resulting in - non-standardized datasets demanding customized annotation - workflows. We present an ecosystem of R packages, centered - around the MetaboCoreUtils, MetaboAnnotation and CompoundDb - packages that together provide a modular infrastructure for the - annotation of untargeted metabolomics data. Initial annotation - can be performed based on MS1 properties such as m/z and - retention times, followed by an MS2-based annotation in which - experimental fragment spectra are compared against a reference - library. Such reference databases can be created and managed - with the CompoundDb package. The ecosystem supports data from a - variety of formats, including, but not limited to, MSP, MGF, - mzML, mzXML, netCDF as well as MassBank text files and SQL - databases. Through its highly customizable functionality, the - presented infrastructure allows to build reproducible annotation - workflows tailored for and adapted to most untargeted - LC-MS-based datasets. All core functionality, which supports - base R data types, is exported, also facilitating its re-use in - other R packages. Finally, all packages are thoroughly - unit-tested and documented and are available on GitHub and - through Bioconductor.", - journal = "Metabolites", - publisher = "Multidisciplinary Digital Publishing Institute", - volume = 12, - number = 2, - pages = "173", - month = feb, - year = 2022, - language = "en" -} - -@article{Sinha:2020, - author = {Sinha, Ankit and Mann, Matthias}, - title = "{A beginner’s guide to mass spectrometry–based proteomics}", - journal = {The Biochemist}, - year = {2020}, - month = {09}, - abstract = "{Mass spectrometry (MS)-based proteomics is the most - comprehensive approach for the quantitative - profiling of proteins, their interactions and - modifications. It is a challenging topic as a firm - grasp requires expertise in biochemistry for sample - preparation, analytical chemistry for - instrumentation and computational biology for data - analysis. In this short guide, we highlight the - various components of a mass spectrometer, the - sample preparation process for conversion of - proteins into peptides, and quantification and - analysis strategies. The advancing technology of - MS-based proteomics now opens up opportunities in - clinical applications and single-cell analysis.}", - issn = {0954-982X}, - doi = {10.1042/BIO20200057}, - url = {https://doi.org/10.1042/BIO20200057}, - note = {BIO20200057}, - eprint = {https://portlandpress.com/biochemist/article-pdf/doi/10.1042/BIO20200057/892770/bio20200057.pdf}, -} - -@Article{Steen:2004, - title = "The {ABC's} (and {XYZ's}) of peptide sequencing", - author = "Steen, Hanno and Mann, Matthias", - abstract = "Proteomics is an increasingly powerful and indispensable - technology in molecular cell biology. It can be used to identify - the components of small protein complexes and large organelles, - to determine post-translational modifications and in - sophisticated functional screens. The key - but little understood - - technology in mass-spectrometry-based proteomics is peptide - sequencing, which we describe and review here in an easily - accessible format.", - journal = "Nat. Rev. Mol. Cell Biol.", - volume = 5, - number = 9, - pages = "699--711", - month = sep, - year = 2004, - language = "en" -} - -@ARTICLE{Marcotte:2007, - title = "How do shotgun proteomics algorithms identify proteins?", - author = "Marcotte, Edward M", - journal = "Nat. Biotechnol.", - volume = 25, - number = 7, - pages = "755--757", - month = jul, - year = 2007, - language = "en" -} - - -@ARTICLE{Shuken:2023, - title = "An Introduction to Mass {Spectrometry-Based} Proteomics", - author = "Shuken, Steven R", - abstract = "Mass spectrometry is unmatched in its versatility for studying - practically any aspect of the proteome. Because the foundations - of mass spectrometry-based proteomics are complex and span - multiple scientific fields, proteomics can be perceived as having - a high barrier to entry. This tutorial is intended to be an - accessible illustrated guide to the technical details of a - relatively simple quantitative proteomic experiment. An attempt - is made to explain the relevant concepts to those with limited - knowledge of mass spectrometry and a basic understanding of - proteins. An experimental overview is provided, from the - beginning of sample preparation to the analysis of protein group - quantities, with explanations of how the data are acquired, - processed, and analyzed. A selection of advanced topics is - briefly surveyed and works for further reading are cited. To - conclude, a brief discussion of the future of proteomics is - given, considering next-generation protein sequencing - technologies that may complement mass spectrometry to create a - fruitful future for proteomics.", - journal = "J. Proteome Res.", - month = jun, - year = 2023, - keywords = "bottom-up; data-dependent acquisition; label-free quantification; - mass spectrometry; proteomics; untargeted proteomics", - language = "en" -} diff --git a/skeleton.bib b/skeleton.bib deleted file mode 100644 index 77e82d8..0000000 --- a/skeleton.bib +++ /dev/null @@ -1,62 +0,0 @@ -@Manual{R-base, - title = {R: A Language and Environment for Statistical Computing}, - author = {{R Core Team}}, - organization = {R Foundation for Statistical Computing}, - address = {Vienna, Austria}, - year = {2023}, - url = {https://www.R-project.org/}, -} - -@Manual{R-bookdown, - title = {bookdown: Authoring Books and Technical Documents with R Markdown}, - author = {Yihui Xie}, - note = {R package version 0.34.2, https://pkgs.rstudio.com/bookdown/}, - url = {https://github.com/rstudio/bookdown}, - year = {2023}, -} - -@Manual{R-msmbstyle, - title = {msmbstyle: MSMB Styles for R Markdown Documents}, - author = {Mike Smith}, - year = {2023}, - note = {R package version 0.0.19}, -} - -@Manual{R-rmarkdown, - title = {rmarkdown: Dynamic Documents for R}, - author = {JJ Allaire and Yihui Xie and Christophe Dervieux and Jonathan McPherson and Javier Luraschi and Kevin Ushey and Aron Atkins and Hadley Wickham and Joe Cheng and Winston Chang and Richard Iannone}, - year = {2023}, - note = {R package version 2.24}, - url = {https://CRAN.R-project.org/package=rmarkdown}, -} - -@Book{bookdown2016, - title = {bookdown: Authoring Books and Technical Documents with {R} Markdown}, - author = {Yihui Xie}, - publisher = {Chapman and Hall/CRC}, - address = {Boca Raton, Florida}, - year = {2016}, - isbn = {978-1138700109}, - url = {https://bookdown.org/yihui/bookdown}, -} - -@Book{rmarkdown2018, - title = {R Markdown: The Definitive Guide}, - author = {Yihui Xie and J.J. Allaire and Garrett Grolemund}, - publisher = {Chapman and Hall/CRC}, - address = {Boca Raton, Florida}, - year = {2018}, - isbn = {9781138359338}, - url = {https://bookdown.org/yihui/rmarkdown}, -} - -@Book{rmarkdown2020, - title = {R Markdown Cookbook}, - author = {Yihui Xie and Christophe Dervieux and Emily Riederer}, - publisher = {Chapman and Hall/CRC}, - address = {Boca Raton, Florida}, - year = {2020}, - isbn = {9780367563837}, - url = {https://bookdown.org/yihui/rmarkdown-cookbook}, -} - diff --git a/style.css b/style.css deleted file mode 100644 index 4c51529..0000000 --- a/style.css +++ /dev/null @@ -1,5 +0,0 @@ -/* original background colour is #1881c2 */ - -:root { - --main-bg-color: #115a88; -} diff --git a/vignettes/Makefile b/vignettes/Makefile new file mode 100644 index 0000000..e9d4040 --- /dev/null +++ b/vignettes/Makefile @@ -0,0 +1,5 @@ +all: render + +render: + quarto render ../inst/ + mkdir -p ../inst/doc && mv ../inst/docs ../inst/doc/book diff --git a/vignettes/stub.Rmd b/vignettes/stub.Rmd new file mode 100644 index 0000000..aaa1b3c --- /dev/null +++ b/vignettes/stub.Rmd @@ -0,0 +1,19 @@ +--- +vignette: > + %\VignetteIndexEntry{Link to book} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r, echo=FALSE} +# This is adapted from Aaron Lun's approach in OSCA.* books +link <- BiocStyle::Biocbook( + read.dcf('../DESCRIPTION')[1], + label="link" +) +URL <- sub(".*\\((.+))", "\\1", link) +``` + + + +