From 0c68bbc9fc8d3550cbb3647587162f88fe05de2b Mon Sep 17 00:00:00 2001 From: ppxasjsm Date: Wed, 7 Aug 2024 17:52:21 +0100 Subject: [PATCH 01/11] adding paper skeleton --- paper/paper.bib | 28 ++++++++++++++++++ paper/paper.md | 79 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 107 insertions(+) create mode 100644 paper/paper.bib create mode 100644 paper/paper.md diff --git a/paper/paper.bib b/paper/paper.bib new file mode 100644 index 0000000..79dd542 --- /dev/null +++ b/paper/paper.bib @@ -0,0 +1,28 @@ +@article{engelberger2021developing, + title = {Developing and {{Implementing Cloud-Based Tutorials That Combine Bioinformatics Software}}, {{Interactive Coding}}, and {{Visualization Exercises}} for {{Distance Learning}} on {{Structural Bioinformatics}}}, + author = {Engelberger, Felipe and {Galaz-Davison}, Pablo and Bravo, Graciela and Rivera, Maira and {Ram{\'i}rez-Sarmiento}, C{\'e}sar A.}, + year = {2021}, + month = may, + journal = {J. Chem. Educ.}, + volume = {98}, + number = {5}, + pages = {1801--1807}, + publisher = {American Chemical Society}, + issn = {0021-9584}, + doi = {10.1021/acs.jchemed.1c00022}, + urldate = {2024-08-07}, + abstract = {The COVID-19 pandemic has swiftly forced a change in learning strategies across educational institutions, from extensively relying on in-person activities toward online teaching. It is particularly difficult to adapt courses that depend on physical equipment to be now carried out remotely. This is the case for bioinformatics, which typically requires dedicated computer classrooms, as the logistics of granting remote access to a workstation or relying on the computational resources of each student is not trivial. A possible workaround is using cloud server-based computing resources, such as Google Colaboratory, a free web browser application that allows the writing and execution of Python programming through Jupyter notebooks, integrating text, images, and code cells. Following a cloud-based approach, we migrated the practical activities of a course on molecular modeling and simulation into the Google Colaboratory environment resulting in 12 tutorials that introduce students to topics such as phylogenetic analysis, molecular modeling, molecular docking, several flavors of molecular dynamics, and coevolutionary analysis. Each of these notebooks includes a brief introduction to the topic, software installation, execution of the required tools, and analysis of results, with each step properly described. Using a Likert scale questionnaire, a pool of students positively evaluated these tutorials in terms of the time required for their completion, their ability to understand the content and exercises developed in each session, and the practical significance and impact that these computational tools have on scientific research. All tutorials are freely available at https://github.com/pb3lab/ibm3202.}, + file = {/Users/toni_brain/Zotero/storage/XIFB34JG/Engelberger et al. - 2021 - Developing and Implementing Cloud-Based Tutorials .pdf} +} + +@misc{thompson2024openff, + title = {{{OpenFF Interchange}}}, + author = {Thompson, Matthew and Wagner, Jeff and Gilmer, Justin B. and Timalsina, Umesh and Quach, Co D. and Boothroyd, Simon and Mitchell, Joshua A.}, + year = {2024}, + month = aug, + doi = {10.5281/zenodo.13155316}, + urldate = {2024-08-07}, + abstract = {0.3.29 GROMACS export performance improements and bugfixes For the complete release notes, please see the release history. For help installing Interchange, the installation instructions. Please report bugs, request features, or ask questions through our issue tracker. Please note that there may still be some changes to the API prior to a stable 1.0.0 release.}, + howpublished = {Zenodo}, + file = {/Users/toni_brain/Zotero/storage/GAHP5Y64/13155316.html} +} diff --git a/paper/paper.md b/paper/paper.md new file mode 100644 index 0000000..eca1bbf --- /dev/null +++ b/paper/paper.md @@ -0,0 +1,79 @@ +--- +title: 'Course Materials for an Introduction to Data-Driven Chemistry' +tags: + - molecular simulation + - python + - graduate level +authors: + - name: Matteo T. Degiacomi + orcid: 0000-0003-4672-471X + affiliation: "1" + - name: Micaela Matta + orcid: xxx + affiliation: "2" + - name: Antonia S. J. S. Mey + orcid: 0000-0001-7512-5252 + affiliation: "3" + +affiliations: + - name: Department of Physics, Durham University, South Road, Durham, DH1 3LE, United Kingdom + index: 1 + - name: xxxx + index: 2 + - name: EaStCHEM School of Chemistry, University of Edinburgh, Joseph Black Building, David Brewster Road, Edinburgh, EH9 3FJ, United Kingdom + index: 3 + + + +date: 07 August 2024 +bibliography: paper.bib +--- + +# Summary + +This will be a summary when I have grown up + +# Statement of Need + +Here is where we talk about why this course is important +# Overview, Content, and Structure +## Target Audience + +Graduate level + +## Content + +Here we discuss the content of the course +A summary of each unit can be found in Table 1 below: + +Table: Summary of course material. + +| Session | Content Summary | Materials | +|------|---------------------------------|-----------| +| 01 | An Introduction to algorithmic thinking and using Jupyter notebooks |[Unit 1 Notebook](https://github.com/Edinburgh-Chemistry-Teaching/DDC/blob/main/Unit_01/Unit_01_problem_solving_I.ipynb)| + + +Here we talk about the content more. + + +# Assessment and feedback + +Assessment and feedback: Carpentries style + + + +# Conclusion + +Some conclusions + +# Contributions to the course + +MTD, MM, and ASJSM conceived the course. + +# Acknowledgements + +- OpenFF tutorials, particularly Matt Thompson, Jeff Wagner, and Josh Mitchell +- Charlie Laughton for inspiration on discussing RMSD in conjunction with equilibration +- Rohan Gorantla for help with Colab + +# References From 4e94b39af943b23bd49ed8ffe6dcf607f7e4ebb6 Mon Sep 17 00:00:00 2001 From: Matteo Degiacomi Date: Mon, 12 Aug 2024 15:01:00 +0100 Subject: [PATCH 02/11] Update paper.md --- paper/paper.md | 60 +++++++++++++++++++++++++++++++++++--------------- 1 file changed, 42 insertions(+), 18 deletions(-) diff --git a/paper/paper.md b/paper/paper.md index eca1bbf..989d86c 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -24,56 +24,80 @@ affiliations: index: 3 - -date: 07 August 2024 +date: 12 August 2024 bibliography: paper.bib --- # Summary -This will be a summary when I have grown up +[...] # Statement of Need -Here is where we talk about why this course is important +Biomolecular systems have been the first to be subjected to molecular dynamics simulations, and are thus underpinned by half a century history rich of methodological developments, embodied in a vast array of available software. +[...] + # Overview, Content, and Structure + ## Target Audience -Graduate level +This is a graduate-level course, aimed at beginners in biomolecular simulation. It is expected that students are already familiar with molecular dynamics simulation theory, and have a basic working knowledge of Python and its core scientific packages (numpy, scipy, matplotlib). ## Content -Here we discuss the content of the course -A summary of each unit can be found in Table 1 below: +The objective of this course is not to make students proficient in one or few selected software for MD simulation preparation, execution, or analysisis. Instead, it is instead aimed at providing students with a general overview of the key decision-making required to carry out MD simulations of biomolecules and extracting quantitative data from them. In this context, the course is subdivided in two Units featuring lectures and practical sessions. Lectures are software-agnostic, whereas practical sessions demonstrate how those concepts could be put into practice by exposing student to authentic tasks leveraging on commonly used Python packages. While each practical session can be run by students on their own computer, these are also available in Google colab. This solution, requiring on local installation, is especially suitable for for those unfamiliar with how to set-up a Python environment. + +### Unit 1: Simulation Preparation -Table: Summary of course material. +The first Unit is dedicated to providing background on protein structure, and how to prepare a protein for biomolecular simulation. In this Unit, students learn about how to critically observe a protein structure, and make informed decisions required to set-up a simulation faithfully recapitulating a biologically relevant system. -| Session | Content Summary | Materials | -|------|---------------------------------|-----------| -| 01 | An Introduction to algorithmic thinking and using Jupyter notebooks |[Unit 1 Notebook](https://github.com/Edinburgh-Chemistry-Teaching/DDC/blob/main/Unit_01/Unit_01_problem_solving_I.ipynb)| +| Session | Materials | +|------------------------------------|-----------| +| L1: Introduction to Proteins | [Lecture Slides](https://github.com/CCPBioSim/Into_to_MD_simulation_and_analysis/blob/main/1_Introduction/Lecture_1_Introduction.pdf) | +| L2: Understanding Protein Systems | [Lecture Slides](https://github.com/CCPBioSim/Into_to_MD_simulation_and_analysis/blob/main/2_Protein_Preparation/Lecture_2_Protein_Prep.pdf) +| P: Understanding Protein Systems, contd. | [Webserver](https://server.poissonboltzmann.org/pdb2pqr)| +| L4: Protein-Ligand Docking | [Lecture Slides](https://github.com/CCPBioSim/Into_to_MD_simulation_and_analysis/blob/main/3_Docking/Lecture_3_Docking.pdf)| +| P: Protein-Ligand Docking | [![Docking](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/CCPBioSim/Into_to_MD_simulation_and_analysis/blob/main/3_Docking/3_Docking.ipynb)| +| L3: Simulation Setup | [Lecture Slides](https://github.com/CCPBioSim/Into_to_MD_simulation_and_analysis/blob/main/4_Simulation_Setup/Lecture_4_Simulation_setup.pdf) | +| P: Simulation Setup | [![Simulation](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/CCPBioSim/Into_to_MD_simulation_and_analysis/blob/main/4_Simulation_Setup/4_Simulation_Setup.ipynb) | -Here we talk about the content more. +### Unit 2: Simulation Anaylsis + +The second Unit is dedicated to providing the students with means to extract relevant quantitative information from a molecular dynamics simulation trajectory. A key aspect of this Unit lies in the demonstration of how machine learning techniques (clustering, dimensionality reduction, classification) can be used to extract meaningful information from noisty and high-dimensional data associated with biomolecular MD simulations. + +| Session | Materials | +|-----------------------------------------------------|-----------| +| L5: Simulation Basic Analyses | [Lecture Slides](5_Analysis_MDAnalysis/Lecture_5_Analysis_MDAnalysis.pdf)| +| P: Simulation Basic Analyses | [![Analysis_0](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/CCPBioSim/Into_to_MD_simulation_and_analysis/blob/main/5_Analysis_MDAnalysis/5_Analysis_MDAnalysis.ipynb)| +| L6: Dimensionality Reduction | [Lecture Slides](https://github.com/CCPBioSim/Into_to_MD_simulation_and_analysis/blob/main/6_Analysis_DR/Lecture_6_DR.pdf) | +| P: Dimensionality Reduction, part 1 | [![Analysis_1](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/CCPBioSim/Into_to_MD_simulation_and_analysis/blob/main/6_Analysis_DR/6_Analysis_part1.ipynb)| +| P: Dimensionality Reduction, part 2 | [![Analysis_2](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/CCPBioSim/Into_to_MD_simulation_and_analysis/blob/main/6_Analysis_DR/6_Analysis_part2.ipynb)| +| L7: Clustering | [Lecture Slides](https://github.com/CCPBioSim/Into_to_MD_simulation_and_analysis/blob/main/7_Analysis_clustering/Lecture_7_Clustering.pdf)| +| P: Clustering | [![Analysis_3](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/CCPBioSim/Into_to_MD_simulation_and_analysis/blob/main/7_Analysis_clustering/7_clustering.ipynb) | +| L8: Data Classification | [Lecture Slides](https://github.com/CCPBioSim/Into_to_MD_simulation_and_analysis/blob/main/8_Analysis_classification/Lecture_8_classification.pdf) | +| P: Data Classification | [![Analysis_4](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/CCPBioSim/Into_to_MD_simulation_and_analysis/blob/main/8_Analysis_classification/1_classification.ipynb) | +| L9: Markov State Modelling | [Lecture Slides](https://github.com/CCPBioSim/Into_to_MD_simulation_and_analysis/blob/main/9_Analysis_MSM/Lecture_9_MSM.pdf) | # Assessment and feedback -Assessment and feedback: Carpentries style +Each Jupyter notebook contains information on a specific topic, as well as task the student is asked to carry out independently. The tasks range from interpreting data previously produced, to running presented code with different parameters, to solving a specific problem by implementing a short Python code. Solutions to all questions are provided in the notebook as drop-down cells. +In our own teaching practice, we provide students with post-its of two different colours that can be displayed on their computer screen --- yellow indicating that everything is clear, pink indicating that help is required. At the end of each practical session, studends are asked to use these same post-its to provide instructors with feedback on something they liked (yellow post-it), and something that requires improvement (pink post-it). In the three years we have delivered this course, this appoach has enabled us to gather comprehensive feedback, helping us fine-tuning the teaching material and our own delivery style. A key observation is that students, when presented with a new notebook, especially appreciate the instructors spending few minutes describing the overall notebook structure and the tasks it features, before working through the beginning of it. # Conclusion -Some conclusions +[...] + # Contributions to the course -MTD, MM, and ASJSM conceived the course. +MTD, MM, and ASJSM conceived the course. # Acknowledgements -- OpenFF tutorials, particularly Matt Thompson, Jeff Wagner, and Josh Mitchell -- Charlie Laughton for inspiration on discussing RMSD in conjunction with equilibration -- Rohan Gorantla for help with Colab +Part of the material in this course was adapted from other sources with the agreement of respective authors. In this context, we would like to acknowledge Matt Thompson, Jeff Wagner, and Josh Mitchell for the material featured in the OpenFF tutorials, César Ramírez-Sarmiento and collaborators for material featured in the Autodock tutorial, and Ifan Alibay and Richard Gowers for material in the MDAnalysis tutorial. We also wish to acknowledge Charlie Laughton for inspiration on discussing RMSD in conjunction with equilibration, and Rohan Gorantla for help with with setting up Colab environments. # References From 432bf05d0888b4b0f2c7d54ffb295675532277a8 Mon Sep 17 00:00:00 2001 From: Matteo Degiacomi Date: Mon, 12 Aug 2024 15:50:09 +0100 Subject: [PATCH 03/11] Update paper.md --- paper/paper.md | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/paper/paper.md b/paper/paper.md index 989d86c..9455d2a 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -30,22 +30,22 @@ bibliography: paper.bib # Summary -[...] +We present an open-source course featuring a blend of lectures and practical sessions teaching how to set-up and analyse molecular dynamics (MD) simulations of biomolecules, with a particular focus on proteins. The material presented in this course has been delivered since 2022 to cohorts of 30-40 postgraduates attending the UK-based CCP5 Summer School on Molecular simulation. # Statement of Need -Biomolecular systems have been the first to be subjected to molecular dynamics simulations, and are thus underpinned by half a century history rich of methodological developments, embodied in a vast array of available software. -[...] +Biomolecular systems have been the first to be subjected to molecular dynamics (MD) simulations, and are thus underpinned by half a century history rich of methodological developments, embodied in a vast array of available software. The improvement in physical models dictating interatomic interactions couple with an ever-increasing availability of computational power have enabled MD simulations to establish themselves as a technique complementary to experimental data. Starting from the simulation of small proteins for only a few nanoseconds, nowadays large biomolecular complexes featuring millions of atoms can be simulated for timescales orders of magnitude longer. The data produced by MD simulations is noisy and high-dimensional though, and its usefulness is directly dependent on how faithfully the molecular system simulated recapitulates the phisiochemical conditions of its real counterpart. +While since the mid-70s strides have been made to automate the preparation of biologically-relevant atomistic models and the analysis of simulation data, the modern computational scientist is still required to make critical decisions on how the system should be assembled and simulated, and on which quantities should be extracted from the resulting simulation so as to precisely and accurately explain, or predict, experimental data. # Overview, Content, and Structure ## Target Audience -This is a graduate-level course, aimed at beginners in biomolecular simulation. It is expected that students are already familiar with molecular dynamics simulation theory, and have a basic working knowledge of Python and its core scientific packages (numpy, scipy, matplotlib). +This is a graduate-level course, aimed at beginners in biomolecular simulation. It is expected that students are already familiar with key concepts of molecular dynamics simulation theory, and have a basic working knowledge of Python and its core scientific packages (numpy, scipy, matplotlib). ## Content -The objective of this course is not to make students proficient in one or few selected software for MD simulation preparation, execution, or analysisis. Instead, it is instead aimed at providing students with a general overview of the key decision-making required to carry out MD simulations of biomolecules and extracting quantitative data from them. In this context, the course is subdivided in two Units featuring lectures and practical sessions. Lectures are software-agnostic, whereas practical sessions demonstrate how those concepts could be put into practice by exposing student to authentic tasks leveraging on commonly used Python packages. While each practical session can be run by students on their own computer, these are also available in Google colab. This solution, requiring on local installation, is especially suitable for for those unfamiliar with how to set-up a Python environment. +The objective of this course is not to make students proficient in one or few selected software for MD simulation preparation, execution, or analysisis. Instead, it is aimed at providing students with a general overview of the key decision-making required to carry out MD simulations of biomolecules and extracting quantitative data from them. In this context, the course is subdivided in two Units featuring lectures and practical sessions. Lectures are software-agnostic, whereas practical sessions demonstrate how those concepts could be put into practice by exposing student to authentic tasks leveraging on commonly used Python packages. While each practical session can be run by students on their own computer, these are also available in Google colab. This solution, requiring on local installation, is especially suitable for those unfamiliar with how to set-up a Python environment. ### Unit 1: Simulation Preparation @@ -89,7 +89,9 @@ In our own teaching practice, we provide students with post-its of two different # Conclusion -[...] +Thanks to the increasing availability of computational power and software automating many of the processes associated with biomolecular simulation and analysis, the palette of questions addressable with MD is broadening. While this is positive, it is still crucial that the computational scientist has a clear understanding of what is being simulated, and how. Indeed, to date many decisions associated with system building and analysis cannot be delegated to a machine without human verification. In this context, we see our course as a first stepping-stone, detailing the key decisions that need to be made, providing examples of how this can be done in practice, and signposting relevant software and specialised analysis techniques for further education. + +Despite its long history, MD is still an evolving field. New techniques pushing the boundary of what is possible keep emerging, as exemplified by the current revolution constituted by the advent of modern machine learning techniques. While we expect that majority of the concepts presented in this course will be valid for many years to come, we are endeavouring to keeping the course material up-to-date by highlighting current methodological trends. For instance, in the most recent interation presented in this publication a discussion on how models produced by AlphaFold should be interpreted and used is introduced. # Contributions to the course From c14776ca6749241df385a4299e7545eb439b5930 Mon Sep 17 00:00:00 2001 From: Matteo Degiacomi Date: Mon, 12 Aug 2024 15:52:01 +0100 Subject: [PATCH 04/11] Update paper.md --- paper/paper.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paper/paper.md b/paper/paper.md index 9455d2a..3047e47 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -45,7 +45,7 @@ This is a graduate-level course, aimed at beginners in biomolecular simulation. ## Content -The objective of this course is not to make students proficient in one or few selected software for MD simulation preparation, execution, or analysisis. Instead, it is aimed at providing students with a general overview of the key decision-making required to carry out MD simulations of biomolecules and extracting quantitative data from them. In this context, the course is subdivided in two Units featuring lectures and practical sessions. Lectures are software-agnostic, whereas practical sessions demonstrate how those concepts could be put into practice by exposing student to authentic tasks leveraging on commonly used Python packages. While each practical session can be run by students on their own computer, these are also available in Google colab. This solution, requiring on local installation, is especially suitable for those unfamiliar with how to set-up a Python environment. +The objective of this course is not to make students proficient in one or few selected software for MD simulation preparation, execution, or analysisis. Instead, it is aimed at providing students with a general overview of the key decision-making required to carry out MD simulations of biomolecules and extracting quantitative data from them. In this context, the course is subdivided in two Units featuring lectures and practical sessions. Lectures are software-agnostic, whereas practical sessions demonstrate how those concepts could be put into practice by exposing student to authentic tasks leveraging on commonly used Python packages. While each practical session can be run by students on their own computer, these are also available in Google colab. This solution, requiring no local installation, is especially suitable for those unfamiliar with how to set-up a Python environment, or having limited access to computational resources. ### Unit 1: Simulation Preparation From 7faa5daba7153f72274b49c8b6d11680aabf59d8 Mon Sep 17 00:00:00 2001 From: Matteo Degiacomi Date: Mon, 12 Aug 2024 16:06:55 +0100 Subject: [PATCH 05/11] Update paper.md --- paper/paper.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/paper/paper.md b/paper/paper.md index 3047e47..0c6588c 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -30,11 +30,11 @@ bibliography: paper.bib # Summary -We present an open-source course featuring a blend of lectures and practical sessions teaching how to set-up and analyse molecular dynamics (MD) simulations of biomolecules, with a particular focus on proteins. The material presented in this course has been delivered since 2022 to cohorts of 30-40 postgraduates attending the UK-based CCP5 Summer School on Molecular simulation. +We present an open-source course featuring a blend of lectures and practical sessions teaching how to set-up and analyse molecular dynamics (MD) simulations of biomolecules, with a particular focus on proteins. The material presented in this course has been delivered since 2022 to cohorts of 30-40 international postgraduates attending the UK-based CCP5 Summer School on Molecular simulation. # Statement of Need -Biomolecular systems have been the first to be subjected to molecular dynamics (MD) simulations, and are thus underpinned by half a century history rich of methodological developments, embodied in a vast array of available software. The improvement in physical models dictating interatomic interactions couple with an ever-increasing availability of computational power have enabled MD simulations to establish themselves as a technique complementary to experimental data. Starting from the simulation of small proteins for only a few nanoseconds, nowadays large biomolecular complexes featuring millions of atoms can be simulated for timescales orders of magnitude longer. The data produced by MD simulations is noisy and high-dimensional though, and its usefulness is directly dependent on how faithfully the molecular system simulated recapitulates the phisiochemical conditions of its real counterpart. +Biomolecular systems have been the first to be subjected to molecular dynamics (MD) simulations, and are thus underpinned by half a century history rich of methodological developments, embodied in a vast array of available software. The improvement in physical models dictating interatomic interactions coupled with an ever-increasing availability of computational power have enabled MD simulations to establish themselves as a technique complementary to experimental data [ref_review?]. Starting from the simulation of small proteins for only a few nanoseconds [ref Levitt-Warshel], nowadays large biomolecular complexes featuring millions of atoms can be simulated for timescales orders of magnitude longer [refs]. The data produced by MD simulations is noisy and high-dimensional though, and its usefulness is directly dependent on how faithfully the molecular system simulated recapitulates the phisiochemical conditions of its real counterpart. While since the mid-70s strides have been made to automate the preparation of biologically-relevant atomistic models and the analysis of simulation data, the modern computational scientist is still required to make critical decisions on how the system should be assembled and simulated, and on which quantities should be extracted from the resulting simulation so as to precisely and accurately explain, or predict, experimental data. # Overview, Content, and Structure @@ -91,7 +91,7 @@ In our own teaching practice, we provide students with post-its of two different Thanks to the increasing availability of computational power and software automating many of the processes associated with biomolecular simulation and analysis, the palette of questions addressable with MD is broadening. While this is positive, it is still crucial that the computational scientist has a clear understanding of what is being simulated, and how. Indeed, to date many decisions associated with system building and analysis cannot be delegated to a machine without human verification. In this context, we see our course as a first stepping-stone, detailing the key decisions that need to be made, providing examples of how this can be done in practice, and signposting relevant software and specialised analysis techniques for further education. -Despite its long history, MD is still an evolving field. New techniques pushing the boundary of what is possible keep emerging, as exemplified by the current revolution constituted by the advent of modern machine learning techniques. While we expect that majority of the concepts presented in this course will be valid for many years to come, we are endeavouring to keeping the course material up-to-date by highlighting current methodological trends. For instance, in the most recent interation presented in this publication a discussion on how models produced by AlphaFold should be interpreted and used is introduced. +Despite its long history, MD is still an evolving field. New techniques pushing the boundary of what is possible keep emerging, as exemplified by the current revolution constituted by the advent of modern machine learning techniques. While we expect that majority of the concepts presented in this course will be valid for many years to come, we are endeavouring to keeping the course material up-to-date by highlighting current methodological trends. For instance, in the most recent interation presented in this publication a discussion on how models produced by AlphaFold [ref] should be interpreted and used is introduced. # Contributions to the course From e6e71866e7e042b30136c7140ac803019a1ed005 Mon Sep 17 00:00:00 2001 From: Matteo Degiacomi Date: Mon, 12 Aug 2024 16:15:14 +0100 Subject: [PATCH 06/11] Update paper.md --- paper/paper.md | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/paper/paper.md b/paper/paper.md index 0c6588c..ad8163a 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -30,12 +30,14 @@ bibliography: paper.bib # Summary -We present an open-source course featuring a blend of lectures and practical sessions teaching how to set-up and analyse molecular dynamics (MD) simulations of biomolecules, with a particular focus on proteins. The material presented in this course has been delivered since 2022 to cohorts of 30-40 international postgraduates attending the UK-based CCP5 Summer School on Molecular simulation. +We present an open-source course featuring a blend of lectures and practical sessions teaching how to set-up and analyse molecular dynamics (MD) simulations of biomolecules, with a particular focus on proteins. # Statement of Need -Biomolecular systems have been the first to be subjected to molecular dynamics (MD) simulations, and are thus underpinned by half a century history rich of methodological developments, embodied in a vast array of available software. The improvement in physical models dictating interatomic interactions coupled with an ever-increasing availability of computational power have enabled MD simulations to establish themselves as a technique complementary to experimental data [ref_review?]. Starting from the simulation of small proteins for only a few nanoseconds [ref Levitt-Warshel], nowadays large biomolecular complexes featuring millions of atoms can be simulated for timescales orders of magnitude longer [refs]. The data produced by MD simulations is noisy and high-dimensional though, and its usefulness is directly dependent on how faithfully the molecular system simulated recapitulates the phisiochemical conditions of its real counterpart. -While since the mid-70s strides have been made to automate the preparation of biologically-relevant atomistic models and the analysis of simulation data, the modern computational scientist is still required to make critical decisions on how the system should be assembled and simulated, and on which quantities should be extracted from the resulting simulation so as to precisely and accurately explain, or predict, experimental data. +Biomolecular systems have been the first to be subjected to molecular dynamics (MD) simulations, and are thus underpinned by half a century history rich of methodological developments, embodied in a vast array of available software. The improvement in physical models dictating interatomic interactions coupled with an ever-increasing availability of computational power have enabled MD simulations to establish themselves as a technique complementary to experimental data [ref_review?]. Starting from the simulation of small proteins for only a few nanoseconds [ref Levitt-Warshel], nowadays large biomolecular complexes featuring millions of atoms can be simulated for timescales orders of magnitude longer [refs]. The data produced by MD simulations is noisy and high-dimensional though, and its usefulness is directly dependent on how faithfully the molecular system simulated recapitulates the phisiochemical conditions of its real counterpart. While since the mid-70s strides have been made to automate the preparation of biologically-relevant atomistic models and the analysis of simulation data, the modern computational scientist is still required to make critical decisions on how the system should be assembled and simulated, and on which quantities should be extracted from the resulting simulation so as to precisely and accurately explain, or predict, experimental data. + +The material presented in this course has been deeloped to be be delivered in the scope of the UK-based CCP5 Summer School on Molecular simulation. Since 2022, is has been delivered to three cohorts of 30-40 international postgraduates. A first unique aspect of this course is that it provides under the same hood information on both the set-up and the analysis of MD simulations, typically presented separately. A second unique aspect is that it demonstrates how machine learning techniques can be used to extract relevant information from an MD simulation. + # Overview, Content, and Structure @@ -45,7 +47,7 @@ This is a graduate-level course, aimed at beginners in biomolecular simulation. ## Content -The objective of this course is not to make students proficient in one or few selected software for MD simulation preparation, execution, or analysisis. Instead, it is aimed at providing students with a general overview of the key decision-making required to carry out MD simulations of biomolecules and extracting quantitative data from them. In this context, the course is subdivided in two Units featuring lectures and practical sessions. Lectures are software-agnostic, whereas practical sessions demonstrate how those concepts could be put into practice by exposing student to authentic tasks leveraging on commonly used Python packages. While each practical session can be run by students on their own computer, these are also available in Google colab. This solution, requiring no local installation, is especially suitable for those unfamiliar with how to set-up a Python environment, or having limited access to computational resources. +The objective of this course is not to make students proficient in one or few selected software for MD simulation preparation, execution, or analysisis. Instead, it is aimed at providing students with a general overview of the key decision-making required to carry out MD simulations of biomolecules and extracting quantitative data from them. In this context, the course is subdivided in two Units featuring lectures and practical sessions. Lectures are software-agnostic, whereas practical sessions demonstrate how those concepts could be put into practice by exposing student to authentic tasks leveraging on commonly used Python packages, such as MDAnalysis [ref] and scikit-learn [ref]. While each practical session can be run by students on their own computer, these are also available in Google colab. This solution, requiring no local installation, is especially suitable for those unfamiliar with how to set-up a Python environment, or having limited access to computational resources. ### Unit 1: Simulation Preparation From f3f650b0487abccc049e43740cf8c5ab4e118fe7 Mon Sep 17 00:00:00 2001 From: Matteo Degiacomi Date: Mon, 12 Aug 2024 17:07:02 +0100 Subject: [PATCH 07/11] Update paper.md --- paper/paper.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paper/paper.md b/paper/paper.md index ad8163a..43bd4c2 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -30,7 +30,7 @@ bibliography: paper.bib # Summary -We present an open-source course featuring a blend of lectures and practical sessions teaching how to set-up and analyse molecular dynamics (MD) simulations of biomolecules, with a particular focus on proteins. +We present an open-source course teaching how to set-up and analyse molecular dynamics (MD) simulations of biomolecules, with a particular focus on proteins. The course features a blend of lectures coupled with practical sessions using Jupyter notebooks. # Statement of Need @@ -64,7 +64,7 @@ The first Unit is dedicated to providing background on protein structure, and ho | P: Simulation Setup | [![Simulation](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/CCPBioSim/Into_to_MD_simulation_and_analysis/blob/main/4_Simulation_Setup/4_Simulation_Setup.ipynb) | -### Unit 2: Simulation Anaylsis +### Unit 2: Simulation Analysis The second Unit is dedicated to providing the students with means to extract relevant quantitative information from a molecular dynamics simulation trajectory. A key aspect of this Unit lies in the demonstration of how machine learning techniques (clustering, dimensionality reduction, classification) can be used to extract meaningful information from noisty and high-dimensional data associated with biomolecular MD simulations. From 30df367e4213835517b1db47345b44d85ebd6c6d Mon Sep 17 00:00:00 2001 From: Matteo Degiacomi Date: Wed, 14 Aug 2024 17:03:42 +0100 Subject: [PATCH 08/11] Update paper.md --- paper/paper.md | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/paper/paper.md b/paper/paper.md index 43bd4c2..5ce26ea 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -34,7 +34,7 @@ We present an open-source course teaching how to set-up and analyse molecular dy # Statement of Need -Biomolecular systems have been the first to be subjected to molecular dynamics (MD) simulations, and are thus underpinned by half a century history rich of methodological developments, embodied in a vast array of available software. The improvement in physical models dictating interatomic interactions coupled with an ever-increasing availability of computational power have enabled MD simulations to establish themselves as a technique complementary to experimental data [ref_review?]. Starting from the simulation of small proteins for only a few nanoseconds [ref Levitt-Warshel], nowadays large biomolecular complexes featuring millions of atoms can be simulated for timescales orders of magnitude longer [refs]. The data produced by MD simulations is noisy and high-dimensional though, and its usefulness is directly dependent on how faithfully the molecular system simulated recapitulates the phisiochemical conditions of its real counterpart. While since the mid-70s strides have been made to automate the preparation of biologically-relevant atomistic models and the analysis of simulation data, the modern computational scientist is still required to make critical decisions on how the system should be assembled and simulated, and on which quantities should be extracted from the resulting simulation so as to precisely and accurately explain, or predict, experimental data. +Biomolecular systems have been the first to be subjected to molecular dynamics (MD) simulations, and are thus underpinned by half a century history rich of methodological developments, embodied in a wide range of specialised software. The improvement in physical models dictating interatomic interactions coupled with an ever-increasing availability of computational power have enabled MD simulations to establish themselves as a technique complementary to experimental data [ref_review?]. Starting from the simulation of small proteins for only a few nanoseconds [ref Levitt-Warshel], nowadays large biomolecular complexes featuring millions of atoms can be simulated for timescales orders of magnitude longer [refs]. The data produced by MD simulations is noisy and high-dimensional though, and its usefulness is directly dependent on how faithfully the molecular system simulated recapitulates the physiochemical conditions of its real-world counterpart. Since the mid-1970s, significant progress has been made in automating the preparation of biologically relevant atomistic models and the analysis of simulation data. Nonetheless, modern computational scientists must still make critical decisions about how to assemble and simulate the system, as well as which quantities to extract from the resulting data to accurately explain or predict experimental outcomes. The material presented in this course has been deeloped to be be delivered in the scope of the UK-based CCP5 Summer School on Molecular simulation. Since 2022, is has been delivered to three cohorts of 30-40 international postgraduates. A first unique aspect of this course is that it provides under the same hood information on both the set-up and the analysis of MD simulations, typically presented separately. A second unique aspect is that it demonstrates how machine learning techniques can be used to extract relevant information from an MD simulation. @@ -51,7 +51,7 @@ The objective of this course is not to make students proficient in one or few se ### Unit 1: Simulation Preparation -The first Unit is dedicated to providing background on protein structure, and how to prepare a protein for biomolecular simulation. In this Unit, students learn about how to critically observe a protein structure, and make informed decisions required to set-up a simulation faithfully recapitulating a biologically relevant system. +The first Unit is dedicated to providing background on protein structure, and how to prepare a protein for biomolecular simulation. In this Unit, students learn about how to critically observe a protein structure, and make informed decisions required to set-up a simulation that faithfully recapitulates a biologically relevant system. | Session | Materials | |------------------------------------|-----------| @@ -63,7 +63,6 @@ The first Unit is dedicated to providing background on protein structure, and ho | L3: Simulation Setup | [Lecture Slides](https://github.com/CCPBioSim/Into_to_MD_simulation_and_analysis/blob/main/4_Simulation_Setup/Lecture_4_Simulation_setup.pdf) | | P: Simulation Setup | [![Simulation](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/CCPBioSim/Into_to_MD_simulation_and_analysis/blob/main/4_Simulation_Setup/4_Simulation_Setup.ipynb) | - ### Unit 2: Simulation Analysis The second Unit is dedicated to providing the students with means to extract relevant quantitative information from a molecular dynamics simulation trajectory. A key aspect of this Unit lies in the demonstration of how machine learning techniques (clustering, dimensionality reduction, classification) can be used to extract meaningful information from noisty and high-dimensional data associated with biomolecular MD simulations. @@ -84,24 +83,26 @@ The second Unit is dedicated to providing the students with means to extract rel # Assessment and feedback -Each Jupyter notebook contains information on a specific topic, as well as task the student is asked to carry out independently. The tasks range from interpreting data previously produced, to running presented code with different parameters, to solving a specific problem by implementing a short Python code. Solutions to all questions are provided in the notebook as drop-down cells. +Each Jupyter notebook contains information on a specific topic, as well as tasks the student is asked to carry out independently. The tasks range from interpreting data previously produced, to running presented code with different parameters, to solving a specific problem by implementing a short Python code. Solutions to all questions are provided in each notebook as drop-down cells, enabling students to self-assess their understanding. In our own teaching practice, we provide students with post-its of two different colours that can be displayed on their computer screen --- yellow indicating that everything is clear, pink indicating that help is required. At the end of each practical session, studends are asked to use these same post-its to provide instructors with feedback on something they liked (yellow post-it), and something that requires improvement (pink post-it). In the three years we have delivered this course, this appoach has enabled us to gather comprehensive feedback, helping us fine-tuning the teaching material and our own delivery style. A key observation is that students, when presented with a new notebook, especially appreciate the instructors spending few minutes describing the overall notebook structure and the tasks it features, before working through the beginning of it. # Conclusion -Thanks to the increasing availability of computational power and software automating many of the processes associated with biomolecular simulation and analysis, the palette of questions addressable with MD is broadening. While this is positive, it is still crucial that the computational scientist has a clear understanding of what is being simulated, and how. Indeed, to date many decisions associated with system building and analysis cannot be delegated to a machine without human verification. In this context, we see our course as a first stepping-stone, detailing the key decisions that need to be made, providing examples of how this can be done in practice, and signposting relevant software and specialised analysis techniques for further education. +Thanks to the increasing availability of computational power and software automating many of the processes associated with biomolecular simulation and analysis, the palette of questions addressable with MD is broadening. While this is positive, it remains crucial for computational scientists to have a clear understanding of what is being simulated and how. Indeed, to date many decisions associated with system building and analysis cannot be delegated to a machine without human verification. In this context, we see our course as a first stepping-stone, detailing the key decisions that need to be made, providing examples of how this can be done in practice, and directing learners to relevant software and specialized analysis techniques for further education. -Despite its long history, MD is still an evolving field. New techniques pushing the boundary of what is possible keep emerging, as exemplified by the current revolution constituted by the advent of modern machine learning techniques. While we expect that majority of the concepts presented in this course will be valid for many years to come, we are endeavouring to keeping the course material up-to-date by highlighting current methodological trends. For instance, in the most recent interation presented in this publication a discussion on how models produced by AlphaFold [ref] should be interpreted and used is introduced. +Despite its long history, MD remains an evolving field. New techniques that push the boundaries of what is possible keep emerging, as exemplified by the current revolution associated with the integration of modern machine learning techniques in molecular modelling pipelines. While we expect that majority of the concepts presented in this course will be valid for many years to come, we are endeavouring to keeping the course material up-to-date by highlighting current methodological trends. For instance in the latest iteration of this course we have introduced a discussion on how how to interpret and use models produced by AlphaFold [ref]. # Contributions to the course MTD, MM, and ASJSM conceived the course. + # Acknowledgements -Part of the material in this course was adapted from other sources with the agreement of respective authors. In this context, we would like to acknowledge Matt Thompson, Jeff Wagner, and Josh Mitchell for the material featured in the OpenFF tutorials, César Ramírez-Sarmiento and collaborators for material featured in the Autodock tutorial, and Ifan Alibay and Richard Gowers for material in the MDAnalysis tutorial. We also wish to acknowledge Charlie Laughton for inspiration on discussing RMSD in conjunction with equilibration, and Rohan Gorantla for help with with setting up Colab environments. +Parts of the material in this course have been adapted from other sources with the agreement of respective authors. In this context, we would like to acknowledge Matt Thompson, Jeff Wagner, and Josh Mitchell for the material featured in the OpenFF tutorials, César Ramírez-Sarmiento and collaborators for material featured in the Autodock tutorial, and Ifan Alibay and Richard Gowers for material in the MDAnalysis tutorial. We also wish to acknowledge Charlie Laughton for inspiration on discussing RMSD in conjunction with equilibration, and Rohan Gorantla for help with with setting up Colab environments. + # References From 08e60aba8424ec69106fb36b119a46d5fbfd498a Mon Sep 17 00:00:00 2001 From: Matteo Degiacomi Date: Thu, 22 Aug 2024 15:17:26 +0100 Subject: [PATCH 09/11] Update paper.md --- paper/paper.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/paper/paper.md b/paper/paper.md index 5ce26ea..361e8df 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -9,7 +9,7 @@ authors: orcid: 0000-0003-4672-471X affiliation: "1" - name: Micaela Matta - orcid: xxx + orcid: 0000-0002-9852-3154 affiliation: "2" - name: Antonia S. J. S. Mey orcid: 0000-0001-7512-5252 @@ -18,13 +18,13 @@ authors: affiliations: - name: Department of Physics, Durham University, South Road, Durham, DH1 3LE, United Kingdom index: 1 - - name: xxxx + - name: Department of Chemistry, King's College London, London, SE1 1DB, United Kingdom index: 2 - name: EaStCHEM School of Chemistry, University of Edinburgh, Joseph Black Building, David Brewster Road, Edinburgh, EH9 3FJ, United Kingdom index: 3 -date: 12 August 2024 +date: 22 August 2024 bibliography: paper.bib --- @@ -36,7 +36,7 @@ We present an open-source course teaching how to set-up and analyse molecular dy Biomolecular systems have been the first to be subjected to molecular dynamics (MD) simulations, and are thus underpinned by half a century history rich of methodological developments, embodied in a wide range of specialised software. The improvement in physical models dictating interatomic interactions coupled with an ever-increasing availability of computational power have enabled MD simulations to establish themselves as a technique complementary to experimental data [ref_review?]. Starting from the simulation of small proteins for only a few nanoseconds [ref Levitt-Warshel], nowadays large biomolecular complexes featuring millions of atoms can be simulated for timescales orders of magnitude longer [refs]. The data produced by MD simulations is noisy and high-dimensional though, and its usefulness is directly dependent on how faithfully the molecular system simulated recapitulates the physiochemical conditions of its real-world counterpart. Since the mid-1970s, significant progress has been made in automating the preparation of biologically relevant atomistic models and the analysis of simulation data. Nonetheless, modern computational scientists must still make critical decisions about how to assemble and simulate the system, as well as which quantities to extract from the resulting data to accurately explain or predict experimental outcomes. -The material presented in this course has been deeloped to be be delivered in the scope of the UK-based CCP5 Summer School on Molecular simulation. Since 2022, is has been delivered to three cohorts of 30-40 international postgraduates. A first unique aspect of this course is that it provides under the same hood information on both the set-up and the analysis of MD simulations, typically presented separately. A second unique aspect is that it demonstrates how machine learning techniques can be used to extract relevant information from an MD simulation. +The material presented in this course has been developed in the scope of the UK-based CCP5 Summer School on Molecular simulation. Since 2022, is has been delivered to three cohorts of 30-40 international postgraduates. A first key aspect of this course is that, under the same hood, it provides information on both the set-up and the analysis of MD simulations, typically presented separately. A second key aspect is that it demonstrates how machine learning techniques can be used to extract relevant information from an MD simulation. # Overview, Content, and Structure @@ -85,7 +85,7 @@ The second Unit is dedicated to providing the students with means to extract rel Each Jupyter notebook contains information on a specific topic, as well as tasks the student is asked to carry out independently. The tasks range from interpreting data previously produced, to running presented code with different parameters, to solving a specific problem by implementing a short Python code. Solutions to all questions are provided in each notebook as drop-down cells, enabling students to self-assess their understanding. -In our own teaching practice, we provide students with post-its of two different colours that can be displayed on their computer screen --- yellow indicating that everything is clear, pink indicating that help is required. At the end of each practical session, studends are asked to use these same post-its to provide instructors with feedback on something they liked (yellow post-it), and something that requires improvement (pink post-it). In the three years we have delivered this course, this appoach has enabled us to gather comprehensive feedback, helping us fine-tuning the teaching material and our own delivery style. A key observation is that students, when presented with a new notebook, especially appreciate the instructors spending few minutes describing the overall notebook structure and the tasks it features, before working through the beginning of it. +In our teaching practice, we provide students with post-its of two different colours that can be displayed on their computer screen --- yellow indicating that everything is clear, pink indicating that help is required. At the end of each practical session, studends are asked to use these same post-its to provide instructors with feedback on something they liked (yellow post-it), and something that requires improvement (pink post-it). In the three years we have delivered this course, this appoach has enabled us to gather comprehensive feedback, helping us fine-tuning the teaching material and our own delivery style. A key observation is that students, when presented with a new notebook, especially appreciate the instructors spending few minutes describing the overall notebook structure and the tasks it features, before working through the beginning of it. # Conclusion From d57e569f8281747b1e8a5d55bb62b73a4923da89 Mon Sep 17 00:00:00 2001 From: ppxasjsm Date: Thu, 22 Aug 2024 15:34:02 +0100 Subject: [PATCH 10/11] small fixes to paper --- paper/paper.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paper/paper.md b/paper/paper.md index 361e8df..987961f 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -30,11 +30,11 @@ bibliography: paper.bib # Summary -We present an open-source course teaching how to set-up and analyse molecular dynamics (MD) simulations of biomolecules, with a particular focus on proteins. The course features a blend of lectures coupled with practical sessions using Jupyter notebooks. +We present an open-source course teaching how to set-up and analyse molecular dynamics (MD) simulations of biomolecules using proteins as a use-case. The course consists a blend of lectures and practical sessions using Jupyter notebooks. # Statement of Need -Biomolecular systems have been the first to be subjected to molecular dynamics (MD) simulations, and are thus underpinned by half a century history rich of methodological developments, embodied in a wide range of specialised software. The improvement in physical models dictating interatomic interactions coupled with an ever-increasing availability of computational power have enabled MD simulations to establish themselves as a technique complementary to experimental data [ref_review?]. Starting from the simulation of small proteins for only a few nanoseconds [ref Levitt-Warshel], nowadays large biomolecular complexes featuring millions of atoms can be simulated for timescales orders of magnitude longer [refs]. The data produced by MD simulations is noisy and high-dimensional though, and its usefulness is directly dependent on how faithfully the molecular system simulated recapitulates the physiochemical conditions of its real-world counterpart. Since the mid-1970s, significant progress has been made in automating the preparation of biologically relevant atomistic models and the analysis of simulation data. Nonetheless, modern computational scientists must still make critical decisions about how to assemble and simulate the system, as well as which quantities to extract from the resulting data to accurately explain or predict experimental outcomes. +Biomolecular systems were one of the first systems used in molecular dynamics (MD) simulations [@Levitt2017]. As such biomolecular simulations build on a rich half a century history rich of methodological developments, embodied in a wide range of specialised software. The improvement in physical models dictating interatomic interactions coupled with an ever-increasing availability of computational power have enabled MD simulations to establish themselves as a technique complementary to experimental data [ref_review?]. Starting from the simulation of small proteins for only a few nanoseconds [ref Levitt-Warshel], nowadays large biomolecular complexes featuring millions of atoms can be simulated for timescales orders of magnitude longer [refs]. The data produced by MD simulations is noisy and high-dimensional though, and its usefulness is directly dependent on how faithfully the molecular system simulated recapitulates the physiochemical conditions of its real-world counterpart. Since the mid-1970s, significant progress has been made in automating the preparation of biologically relevant atomistic models and the analysis of simulation data. Nonetheless, modern computational scientists must still make critical decisions about how to assemble and simulate the system, as well as which quantities to extract from the resulting data to accurately explain or predict experimental outcomes. The material presented in this course has been developed in the scope of the UK-based CCP5 Summer School on Molecular simulation. Since 2022, is has been delivered to three cohorts of 30-40 international postgraduates. A first key aspect of this course is that, under the same hood, it provides information on both the set-up and the analysis of MD simulations, typically presented separately. A second key aspect is that it demonstrates how machine learning techniques can be used to extract relevant information from an MD simulation. From 37e6d6a471429147e073f588aa8b38b6e350eafd Mon Sep 17 00:00:00 2001 From: ppxasjsm Date: Thu, 22 Aug 2024 16:49:23 +0100 Subject: [PATCH 11/11] adding references --- paper/paper.bib | 147 +++++++++++++++++++++++++++++++++++++++++++----- paper/paper.md | 6 +- 2 files changed, 137 insertions(+), 16 deletions(-) diff --git a/paper/paper.bib b/paper/paper.bib index 79dd542..a818357 100644 --- a/paper/paper.bib +++ b/paper/paper.bib @@ -1,4 +1,50 @@ -@article{engelberger2021developing, +@misc{thompson2024openff, + title = {{{OpenFF Interchange}}}, + author = {Thompson, Matthew and Wagner, Jeff and Gilmer, Justin B. and Timalsina, Umesh and Quach, Co D. and Boothroyd, Simon and Mitchell, Joshua A.}, + year = {2024}, + month = aug, + doi = {10.5281/zenodo.13155316}, + urldate = {2024-08-07}, + abstract = {0.3.29 GROMACS export performance improements and bugfixes For the complete release notes, please see the release history. For help installing Interchange, the installation instructions. Please report bugs, request features, or ask questions through our issue tracker. Please note that there may still be some changes to the API prior to a stable 1.0.0 release.}, + howpublished = {Zenodo}, + file = {/Users/toni_brain/Zotero/storage/GAHP5Y64/13155316.html} +} + +@article{levitt1975computer, + title = {Computer Simulation of Protein Folding}, + author = {Levitt, Michael and Warshel, Arieh}, + year = {1975}, + month = feb, + journal = {Nature}, + volume = {253}, + number = {5494}, + pages = {694--698}, + publisher = {Nature Publishing Group}, + issn = {1476-4687}, + doi = {10.1038/253694a0}, + urldate = {2024-08-22}, + abstract = {A new and very simple representation of protein conformations has been used together with energy minimisation and thermalisation to simulate protein folding. Under certain conditions, the method succeeds in `renaturing' bovine pancreatic trypsin inhibitor from an open-chain conformation into a folded conformation close to that of the native molecule.}, + copyright = {1975 Springer Nature Limited}, + langid = {english}, + keywords = {Humanities and Social Sciences,multidisciplinary,Science} +} + +@article{lindorff-larsen2011howa, + title = {How {{Fast-Folding Proteins Fold}}}, + author = {{Lindorff-Larsen}, Kresten and Piana, Stefano and Dror, Ron O. and Shaw, David E.}, + year = {2011}, + month = oct, + journal = {Science}, + volume = {334}, + number = {6055}, + pages = {517--520}, + publisher = {American Association for the Advancement of Science}, + doi = {10.1126/science.1208351}, + urldate = {2024-08-22}, + abstract = {An outstanding challenge in the field of molecular biology has been to understand the process by which proteins fold into their characteristic three-dimensional structures. Here, we report the results of atomic-level molecular dynamics simulations, over periods ranging between 100 {$\mu$}s and 1 ms, that reveal a set of common principles underlying the folding of 12 structurally diverse proteins. In simulations conducted with a single physics-based energy function, the proteins, representing all three major structural classes, spontaneously and repeatedly fold to their experimentally determined native structures. Early in the folding process, the protein backbone adopts a nativelike topology while certain secondary structure elements and a small number of nonlocal contacts form. In most cases, folding follows a single dominant route in which elements of the native structure appear in an order highly correlated with their propensity to form in the unfolded state.} +} + +@article{engelberger2021developingb, title = {Developing and {{Implementing Cloud-Based Tutorials That Combine Bioinformatics Software}}, {{Interactive Coding}}, and {{Visualization Exercises}} for {{Distance Learning}} on {{Structural Bioinformatics}}}, author = {Engelberger, Felipe and {Galaz-Davison}, Pablo and Bravo, Graciela and Rivera, Maira and {Ram{\'i}rez-Sarmiento}, C{\'e}sar A.}, year = {2021}, @@ -10,19 +56,94 @@ @article{engelberger2021developing publisher = {American Chemical Society}, issn = {0021-9584}, doi = {10.1021/acs.jchemed.1c00022}, - urldate = {2024-08-07}, + urldate = {2024-08-22}, abstract = {The COVID-19 pandemic has swiftly forced a change in learning strategies across educational institutions, from extensively relying on in-person activities toward online teaching. It is particularly difficult to adapt courses that depend on physical equipment to be now carried out remotely. This is the case for bioinformatics, which typically requires dedicated computer classrooms, as the logistics of granting remote access to a workstation or relying on the computational resources of each student is not trivial. A possible workaround is using cloud server-based computing resources, such as Google Colaboratory, a free web browser application that allows the writing and execution of Python programming through Jupyter notebooks, integrating text, images, and code cells. Following a cloud-based approach, we migrated the practical activities of a course on molecular modeling and simulation into the Google Colaboratory environment resulting in 12 tutorials that introduce students to topics such as phylogenetic analysis, molecular modeling, molecular docking, several flavors of molecular dynamics, and coevolutionary analysis. Each of these notebooks includes a brief introduction to the topic, software installation, execution of the required tools, and analysis of results, with each step properly described. Using a Likert scale questionnaire, a pool of students positively evaluated these tutorials in terms of the time required for their completion, their ability to understand the content and exercises developed in each session, and the practical significance and impact that these computational tools have on scientific research. All tutorials are freely available at https://github.com/pb3lab/ibm3202.}, - file = {/Users/toni_brain/Zotero/storage/XIFB34JG/Engelberger et al. - 2021 - Developing and Implementing Cloud-Based Tutorials .pdf} + file = {/Users/toni_brain/Zotero/storage/Q7DNX2C6/Engelberger et al. - 2021 - Developing and Implementing Cloud-Based Tutorials .pdf} } -@misc{thompson2024openff, - title = {{{OpenFF Interchange}}}, - author = {Thompson, Matthew and Wagner, Jeff and Gilmer, Justin B. and Timalsina, Umesh and Quach, Co D. and Boothroyd, Simon and Mitchell, Joshua A.}, - year = {2024}, - month = aug, - doi = {10.5281/zenodo.13155316}, - urldate = {2024-08-07}, - abstract = {0.3.29 GROMACS export performance improements and bugfixes For the complete release notes, please see the release history. For help installing Interchange, the installation instructions. Please report bugs, request features, or ask questions through our issue tracker. Please note that there may still be some changes to the API prior to a stable 1.0.0 release.}, - howpublished = {Zenodo}, - file = {/Users/toni_brain/Zotero/storage/GAHP5Y64/13155316.html} +@article{ciccotti2022molecular, + title = {Molecular Simulations: Past, Present, and Future (a {{Topical Issue}} in {{EPJB}})}, + shorttitle = {Molecular Simulations}, + author = {Ciccotti, G. and Dellago, C. and Ferrario, M. and Hern{\'a}ndez, E. R. and Tuckerman, M. E.}, + year = {2022}, + month = jan, + journal = {Eur. Phys. J. B}, + volume = {95}, + number = {1}, + pages = {3}, + issn = {1434-6036}, + doi = {10.1140/epjb/s10051-021-00249-x}, + urldate = {2024-08-22}, + langid = {english}, + file = {/Users/toni_brain/Zotero/storage/D7H43BJJ/Ciccotti et al. - 2022 - Molecular simulations past, present, and future (.pdf} +} + +@article{hollingsworth2018molecular, + title = {Molecular {{Dynamics Simulation}} for {{All}}}, + author = {Hollingsworth, Scott A. and Dror, Ron O.}, + year = {2018}, + month = sep, + journal = {Neuron}, + volume = {99}, + number = {6}, + pages = {1129--1143}, + publisher = {Elsevier}, + issn = {0896-6273}, + doi = {10.1016/j.neuron.2018.08.011}, + urldate = {2024-08-22}, + langid = {english}, + pmid = {30236283}, + keywords = {allostery,biomolecular simulation,conformational change,drug design,drug discovery,experimental design,MD simulations,protein,structural biology}, + file = {/Users/toni_brain/Zotero/storage/FP552XVR/Hollingsworth and Dror - 2018 - Molecular Dynamics Simulation for All.pdf} +} + +@article{alibay2023building, + title = {Building a Community-Driven Ecosystem for Fast, Reproducible, and Reusable Molecular Simulation Analysis Using Mdanalysis}, + author = {Alibay, Irfan and Barnoud, Jonathan and Beckstein, Oliver and Gowers, Richard J. and Loche, Philip R. and {MacDermott-Opeskin}, Hugo and Matta, Micaela and Naughton, Fiona B. and Reddy, Tyler and Wang, Lily}, + year = {2023}, + month = feb, + journal = {Biophysical Journal}, + volume = {122}, + number = {3}, + pages = {420a}, + publisher = {Elsevier}, + issn = {0006-3495}, + doi = {10.1016/j.bpj.2022.11.2277}, + urldate = {2024-08-22}, + langid = {english}, + file = {/Users/toni_brain/Zotero/storage/BHBHKCP8/Alibay et al. - 2023 - Building a community-driven ecosystem for fast, re.pdf} +} + +@article{michaud-agrawal2011mdanalysis, + title = {{{MDAnalysis}}: {{A}} Toolkit for the Analysis of Molecular Dynamics Simulations}, + shorttitle = {{{MDAnalysis}}}, + author = {{Michaud-Agrawal}, Naveen and Denning, Elizabeth J. and Woolf, Thomas B. and Beckstein, Oliver}, + year = {2011}, + journal = {Journal of Computational Chemistry}, + volume = {32}, + number = {10}, + pages = {2319--2327}, + issn = {1096-987X}, + doi = {10.1002/jcc.21787}, + urldate = {2024-08-22}, + abstract = {MDAnalysis is an object-oriented library for structural and temporal analysis of molecular dynamics (MD) simulation trajectories and individual protein structures. It is written in the Python language with some performance-critical code in C. It uses the powerful NumPy package to expose trajectory data as fast and efficient NumPy arrays. It has been tested on systems of millions of particles. Many common file formats of simulation packages including CHARMM, Gromacs, Amber, and NAMD and the Protein Data Bank format can be read and written. Atoms can be selected with a syntax similar to CHARMM's powerful selection commands. MDAnalysis enables both novice and experienced programmers to rapidly write their own analytical tools and access data stored in trajectories in an easily accessible manner that facilitates interactive explorative analysis. MDAnalysis has been tested on and works for most Unix-based platforms such as Linux and Mac OS X. It is freely available under the GNU General Public License from http://mdanalysis.googlecode.com. {\copyright} 2011 Wiley Periodicals, Inc. J Comput Chem 2011}, + copyright = {Copyright {\copyright} 2011 Wiley Periodicals, Inc.}, + langid = {english}, + keywords = {analysis,membrane systems,molecular dynamics simulations,object-oriented design,proteins,Python programming language,software}, + file = {/Users/toni_brain/Zotero/storage/8FBCZC37/Michaud-Agrawal et al. - 2011 - MDAnalysis A toolkit for the analysis of molecula.pdf;/Users/toni_brain/Zotero/storage/XH6AXMAY/jcc.html} +} + +@article{pedregosa2011scikitlearn, + title = {Scikit-Learn: {{Machine Learning}} in {{Python}}}, + shorttitle = {Scikit-Learn}, + author = {Pedregosa, Fabian and Varoquaux, Ga{\"e}l and Gramfort, Alexandre and Michel, Vincent and Thirion, Bertrand and Grisel, Olivier and Blondel, Mathieu and Prettenhofer, Peter and Weiss, Ron and Dubourg, Vincent and Vanderplas, Jake and Passos, Alexandre and Cournapeau, David and Brucher, Matthieu and Perrot, Matthieu and Duchesnay, {\'E}douard}, + year = {2011}, + journal = {Journal of Machine Learning Research}, + volume = {12}, + number = {85}, + pages = {2825--2830}, + issn = {1533-7928}, + urldate = {2024-08-22}, + abstract = {Scikit-learn is a Python module integrating a wide range of state-of-the-art machine learning algorithms for medium-scale supervised and unsupervised problems. This package focuses on bringing machine learning to non-specialists using a general-purpose high-level language. Emphasis is put on ease of use, performance, documentation, and API consistency. It has minimal dependencies and is distributed under the simplified BSD license, encouraging its use in both academic and commercial settings. Source code, binaries, and documentation can be downloaded from http://scikit-learn.sourceforge.net.}, + file = {/Users/toni_brain/Zotero/storage/CZAWDZJI/Pedregosa et al. - 2011 - Scikit-learn Machine Learning in Python.pdf;/Users/toni_brain/Zotero/storage/EEVRH94Z/scikit-learn.html} } diff --git a/paper/paper.md b/paper/paper.md index 987961f..ad37fe2 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -34,9 +34,9 @@ We present an open-source course teaching how to set-up and analyse molecular dy # Statement of Need -Biomolecular systems were one of the first systems used in molecular dynamics (MD) simulations [@Levitt2017]. As such biomolecular simulations build on a rich half a century history rich of methodological developments, embodied in a wide range of specialised software. The improvement in physical models dictating interatomic interactions coupled with an ever-increasing availability of computational power have enabled MD simulations to establish themselves as a technique complementary to experimental data [ref_review?]. Starting from the simulation of small proteins for only a few nanoseconds [ref Levitt-Warshel], nowadays large biomolecular complexes featuring millions of atoms can be simulated for timescales orders of magnitude longer [refs]. The data produced by MD simulations is noisy and high-dimensional though, and its usefulness is directly dependent on how faithfully the molecular system simulated recapitulates the physiochemical conditions of its real-world counterpart. Since the mid-1970s, significant progress has been made in automating the preparation of biologically relevant atomistic models and the analysis of simulation data. Nonetheless, modern computational scientists must still make critical decisions about how to assemble and simulate the system, as well as which quantities to extract from the resulting data to accurately explain or predict experimental outcomes. +Biomolecular systems were one of the first systems used in molecular dynamics (MD) simulations [@levitt1975computer]. As such biomolecular simulations build on a rich half a century history rich of methodological developments, embodied in a wide range of specialised software. The improvement in physical models dictating interatomic interactions coupled with an ever-increasing availability of computational power have enabled MD simulations to establish themselves as a technique complementary to experimental data [@hollingsworth2018molecular, @ciccotti2022molecular]. Starting from the simulation of small proteins for only a few nanoseconds [@levitt1975computer], nowadays large biomolecular complexes featuring millions of atoms can be simulated for timescales orders of magnitude longer [@lindorff-larsen2011howa]. The data produced by MD simulations is noisy and high-dimensional though, and its usefulness is directly dependent on how faithfully the molecular system simulated recapitulates the physiochemical conditions of its real-world counterpart. Since the mid-1970s, significant progress has been made in automating the preparation of biologically relevant atomistic models and the analysis of simulation data. Nonetheless, modern computational scientists must still make critical decisions on how to assemble and simulate the system, as well as which quantities to extract from the resulting data to accurately explain or predict experimental outcomes. -The material presented in this course has been developed in the scope of the UK-based CCP5 Summer School on Molecular simulation. Since 2022, is has been delivered to three cohorts of 30-40 international postgraduates. A first key aspect of this course is that, under the same hood, it provides information on both the set-up and the analysis of MD simulations, typically presented separately. A second key aspect is that it demonstrates how machine learning techniques can be used to extract relevant information from an MD simulation. +The material presented in this course has been developed in the scope of the UK-based CCP5 Summer School on Molecular simulation. Since 2022, is has been delivered to three cohorts of 30-40 international postgraduates. A first key aspect of this course is that, under the same hood, it provides information on both the set-up and the analysis of MD simulations, typically presented separately. A second key aspect is that it demonstrates how machine learning techniques can be integrated in the analysis of MD simulations and used to extract relevant information from an MD simulation. # Overview, Content, and Structure @@ -47,7 +47,7 @@ This is a graduate-level course, aimed at beginners in biomolecular simulation. ## Content -The objective of this course is not to make students proficient in one or few selected software for MD simulation preparation, execution, or analysisis. Instead, it is aimed at providing students with a general overview of the key decision-making required to carry out MD simulations of biomolecules and extracting quantitative data from them. In this context, the course is subdivided in two Units featuring lectures and practical sessions. Lectures are software-agnostic, whereas practical sessions demonstrate how those concepts could be put into practice by exposing student to authentic tasks leveraging on commonly used Python packages, such as MDAnalysis [ref] and scikit-learn [ref]. While each practical session can be run by students on their own computer, these are also available in Google colab. This solution, requiring no local installation, is especially suitable for those unfamiliar with how to set-up a Python environment, or having limited access to computational resources. +The objective of this course is not to make students proficient in one or few selected software for MD simulation preparation, execution, or analysisis. Instead, it is aimed at providing students with a general overview of the key decision-making required to carry out MD simulations of biomolecules and extracting quantitative data from them. In this context, the course is subdivided in two Units featuring lectures and practical sessions. Lectures are software-agnostic, whereas practical sessions demonstrate how those concepts could be put into practice by exposing student to authentic tasks leveraging on commonly used Python packages, such as MDAnalysis [@michaud-agrawal2011mdanalysis, @alibay2023building] and scikit-learn [@pedregosa2011scikitlearn]. While each practical session can be run by students on their own computer, these are also available in Google colab. This solution, requiring no local installation, is especially suitable for those unfamiliar with how to set-up a Python environment, or having limited access to computational resources. ### Unit 1: Simulation Preparation