diff --git a/.binder/environment.yml b/.binder/environment.yml new file mode 100644 index 0000000..4327043 --- /dev/null +++ b/.binder/environment.yml @@ -0,0 +1,28 @@ +name: metadatainrcr +channels: + - conda-forge + - bioconda + - defaults + - r +dependencies: + - r-base + - cwltool + - r-readr + - r-readxl + - r-stringr + - r-lubridate + - r-rvest + - r-ggplot2 + - r-wordcloud + - r-tidytext + - r-rmarkdown + - r-ggpubr + - r-ggthemes + - r-here + - r-bibtex + - conda-build + - autopep8 + - entrez-direct + - jupyter + - pandas + - scikit-learn diff --git a/.binder/start b/.binder/start new file mode 100755 index 0000000..b6acd2a --- /dev/null +++ b/.binder/start @@ -0,0 +1,6 @@ +#!/bin/bash + +# source: https://discourse.jupyter.org/t/glibcxx-3-4-26-not-found-from-rstudio/7778/8 +set -e +export LD_LIBRARY_PATH=${NB_PYTHON_PREFIX}/lib:${LD_LIBRARY_PATH} +exec "$@" \ No newline at end of file diff --git a/.gitignore b/.gitignore index 0911757..a001607 100644 --- a/.gitignore +++ b/.gitignore @@ -9,4 +9,25 @@ __pycache__ data/lens/citespace/ .ipynb_checkpoints src/condabuilds/ -src/timeline.html \ No newline at end of file +src/timeline.html +.ipython/ + +.local/ + +.cache/ + +.conda/ + +.jupyter/ + +.rstudio/ + +.subversion/ + +.bash_history + +.bashrc + +.jupyter-server-log.txt + +src/timeline/timeline.html diff --git a/.here b/.here new file mode 100644 index 0000000..e69de29 diff --git a/README.md b/README.md index de173e5..21c1b20 100644 --- a/README.md +++ b/README.md @@ -1,25 +1,31 @@ # The Role of Metadata in Reproducible Computational Research + This is a supplemental resource to Leipzig et al. "The Role of Metadata in Reproducible Computational Research" https://arxiv.org/pdf/2006.08589.pdf ## Organization + ``` ├───data/ │ ├───examples/ Examples of metadata standards │ ├───lens/ Search exports for scimetric journal analysis │ └───standards.tsv Raw standards table ├───src/ +│ ├───cwl/tools/ CWL configuration to produce the timeline plot +│ ├───manuscript/ Manuscript revision document │ ├───secrets/ │ │ └───api.template.py Replace this with api.py using your NCBI/NCBO keys -│ ├───ncbo_ontologies.py Scimetric ontology popularity analysis -│ ├───scimetric.ipynb Scimetric journal meta/rcr frequency analysis -│ ├───timeline.R Produces the RCR case study timeline in the paper +│ ├───ontologies/ Scimetric ontology popularity analysis +│ ├───repotutils/ Scripts for automating management of this repository +│ ├───scimetric/ Scimetric journal meta/rcr frequency analysis in a Jupyter Notebook +│ ├───timeline/ R Markdown document to produce the RCR case study timeline in the paper, incl. helper files for execution with CWL (wrapper script, Dockerfile) │ ├───wget2jsonld.py Helper script to convert wget output to jsonld -│ └───wordcloud.R Produces word cloud from cited abstracts +│ └───wordcloud/ R script to produce word cloud from cited abstracts ├───LICENSE The LICENSE file ├───README.md What you are looking at ├───environment.osx.yaml OSX pinned Conda depenencies ├───environment.unpinned.yaml Unpinned Conda depenencies └───ro-crate-metadata.jsonld RO Crate config +└───.binder Environment configuration files for usage with Binder (mybinder.org) ``` @@ -97,23 +103,34 @@ https://stackoverflow.com/questions/1740341/what-is-the-difference-between-rdf-a ## How to generate the timeline for this article Install [cwltool](https://github.com/common-workflow-language/cwltool) + ``` pip install cwltool cwltool src/cwl/tools/timeline.cwl --reportfile timeline.html ``` -## Contribute - -Contributions welcome! - -## License +Note that the tools requires Docker for runningthe computing environment, see the file `timeline/Dockerfile` for the definition of the image used in the `.cwl` file. -[![CC0](http://mirrors.creativecommons.org/presskit/buttons/88x31/svg/cc-zero.svg)](https://creativecommons.org/publicdomain/zero/1.0/) +## Run on Binder +[MyBinder](https://mybinder.org/) is a tool for creating executable computing environments based on standard and widely used dependency management files. +You can easily run important parts of the analysis for the manuscript by clicking on the badges below. +Binder will create a container using the environment configuration from the directory `.binder/` and provide you with an interactive environment to execute notebooks or scripts. +- Scimetric journal frequency analysis of RCR and metadata terms (opens a Jupyter Notebook) [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/leipzig/metadata-in-rcr/HEAD?filepath=src%2Fscimetric%2Fscimetric.ipynb) +- Create Figure 2 from the paper (R Markdown notebook, open the file `src/timeline/timeline.Rmd` manually in RStudio) [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/leipzig/metadata-in-rcr/HEAD?urlpath=rstudio) +- Create word cloud from cited abstracts (run R script `src/wordcloud/wordcloud.R`) [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/leipzig/metadata-in-rcr/HEAD?urlpath=rstudio) +For development purposes, you can also run `repo2docker` locally in the directory of the repository. +```bash +repo2docker --editable . +``` +## Contribute +Contributions welcome! +## License +[![CC0](http://mirrors.creativecommons.org/presskit/buttons/88x31/svg/cc-zero.svg)](https://creativecommons.org/publicdomain/zero/1.0/) diff --git a/data/examples/rmarkdown.rmd b/data/examples/rmarkdown.rmd index ead41e5..00ff369 100644 --- a/data/examples/rmarkdown.rmd +++ b/data/examples/rmarkdown.rmd @@ -1,5 +1,25 @@ --- title: "A title for the analysis" +# author metadata, esp. used for scientific articles +author: + - name: Jeremy Leipzig + footnote: Corresponding author + affiliation: "Metadata Research Center, Drexel University, College of Computing and Informatics, Philadelphia PA, USA" + orcid: "0000-0001-7224-9620" + - name: Daniel Nüst + affiliation: "Institute for Geoinformatics, University of Münster, Germany" + orcid: "0000-0002-0024-5046" + email: daniel.nuest@uni-muenster.de + +# parameters to manipulate workflow; defaults can be changed when compiling the document +params: + year: 2020 + region: "Europe" + printcode: TRUE + data: file.csv + max_n: 42 + +# configuration and styling of different output document formats output: html_document: theme: lumen @@ -7,8 +27,19 @@ output: toc_float: collapsed: false code_folding: show + self_contained: true + pdf_document: + toc: yes + fig_caption: yes + df_print: kable +linkcolor: blue + +# field values can be generated from code +date: "`r format(Sys.time(), '%d %B, %Y')`" --- + + ```{r include=FALSE} knitr::opts_chunk$set(echo=TRUE, message=FALSE, warning=FALSE, fig.width=8, tidy=TRUE) ``` diff --git a/src/cwl/tools/timeline.cwl b/src/cwl/tools/timeline.cwl index 4d2ba30..4faaf4b 100644 --- a/src/cwl/tools/timeline.cwl +++ b/src/cwl/tools/timeline.cwl @@ -61,7 +61,7 @@ doc: $schemas: - - https://schema.org/version/3.9/schema.rdf + - https://schema.org/version/latest/schemaorg-current-https.rdf $namespaces: iana: https://www.iana.org/assignments/media-types/ diff --git a/src/scimetric/scimetric.ipynb b/src/scimetric/scimetric.ipynb index 776c0bf..27f9307 100644 --- a/src/scimetric/scimetric.ipynb +++ b/src/scimetric/scimetric.ipynb @@ -1844,7 +1844,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -1855,7 +1855,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -1864,7 +1864,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -3326,7 +3326,7 @@ "Procedia - Social and Behavioral Sciences 0.995346 " ] }, - "execution_count": 9, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -3344,7 +3344,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -3471,7 +3471,7 @@ " Erik Mannens 94516.0 0.999760" ] }, - "execution_count": 10, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -3491,7 +3491,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -3531,12 +3531,12 @@ "