diff --git a/.binder/environment.yml b/.binder/environment.yml new file mode 100644 index 0000000..4327043 --- /dev/null +++ b/.binder/environment.yml @@ -0,0 +1,28 @@ +name: metadatainrcr +channels: + - conda-forge + - bioconda + - defaults + - r +dependencies: + - r-base + - cwltool + - r-readr + - r-readxl + - r-stringr + - r-lubridate + - r-rvest + - r-ggplot2 + - r-wordcloud + - r-tidytext + - r-rmarkdown + - r-ggpubr + - r-ggthemes + - r-here + - r-bibtex + - conda-build + - autopep8 + - entrez-direct + - jupyter + - pandas + - scikit-learn diff --git a/.binder/start b/.binder/start new file mode 100755 index 0000000..b6acd2a --- /dev/null +++ b/.binder/start @@ -0,0 +1,6 @@ +#!/bin/bash + +# source: https://discourse.jupyter.org/t/glibcxx-3-4-26-not-found-from-rstudio/7778/8 +set -e +export LD_LIBRARY_PATH=${NB_PYTHON_PREFIX}/lib:${LD_LIBRARY_PATH} +exec "$@" \ No newline at end of file diff --git a/.gitignore b/.gitignore index 0911757..a001607 100644 --- a/.gitignore +++ b/.gitignore @@ -9,4 +9,25 @@ __pycache__ data/lens/citespace/ .ipynb_checkpoints src/condabuilds/ -src/timeline.html \ No newline at end of file +src/timeline.html +.ipython/ + +.local/ + +.cache/ + +.conda/ + +.jupyter/ + +.rstudio/ + +.subversion/ + +.bash_history + +.bashrc + +.jupyter-server-log.txt + +src/timeline/timeline.html diff --git a/.here b/.here new file mode 100644 index 0000000..e69de29 diff --git a/README.md b/README.md index de173e5..21c1b20 100644 --- a/README.md +++ b/README.md @@ -1,25 +1,31 @@ # The Role of Metadata in Reproducible Computational Research + This is a supplemental resource to Leipzig et al. "The Role of Metadata in Reproducible Computational Research" https://arxiv.org/pdf/2006.08589.pdf ## Organization + ``` ├───data/ │ ├───examples/ Examples of metadata standards │ ├───lens/ Search exports for scimetric journal analysis │ └───standards.tsv Raw standards table ├───src/ +│ ├───cwl/tools/ CWL configuration to produce the timeline plot +│ ├───manuscript/ Manuscript revision document │ ├───secrets/ │ │ └───api.template.py Replace this with api.py using your NCBI/NCBO keys -│ ├───ncbo_ontologies.py Scimetric ontology popularity analysis -│ ├───scimetric.ipynb Scimetric journal meta/rcr frequency analysis -│ ├───timeline.R Produces the RCR case study timeline in the paper +│ ├───ontologies/ Scimetric ontology popularity analysis +│ ├───repotutils/ Scripts for automating management of this repository +│ ├───scimetric/ Scimetric journal meta/rcr frequency analysis in a Jupyter Notebook +│ ├───timeline/ R Markdown document to produce the RCR case study timeline in the paper, incl. helper files for execution with CWL (wrapper script, Dockerfile) │ ├───wget2jsonld.py Helper script to convert wget output to jsonld -│ └───wordcloud.R Produces word cloud from cited abstracts +│ └───wordcloud/ R script to produce word cloud from cited abstracts ├───LICENSE The LICENSE file ├───README.md What you are looking at ├───environment.osx.yaml OSX pinned Conda depenencies ├───environment.unpinned.yaml Unpinned Conda depenencies └───ro-crate-metadata.jsonld RO Crate config +└───.binder Environment configuration files for usage with Binder (mybinder.org) ``` @@ -97,23 +103,34 @@ https://stackoverflow.com/questions/1740341/what-is-the-difference-between-rdf-a ## How to generate the timeline for this article Install [cwltool](https://github.com/common-workflow-language/cwltool) + ``` pip install cwltool cwltool src/cwl/tools/timeline.cwl --reportfile timeline.html ``` -## Contribute - -Contributions welcome! - -## License +Note that the tools requires Docker for runningthe computing environment, see the file `timeline/Dockerfile` for the definition of the image used in the `.cwl` file. -[![CC0](http://mirrors.creativecommons.org/presskit/buttons/88x31/svg/cc-zero.svg)](https://creativecommons.org/publicdomain/zero/1.0/) +## Run on Binder +[MyBinder](https://mybinder.org/) is a tool for creating executable computing environments based on standard and widely used dependency management files. +You can easily run important parts of the analysis for the manuscript by clicking on the badges below. +Binder will create a container using the environment configuration from the directory `.binder/` and provide you with an interactive environment to execute notebooks or scripts. +- Scimetric journal frequency analysis of RCR and metadata terms (opens a Jupyter Notebook) [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/leipzig/metadata-in-rcr/HEAD?filepath=src%2Fscimetric%2Fscimetric.ipynb) +- Create Figure 2 from the paper (R Markdown notebook, open the file `src/timeline/timeline.Rmd` manually in RStudio) [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/leipzig/metadata-in-rcr/HEAD?urlpath=rstudio) +- Create word cloud from cited abstracts (run R script `src/wordcloud/wordcloud.R`) [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/leipzig/metadata-in-rcr/HEAD?urlpath=rstudio) +For development purposes, you can also run `repo2docker` locally in the directory of the repository. +```bash +repo2docker --editable . +``` +## Contribute +Contributions welcome! +## License +[![CC0](http://mirrors.creativecommons.org/presskit/buttons/88x31/svg/cc-zero.svg)](https://creativecommons.org/publicdomain/zero/1.0/) diff --git a/data/examples/rmarkdown.rmd b/data/examples/rmarkdown.rmd index ead41e5..00ff369 100644 --- a/data/examples/rmarkdown.rmd +++ b/data/examples/rmarkdown.rmd @@ -1,5 +1,25 @@ --- title: "A title for the analysis" +# author metadata, esp. used for scientific articles +author: + - name: Jeremy Leipzig + footnote: Corresponding author + affiliation: "Metadata Research Center, Drexel University, College of Computing and Informatics, Philadelphia PA, USA" + orcid: "0000-0001-7224-9620" + - name: Daniel Nüst + affiliation: "Institute for Geoinformatics, University of Münster, Germany" + orcid: "0000-0002-0024-5046" + email: daniel.nuest@uni-muenster.de + +# parameters to manipulate workflow; defaults can be changed when compiling the document +params: + year: 2020 + region: "Europe" + printcode: TRUE + data: file.csv + max_n: 42 + +# configuration and styling of different output document formats output: html_document: theme: lumen @@ -7,8 +27,19 @@ output: toc_float: collapsed: false code_folding: show + self_contained: true + pdf_document: + toc: yes + fig_caption: yes + df_print: kable +linkcolor: blue + +# field values can be generated from code +date: "`r format(Sys.time(), '%d %B, %Y')`" --- + + ```{r include=FALSE} knitr::opts_chunk$set(echo=TRUE, message=FALSE, warning=FALSE, fig.width=8, tidy=TRUE) ``` diff --git a/src/cwl/tools/timeline.cwl b/src/cwl/tools/timeline.cwl index 4d2ba30..4faaf4b 100644 --- a/src/cwl/tools/timeline.cwl +++ b/src/cwl/tools/timeline.cwl @@ -61,7 +61,7 @@ doc: $schemas: - - https://schema.org/version/3.9/schema.rdf + - https://schema.org/version/latest/schemaorg-current-https.rdf $namespaces: iana: https://www.iana.org/assignments/media-types/ diff --git a/src/scimetric/scimetric.ipynb b/src/scimetric/scimetric.ipynb index 776c0bf..27f9307 100644 --- a/src/scimetric/scimetric.ipynb +++ b/src/scimetric/scimetric.ipynb @@ -1844,7 +1844,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -1855,7 +1855,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -1864,7 +1864,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -3326,7 +3326,7 @@ "Procedia - Social and Behavioral Sciences 0.995346 " ] }, - "execution_count": 9, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -3344,7 +3344,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -3471,7 +3471,7 @@ " Erik Mannens 94516.0 0.999760" ] }, - "execution_count": 10, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -3491,7 +3491,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -3531,12 +3531,12 @@ " 0.999607\n", " \n", " \n", - " Mihai Pop\n", + " Ludmila Prokunina-Olsson\n", " 4425.5\n", " 0.999017\n", " \n", " \n", - " Ludmila Prokunina-Olsson\n", + " Mihai Pop\n", " 4425.5\n", " 0.999017\n", " \n", @@ -3556,12 +3556,12 @@ " 0.997837\n", " \n", " \n", - " Patricia Porter-Gill\n", + " Iain Hrynaszkiewicz\n", " 4420.0\n", " 0.996854\n", " \n", " \n", - " Iain Hrynaszkiewicz\n", + " Patricia Porter-Gill\n", " 4420.0\n", " 0.996854\n", " \n", @@ -3571,7 +3571,7 @@ " 0.996854\n", " \n", " \n", - " Philippe Rocca-Serra\n", + " Scott C. Edmunds\n", " 4416.0\n", " 0.995281\n", " \n", @@ -3581,17 +3581,17 @@ " 0.995281\n", " \n", " \n", - " Steffen Neumann\n", + " Philippe Rocca-Serra\n", " 4416.0\n", " 0.995281\n", " \n", " \n", - " Scott C. Edmunds\n", + " Markus Rupp\n", " 4416.0\n", " 0.995281\n", " \n", " \n", - " Markus Rupp\n", + " Steffen Neumann\n", " 4416.0\n", " 0.995281\n", " \n", @@ -3603,22 +3603,22 @@ " rcr_auth_cnt rcr_auth_scaled\n", "Victoria Stodden 4428.0 1.000000\n", "Jean-Luc Starck 4427.0 0.999607\n", - "Mihai Pop 4425.5 0.999017\n", "Ludmila Prokunina-Olsson 4425.5 0.999017\n", + "Mihai Pop 4425.5 0.999017\n", "Roger D. Peng 4424.0 0.998427\n", "Wei Tang 4422.5 0.997837\n", "Susanna-Assunta Sansone 4422.5 0.997837\n", - "Patricia Porter-Gill 4420.0 0.996854\n", "Iain Hrynaszkiewicz 4420.0 0.996854\n", + "Patricia Porter-Gill 4420.0 0.996854\n", "Yi-Ping Fu 4420.0 0.996854\n", - "Philippe Rocca-Serra 4416.0 0.995281\n", - "David L. Donoho 4416.0 0.995281\n", - "Steffen Neumann 4416.0 0.995281\n", "Scott C. Edmunds 4416.0 0.995281\n", - "Markus Rupp 4416.0 0.995281" + "David L. Donoho 4416.0 0.995281\n", + "Philippe Rocca-Serra 4416.0 0.995281\n", + "Markus Rupp 4416.0 0.995281\n", + "Steffen Neumann 4416.0 0.995281" ] }, - "execution_count": 11, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -3974,7 +3974,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.2" + "version": "3.7.10" } }, "nbformat": 4, diff --git a/src/timeline/timeline.Rmd b/src/timeline/timeline.Rmd index cf46b88..172defa 100644 --- a/src/timeline/timeline.Rmd +++ b/src/timeline/timeline.Rmd @@ -7,9 +7,10 @@ output: theme: paper toc: false code_folding: hide +params: + useLive: true --- - ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE) library(dplyr) @@ -24,11 +25,12 @@ library(ggpubr) ``` ## Fetching and processing data from Awesome Reproducible Research -There is some munging that needs to be done to access the publication dates, clean up the sizes, simplify approaches into their main goals, and classify by macrofield + +There is some munging that needs to be done to access the publication dates, clean up the sizes, simplify approaches into their main goals, and classify by macrofield. + ```{r fetchandprocess} #try to parse the github repo as it is now vs a commit we know will work -useLive<-TRUE -if(useLive){ +if(params$useLive){ url<-"https://raw.githubusercontent.com/leipzig/awesome-reproducible-research/master/readme.md" }else{ url<-"https://raw.githubusercontent.com/leipzig/awesome-reproducible-research/98a6fbaafd70eb80b2bc9c7987bb4618b5d0bd2b/readme.md" @@ -79,11 +81,14 @@ ggplot(timeline) + ``` ## Session + ```{r session} sessionInfo() ``` ## Git commit -```{bash git, engine.opts='-l'} -git -C / show --summary + +```{r git} +# (bash cells don't play nice with conda in Binder) +cat(system2("git", c("show", "--summary"), stdout = TRUE), sep = "\n") ``` diff --git a/src/wordcloud/wordcloud.R b/src/wordcloud/wordcloud.R index 2db0f24..ade274b 100644 --- a/src/wordcloud/wordcloud.R +++ b/src/wordcloud/wordcloud.R @@ -1,31 +1,45 @@ -library(bib2df) -library(dplyr) -library(tidytext) -library(stringr) -library(wordcloud) -library(knitr) -library(readr) +library("bibtex") +library("dplyr") +library("tidytext") +library("stringr") +library("wordcloud") +library("knitr") +library("readr") +library("here") -pal <- brewer.pal(8,"Dark2") +pal <- brewer.pal(8, "Dark2") -useFullText<-TRUE +useFullText <- FALSE -if(useFullText==TRUE){ - #full text from pdfs - readr::read_file("../data/citations/tokens.txt.gz") %>% - stringr::str_replace_all("'","") %>% - stringr::str_replace_all("\\[","") %>% - stringr::str_replace_all("\\]","") %>% - stringr::str_replace_all(" ","") %>% - stringr::str_split(pattern=',',simplify = TRUE) %>% +if (useFullText == TRUE) { + #full text from pdfs, cannot be shared publicly + readr::read_file("../data/citations/tokens.txt.gz") %>% + stringr::str_replace_all("'", "") %>% + stringr::str_replace_all("\\[", "") %>% + stringr::str_replace_all("\\]", "") %>% + stringr::str_replace_all(" ", "") %>% + stringr::str_split(pattern = ",", simplify = TRUE) %>% stringr::str_to_lower() -> tokenvec - data.frame(word=tokenvec) %>% anti_join(stop_words) %>% count(word, sort = TRUE) %>% ungroup() -> tokens_clean -}else{ + data.frame(word = tokenvec) %>% + anti_join(stop_words) %>% + count(word, sort = TRUE) %>% + ungroup() -> tokens_clean +} else { #just the abstracts - path<-"../data/citations/metadata-in-rcr-refs.bib" - df <- bib2df(path) - df %>% dplyr::filter(!is.na(ABSTRACT)) %>% unnest_tokens(word,ABSTRACT) %>% anti_join(stop_words) %>% count(word, sort = TRUE) %>% ungroup() -> tokens_clean + path <- here::here("data/citations/metadata-in-rcr-refs.bib") + bib <- bibtex::read.bib(path) + df <- data.frame(`ABSTRACT` = unlist( + sapply(bib, function(b) { b$abstract }))) + df %>% dplyr::filter(!is.na(ABSTRACT)) %>% + unnest_tokens(word, ABSTRACT) %>% + anti_join(stop_words) %>% + count(word, sort = TRUE) %>% + ungroup() -> tokens_clean } tokens_clean %>% -with(wordcloud(word, n, random.order = FALSE, max.words = 100, colors=pal)) -> word_cloud + with(wordcloud(word, + n, + random.order = FALSE, + max.words = 100, + colors = pal)) -> word_cloud