From 85d567272f6f06bc7fc896c1052d28fd815a950b Mon Sep 17 00:00:00 2001 From: VNMabus Date: Sun, 1 Sep 2024 00:02:57 +0200 Subject: [PATCH 1/7] First version of the paper. --- .github/workflows/draft-pdf.yml | 28 +++++++ paper/.gitignore | 2 + paper/paper.bib | 53 +++++++++++++ paper/paper.md | 130 ++++++++++++++++++++++++++++++++ 4 files changed, 213 insertions(+) create mode 100644 .github/workflows/draft-pdf.yml create mode 100644 paper/.gitignore create mode 100644 paper/paper.bib create mode 100644 paper/paper.md diff --git a/.github/workflows/draft-pdf.yml b/.github/workflows/draft-pdf.yml new file mode 100644 index 0000000..e189889 --- /dev/null +++ b/.github/workflows/draft-pdf.yml @@ -0,0 +1,28 @@ +name: Draft PDF +on: + push: + paths: + - paper/** + - .github/workflows/draft-pdf.yml + +jobs: + paper: + runs-on: ubuntu-latest + name: Paper Draft + steps: + - name: Checkout + uses: actions/checkout@v4 + - name: Build draft PDF + uses: openjournals/openjournals-draft-action@master + with: + journal: joss + # This should be the path to the paper within your repo. + paper-path: paper/paper.md + - name: Upload + uses: actions/upload-artifact@v4 + with: + name: paper + # This is the output path where Pandoc will write the compiled + # PDF. Note, this should be the same directory as the input + # paper.md + path: paper/paper.pdf \ No newline at end of file diff --git a/paper/.gitignore b/paper/.gitignore new file mode 100644 index 0000000..811db69 --- /dev/null +++ b/paper/.gitignore @@ -0,0 +1,2 @@ +/jats/ +/paper.pdf diff --git a/paper/paper.bib b/paper/paper.bib new file mode 100644 index 0000000..af0952b --- /dev/null +++ b/paper/paper.bib @@ -0,0 +1,53 @@ +@misc{diaz-vico+ramos-carreno_2022_scikitdatasets, + title = {{{scikit-datasets}}: {{Scikit-learn-compatible}} Datasets}, + author = {{D{\'i}az-Vico}, David and {Ramos-Carre{\~n}o}, Carlos}, + year = {2022}, + month = mar, + doi = {10.5281/zenodo.6383047}, + url = {https://github.com/daviddiazvico/scikit-datasets}, + copyright = {MIT} +} + +@misc{fajardo_2018_pyreadr, + title = {Pyreadr}, + author = {Fajardo, Otto}, + year = {2018}, + month = dec, + doi = {10.5281/zenodo.7110170}, + url = {https://github.com/ofajardo/pyreadr} +} + +@misc{gautier_2024_rpy2, + title = {Rpy2: {{R}} in {{Python}}}, + author = {Gautier, Laurent}, + year = {2024}, + publisher = {GitHub}, + url = {https://github.com/rpy2/rpy2} +} + +@software{pandasdevelopmentteam_2020_pandasdev, + title = {{{pandas-dev/pandas}}: {{pandas}}}, + author = {Pandas Development Team}, + year = {2020}, + month = feb, + publisher = {Zenodo}, + doi = {10.5281/zenodo.3509134}, + url = {https://doi.org/10.5281/zenodo.3509134}, + version = {latest} +} + +@article{ramos-carreno+_2024_scikitfda, + title = {Scikit-Fda: {{A Python Package}} for {{Functional Data Analysis}}}, + shorttitle = {Scikit-Fda}, + author = {{Ramos-Carre{\~n}o}, Carlos and Torrecilla, Jos{\'e} Luis and {Carbajo-Berrocal}, Miguel and Marcos, Pablo and Su{\'a}rez, Alberto}, + year = {2024}, + month = may, + journal = {Journal of Statistical Software}, + volume = {109}, + pages = {1--37}, + issn = {1548-7660}, + doi = {10.18637/jss.v109.i02}, + abstract = {The library scikit-fda is a Python package for functional data analysis (FDA). It provides a comprehensive set of tools for representation, preprocessing, and exploratory analysis of functional data. The library is built upon and integrated in Python's scientific ecosystem. In particular, it conforms to the scikit-learn application programming interface so as to take advantage of the functionality for machine learning provided by this package: Pipelines, model selection, and hyperparameter tuning, among others. The scikit-fda package has been released as free and open-source software under a 3-clause BSD license and is open to contributions from the FDA community. The library's extensive documentation includes step-by-step tutorials and detailed examples of use.}, + copyright = {Copyright (c) 2024 Carlos Ramos-Carre{\~n}o, Jos{\'e} Luis Torrecilla, Miguel Carbajo-Berrocal, Pablo Marcos, Alberto Su{\'a}rez}, + langid = {english} +} \ No newline at end of file diff --git a/paper/paper.md b/paper/paper.md new file mode 100644 index 0000000..7625599 --- /dev/null +++ b/paper/paper.md @@ -0,0 +1,130 @@ +--- +title: 'rdata: Read R datasets from Python' +tags: + - Python + - R + - datasets + - rda + - rds +authors: + - name: Carlos Ramos-Carreño + orcid: 0000-0003-2566-7058 + affiliation: 1 + - name: Tuomas Rossi + orcid: 0000-0002-8713-4559 + affiliation: 2 +affiliations: + - name: Universidad Autónoma de Madrid, Spain + index: 1 + - name: CSC - IT Center for Science Ltd, Finland + index: 2 +date: 31 August 2024 +bibliography: paper.bib + +--- + +# Summary + +Research work usually requires the analysis and processing of data from different sources. +Traditionally statisticians and other research professionals have been using R for this task, and have compiled a huge amount of datasets in the Rda and Rds formats, native to this programming language. +As these formats contain internally the representation of R objects, they cannot be directly used from Python, another widely used language for data analysis and processing. +The library `rdata` allows to load and convert these datasets to Python objects, without the need of exporting them to other intermediate formats which may not keep all the original information. +This library has minimal dependencies, ensuring that it can be used in contexts where an R installation is not available. +Thus, the library `rdata` facilitates data interchange, enabling the usage of the same datasets in both languages (e.g. for reproducibility, comparisons of results against methods in both languages, or migration of processing pipelines to Python). + +# Statement of need + +The datasets from the CRAN repository are stored in the R specific format RData. +In Python, there were a few packages that could parse this file format, albeit all of them presented some limitations. + +The package `rpy2` [@gautier_2024_rpy2] can be used to interact with R from Python. +This includes the ability to load data in the RData format, and to convert these data to equivalent Python objects. +Although this is arguably the best package to achieve interaction between both languages, it has many disadvantages if one wants to use it just to load RData datasets. +In the first place, the package requires an R installation, as it relies in launching an R interpreter and communicating with it. +Secondly, launching R just to load data is inefficient, both in time and memory. +Finally, this package inherits the GPL license from the R language, which is not compatible with most Python packages, typically released under more permissive licenses. + +The recent package `pyreadr` [@fajardo_2018_pyreadr] also provides functionality to read some R datasets. +It relies in the C library `librdata` in order to perform the parsing of the RData format. +This adds an additional dependency from C building tools, and requires that the package is compiled for all the desired operating systems. +Moreover, this package is limited by the functionalities available in `librdata`, which at the moment of writing +does not include the parsing of common objects such as R lists and S4 objects. +The license can also be a problem, as it is part of the GPL family and does not allow commercial use. + +As existing solutions were unsuitable for our needs, the package `rdata` was developed to parse data in the RData format. +This is a small, extensible and very complete implementation in pure Python of a RData parser, that is able to read and convert most datasets in the CRAN repository to equivalent Python objects. +It has a permissive license and can be extended to support additional conversions from custom R classes. + +The package `rdata` has been designed as a pure Python package with minimal dependencies, so that it can be easily integrated inside other libraries and applications. +It currently powers the functionality offered in the `scikit-datasets` package [@diaz-vico+ramos-carreno_2022_scikitdatasets] for loading datasets from the CRAN repository of R packages. +This functionality is used for fetching the functional datasets provided in the `scikit-fda` library [@ramos-carreno+_2024_scikitfda], whose development was the main reason for the creation of the `rdata` package itself. + +# Features + +The package `rdata` is intended to be both flexible and easy to use. +In order to be flexible, the parsing of the RData format and the conversion of the parsed structures to appropriate Python objects have been splitted in two steps. +This allows advanced users to perform custom conversions without losing information. +Most users, however, will want to use the default conversion routine, which attempts to convert data +to a standard Python representation which preserves most part of the information. + +```python +import rdata + +converted = rdata.read_rda("dataset.rda") +converted +``` + +This is equivalent to the following code, in which the two steps have been performed separatedly. + +```python +import rdata + +parsed = rdata.parser.parse_file("dataset.rda") +converted = rdata.conversion.convert(parsed) +``` + +The function `parse_file()` of the parser module is used to parse the RData file, returning a tree-like structure of Python objects that contains a representation of the basic R objects conforming the dataset. +The function `convert()` of the conversion module transforms that representation to the final Python objects, such as lists, dictionaries or dataframes, that users can manipulate. + +Advanced users will probably require loading datasets which contain non standard S3 or S4 classes, translating each of them to a custom Python class. +This is easy to achieve using `rdata` by simply creating a constructor function that receives the converted object representation and its attributes, and returns a Python object of the desired type. +As an example, consider the following simple code that constructs a `Pandas` [@pandasdevelopmentteam_2020_pandasdev] `Categorical` object from the internal representation of an R `factor`. + +```python +import pandas + + +def factor_constructor(obj, attrs): + values = [attrs['levels'][i - 1] if i >= 0 else None for i in obj] + + return pandas.Categorical(values, attrs['levels'], ordered=False) +``` + +Then, a dictionary containing as keys the original class names to convert and as values the constructor functions can be passed as the constructor_dict parameter of the `read_rda()` (or `convert()` if we do it in two steps) function. +In the previous example, this could be done using the following code: + +```python +converted = rdata.read_rda( + "dataset.rda", + constructor_dict={"factor": factor_constructor}, +) +``` + +When the default conversion routine is being executed, if an object belonging to an S3 or S4 class is found, the appropriate constructor will be called passing to it the partially constructed object. +If no constructor is available for that class, a warning will be emitted and the constructor of the most immediate parent class available will be called. +If there are no constructors for any of the parent classes, the basic underlying Python object will be left without transformation. + +By default, a dictionary named `DEFAULT_CLASS_MAP` is passed to `convert()` including constructors for commonly used classes, such as `data.frame`, `ordered` or the aforementioned `factor`. +In case anyone wants different conversions for basic R objects, it would be enough to create a subclass of the `Converter` class. +Several utility functions, such as the routines `convert_char()` and `convert_list()`, are exposed by the conversion module in order for users to be able to reuse them for that purpose. + +# Ongoing work + + + +# Acknowledgements + +The authors acknowledge financial support from the Spanish Ministry of Education and Innovation, projects PID2019-106827GB-I00 / AEI / 10.13039/501100011033 and PID2019-109387GB-I00. +This work was also supported by an FPU grant (Formación de Profesorado Universitario) from the Spanish Ministry of Science, Innovation and Universities(MICIU) with reference FPU18/00047. + +# References \ No newline at end of file From e074b3f783ad79084454fe2990b852d45e78fe81 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Ramos=20Carre=C3=B1o?= Date: Sun, 8 Sep 2024 00:04:48 +0200 Subject: [PATCH 2/7] Apply suggestions from code review Co-authored-by: Tuomas Rossi <34502776+trossi@users.noreply.github.com> --- paper/paper.bib | 21 +++++++++++++++++---- paper/paper.md | 31 +++++++++++++++++++------------ 2 files changed, 36 insertions(+), 16 deletions(-) diff --git a/paper/paper.bib b/paper/paper.bib index af0952b..f53d323 100644 --- a/paper/paper.bib +++ b/paper/paper.bib @@ -8,12 +8,12 @@ @misc{diaz-vico+ramos-carreno_2022_scikitdatasets copyright = {MIT} } -@misc{fajardo_2018_pyreadr, +@misc{fajardo_2024_pyreadr, title = {Pyreadr}, author = {Fajardo, Otto}, - year = {2018}, - month = dec, - doi = {10.5281/zenodo.7110170}, + year = {2024}, + month = jul, + doi = {10.5281/zenodo.13132498}, url = {https://github.com/ofajardo/pyreadr} } @@ -50,4 +50,17 @@ @article{ramos-carreno+_2024_scikitfda abstract = {The library scikit-fda is a Python package for functional data analysis (FDA). It provides a comprehensive set of tools for representation, preprocessing, and exploratory analysis of functional data. The library is built upon and integrated in Python's scientific ecosystem. In particular, it conforms to the scikit-learn application programming interface so as to take advantage of the functionality for machine learning provided by this package: Pipelines, model selection, and hyperparameter tuning, among others. The scikit-fda package has been released as free and open-source software under a 3-clause BSD license and is open to contributions from the FDA community. The library's extensive documentation includes step-by-step tutorials and detailed examples of use.}, copyright = {Copyright (c) 2024 Carlos Ramos-Carre{\~n}o, Jos{\'e} Luis Torrecilla, Miguel Carbajo-Berrocal, Pablo Marcos, Alberto Su{\'a}rez}, langid = {english} +} + +@article{rahman+_2024_hmschpc, + title = {Accelerating joint species distribution modelling with {Hmsc-HPC} by {GPU} porting}, + author = {Rahman, Anis Ur and Tikhonov, Gleb and Oksanen, Jari and Rossi, Tuomas and Ovaskainen, Otso}, + year = {2024}, + month = sep, + journal = {PLOS Computational Biology}, + volume = {20}, + number = {9}, + pages = {e1011914}, + doi = {10.1371/journal.pcbi.1011914}, + abstract = {Joint species distribution modelling (JSDM) is a widely used statistical method that analyzes combined patterns of all species in a community, linking empirical data to ecological theory and enhancing community-wide prediction tasks. However, fitting JSDMs to large datasets is often computationally demanding and time-consuming. Recent studies have introduced new statistical and machine learning techniques to provide more scalable fitting algorithms, but extending these to complex JSDM structures that account for spatial dependencies or multi-level sampling designs remains challenging. In this study, we aim to enhance JSDM scalability by leveraging high-performance computing (HPC) resources for an existing fitting method. Our work focuses on the Hmsc R-package, a widely used JSDM framework that supports the integration of various dataset types into a single comprehensive model. We developed a GPU-compatible implementation of its model-fitting algorithm using Python and the TensorFlow library. Despite these changes, our enhanced framework retains the original user interface of the Hmsc R-package. We evaluated the performance of the proposed implementation across various model configurations and dataset sizes. Our results show a significant increase in model fitting speed for most models compared to the baseline Hmsc R-package. For the largest datasets, we achieved speed-ups of over 1000 times, demonstrating the substantial potential of GPU porting for previously CPU-bound JSDM software. This advancement opens promising opportunities for better utilizing the rapidly accumulating new biodiversity data resources for inference and prediction.}, } \ No newline at end of file diff --git a/paper/paper.md b/paper/paper.md index 7625599..5985467 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -1,5 +1,5 @@ --- -title: 'rdata: Read R datasets from Python' +title: 'rdata: A Python library for R datasets' tags: - Python - R @@ -14,11 +14,11 @@ authors: orcid: 0000-0002-8713-4559 affiliation: 2 affiliations: - - name: Universidad Autónoma de Madrid, Spain - index: 1 - - name: CSC - IT Center for Science Ltd, Finland - index: 2 -date: 31 August 2024 + - name: Universidad Autónoma de Madrid, Spain + index: 1 + - name: CSC – IT Center for Science Ltd., Finland + index: 2 +date: 4 September 2024 bibliography: paper.bib --- @@ -26,10 +26,11 @@ bibliography: paper.bib # Summary Research work usually requires the analysis and processing of data from different sources. -Traditionally statisticians and other research professionals have been using R for this task, and have compiled a huge amount of datasets in the Rda and Rds formats, native to this programming language. +Traditionally in statistical computing, R language has been widely used for this task, and a huge amount of datasets have been compiled in the Rda and Rds formats, native to this programming language. As these formats contain internally the representation of R objects, they cannot be directly used from Python, another widely used language for data analysis and processing. The library `rdata` allows to load and convert these datasets to Python objects, without the need of exporting them to other intermediate formats which may not keep all the original information. This library has minimal dependencies, ensuring that it can be used in contexts where an R installation is not available. +The capability to write data in Rda and Rds formats is also under development. Thus, the library `rdata` facilitates data interchange, enabling the usage of the same datasets in both languages (e.g. for reproducibility, comparisons of results against methods in both languages, or migration of processing pipelines to Python). # Statement of need @@ -44,7 +45,7 @@ In the first place, the package requires an R installation, as it relies in laun Secondly, launching R just to load data is inefficient, both in time and memory. Finally, this package inherits the GPL license from the R language, which is not compatible with most Python packages, typically released under more permissive licenses. -The recent package `pyreadr` [@fajardo_2018_pyreadr] also provides functionality to read some R datasets. +The package `pyreadr` [@fajardo_2024_pyreadr] also provides functionality to read and write some R datasets. It relies in the C library `librdata` in order to perform the parsing of the RData format. This adds an additional dependency from C building tools, and requires that the package is compiled for all the desired operating systems. Moreover, this package is limited by the functionalities available in `librdata`, which at the moment of writing @@ -52,7 +53,7 @@ does not include the parsing of common objects such as R lists and S4 objects. The license can also be a problem, as it is part of the GPL family and does not allow commercial use. As existing solutions were unsuitable for our needs, the package `rdata` was developed to parse data in the RData format. -This is a small, extensible and very complete implementation in pure Python of a RData parser, that is able to read and convert most datasets in the CRAN repository to equivalent Python objects. +This is a small, extensible, efficient, and very complete implementation in pure Python of a RData parser, that is able to read and convert most datasets in the CRAN repository to equivalent Python objects. It has a permissive license and can be extended to support additional conversions from custom R classes. The package `rdata` has been designed as a pure Python package with minimal dependencies, so that it can be easily integrated inside other libraries and applications. @@ -120,11 +121,17 @@ Several utility functions, such as the routines `convert_char()` and `convert_li # Ongoing work - +To broaden the utility of the `rdata` library to data processing pipelines with steps in both R and Python, we are currently extending the library with the capability to write compatible Python objects to RData files. +As an example, such a pipeline is present in the Hmsc-HPC code [@rahman+_2024_hmschpc], the continuous development of which has been driving the ongoing work on the writing functionality in the `rdata` library. +The writing of RData files is implemented as a two-step process similar to reading: first, the Python object is converted to the tree-like intermediate representation used in parsing, and then this intermediate representation is written to a RData file. +Currently, the writing functionality supporting common types is available at the development branch of the `rdata` library. # Acknowledgements -The authors acknowledge financial support from the Spanish Ministry of Education and Innovation, projects PID2019-106827GB-I00 / AEI / 10.13039/501100011033 and PID2019-109387GB-I00. -This work was also supported by an FPU grant (Formación de Profesorado Universitario) from the Spanish Ministry of Science, Innovation and Universities(MICIU) with reference FPU18/00047. +This work has received funding +from the Spanish Ministry of Education and Innovation, projects PID2019-106827GB-I00 / AEI / 10.13039/501100011033 and PID2019-109387GB-I00, +from an FPU grant (Formación de Profesorado Universitario) from the Spanish Ministry of Science, Innovation and Universities(MICIU) with reference FPU18/00047, +and from the European Union's Horizon Europe research and innovation programme under grant agreement No 101057437 (BioDT project, [https://doi.org/10.3030/101057437](https://doi.org/10.3030/101057437)). +Views and opinions expressed are those of the author(s) only and do not necessarily reflect those of the European Union or the European Commission. Neither the European Union nor the European Commission can be held responsible for them. # References \ No newline at end of file From 51745361ec020376a279e1f8ffca8f9435a6f2fb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Ramos=20Carre=C3=B1o?= Date: Sun, 22 Sep 2024 17:21:14 +0200 Subject: [PATCH 3/7] Apply suggestions from code review Accepted changes from code review: - Updated references - Set Zenodo DOIs to "all versions" DOI - Add references to NumPy and Pandas Co-authored-by: Tuomas Rossi <34502776+trossi@users.noreply.github.com> --- paper/paper.bib | 48 ++++++++++++++++++++++++++++++++++++++++-------- paper/paper.md | 6 +++--- 2 files changed, 43 insertions(+), 11 deletions(-) diff --git a/paper/paper.bib b/paper/paper.bib index f53d323..a169ad1 100644 --- a/paper/paper.bib +++ b/paper/paper.bib @@ -1,8 +1,8 @@ -@misc{diaz-vico+ramos-carreno_2022_scikitdatasets, +@misc{diaz-vico+ramos-carreno_2023_scikitdatasets, title = {{{scikit-datasets}}: {{Scikit-learn-compatible}} Datasets}, author = {{D{\'i}az-Vico}, David and {Ramos-Carre{\~n}o}, Carlos}, - year = {2022}, - month = mar, + year = {2023}, + month = aug, doi = {10.5281/zenodo.6383047}, url = {https://github.com/daviddiazvico/scikit-datasets}, copyright = {MIT} @@ -13,7 +13,8 @@ @misc{fajardo_2024_pyreadr author = {Fajardo, Otto}, year = {2024}, month = jul, - doi = {10.5281/zenodo.13132498}, + publisher = {Zenodo}, + doi = {10.5281/zenodo.7110169}, url = {https://github.com/ofajardo/pyreadr} } @@ -25,11 +26,42 @@ @misc{gautier_2024_rpy2 url = {https://github.com/rpy2/rpy2} } -@software{pandasdevelopmentteam_2020_pandasdev, - title = {{{pandas-dev/pandas}}: {{pandas}}}, - author = {Pandas Development Team}, +@article{harris+_2020_numpy, + title = {Array programming with {NumPy}}, + author = {Charles R. Harris and K. Jarrod Millman and St{\'{e}}fan J. + van der Walt and Ralf Gommers and Pauli Virtanen and David + Cournapeau and Eric Wieser and Julian Taylor and Sebastian + Berg and Nathaniel J. Smith and Robert Kern and Matti Picus + and Stephan Hoyer and Marten H. van Kerkwijk and Matthew + Brett and Allan Haldane and Jaime Fern{\'{a}}ndez del + R{\'{i}}o and Mark Wiebe and Pearu Peterson and Pierre + G{\'{e}}rard-Marchant and Kevin Sheppard and Tyler Reddy and + Warren Weckesser and Hameer Abbasi and Christoph Gohlke and + Travis E. Oliphant}, year = {2020}, - month = feb, + month = sep, + journal = {Nature}, + volume = {585}, + number = {7825}, + pages = {357--362}, + doi = {10.1038/s41586-020-2649-2}, +} + +@inproceedings{mckinney_2010_pandas, + author = {{W}es {M}c{K}inney}, + title = {{D}ata {S}tructures for {S}tatistical {C}omputing in {P}ython}, + booktitle = {{P}roceedings of the 9th {P}ython in {S}cience {C}onference}, + pages = {56 - 61}, + year = {2010}, + editor = {{S}t\'efan van der {W}alt and {J}arrod {M}illman}, + doi = {10.25080/Majora-92bf1922-00a}, +} + +@software{pandasdevelopmentteam_2024_pandasdev, + title = {{{pandas-dev/pandas}}: {{pandas}}}, + author = {{The Pandas Development Team}}, + year = {2024}, + month = apr, publisher = {Zenodo}, doi = {10.5281/zenodo.3509134}, url = {https://doi.org/10.5281/zenodo.3509134}, diff --git a/paper/paper.md b/paper/paper.md index 5985467..1eb3cfa 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -53,11 +53,11 @@ does not include the parsing of common objects such as R lists and S4 objects. The license can also be a problem, as it is part of the GPL family and does not allow commercial use. As existing solutions were unsuitable for our needs, the package `rdata` was developed to parse data in the RData format. -This is a small, extensible, efficient, and very complete implementation in pure Python of a RData parser, that is able to read and convert most datasets in the CRAN repository to equivalent Python objects. +This is a small, extensible, efficient, and very complete implementation in pure Python of a RData parser, that is able to read and convert most datasets in the CRAN repository to equivalent Python objects, such as the built-in types of The Python Standard Library, NumPy arrays [@harris+_2020_numpy], or Pandas dataframes [@mckinney_2010_pandas; @pandasdevelopmentteam_2024_pandasdev]. It has a permissive license and can be extended to support additional conversions from custom R classes. The package `rdata` has been designed as a pure Python package with minimal dependencies, so that it can be easily integrated inside other libraries and applications. -It currently powers the functionality offered in the `scikit-datasets` package [@diaz-vico+ramos-carreno_2022_scikitdatasets] for loading datasets from the CRAN repository of R packages. +It currently powers the functionality offered in the `scikit-datasets` package [@diaz-vico+ramos-carreno_2023_scikitdatasets] for loading datasets from the CRAN repository of R packages. This functionality is used for fetching the functional datasets provided in the `scikit-fda` library [@ramos-carreno+_2024_scikitfda], whose development was the main reason for the creation of the `rdata` package itself. # Features @@ -89,7 +89,7 @@ The function `convert()` of the conversion module transforms that representation Advanced users will probably require loading datasets which contain non standard S3 or S4 classes, translating each of them to a custom Python class. This is easy to achieve using `rdata` by simply creating a constructor function that receives the converted object representation and its attributes, and returns a Python object of the desired type. -As an example, consider the following simple code that constructs a `Pandas` [@pandasdevelopmentteam_2020_pandasdev] `Categorical` object from the internal representation of an R `factor`. +As an example, consider the following simple code that constructs a `Pandas` [@pandasdevelopmentteam_2024_pandasdev] `Categorical` object from the internal representation of an R `factor`. ```python import pandas From 577d96f2858aaf50e0bae4b9625f55299caa1a12 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Ramos=20Carre=C3=B1o?= Date: Mon, 7 Oct 2024 21:32:09 +0200 Subject: [PATCH 4/7] Apply suggestions from code review Co-authored-by: Tuomas Rossi <34502776+trossi@users.noreply.github.com> --- paper/paper.md | 43 ++++++++++++++++++++----------------------- 1 file changed, 20 insertions(+), 23 deletions(-) diff --git a/paper/paper.md b/paper/paper.md index 1eb3cfa..1f110dc 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -26,34 +26,34 @@ bibliography: paper.bib # Summary Research work usually requires the analysis and processing of data from different sources. -Traditionally in statistical computing, R language has been widely used for this task, and a huge amount of datasets have been compiled in the Rda and Rds formats, native to this programming language. +Traditionally in statistical computing, the R language has been widely used for this task, and a huge amount of datasets have been compiled in the Rda and Rds formats, native to this programming language. As these formats contain internally the representation of R objects, they cannot be directly used from Python, another widely used language for data analysis and processing. The library `rdata` allows to load and convert these datasets to Python objects, without the need of exporting them to other intermediate formats which may not keep all the original information. This library has minimal dependencies, ensuring that it can be used in contexts where an R installation is not available. The capability to write data in Rda and Rds formats is also under development. -Thus, the library `rdata` facilitates data interchange, enabling the usage of the same datasets in both languages (e.g. for reproducibility, comparisons of results against methods in both languages, or migration of processing pipelines to Python). +Thus, the library `rdata` facilitates data interchange, enabling the usage of the same datasets in both languages (e.g. for reproducibility, comparisons of results against methods in both languages, or the creation of complex processing pipelines that involve steps in both R and Python). # Statement of need -The datasets from the CRAN repository are stored in the R specific format RData. -In Python, there were a few packages that could parse this file format, albeit all of them presented some limitations. +The datasets from the CRAN repository are stored in the R specific formats Rda and Rds. +In Python, there were a few packages that could parse these file formats, albeit all of them presented some limitations. The package `rpy2` [@gautier_2024_rpy2] can be used to interact with R from Python. -This includes the ability to load data in the RData format, and to convert these data to equivalent Python objects. -Although this is arguably the best package to achieve interaction between both languages, it has many disadvantages if one wants to use it just to load RData datasets. +This includes the ability to load data in the Rda and Rds formats, and to convert these data to equivalent Python objects. +Although this is arguably the best package to achieve interaction between both languages, it has many disadvantages if one wants to use it just to load R datasets. In the first place, the package requires an R installation, as it relies in launching an R interpreter and communicating with it. Secondly, launching R just to load data is inefficient, both in time and memory. Finally, this package inherits the GPL license from the R language, which is not compatible with most Python packages, typically released under more permissive licenses. The package `pyreadr` [@fajardo_2024_pyreadr] also provides functionality to read and write some R datasets. -It relies in the C library `librdata` in order to perform the parsing of the RData format. +It relies in the C library `librdata` in order to perform the parsing of the R data files. This adds an additional dependency from C building tools, and requires that the package is compiled for all the desired operating systems. Moreover, this package is limited by the functionalities available in `librdata`, which at the moment of writing does not include the parsing of common objects such as R lists and S4 objects. The license can also be a problem, as it is part of the GPL family and does not allow commercial use. -As existing solutions were unsuitable for our needs, the package `rdata` was developed to parse data in the RData format. -This is a small, extensible, efficient, and very complete implementation in pure Python of a RData parser, that is able to read and convert most datasets in the CRAN repository to equivalent Python objects, such as the built-in types of The Python Standard Library, NumPy arrays [@harris+_2020_numpy], or Pandas dataframes [@mckinney_2010_pandas; @pandasdevelopmentteam_2024_pandasdev]. +As existing solutions were unsuitable for our needs, the package `rdata` was developed to parse data in the Rda and Rds formats. +This is a small, extensible, efficient, and very complete implementation in pure Python of an R data parser, that is able to read and convert most datasets in the CRAN repository to equivalent Python objects, such as the built-in types of the Python standard library, NumPy arrays [@harris+_2020_numpy], or Pandas dataframes [@mckinney_2010_pandas; @pandasdevelopmentteam_2024_pandasdev]. It has a permissive license and can be extended to support additional conversions from custom R classes. The package `rdata` has been designed as a pure Python package with minimal dependencies, so that it can be easily integrated inside other libraries and applications. @@ -63,19 +63,18 @@ This functionality is used for fetching the functional datasets provided in the # Features The package `rdata` is intended to be both flexible and easy to use. -In order to be flexible, the parsing of the RData format and the conversion of the parsed structures to appropriate Python objects have been splitted in two steps. +In order to be flexible, the parsing of the R data file formats and the conversion of the parsed structures to appropriate Python objects have been splitted in two steps. This allows advanced users to perform custom conversions without losing information. -Most users, however, will want to use the default conversion routine, which attempts to convert data -to a standard Python representation which preserves most part of the information. +Most users, however, will want to use the default conversion routine, which attempts to convert data to a standard Python representation which preserves most part of the information. +Converting an Rda dataset to Python objects using the package `rdata` can be easily done as follows: ```python import rdata converted = rdata.read_rda("dataset.rda") -converted ``` -This is equivalent to the following code, in which the two steps have been performed separatedly. +This is equivalent to the following code, in which the two steps are performed separately: ```python import rdata @@ -84,20 +83,18 @@ parsed = rdata.parser.parse_file("dataset.rda") converted = rdata.conversion.convert(parsed) ``` -The function `parse_file()` of the parser module is used to parse the RData file, returning a tree-like structure of Python objects that contains a representation of the basic R objects conforming the dataset. +The function `parse_file()` of the parser module is used to parse Rda and Rds files, returning a tree-like structure of Python objects that contains a representation of the basic R objects conforming the dataset. The function `convert()` of the conversion module transforms that representation to the final Python objects, such as lists, dictionaries or dataframes, that users can manipulate. Advanced users will probably require loading datasets which contain non standard S3 or S4 classes, translating each of them to a custom Python class. -This is easy to achieve using `rdata` by simply creating a constructor function that receives the converted object representation and its attributes, and returns a Python object of the desired type. -As an example, consider the following simple code that constructs a `Pandas` [@pandasdevelopmentteam_2024_pandasdev] `Categorical` object from the internal representation of an R `factor`. +This can be achieved using `rdata` by creating a constructor function that receives the converted object representation and its attributes, and returns a Python object of the desired type. +As an example, consider the following short code that constructs a `Pandas` [@pandasdevelopmentteam_2024_pandasdev] `Categorical` object from the internal representation of an R `factor`. ```python import pandas - def factor_constructor(obj, attrs): values = [attrs['levels'][i - 1] if i >= 0 else None for i in obj] - return pandas.Categorical(values, attrs['levels'], ordered=False) ``` @@ -116,15 +113,15 @@ If no constructor is available for that class, a warning will be emitted and the If there are no constructors for any of the parent classes, the basic underlying Python object will be left without transformation. By default, a dictionary named `DEFAULT_CLASS_MAP` is passed to `convert()` including constructors for commonly used classes, such as `data.frame`, `ordered` or the aforementioned `factor`. -In case anyone wants different conversions for basic R objects, it would be enough to create a subclass of the `Converter` class. +In case the user desires different conversions for basic R objects, it would be enough to create a subclass of the `Converter` class. Several utility functions, such as the routines `convert_char()` and `convert_list()`, are exposed by the conversion module in order for users to be able to reuse them for that purpose. # Ongoing work -To broaden the utility of the `rdata` library to data processing pipelines with steps in both R and Python, we are currently extending the library with the capability to write compatible Python objects to RData files. +To broaden the utility of the `rdata` library for data processing pipelines with steps in both R and Python, we are currently extending the library with the capability to write compatible Python objects to Rda and Rds files. As an example, such a pipeline is present in the Hmsc-HPC code [@rahman+_2024_hmschpc], the continuous development of which has been driving the ongoing work on the writing functionality in the `rdata` library. -The writing of RData files is implemented as a two-step process similar to reading: first, the Python object is converted to the tree-like intermediate representation used in parsing, and then this intermediate representation is written to a RData file. -Currently, the writing functionality supporting common types is available at the development branch of the `rdata` library. +The writing of Rda and Rds files is implemented as a two-step process similar to reading: first, the Python object is converted to the tree-like intermediate representation used in parsing, and then this intermediate representation is written to a file of the chosen format. +Currently, the writing functionality supporting common types is available in the development branch of the `rdata` library. # Acknowledgements From 118f469e8fe621dee483b555d21dbede173d34b7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Ramos=20Carre=C3=B1o?= Date: Thu, 10 Oct 2024 21:34:50 +0200 Subject: [PATCH 5/7] Update paper/paper.md Co-authored-by: Tuomas Rossi <34502776+trossi@users.noreply.github.com> --- paper/paper.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/paper/paper.md b/paper/paper.md index 1f110dc..837769d 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -119,7 +119,8 @@ Several utility functions, such as the routines `convert_char()` and `convert_li # Ongoing work To broaden the utility of the `rdata` library for data processing pipelines with steps in both R and Python, we are currently extending the library with the capability to write compatible Python objects to Rda and Rds files. -As an example, such a pipeline is present in the Hmsc-HPC code [@rahman+_2024_hmschpc], the continuous development of which has been driving the ongoing work on the writing functionality in the `rdata` library. +As an example, such a pipeline is present in the Hmsc-HPC code [@rahman+_2024_hmschpc]. +The continuous development of this code has also been driving the ongoing work on the writing functionality of the `rdata` library. The writing of Rda and Rds files is implemented as a two-step process similar to reading: first, the Python object is converted to the tree-like intermediate representation used in parsing, and then this intermediate representation is written to a file of the chosen format. Currently, the writing functionality supporting common types is available in the development branch of the `rdata` library. From d4bfa294e5317ddcb6a1dc9ac893055a87354fd2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Ramos=20Carre=C3=B1o?= Date: Fri, 11 Oct 2024 12:29:42 +0200 Subject: [PATCH 6/7] Update paper/paper.md Co-authored-by: Tuomas Rossi <34502776+trossi@users.noreply.github.com> --- paper/paper.md | 1 + 1 file changed, 1 insertion(+) diff --git a/paper/paper.md b/paper/paper.md index 837769d..9ba0009 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -131,5 +131,6 @@ from the Spanish Ministry of Education and Innovation, projects PID2019-106827GB from an FPU grant (Formación de Profesorado Universitario) from the Spanish Ministry of Science, Innovation and Universities(MICIU) with reference FPU18/00047, and from the European Union's Horizon Europe research and innovation programme under grant agreement No 101057437 (BioDT project, [https://doi.org/10.3030/101057437](https://doi.org/10.3030/101057437)). Views and opinions expressed are those of the author(s) only and do not necessarily reflect those of the European Union or the European Commission. Neither the European Union nor the European Commission can be held responsible for them. +The authors gratefully acknowledge the use of the computational facilities provided by Centro de Computación Científica (CCC) at Universidad Autónoma de Madrid and by CSC – IT Center for Science, Finland. # References \ No newline at end of file From b8b2aacd89ec67c3b6d390a05256977ada093917 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Ramos=20Carre=C3=B1o?= Date: Fri, 25 Oct 2024 01:18:00 +0200 Subject: [PATCH 7/7] Update paper/paper.md Co-authored-by: Tuomas Rossi <34502776+trossi@users.noreply.github.com> --- paper/paper.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paper/paper.md b/paper/paper.md index 9ba0009..d84e905 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -35,7 +35,7 @@ Thus, the library `rdata` facilitates data interchange, enabling the usage of th # Statement of need -The datasets from the CRAN repository are stored in the R specific formats Rda and Rds. +The datasets of the R programming language, such as those from the CRAN repository, are often stored in the R specific formats Rda and Rds. In Python, there were a few packages that could parse these file formats, albeit all of them presented some limitations. The package `rpy2` [@gautier_2024_rpy2] can be used to interact with R from Python.