Skip to content

Commit 1bb3326

Browse files
committed
Try putting R code in tests and referencing that in yaml
1 parent a0b41b9 commit 1bb3326

File tree

6 files changed

+225
-15
lines changed

6 files changed

+225
-15
lines changed

.Rbuildignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ man-roxygen/*
66
^README\.Rmd$
77
^README\.html$
88
^CONTRIBUTING\.md$
9+
tests/.*_ghaction.R
910
^vignettes/figure$
1011
^vignettes/figure/.+$
1112
\.Rmd2$

.github/workflows/R-CMD-check-dev.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,11 @@ jobs:
4747
rcmdcheck::rcmdcheck(args = c("--no-manual", "--as-cran"), error_on = "warning", check_dir = "check")
4848
shell: Rscript {0}
4949

50+
- name: Test live dataverse (vignette)
51+
run: |
52+
Rscript -e "tests/B-search_ghaction.R"
53+
Rscript -e "tests/C_download_ghaction.R"
54+
5055
- name: Test coverage
5156
run: covr::codecov()
5257
shell: Rscript {0}

tests/B-search_ghaction.R

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
## ----knitr_options, echo=FALSE, results="hide"----------------------------------------------
2+
options(width = 120)
3+
knitr::opts_chunk$set(results = "hold")
4+
5+
6+
## -------------------------------------------------------------------------------------------
7+
library("dataverse")
8+
Sys.setenv("DATAVERSE_SERVER" = "dataverse.harvard.edu")
9+
dataverse_search("Gary King")[c("name")]
10+
11+
12+
## -------------------------------------------------------------------------------------------
13+
dataverse_search("Gary King", start = 6, per_page = 20)[c("name")]
14+
15+
16+
## -------------------------------------------------------------------------------------------
17+
ei <- dataverse_search(author = "Gary King", title = "Ecological Inference", type = "dataset", per_page = 20)
18+
# fields returned
19+
names(ei)
20+
# names of datasets
21+
ei$name
22+

tests/C-download_ghaction.R

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
## ----knitr_options, echo=FALSE, results="hide"----------------------------------------------
2+
options(width = 120)
3+
knitr::opts_chunk$set(results = "hold")
4+
5+
6+
## -------------------------------------------------------------------------------------------
7+
Sys.setenv("DATAVERSE_SERVER" = "dataverse.harvard.edu")
8+
9+
10+
## -------------------------------------------------------------------------------------------
11+
library("dataverse")
12+
library("tibble") # to see dataframes in tidyverse-form
13+
14+
15+
## ----echo=FALSE, message=FALSE,include=FALSE------------------------------------------------
16+
energy <- get_dataframe_by_name(
17+
filename = "comprehensiveJapanEnergy.tab",
18+
dataset = "10.7910/DVN/ARKOTI",
19+
server = "dataverse.harvard.edu")
20+
21+
22+
## ----eval=FALSE-----------------------------------------------------------------------------
23+
## energy <- get_dataframe_by_name(
24+
## filename = "comprehensiveJapanEnergy.tab",
25+
## dataset = "10.7910/DVN/ARKOTI",
26+
## server = "dataverse.harvard.edu")
27+
28+
29+
## -------------------------------------------------------------------------------------------
30+
head(energy)
31+
32+
33+
## -------------------------------------------------------------------------------------------
34+
library(readr)
35+
energy <- get_dataframe_by_name(
36+
filename = "comprehensiveJapanEnergy.tab",
37+
dataset = "10.7910/DVN/ARKOTI",
38+
server = "dataverse.harvard.edu",
39+
.f = function(x) read.delim(x, sep = "\t"))
40+
41+
head(energy)
42+
43+
44+
## ----message=FALSE--------------------------------------------------------------------------
45+
argentina_tab <- get_dataframe_by_name(
46+
filename = "alpl2013.tab",
47+
dataset = "10.7910/DVN/ARKOTI",
48+
server = "dataverse.harvard.edu")
49+
50+
51+
## -------------------------------------------------------------------------------------------
52+
str(argentina_tab$polling_place)
53+
54+
55+
## -------------------------------------------------------------------------------------------
56+
argentina_dta <- get_dataframe_by_name(
57+
filename = "alpl2013.tab",
58+
dataset = "10.7910/DVN/ARKOTI",
59+
server = "dataverse.harvard.edu",
60+
original = TRUE,
61+
.f = haven::read_dta)
62+
63+
64+
## -------------------------------------------------------------------------------------------
65+
str(argentina_dta$polling_place)
66+
67+
68+
## -------------------------------------------------------------------------------------------
69+
str(dataset_metadata("10.7910/DVN/ARKOTI", server = "dataverse.harvard.edu"),
70+
max.level = 2)
71+
72+
73+
## ----eval = FALSE---------------------------------------------------------------------------
74+
## code3 <- get_file("chapter03.R", "doi:10.7910/DVN/ARKOTI", server = "dataverse.harvard.edu")
75+
## writeBin(code3, "chapter03.R")
76+

vignettes/B-search.Rmd

Lines changed: 69 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,26 +17,92 @@ knitr::opts_chunk$set(results = "hold")
1717

1818
Searching for data within Dataverse is quite easy using the `dataverse_search()` function. The simplest searches simply consist of a query string:
1919

20-
```{r}
20+
```{r, eval=FALSE}
2121
library("dataverse")
2222
Sys.setenv("DATAVERSE_SERVER" = "dataverse.harvard.edu")
2323
dataverse_search("Gary King")[c("name")]
2424
```
2525

26+
```{r}
27+
## name
28+
## 1 004_informal_food_retail_Nigeria_2018.tab
29+
## 2 00592Belle-Stress-PaperData-Subject_King_ChildIs.PDF
30+
## 3 00592Belle-Stress-PaperData-Subject_King_ChildO.PDF
31+
## 4 00592Belle-Stress-PaperData-Subject_King_Coping.PDF
32+
## 5 00592Belle-Stress-PaperData-Subject_King_Discrimination.PDF
33+
## 6 00592Belle-Stress-PaperData-Subject_King_LifeCs.PDF
34+
## 7 00592Belle-Stress-PaperData-Subject_King_LifeE.PDF
35+
## 8 00592Belle-Stress-PaperData-Subject_KingAndMeunier_Parenting.PDF
36+
## 9 00698McArthur-King-BoxCoverSheets.pdf
37+
## 10 00698McArthur-King-MemoOfAgreement.pdf
38+
```
39+
40+
2641
The results are paginated, so users can rely upon the `per_page` and `start` argument to requested subsequent pages of results. We'll start at 6 and to show that we retrieve the last five results from the previous query plus 15 more (due to `per_page = 20`):
2742

28-
```{r}
43+
```{r, eval=FALSE}
2944
dataverse_search("Gary King", start = 6, per_page = 20)[c("name")]
3045
```
3146

47+
```{r}
48+
# 10 of 3676 results retrieved
49+
## name
50+
## 1 004_informal_food_retail_Nigeria_2018.tab
51+
## 2 00698McArthur-King-BoxCoverSheets.pdf
52+
## 3 00698McArthur-King-MemoOfAgreement.pdf
53+
## 4 00698McArthur-King-StudyDescription.pdf
54+
## 5 01 ReadMe Unlocking history through automated virtual unfolding of sealed documents imaged by X-ray microtomography
55+
## 6 01_ReadMe_The_Spiral_Locked_Letters_of_Elizabeth_I_and_Mary_Queen_of_Scots
56+
## 7 03 Brienne Collection letterlocking data: Images folder 02/16, DB-0874_2–DB-0903
57+
## 8 03 Brienne Collection letterlocking data: Images folder 04/16, DB-0988–DB-1109_03
58+
## 9 03 Brienne Collection letterlocking data: Images folder 06/16, DB-1241_02–DB-1339_06
59+
## 10 03 Brienne Collection letterlocking data: Images folder 08/16, DB-1455_02–DB-1564_01
60+
```
61+
62+
3263
More complicated searches can specify metadata fields like `title` and restrict results to a specific `type` of Dataverse object (a "dataverse", "dataset", or "file"):
3364

34-
```{r}
65+
```{r, eval=FALSE}
3566
ei <- dataverse_search(author = "Gary King", title = "Ecological Inference", type = "dataset", per_page = 20)
3667
# fields returned
3768
names(ei)
3869
# names of datasets
3970
ei$name
4071
```
4172

73+
```{r}
74+
## [1] "name" "type" "url" "global_id"
75+
## [5] "description" "published_at" "publisher" "citationHtml"
76+
## [9] "identifier_of_dataverse" "name_of_dataverse" "citation" "storageIdentifier"
77+
## [13] "keywords" "subjects" "fileCount" "versionId"
78+
## [17] "versionState" "majorVersion" "minorVersion" "createdAt"
79+
## [21] "updatedAt" "contacts" "authors" "publications"
80+
## [1] "01 ReadMe Unlocking history through automated virtual unfolding of sealed documents imaged by X-ray microtomography"
81+
## [2] "01_ReadMe_The_Spiral_Locked_Letters_of_Elizabeth_I_and_Mary_Queen_of_Scots"
82+
## [3] "03 Brienne Collection letterlocking data: Images folder 02/16, DB-0874_2–DB-0903"
83+
## [4] "03 Brienne Collection letterlocking data: Images folder 04/16, DB-0988–DB-1109_03"
84+
## [5] "03 Brienne Collection letterlocking data: Images folder 06/16, DB-1241_02–DB-1339_06"
85+
## [6] "03 Brienne Collection letterlocking data: Images folder 08/16, DB-1455_02–DB-1564_01"
86+
## [7] "03 Brienne Collection letterlocking data: Images folder 12/16, DB-1868–DB-1963_03"
87+
## [8] "03 Brienne Collection letterlocking data: Images folder 14/16, DB-2064_01–2155_03"
88+
## [9] "03 Spiral-lock figures"
89+
## [10] "07 Letterlocking Categories and Formats Chart"
90+
## [11] "10 Foldable: Launch Little Book of Locks (UH6089), with Categories and Formats Chart. Letterlocking Instructional Resources"
91+
## [12] "10 Million International Dyadic Events"
92+
## [13] "1479 data points of covid19 policy response times"
93+
## [14] "2016 Census of Population: ADA and DA Maps for Kings County Nova Scotia"
94+
## [15] "3D Dust map from Green et al. (2015)"
95+
## [16] "3D dust map from Green et al. (2017)"
96+
## [17] "3D dust map from Green et al. (2019)"
97+
## [18] "A 1D Lyman-alpha Profile Camera for Plasma Edge Neutral Studies on the DIII-D Tokamak"
98+
## [19] "A Comparative Analysis of Brazil's Foreign Policy Drivers Towards the USA: Comment on Amorim Neto (2011)"
99+
## [20] "A Critique of Dyadic Design"
100+
## 16 1998 Jewish Community Study of the Coachella Valley, California
101+
## 17 2002 State Legislative Survey
102+
## 18 2007 White Sands Dune Field lidar topographic data
103+
## 19 2008 White Sands Dune Field lidar topographic data
104+
## 20 2012 STATA Data.tab
105+
106+
```
107+
42108
Once datasets and files are identified, it is easy to download and use them directly in R. See the ["Data Download" vignette](C-download.html) for details.

vignettes/C-download.Rmd

Lines changed: 52 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -42,28 +42,35 @@ library("tibble") # to see dataframes in tidyverse-form
4242

4343
First, we retrieve a plain-text file like this dataset on electricity consumption by [Wakiyama et al. (2014)](https://doi.org/10.7910/DVN/ARKOTI/GN1MRT). Taking the file name and dataset DOI from this entry,
4444

45-
```{r, echo=FALSE, message=FALSE,include=FALSE}
45+
46+
```{r, eval=FALSE}
4647
energy <- get_dataframe_by_name(
4748
filename = "comprehensiveJapanEnergy.tab",
4849
dataset = "10.7910/DVN/ARKOTI",
4950
server = "dataverse.harvard.edu")
5051
```
5152

5253
```{r, eval=FALSE}
53-
energy <- get_dataframe_by_name(
54-
filename = "comprehensiveJapanEnergy.tab",
55-
dataset = "10.7910/DVN/ARKOTI",
56-
server = "dataverse.harvard.edu")
54+
head(energy)
5755
```
5856

5957
```{r}
60-
head(energy)
58+
## # A tibble: 6 × 10
59+
## time date dummy temp temp2 all large house kepco tepco
60+
## <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
61+
## 1 1 8-Jan 0 5.9 34.8 95792389 35194957 26190714 13357735 26960899
62+
## 2 2 8-Feb 0 5.5 30.3 95156901 35322031 24224097 13315027 27189705
63+
## 3 3 8-Mar 0 10.7 114. 91034047 36474192 21391965 12805831 24495519
64+
## 4 4 8-Apr 0 14.7 216. 84087552 34949622 18494473 11494328 23540356
65+
## 5 5 8-May 0 18.5 342. 82742929 35417089 17923760 11589061 22848737
66+
## 6 6 8-Jun 0 21.3 454. 82180013 36692291 15205229 11360771 22487441
6167
```
6268

69+
6370
These `get_dataframe_*` functions, introduced in v0.3.0, directly read in the data into a R environment through whatever R function supplied by `.f`. The default of the `get_dataframe_*` functions is to read in such data by `readr::read_tsv()`. The `.f` function can be modified to modify the read-in settings. For example, the following modification is a base-R equivalent to read in the ingested data.
6471

6572

66-
```{r}
73+
```{r, eval=FALSE}
6774
library(readr)
6875
energy <- get_dataframe_by_name(
6976
filename = "comprehensiveJapanEnergy.tab",
@@ -74,6 +81,16 @@ energy <- get_dataframe_by_name(
7481
head(energy)
7582
```
7683

84+
```{r}
85+
## time date dummy temp temp2 all large house kepco tepco
86+
## 1 1 8-Jan 0 5.9 34.8 95792389 35194957 26190714 13357735 26960899
87+
## 2 2 8-Feb 0 5.5 30.3 95156901 35322031 24224097 13315027 27189705
88+
## 3 3 8-Mar 0 10.7 114.5 91034047 36474192 21391965 12805831 24495519
89+
## 4 4 8-Apr 0 14.7 216.1 84087552 34949622 18494473 11494328 23540356
90+
## 5 5 8-May 0 18.5 342.3 82742929 35417089 17923760 11589061 22848737
91+
## 6 6 8-Jun 0 21.3 453.7 82180013 36692291 15205229 11360771 22487441
92+
```
93+
7794

7895
The dataverse package can also download datasets that are _drafts_ (i.e. versions not released publicly), as long as the user of the dataset provides their appropriate DATAVERSE_KEY. Users may need to modify the metadata of a datafile, such as adding a descriptive label, for the data downloading to work properly in this case. This is because the the file identifier UNF, which the read function relies on, may only appear after metadata has been added.
7996

@@ -83,7 +100,7 @@ The dataverse package can also download datasets that are _drafts_ (i.e. version
83100

84101
If a file is displayed on dataverse as a `.tab` file like the survey data by [Alvarez et al. (2013)](https://doi.org/10.7910/DVN/ARKOTI/A8YRMP), it is likely that Dataverse [ingested](https://guides.dataverse.org/en/latest/user/tabulardataingest/index.html) the file to a plain-text, tab-delimited format.
85102

86-
```{r, message=FALSE}
103+
```{r, message=FALSE,eval=FALSE}
87104
argentina_tab <- get_dataframe_by_name(
88105
filename = "alpl2013.tab",
89106
dataset = "10.7910/DVN/ARKOTI",
@@ -93,13 +110,17 @@ argentina_tab <- get_dataframe_by_name(
93110

94111
However, ingested files may not retain important dataset attributes. For example, Stata and SPSS datasets encode value labels on to numeric values. Factor variables in R dataframes encode levels, not only labels. A plain-text ingested file will discard such information. For example, the `polling_place` variable in this data is only given by numbers, although the original data labelled these numbers with informative values.
95112

96-
```{r}
113+
```{r,eval=FALSE}
97114
str(argentina_tab$polling_place)
98115
```
99116

117+
```{r}
118+
## num [1:1475] 31 31 31 31 31 31 31 31 31 31 ...
119+
```
120+
100121
When ingesting, Dataverse retains a `original` version that retains these attributes but may not be readable in some platforms. The `get_dataframe_*` functions have an argument that can be set to `original = TRUE`. In this case we know that `alpl2013.tab` was originally a Stata dta file, so we can run:
101122

102-
```{r}
123+
```{r, eval=FALSE}
103124
argentina_dta <- get_dataframe_by_name(
104125
filename = "alpl2013.tab",
105126
dataset = "10.7910/DVN/ARKOTI",
@@ -110,10 +131,17 @@ argentina_dta <- get_dataframe_by_name(
110131

111132
Now we see that labels are read in through `haven`'s labelled variables class:
112133

113-
```{r}
134+
```{r, eval=FALSE}
114135
str(argentina_dta$polling_place)
115136
```
116137

138+
```{r}
139+
## dbl+lbl [1:1475] 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 3...
140+
## @ label : chr "polling_place"
141+
## @ format.stata: chr "%9.0g"
142+
## @ labels : Named num [1:37] 1 2 3 4 5 6 7 8 9 10 ...
143+
## ..- attr(*, "names")= chr [1:37] "E.E.T." "Escuela Juan Bautista Alberdi" "Escuela Juan Carlos Dávalos" "Escuela Bernardino de Rivadavia" ...
144+
```
117145

118146

119147
Users should pick `.f` and `original` based on their existing knowledge of the file. If the original file is a `.sav` SPSS file, `.f` can be `haven::read_sav`. If it is a `.Rds` file, use `readRDS` or `readr::read_rds`. In fact, because the raw data is read in as a binary, there is no limitation to the file types `get_dataframe_*` can read in, as far as the dataverse package is concerned.
@@ -138,11 +166,23 @@ This shows that there are indeed 32 files, a mix of .R code files and tab- and c
138166

139167
You can also retrieve more extensive metadata using `dataset_metadata()`:
140168

141-
```{r}
169+
```{r, eval=FALSE}
142170
str(dataset_metadata("10.7910/DVN/ARKOTI", server = "dataverse.harvard.edu"),
143171
max.level = 2)
144172
```
145173

174+
```{r}
175+
## List of 3
176+
## $ displayName: chr "Citation Metadata"
177+
## $ name : chr "citation"
178+
## $ fields :'data.frame': 7 obs. of 4 variables:
179+
## ..$ typeName : chr [1:7] "title" "author" "datasetContact" "dsDescription" ...
180+
## ..$ multiple : logi [1:7] FALSE TRUE TRUE TRUE TRUE FALSE ...
181+
## ..$ typeClass: chr [1:7] "primitive" "compound" "compound" "compound" ...
182+
## ..$ value :List of 7
183+
```
184+
185+
146186
## Retrieving Scripts and Other Files
147187

148188
If the file you want to retrieve is not data, you may want to use the more primitive function, `get_file`, which gets the file data as a raw binary file. See the help page examples of `get_file()` that use the `base::writeBin()` function for details on how to write and read these binary files instead.

0 commit comments

Comments
 (0)