Merge pull request #12 from gaelforget/v0p1p3b

V0p1p3b
gdcc · Jun 19, 2022 · fc3c144 · fc3c144
2 parents cc50484 + 9c48eed
commit fc3c144
Show file tree

Hide file tree

Showing 7 changed files with 94 additions and 83 deletions.
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "Dataverse"
 uuid = "9c0b9be8-e31e-490f-90fe-77697562404d"
 authors = ["gaelforget <gforget@mit.edu> and contributors"]
-version = "0.1.2"
+version = "0.1.3"
 
 [deps]
 CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"

diff --git a/docs/make.jl b/docs/make.jl
@@ -19,7 +19,6 @@ makedocs(;
     ),
     pages=[
         "Home" => "index.md",
-        "APIs" => "dataverse_access.md",
     ],
 )
 

diff --git a/docs/src/index.md b/docs/src/index.md
@@ -21,5 +21,5 @@ This package is about interfaces to the [Dataverse](https://dataverse.org) proje
 ## Functionalities
 
 ```@autodocs
-Modules = [pyDataverse, Dataverse.downloads]
+Modules = [pyDataverse, DataverseDownloads]
 ```
diff --git a/src/Dataverse.jl b/src/Dataverse.jl
@@ -4,9 +4,6 @@ include("pyDataverse.jl")
 export pyDataverse
 
 include("downloads.jl")
-export get_from_dataverse, dataverse_lists, example_lists
-get_from_dataverse=downloads.get_from_dataverse
-dataverse_lists=downloads.dataverse_lists
-example_lists=downloads.example_lists
+export DataverseDownloads
 
 end
diff --git a/src/downloads.jl b/src/downloads.jl
@@ -1,20 +1,19 @@
-module downloads
+module DataverseDownloads
 
-using Downloads, OceanStateEstimation, CSV, DataFrames
+import Dataverse.pyDataverse.dataset_file_list
+using Downloads, DataFrames
 
 ##
 
 """
-    get_from_dataverse(lst::String,nam::String,pth::String)
+    download_files(lst::String,nam::String,pth::String)
 
 ```
-lst=example_lists.OCCA_list
-nams=example_lists.OCCA_files.name
-[get_from_dataverse(lst,string(nam),tempdir()) for nam in nams[:]]
+lst=downloads.download_urls(dataset_file_list(:OCCA_clim))
+DataverseDownloads.download_files(lst,lst.name[1],tempdir())
 ```
 """
-function get_from_dataverse(lst::String,nam::String,pth::String)
-    lists=dataverse_lists(lst)
+function download_files(lists::NamedTuple,nam::String,pth::String)
     ii = findall([occursin("$nam", lists.name[i]) for i=1:length(lists.ID)])
     for i in ii
         nam1=Downloads.download(lists.URL[i])
@@ -30,35 +29,19 @@ function get_from_dataverse(lst::String,nam::String,pth::String)
 end
 
 """
-    dataverse_lists(lst::String)
+    download_urls(lst::String)
 
-Read and derive lists (ID,name,URL) from csv file (ID,name) and return as tuple
-
-```
-lst=example_lists.OCCA_list
-ECCO_files=dataverse_lists(lst)
-```
+Add download URL (using df.id) and return as NamedTuple.
 """
-function dataverse_lists(lst::String)
-    tmp=readlines(lst)
-    ID=[parse(Int,tmp[j][1:findfirst(isequal(','),tmp[j])-1]) for j=2:length(tmp)]
-    name=[tmp[j][findfirst(isequal(','),tmp[j])+1:end] for j=2:length(tmp)]
+download_urls(df::DataFrame) = begin
     tmp="https://dataverse.harvard.edu/api/access/datafile/"
-    URL=[tmp*"$(ID[j])" for j=1:length(ID)]
-    return (ID=ID,name=name,URL=URL)
+    URL=[tmp*"$(df.id[j])" for j=1:length(df.id)]
+    (ID=df.id,name=df.filename,URL=URL)
 end
 
 ##
 
-pth=dirname(pathof(OceanStateEstimation))
-
-OCCA_list=joinpath(pth,"../examples/OCCA_climatology.csv")
-OCCA_files=dataverse_lists(OCCA_list)
-
-ECCO_list=joinpath(pth,"../examples/nctiles_climatology.csv")
-ECCO_files=dataverse_lists(ECCO_list)
-
-example_lists=( OCCA_list=OCCA_list,OCCA_files=OCCA_files,
-                ECCO_list=ECCO_list,ECCO_files=ECCO_files)
+OCCA_files()=download_urls(dataset_file_list(:OCCA_clim))
+ECCO_files()=download_urls(dataset_file_list(:ECCO_clim))
 
 end
diff --git a/src/pyDataverse.jl b/src/pyDataverse.jl
@@ -22,30 +22,32 @@ end
 """
     demo(option::String)
 
-- call `demo_basic` if `option=="basic"`
-- call `demo_ECCO` if `option=="ECCO"`
-"""
-function demo(option="basic")
-    if option=="basic"
-        demo_basic()
-    elseif option=="ECCO"
-        demo_ECCO()
+- call `demo_download` if `option=="download"`
+- call `demo_metadata` if `option=="metadata"`
+"""
+function demo(option="download")
+    if option=="download"
+        demo_download()
+    elseif option=="metadata"
+        demo_metadata()
+    else
+        println("unknown option")
     end
 end
 
 """
-    demo_basic(;path=tempdir(),DOI = "doi:10.7910/DVN/KBHLOD")
+    demo_download(;path=tempdir(),DOI = "doi:10.7910/DVN/KBHLOD")
 
 Replicate the worflow example from 
 
 <https://pydataverse.readthedocs.io/en/latest/user/basic-usage.html#download-and-save-a-dataset-to-disk>    
 
 ```
-pyDataverse.demo_basic()
+pyDataverse.demo_download()
 ```
 """
-function demo_basic(;path=tempdir(),DOI = "doi:10.7910/DVN/KBHLOD")
-    (DataAccessApi,NativeApi)=pyDataverse.APIs()
+function demo_download(;path=tempdir(),DOI = "doi:10.7910/DVN/KBHLOD")
+    (DataAccessApi,NativeApi)=pyDataverse.APIs(do_install=false)
     dataset = NativeApi.get_dataset(DOI)
     files_list = dataset.json()["data"]["latestVersion"]["files"]
     filenames=String[]
@@ -59,11 +61,7 @@ function demo_basic(;path=tempdir(),DOI = "doi:10.7910/DVN/KBHLOD")
     filenames
 end
 
-"""
-    tree_children_to_DataFrame(files)
-
-Convert output of e.g. `tree[1]["children"]` to DataFrame. See notebook for a more complete example.
-"""
+#Deprecated : see `dataset_file_list`+`files_to_DataFrame` instead
 function tree_children_to_DataFrame(files)	
 	nf=length(files)
 	filename=[files[ff]["filename"] for ff in 1:nf]
@@ -73,40 +71,79 @@ function tree_children_to_DataFrame(files)
 end
 
 """
-    dataset_children_to_DataFrame(files)
+    files_to_DataFrame(files)
 
-Convert output of e.g. `dataset.json()["data"]["latestVersion"]["files"]` to DataFrame. See notebook for a more complete example.
+Convert output from `dataset.json()["data"]["latestVersion"]["files"]` to DataFrame. 
+
+See notebook for a more complete example.
 """
-function dataset_files_to_DataFrame(files)	
+function files_to_DataFrame(files)	
 	nf=length(files)
 	filename=[files[ff]["dataFile"]["filename"] for ff in 1:nf]
 	filesize=[files[ff]["dataFile"]["filesize"] for ff in 1:nf]
+	id=[files[ff]["dataFile"]["id"] for ff in 1:nf]    
 	pidURL=[files[ff]["dataFile"]["pidURL"] for ff in 1:nf]
-	DataFrame(filename=filename,filesize=filesize,pidURL=pidURL)
+	DataFrame(filename=filename,filesize=filesize,id=id,pidURL=pidURL)
 end
 
 
 """
-    demo_ECCO()
+    demo_metadata()
+
+```
+pyDataverse.demo_metadata()
+```
+"""
+function demo_metadata()
+    df1=dataset_file_list(:OCCA_clim)
+	df2=dataverse_file_list(:ECCOv4r2)
+    df1,df2
+end
+
+##
+
+"""
+    dataset_file_list(nam::Symbol=:OCCA_clim)
+
+Lookup DOI from list of demo data sets.
 
 ```
-pyDataverse.demo_ECCO()
+dataset_file_list(:OCCA_clim)
 ```
 """
-function demo_ECCO()
-    (DataAccessApi,NativeApi)=pyDataverse.APIs()
+function dataset_file_list(nam::Symbol)
+    (DataAccessApi,NativeApi)=pyDataverse.APIs(do_install=false)
+    DOI=(OCCA_clim="doi:10.7910/DVN/RNXA2A",ECCO_clim="doi:10.7910/DVN/3HPRZI")
+    dataset_file_list(DOI[nam])
+end
 
-    DOI="doi:10.7910/DVN/AVVGYX"    
+"""
+    dataset_file_list(DOI::String="doi:10.7910/DVN/ODM2IQ")
+
+Use `NativeApi.get_dataset` to derive the list of files (name, etc) via `files_to_DataFrame`.
+
+```
+dataset_file_list("doi:10.7910/DVN/ODM2IQ")
+```
+"""
+function dataset_file_list(DOI::String)
+    (DataAccessApi,NativeApi)=pyDataverse.APIs(do_install=false)
     dataset = NativeApi.get_dataset(DOI)
     dataset_files = dataset.json()["data"]["latestVersion"]["files"]
-    df1=dataset_files_to_DataFrame(dataset_files)
+    files_to_DataFrame(dataset_files)
+end
 
-    NAME="ECCOv4r2"
-    tree = NativeApi.get_children(NAME, children_types= ["datasets", "datafiles"])
-    files=tree[1]["children"]
-	df2=tree_children_to_DataFrame(files)
+"""
+    dataverse_file_list(nam::Symbol=:ECCOv4r2)
 
-    df1,df2
+- Use `NativeApi.get_children` to get the tree of datasets
+- Loop through and return vector of `dataset_file_list` output
+"""
+function dataverse_file_list(nam::Symbol=:ECCOv4r2)
+    (DataAccessApi,NativeApi)=pyDataverse.APIs(do_install=false)
+    tree = NativeApi.get_children(string(nam), children_types= ["datasets", "datafiles"])
+    #[tree_children_to_DataFrame(leaf["children"]) for leaf in tree]
+    [dataset_file_list(leaf["pid"]) for leaf in tree]
 end
 
 end
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -6,21 +6,16 @@ using UUIDs
 pyDataverse.APIs(do_install=true)
 
 @testset "Dataverse.jl" begin
-    lst=example_lists.OCCA_list
-    nams=example_lists.OCCA_files.name
-
-    pth=joinpath(tempdir(),string(UUIDs.uuid4())) 
+    lst=DataverseDownloads.OCCA_files()
+    pth=joinpath(tempdir(),string(UUIDs.uuid4()))
     mkdir(pth)
+    DataverseDownloads.download_files(lst,lst.name[1],pth)
+    @test isfile(joinpath(pth,lst.name[1]))
 
-    nam=nams[2]
-    get_from_dataverse(lst,string(nam),pth)
-    @test isfile(joinpath(pth,nam))
-
-    tmp=pyDataverse.demo("basic")
+    tmp=pyDataverse.demo("download")
     @test isfile(tmp[1])
 
-    df1,df2=pyDataverse.demo("ECCO")
-    @test size(df1,1)==3
-    @test size(df2,1)==12
-
+    df1,df2=pyDataverse.demo("metadata")
+    @test size(df1,1)==56
+    @test size(df2,1)==11
 end
-Original file line number
+Diff line change
@@ Expand Up / @@ -19,7 +19,6 @@ makedocs(; @@
         ),
         pages=[
             "Home" => "index.md",
-            "APIs" => "dataverse_access.md",
         ],
     )
@@ Expand Down @@