From acb3e3dd61c8ba66ce72d88a5782a911b6bd3b9f Mon Sep 17 00:00:00 2001 From: Daniel Loos Date: Wed, 22 May 2024 11:14:24 +0200 Subject: [PATCH 01/10] Init doc refactoring --- docs/src/.vitepress/config.mts | 105 +++++++++--------- docs/src/UserGuide/chunk.md | 1 + docs/src/UserGuide/compute.md | 16 +++ .../src/UserGuide/{ => etc}/applyfunctions.md | 0 .../{ => etc}/create_cube_from_function.md | 0 docs/src/UserGuide/{ => etc}/creating.md | 0 docs/src/UserGuide/{ => etc}/distributed.md | 0 .../{ => etc}/indexing_subsetting.md | 0 docs/src/UserGuide/{ => etc}/saving.md | 0 docs/src/UserGuide/{ => etc}/setchuncks.md | 0 .../{HowdoI/howdoi.md => UserGuide/faq.md} | 3 +- docs/src/UserGuide/{group_by.md => group.md} | 2 +- docs/src/UserGuide/openZarr.md | 21 ---- docs/src/UserGuide/{openNetCDF.md => read.md} | 33 +++++- docs/src/UserGuide/subset.md | 1 + docs/src/UserGuide/types.md | 33 ++++++ docs/src/UserGuide/write.md | 5 + .../src/{HowdoI => development}/contribute.md | 0 docs/src/{ => development}/contributors.md | 0 .../{tutorial.md => other_tutorials.md} | 2 +- 20 files changed, 144 insertions(+), 78 deletions(-) create mode 100644 docs/src/UserGuide/chunk.md create mode 100644 docs/src/UserGuide/compute.md rename docs/src/UserGuide/{ => etc}/applyfunctions.md (100%) rename docs/src/UserGuide/{ => etc}/create_cube_from_function.md (100%) rename docs/src/UserGuide/{ => etc}/creating.md (100%) rename docs/src/UserGuide/{ => etc}/distributed.md (100%) rename docs/src/UserGuide/{ => etc}/indexing_subsetting.md (100%) rename docs/src/UserGuide/{ => etc}/saving.md (100%) rename docs/src/UserGuide/{ => etc}/setchuncks.md (100%) rename docs/src/{HowdoI/howdoi.md => UserGuide/faq.md} (99%) rename docs/src/UserGuide/{group_by.md => group.md} (99%) delete mode 100644 docs/src/UserGuide/openZarr.md rename docs/src/UserGuide/{openNetCDF.md => read.md} (62%) create mode 100644 docs/src/UserGuide/subset.md create mode 100644 docs/src/UserGuide/types.md create mode 100644 docs/src/UserGuide/write.md rename docs/src/{HowdoI => development}/contribute.md (100%) rename docs/src/{ => development}/contributors.md (100%) rename docs/src/tutorials/{tutorial.md => other_tutorials.md} (98%) diff --git a/docs/src/.vitepress/config.mts b/docs/src/.vitepress/config.mts index 6e3a6c5a..9c810715 100644 --- a/docs/src/.vitepress/config.mts +++ b/docs/src/.vitepress/config.mts @@ -11,16 +11,17 @@ export default defineConfig({ cleanUrls: true, outDir: 'REPLACE_ME_DOCUMENTER_VITEPRESS', // This is required for MarkdownVitepress to work correctly... ignoreDeadLinks: true, - + markdown: { math: true, config(md) { md.use(tabsMarkdownPlugin), - md.use(mathjax3) + md.use(mathjax3) }, theme: { light: "github-light", - dark: "github-dark"} + dark: "github-dark" + } }, themeConfig: { outline: 'deep', @@ -35,25 +36,22 @@ export default defineConfig({ nav: [ { text: 'Home', link: '/' }, { text: 'Getting Started', link: '/getting_started' }, - { text: 'User Guide', + { + text: 'User Guide', items: [ - { text: 'Creating YAXArrays and Datasets', link: '/UserGuide/creating' }, - { text: 'Indexing and subsetting', link: '/UserGuide/indexing_subsetting' }, - { text: 'Saving YAXArrays and Datasets', link: '/UserGuide/saving' }, - { text: 'Setting chunks size', link: '/UserGuide/setchuncks' }, - { text: 'Apply functions on YAXArrays', link: '/UserGuide/applyfunctions' }, - { text: 'Create Cube from function', link: '/UserGuide/create_cube_from_function' }, - { text: 'Group by', link: '/UserGuide/group_by' }, - { text: 'Distributed computing', link: '/UserGuide/distributed' }, - { text: 'Open NetCDF', link: '/UserGuide/openNetCDF' }, - { text: 'Open Zarr (Store)', link: '/UserGuide/openZarr' }, - ]}, - { text: 'Tutorials', + { text: 'Read and Write', link: '/UserGuide/read_and_write' }, + { text: 'Compute', link: '/UserGuide/compute' }, + { text: 'FAQ', link: '/UserGuide/faq' }, + ] + }, + { + text: 'Tutorials', items: [ { text: 'Overview', link: '/tutorials/tutorial' }, { text: 'Plotting maps', link: '/tutorials/plottingmaps' }, { text: 'Mean Seasonal Cycle', link: '/tutorials/mean_seasonal_cycle' }, - { text: 'ESDL studies', + { + text: 'ESDL studies', items: [ { text: 'ESDL study 1', link: 'https://github.com/JuliaDataCubes/YAXArrays.jl/blob/master/docs/src/tutorials/esdl/examples_from_esdl_study_1.jl' }, { text: 'ESDL study 2', link: 'https://github.com/JuliaDataCubes/YAXArrays.jl/blob/master/docs/src/tutorials/esdl/examples_from_esdl_study_2.jl' }, @@ -61,46 +59,53 @@ export default defineConfig({ { text: 'ESDL study 4', link: 'https://github.com/JuliaDataCubes/YAXArrays.jl/blob/master/docs/src/tutorials/esdl/examples_from_esdl_study_4.jl' }, ] }, - ]}, - { text: 'How do I?', - items: [ - { text: 'How do I ...', link: '/HowdoI/howdoi' }, - { text: 'Contribute to docs', link: '/HowdoI/contribute' }, - { text: 'Contributors', link: '/contributors' } - ]}, + { text: 'Other Tutorials', link: '/tutorials/other_tutorials' }, + ] + }, + { + text: 'Development', + items: [ + { text: 'Contribute', link: 'development/contribute' }, + { text: 'Contributors', link: 'development/contributors' } + ] + }, ], sidebar: [ { text: 'Getting Started', link: '/getting_started' }, - { text: 'User Guide', + { + text: 'User Guide', items: [ - { text: 'Creating YAXArrays and Datasets', link: '/UserGuide/creating' }, - { text: 'Indexing and subsetting', link: '/UserGuide/indexing_subsetting' }, - { text: 'Saving YAXArrays and Datasets', link: '/UserGuide/saving' }, - { text: 'Setting chunks size', link: '/UserGuide/setchuncks' }, - { text: 'Apply functions on YAXArrays', link: '/UserGuide/applyfunctions' }, - { text: 'Create Cube from function', link: '/UserGuide/create_cube_from_function' }, - { text: 'Group by', link: '/UserGuide/group_by' }, - { text: 'Distributed computing', link: '/UserGuide/distributed' }, - { text: 'Open NetCDF', link: '/UserGuide/openNetCDF' }, - { text: 'Open Zarr (Store)', link: '/UserGuide/openZarr' }, - ]}, - { text: 'Tutorials', + { text: 'Types', link: '/UserGuide/types' }, + { text: 'Read', link: '/UserGuide/read' }, + { text: 'Write', link: '/UserGuide/write' }, + { text: 'Subset', link: '/UserGuide/subset' }, + { text: 'Compute', link: '/UserGuide/compute' }, + { text: 'Group', link: '/UserGuide/group' }, + { text: 'Chunk', link: '/UserGuide/chunk' }, + { text: 'FAQ', link: '/UserGuide/faq' } + ] + }, + { + text: 'Tutorials', items: [ - { text: 'Overview', link: '/tutorials/tutorial' }, { text: 'Plotting maps', link: '/tutorials/plottingmaps' }, - { text: 'Mean Seasonal Cycle', link: '/tutorials/mean_seasonal_cycle' } - ]}, - { text: 'How do I?', - items: [ - { text: 'How do I ...', link: '/HowdoI/howdoi' }, - { text: 'Contribute to docs', link: '/HowdoI/contribute' }, - ]}, - { text: 'Contributors', link: '/contributors' }, - { text: 'API', - items: [ - { text: 'API Reference', link: 'api' }, - ]}, + { text: 'Mean Seasonal Cycle', link: '/tutorials/mean_seasonal_cycle' }, + { text: 'Other Tutorials', link: '/tutorials/other_tutorials' }, + ] + }, + { + text: 'Development', + items: [ + { text: 'Contribute', link: 'development/contribute' }, + { text: 'Contributors', link: 'development/contributors' } + ] + }, { + text: 'API', + items: [ + { text: 'API Reference', link: 'api' }, + ] + }, ], editLink: { pattern: 'https://github.com/JuliaDataCubes/YAXArrays.jl/edit/master/docs/src/:path' diff --git a/docs/src/UserGuide/chunk.md b/docs/src/UserGuide/chunk.md new file mode 100644 index 00000000..fe73532e --- /dev/null +++ b/docs/src/UserGuide/chunk.md @@ -0,0 +1 @@ +# Chunk YAXArrays \ No newline at end of file diff --git a/docs/src/UserGuide/compute.md b/docs/src/UserGuide/compute.md new file mode 100644 index 00000000..81353891 --- /dev/null +++ b/docs/src/UserGuide/compute.md @@ -0,0 +1,16 @@ +# Compute YAXArrays + +This section describes how to create new YAXArrays by performing arithmetic operations. + +## Arithmetics + +## map + +## mapslices + +## mapCube + + +## Distributed Computation + +parallel \ No newline at end of file diff --git a/docs/src/UserGuide/applyfunctions.md b/docs/src/UserGuide/etc/applyfunctions.md similarity index 100% rename from docs/src/UserGuide/applyfunctions.md rename to docs/src/UserGuide/etc/applyfunctions.md diff --git a/docs/src/UserGuide/create_cube_from_function.md b/docs/src/UserGuide/etc/create_cube_from_function.md similarity index 100% rename from docs/src/UserGuide/create_cube_from_function.md rename to docs/src/UserGuide/etc/create_cube_from_function.md diff --git a/docs/src/UserGuide/creating.md b/docs/src/UserGuide/etc/creating.md similarity index 100% rename from docs/src/UserGuide/creating.md rename to docs/src/UserGuide/etc/creating.md diff --git a/docs/src/UserGuide/distributed.md b/docs/src/UserGuide/etc/distributed.md similarity index 100% rename from docs/src/UserGuide/distributed.md rename to docs/src/UserGuide/etc/distributed.md diff --git a/docs/src/UserGuide/indexing_subsetting.md b/docs/src/UserGuide/etc/indexing_subsetting.md similarity index 100% rename from docs/src/UserGuide/indexing_subsetting.md rename to docs/src/UserGuide/etc/indexing_subsetting.md diff --git a/docs/src/UserGuide/saving.md b/docs/src/UserGuide/etc/saving.md similarity index 100% rename from docs/src/UserGuide/saving.md rename to docs/src/UserGuide/etc/saving.md diff --git a/docs/src/UserGuide/setchuncks.md b/docs/src/UserGuide/etc/setchuncks.md similarity index 100% rename from docs/src/UserGuide/setchuncks.md rename to docs/src/UserGuide/etc/setchuncks.md diff --git a/docs/src/HowdoI/howdoi.md b/docs/src/UserGuide/faq.md similarity index 99% rename from docs/src/HowdoI/howdoi.md rename to docs/src/UserGuide/faq.md index 36f24844..8b46eccb 100644 --- a/docs/src/HowdoI/howdoi.md +++ b/docs/src/UserGuide/faq.md @@ -1,4 +1,5 @@ -# How do I do it? +# Frequently Asked Questions (FAQ) + The purpose of this section is to do a collection of small convinient pieces of code on how to do simple things. diff --git a/docs/src/UserGuide/group_by.md b/docs/src/UserGuide/group.md similarity index 99% rename from docs/src/UserGuide/group_by.md rename to docs/src/UserGuide/group.md index b63787cb..389efbe0 100644 --- a/docs/src/UserGuide/group_by.md +++ b/docs/src/UserGuide/group.md @@ -1,4 +1,4 @@ -# GroupBy +# Group YAXArrays and Datasets The following examples will use the `groupby` function to calculate temporal and spatial averages. diff --git a/docs/src/UserGuide/openZarr.md b/docs/src/UserGuide/openZarr.md deleted file mode 100644 index f1ffc743..00000000 --- a/docs/src/UserGuide/openZarr.md +++ /dev/null @@ -1,21 +0,0 @@ -# Opening a Zarr directory from a store - -````@example open_zarr -using Zarr, YAXArrays -store ="gs://cmip6/CMIP6/ScenarioMIP/DKRZ/MPI-ESM1-2-HR/ssp585/r1i1p1f1/3hr/tas/gn/v20190710/" -```` - -Open and select the `tas` variable, - -````@ansi open_zarr -g = open_dataset(zopen(store, consolidated=true)) -```` - -get variable - -````@ansi open_zarr -c = g["tas"] -```` - -After this operate on it as usual. - diff --git a/docs/src/UserGuide/openNetCDF.md b/docs/src/UserGuide/read.md similarity index 62% rename from docs/src/UserGuide/openNetCDF.md rename to docs/src/UserGuide/read.md index 6c25bde3..0a662a02 100644 --- a/docs/src/UserGuide/openNetCDF.md +++ b/docs/src/UserGuide/read.md @@ -1,8 +1,12 @@ -# Opening NetCDF files +# Read YAXArrays and Datasets + +Here we learn how to open files into arrays and datasets. + +## NetCDF In this example we are going to use a `NetCDF` file. To open a single data file we first need to load the appropriate backend package via `using NetCDF`. -## File with one variable +### File with one variable ````@example open_nc using YAXArrays, NetCDF @@ -17,7 +21,7 @@ nothing # hide c = Cube(filename) ```` -## File with multiple variables, mixed dimensions +### File with multiple variables, mixed dimensions When the dataset contains variables with different dimensions you should use `open_dataset` as in @@ -44,4 +48,25 @@ or c["tas"] ```` -Note that their output is a YAXArray. \ No newline at end of file +Note that their output is a YAXArray. + +## Zarr + +````@example open_zarr +using Zarr, YAXArrays +store ="gs://cmip6/CMIP6/ScenarioMIP/DKRZ/MPI-ESM1-2-HR/ssp585/r1i1p1f1/3hr/tas/gn/v20190710/" +```` + +Open and select the `tas` variable, + +````@ansi open_zarr +g = open_dataset(zopen(store, consolidated=true)) +```` + +get variable + +````@ansi open_zarr +c = g["tas"] +```` + +After this operate on it as usual. diff --git a/docs/src/UserGuide/subset.md b/docs/src/UserGuide/subset.md new file mode 100644 index 00000000..081ba334 --- /dev/null +++ b/docs/src/UserGuide/subset.md @@ -0,0 +1 @@ +# Subset YAXArrays and Datasets \ No newline at end of file diff --git a/docs/src/UserGuide/types.md b/docs/src/UserGuide/types.md new file mode 100644 index 00000000..fe48e83c --- /dev/null +++ b/docs/src/UserGuide/types.md @@ -0,0 +1,33 @@ +# Types + +This section describes the data structures used to work with n-dimensional arrays in YAXArrays. + +## YAXArray + +An `Array` stores a sequence of ordered elements of the same type usually across multiple dimensions or axes. +For example, one can measure temperature across all time points of the time dimension or brightness values of a picture across X and Y dimensions. +A one dimensional array is called `Vector` and a two dimensional array is called a `Matrix`. +In many Machine Learning libraries, arrays are also called tensors. +Arrays are designed to store dense spatial-temporal data stored in a grid, whereas a collection of sparse points is usually stored in data frames or relational databases. + +A `DimArray` as defined by (DimensionalData.jl)(https://rafaqz.github.io/DimensionalData.jl/dev/) adds names to the dimensions and their axes ticks for a given `Array`. +These names can be used to access the data, e.g., by date instead of just by integer position. + +A `YAXArray` is a subtype of a `AbstractDimArray` and adds functions to load and process the named arrays. +For example, it can also handle very large arrays stored on disk that are too big to fit in memory. +In addition, it provides functions for parallel computation. + +## Dataset + +A `Dataset` is an ordered dictionary of `YAXArrays` that usually share dimensios. +For example, it can bundle arrays storing temperature and precipitation that are measured at the same time points and the same locations. +One also can store a picture in a Dataset with three arrays containing brightness values for red green and blue, respectiveley. +Internally, those arrays are still separated allowing to chose different element types for each array. +Analog to the (NetCDF Data Model)[https://docs.unidata.ucar.edu/netcdf-c/current/netcdf_data_model.html], a Dataset usually represents variables belonging to the same group. + +## Cube + +A `Cube` is just a `YAXArray` in which arrays from a dataset are combined together by introducing a new dimension containing labels of which array the corresponding element came from. +Unlike a `Dataset`, all arrays must have the same element type to be converted into a cube. +This data structure is usefull when we want to use all variables at once. +For example, the arrays temperature and precipitation are combnined into a single cube. \ No newline at end of file diff --git a/docs/src/UserGuide/write.md b/docs/src/UserGuide/write.md new file mode 100644 index 00000000..3554c935 --- /dev/null +++ b/docs/src/UserGuide/write.md @@ -0,0 +1,5 @@ +# Write YAXArrays and Datasets + +## NetCDF + +## Zarr \ No newline at end of file diff --git a/docs/src/HowdoI/contribute.md b/docs/src/development/contribute.md similarity index 100% rename from docs/src/HowdoI/contribute.md rename to docs/src/development/contribute.md diff --git a/docs/src/contributors.md b/docs/src/development/contributors.md similarity index 100% rename from docs/src/contributors.md rename to docs/src/development/contributors.md diff --git a/docs/src/tutorials/tutorial.md b/docs/src/tutorials/other_tutorials.md similarity index 98% rename from docs/src/tutorials/tutorial.md rename to docs/src/tutorials/other_tutorials.md index 1cc30494..258921bb 100644 --- a/docs/src/tutorials/tutorial.md +++ b/docs/src/tutorials/other_tutorials.md @@ -1,4 +1,4 @@ -# YAXArray tutorial +# Other tutorials If you are interested in learning how to work with YAXArrays for different use cases you can follow along one of the following tutorials. - Currently the overview tutorial is located at [ESDLTutorials Repository](https://github.com/JuliaDataCubes/ESDLTutorials) From b28c5589a5579c80c034c1f8eb47a88a94beda26 Mon Sep 17 00:00:00 2001 From: Daniel Loos Date: Wed, 22 May 2024 11:28:28 +0200 Subject: [PATCH 02/10] Make get started consistent --- docs/src/.vitepress/config.mts | 8 ++------ docs/src/{getting_started.md => get_started.md} | 0 2 files changed, 2 insertions(+), 6 deletions(-) rename docs/src/{getting_started.md => get_started.md} (100%) diff --git a/docs/src/.vitepress/config.mts b/docs/src/.vitepress/config.mts index 9c810715..38f2fbba 100644 --- a/docs/src/.vitepress/config.mts +++ b/docs/src/.vitepress/config.mts @@ -72,7 +72,8 @@ export default defineConfig({ ], sidebar: [ - { text: 'Getting Started', link: '/getting_started' }, + { text: 'Get Started', link: '/get_started' }, + { text: 'API Reference', link: 'api' }, { text: 'User Guide', items: [ @@ -100,11 +101,6 @@ export default defineConfig({ { text: 'Contribute', link: 'development/contribute' }, { text: 'Contributors', link: 'development/contributors' } ] - }, { - text: 'API', - items: [ - { text: 'API Reference', link: 'api' }, - ] }, ], editLink: { diff --git a/docs/src/getting_started.md b/docs/src/get_started.md similarity index 100% rename from docs/src/getting_started.md rename to docs/src/get_started.md From a58dcacede7906e912b5d4ffaeb058a648e493b7 Mon Sep 17 00:00:00 2001 From: Daniel Loos Date: Wed, 22 May 2024 16:54:04 +0200 Subject: [PATCH 03/10] Update documentation --- docs/Project.toml | 1 + docs/src/.vitepress/config.mts | 14 +- docs/src/UserGuide/chunk.md | 91 ++++++++++++- docs/src/UserGuide/combine.md | 33 +++++ docs/src/UserGuide/compute.md | 88 ++++++++++++- docs/src/UserGuide/create.md | 35 +++++ docs/src/UserGuide/etc/applyfunctions.md | 75 ----------- docs/src/UserGuide/etc/indexing_subsetting.md | 90 ------------- docs/src/UserGuide/etc/saving.md | 107 --------------- docs/src/UserGuide/etc/setchuncks.md | 90 ------------- docs/src/UserGuide/read.md | 79 ++++------- docs/src/UserGuide/subset.md | 123 +++++++++++++++++- docs/src/UserGuide/types.md | 2 +- docs/src/UserGuide/write.md | 114 +++++++++++++++- docs/src/api.md | 4 + docs/src/index.md | 6 +- 16 files changed, 525 insertions(+), 427 deletions(-) create mode 100644 docs/src/UserGuide/combine.md create mode 100644 docs/src/UserGuide/create.md delete mode 100644 docs/src/UserGuide/etc/applyfunctions.md delete mode 100644 docs/src/UserGuide/etc/indexing_subsetting.md delete mode 100644 docs/src/UserGuide/etc/saving.md delete mode 100644 docs/src/UserGuide/etc/setchuncks.md diff --git a/docs/Project.toml b/docs/Project.toml index e584de44..43a7c871 100644 --- a/docs/Project.toml +++ b/docs/Project.toml @@ -1,6 +1,7 @@ [deps] BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" Bonito = "824d6782-a2ef-11e9-3a09-e5662e0c26f8" +CFTime = "179af706-886a-5703-950a-314cd64e0468" CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0" DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" diff --git a/docs/src/.vitepress/config.mts b/docs/src/.vitepress/config.mts index 38f2fbba..18ad08a6 100644 --- a/docs/src/.vitepress/config.mts +++ b/docs/src/.vitepress/config.mts @@ -35,13 +35,19 @@ export default defineConfig({ }, nav: [ { text: 'Home', link: '/' }, - { text: 'Getting Started', link: '/getting_started' }, + { text: 'Get Started', link: '/get_started' }, { text: 'User Guide', items: [ - { text: 'Read and Write', link: '/UserGuide/read_and_write' }, + { text: 'Read', link: '/UserGuide/read' }, + { text: 'Create', link: '/UserGuide/create' }, + { text: 'Write', link: '/UserGuide/write' }, + { text: 'Subset', link: '/UserGuide/subset' }, { text: 'Compute', link: '/UserGuide/compute' }, - { text: 'FAQ', link: '/UserGuide/faq' }, + { text: 'Group', link: '/UserGuide/group' }, + { text: 'Combine', link: '/UserGuide/combine' }, + { text: 'Chunk', link: '/UserGuide/chunk' }, + { text: 'FAQ', link: '/UserGuide/faq' } ] }, { @@ -78,11 +84,13 @@ export default defineConfig({ text: 'User Guide', items: [ { text: 'Types', link: '/UserGuide/types' }, + { text: 'Create', link: '/UserGuide/create' }, { text: 'Read', link: '/UserGuide/read' }, { text: 'Write', link: '/UserGuide/write' }, { text: 'Subset', link: '/UserGuide/subset' }, { text: 'Compute', link: '/UserGuide/compute' }, { text: 'Group', link: '/UserGuide/group' }, + { text: 'Combine', link: '/UserGuide/combine' }, { text: 'Chunk', link: '/UserGuide/chunk' }, { text: 'FAQ', link: '/UserGuide/faq' } ] diff --git a/docs/src/UserGuide/chunk.md b/docs/src/UserGuide/chunk.md index fe73532e..da7874fb 100644 --- a/docs/src/UserGuide/chunk.md +++ b/docs/src/UserGuide/chunk.md @@ -1 +1,90 @@ -# Chunk YAXArrays \ No newline at end of file +# Chunk YAXArrays + +> [!IMPORTANT] +> Thinking about chunking is important when it comes to analyzing your data, because in most situations this will not fit into memory, hence having the fastest read access to it is crucial for your workflows. For example, for geo-spatial data do you want fast access on time or space, or... think about it. + +To determine the chunk size of the array representation on disk, +call the `setchunks` function prior to saving. + +## Chunking YAXArrays + +````@example chunks +using YAXArrays, Zarr +a = YAXArray(rand(10,20)) +a_chunked = setchunks(a, (5,10)) +a_chunked.chunks +```` +And the saved file is also splitted into Chunks. + +````@example chunks +f = tempname() +savecube(a_chunked, f, backend=:zarr) +Cube(f).chunks +```` + +Alternatively chunk sizes can be given by dimension name, so the following results in the same chunks: + +````@example chunks +a_chunked = setchunks(a, (Dim_2=10, Dim_1=5)) +a_chunked.chunks +```` + +## Chunking Datasets +Setchunks can also be applied to a `Dataset`. + +### Set Chunks by Axis + +Set chunk size for each axis occuring in a `Dataset`. This will be applied to all variables in the dataset: + +````@example chunks +using YAXArrays, Zarr +ds = Dataset(x = YAXArray(rand(10,20)), y = YAXArray(rand(10)), z = YAXArray(rand(10,20,5))) +dschunked = setchunks(ds, Dict("Dim_1"=>5, "Dim_2"=>10, "Dim_3"=>2)) +Cube(dschunked).chunks +```` + +Saving... + +````@example chunks +f = tempname() +savedataset(dschunked, path=f, driver=:zarr) +```` + +### Set chunking by Variable + +The following will set the chunk size for each Variable separately +and results in exactly the same chunking as the example above + +````@example chunks +using YAXArrays, Zarr +ds = Dataset(x = YAXArray(rand(10,20)), y = YAXArray(rand(10)), z = YAXArray(rand(10,20,5))) +dschunked = setchunks(ds,(x = (5,10), y = Dict("Dim_1"=>5), z = (Dim_1 = 5, Dim_2 = 10, Dim_3 = 2))) +Cube(dschunked).chunks +```` + +saving... + +````@example chunks +f = tempname() +savedataset(dschunked, path=f, driver=:zarr) +```` + +### Set chunking for all variables + +The following code snippet only works when all member variables of the dataset have the same shape and sets the output chunks for all arrays. + +````@example chunks +using YAXArrays, Zarr +ds = Dataset(x = YAXArray(rand(10,20)), y = YAXArray(rand(10,20)), z = YAXArray(rand(10,20))) +dschunked = setchunks(ds,(5,10)) +Cube(dschunked).chunks +```` + +saving... + +````@example chunks +f = tempname() +savedataset(dschunked, path=f, driver=:zarr) +```` + +Suggestions on how to improve or add to these examples is welcome. diff --git a/docs/src/UserGuide/combine.md b/docs/src/UserGuide/combine.md new file mode 100644 index 00000000..7d31afb1 --- /dev/null +++ b/docs/src/UserGuide/combine.md @@ -0,0 +1,33 @@ +# Combine YAXArrays + +Data is often scattered across multiple files and corresponding arrays, e.g. one file per time step. +This section describes methods on how to combine them into a single YAXArray. + +## Concatenate YAXArrays along an existing dimension + +Here we use `cat` to combine two arrays consisting of data from the first and the second half of a year into one single array containing the whole year. +We glue the arrays along the first dimension using `dims = 1`: +The resulting array `whole_year` still has one dimension, i.e. time, but with 12 instead of 6 elements. + +````@example cat +using YAXArrays + +first_half = YAXArray((Dim{:time}(1:6),), rand(6)) +second_half = YAXArray((Dim{:time}(7:12),), rand(6)) +whole_year = cat(first_half, second_half, dims = 1) +```` + +## Combine YAXArrays along a new dimension + +Here we use `concatenatecubes` to combine two arrays of different variables that share the same time dimension. +The resulting array `combined` has an additional dimension `variable` indicating from which array the element values originates. + +````@example concatenatecubes +using YAXArrays + +temperature = YAXArray((Dim{:time}(1:6),), rand(6)) +precipitation = YAXArray((Dim{:time}(1:6),), rand(6)) +cubes = [temperature,precipitation] +var_axis = Dim{:variable}(["temp", "prep"]) +combined = concatenatecubes(cubes, var_axis) +```` \ No newline at end of file diff --git a/docs/src/UserGuide/compute.md b/docs/src/UserGuide/compute.md index 81353891..01d7aa3a 100644 --- a/docs/src/UserGuide/compute.md +++ b/docs/src/UserGuide/compute.md @@ -1,16 +1,102 @@ # Compute YAXArrays -This section describes how to create new YAXArrays by performing arithmetic operations. +This section describes how to create new YAXArrays by performing operations on them. + +- Use [arithmetics](#Arithmetics) to add or multiply numbers to each element of an array +- Use [map](#map) to apply a more complex functions to every element of an array +- Use [mapslices](#mapslices) to reduce a dimension, e.g. to get the mean over all time steps +- Use [mapCube](#mapCube) to apply complex functions on an array that may change any dimensions + + +Let's start by creating an example dataset: + +````@example compute +using YAXArrays +using Dates + +axlist = ( + Dim{:time}(Date("2022-01-01"):Day(1):Date("2022-01-30")), + Dim{:lon}(range(1, 10, length=10)), + Dim{:lat}(range(1, 5, length=15)), +) +data = rand(30, 10, 15) +properties = Dict(:origin => "user guide") +a = YAXArray(axlist, data2, properties) +```` + +## Modify elements of a YAXArray + +````@example compute +a[1,2,3] +```` + +````@example compute +a[1,2,3] = 42 +```` + +````@example compute +a[1,2,3] +```` + +::: warning + +Some arrays, e.g. those saved in a cloud object storage are immutable making any modification of the data impossible. + +::: + ## Arithmetics +Add a value to all elements of an array and save it as a new array: + +````@example compute +a2 = a .+ 5 +```` + +````@example compute +a2[1,2,3] == a[1,2,3] + 5 +```` + ## map +Apply a function on every element of an array individually: + +````@example compute +offset = 5 +map(a) do x + (x + offset) / 2 * 3 +end +```` + +This keeps all dimensions unchanged. +Note, that here we can not access neighboring elements. +In this case, we can use `mapslices` or `mapCube` instead. +Each element of the array is processed individually. + +The code runs very fast, because `map` applies the function lazily. +Actual computation will be performed only on demand, e.g. when elements were explicitly requested or further computations were performed. + + ## mapslices +Reduce the time dimension by calculating the average value of all time points: + +````@example compute +import Statistics: mean +mapslices(mean, a, dims="Time") +```` +There is no time dimension left, because there is only one value left after averaging all time steps. +We can also calculate spatial means resulting in one value per time step: + +````@example compute +import Statistics: mean +mapslices(mean, a, dims=("lat", "lon")) +```` + ## mapCube + ## Distributed Computation parallel \ No newline at end of file diff --git a/docs/src/UserGuide/create.md b/docs/src/UserGuide/create.md new file mode 100644 index 00000000..7e973192 --- /dev/null +++ b/docs/src/UserGuide/create.md @@ -0,0 +1,35 @@ +# Create YAXArrays and Datasets + +## Create a YAXArray + +We can create a new YAXArray by filling the values directly: + +````@example create +using YAXArrays +a1 = YAXArray(rand(10, 20, 5)) +```` + +We can also specify the dimensions with custom names enabling easier access: + +````@example create +using Dates + +axlist = ( + Dim{:time}(Date("2022-01-01"):Day(1):Date("2022-01-30")), + Dim{:lon}(range(1, 10, length=10)), + Dim{:lat}(range(1, 5, length=15)), +) +data2 = rand(30, 10, 15) +properties = Dict(:origin => "user guide") +a2 = YAXArray(axlist, data2, properties) +```` + +## Create a Dataset + +````@example create +data3 = rand(30, 10, 15) +a3 = YAXArray(axlist, data3, properties) + +arrays = Dict(:a2 => a2, :a3 => a3) +ds = Dataset(; properties, arrays...) +```` \ No newline at end of file diff --git a/docs/src/UserGuide/etc/applyfunctions.md b/docs/src/UserGuide/etc/applyfunctions.md deleted file mode 100644 index 85d5d6df..00000000 --- a/docs/src/UserGuide/etc/applyfunctions.md +++ /dev/null @@ -1,75 +0,0 @@ -# How to apply functions on YAXArrays - -To apply user defined functions on a YAXArray data type we can use the [`map`](@ref) function, -[`mapslices`](@ref) function or the [`mapCube`](@ref) function. Which of these functions should -be used depends on the layout of the data that the user defined function should be applied on. - -## Apply a function on every element of a datacube - -The `map` function can be used to apply a function on every entry of a YAXArray without taking -the dimensions into account. This will lazily register the mapped function which is applied when -the YAXArray is either accessed or when more involved computations are made. - -If we set up a dummy data cube which has all numbers between 1 and 10000. - -````@example applyF -using YAXArrays -using DimensionalData -axes = (Dim{:Lon}(1:10), Dim{:Lat}(1:10), Dim{:Time}(1:100)) -original = YAXArray(axes, reshape(1:10000, (10,10,100))) -nothing # hide -```` - -with one at the first position: - -````@ansi applyF -original[1,:,1] -```` -now we can substract `1` from all elements of this cube - -````@ansi applyF -substracted = map(x-> x-1, original) -```` - -`substracted` is a cube of the same size as `original`, and the applied function is registered, -so that it is applied as soon as the elements of `substracted` are either accessed or further used -in other computations. - -````@ansi applyF -substracted[1,:,1] -```` - -## Apply a function along dimensions of a single cube - -If an function should work along a certain dimension of the data you can use the `mapslices` function -to easily apply this function. This doesn't give you the flexibility of the `mapCube` function but it -is easier to use for simple functions. - -If we set up a dummy data cube which has all numbers between 1 and 10000. - -````@ansi applyF -axes = (Dim{:Lon}(1:10), Dim{:Lat}(1:10), Dim{:Time}(1:100)) -original = YAXArray(axes, reshape(1:10000, (10,10,100))) -```` - -and then we would like to compute the sum over the Time dimension: - -````@ansi applyF -timesum = mapslices(sum, original, dims="Time") -```` - -this reduces over the time dimension and gives us the following values - -````@ansi applyF -timesum[:,:] -```` - -You can also apply a function along multiple dimensions of the same data cube. - -````@ansi applyF -lonlatsum = mapslices(sum, original, dims=("Lon", "Lat")) -```` - -## Multiple input cubes to a function - -TODO \ No newline at end of file diff --git a/docs/src/UserGuide/etc/indexing_subsetting.md b/docs/src/UserGuide/etc/indexing_subsetting.md deleted file mode 100644 index 1651017e..00000000 --- a/docs/src/UserGuide/etc/indexing_subsetting.md +++ /dev/null @@ -1,90 +0,0 @@ -# Indexing, subsetting and selectors - -All these operations are done via [`DimensionalData.jl`](https://rafaqz.github.io/DimensionalData.jl/dev/). - -````@example indexing -using YAXArrays, Dates -```` - -## Define a toy cube - -````@ansi indexing -t = Date("2020-01-01"):Month(1):Date("2022-12-31") -axes = (Dim{:lon}(-9:10), Dim{:lat}(-5:15), Dim{:time}(t)) -c = YAXArray(axes, reshape(1:20*21*36, (20, 21, 36))) -```` - -A very convinient selector is `lookup`, getting for example the values for `lon` and `time`. - -## lookup - -````@example indexing -lon = lookup(c, :lon) -```` - -````@example indexing -tempo = lookup(c, :time) -```` - - -## Selectors - -### `At` value - -````@ansi indexing -c[time = At(DateTime("2021-05-01"))] -```` - -### `At` vector of values - -````@ansi indexing -c[time = At([DateTime("2021-05-01"), DateTime("2021-06-01")])] -```` - -similarly for any of the spatial dimensions: - -````@ansi indexing -c[lon = At([-9,-5])] -```` - -### `At` values with tolerance (`atol`, `rtol`) - -````@ansi indexing -c[lon = At([-10, 11]; atol = 1)] -```` -## Subsetting - -This is also done with selectors, see the following examples - -### Between - -Altought a `Between(a,b)` function is available in `DimensionalData`, is recommended to use instead the `a .. b` notation: - -````@ansi indexing -c[lon = -9 .. -7] # close interval, all points included. -```` - -More selectors from DimensionalData are available, such as `Touches`, `Near`, `Where` and `Contains`. - - -### Open/Close Intervals - -````@example indexing -using IntervalSets -```` - -````@ansi indexing -c[lon = OpenInterval(-9, -7)] -```` - -````@ansi indexing -c[lon = ClosedInterval(-9, -7)] -```` -````@ansi indexing -c[lon =Interval{:open,:closed}(-9,-7)] -```` -````@ansi indexing -c[lon =Interval{:closed,:open}(-9,-7)] -```` - -See tutorials for use cases. \ No newline at end of file diff --git a/docs/src/UserGuide/etc/saving.md b/docs/src/UserGuide/etc/saving.md deleted file mode 100644 index e48e0d11..00000000 --- a/docs/src/UserGuide/etc/saving.md +++ /dev/null @@ -1,107 +0,0 @@ -# Saving YAXArrays and Datasets - -Is possible to save datasets and `YAXArray` directly to `zarr` files. - -## Saving a YAXArray to Zarr - -One can save any `YAXArray` using the `savecube` function. -Simply add a path as an argument and the cube will be saved. - -````@example saveYAX -using YAXArrays, Zarr -a = YAXArray(rand(10,20)) -savecube(a, "our_yax.zarr", driver=:zarr) -nothing # hide -```` - - -## Saving a YAXArray to NetCDF - -Saving to NetCDF works exactly the same way: - -````@example saveYAX -using YAXArrays, Zarr, NetCDF -a = YAXArray(rand(10,20)) -savecube(a, "our_yax.nc", driver=:netcdf) -nothing # hide -```` - -## Saving a Dataset - -Saving Datasets can be done using the `savedataset` function. - -````@example saveDataset -using YAXArrays, Zarr -ds = Dataset(x = YAXArray(rand(10,20)), y = YAXArray(rand(10))) -f = "our_dataset.zarr" -savedataset(ds, path=f, driver=:zarr) -nothing # hide -```` - -## Overwriting a Dataset -If a path already exists, an error will be thrown. Set `overwrite=true` to delete the existing dataset - -````@example saveDataset -savedataset(ds, path=f, driver=:zarr, overwrite=true) -nothing # hide -```` - -::: danger - -Again, setting `overwrite` will delete all your previous saved data. - -::: - -Look at the doc string for more information - -````@docs -savedataset -```` - -## Appending to a Dataset - -New variables can be added to an existing dataset using the `append=true` keyword. - -````@example saveDataset -ds2 = Dataset(z = YAXArray(rand(10,20,5))) -savedataset(ds2, path=f, backend=:zarr, append=true) -nothing # hide -```` - -````@ansi saveDataset -open_dataset(f, driver=:zarr) -```` - -## Datacube Skeleton without the actual data -Sometimes one merely wants to create a datacube "Skeleton" on disk and gradually fill it with data. Here we make use of `FillArrays` to create a `YAXArray` and write only the axis data and array metadata to disk, while no actual array data is copied: - -````@example saveDataset -using YAXArrays, Zarr, FillArrays -```` - -create the `Zeros` array - -````@ansi saveDataset -a = YAXArray(Zeros(Union{Missing, Int32}, 10, 20)) -```` - -and save them as - -````@example saveDataset -r = savecube(a, "skeleton.zarr", driver=:zarr, skeleton=true) -nothing # hide -```` - -and check that all the values are `missing` - -````@example saveDataset -all(ismissing,r[:,:]) -```` - -If using `FillArrays` is not possible, using the `zeros` function works as well, though it does allocate the array in memory. - -::: info - -The `skeleton` argument is also available for `savedataset`. - -::: diff --git a/docs/src/UserGuide/etc/setchuncks.md b/docs/src/UserGuide/etc/setchuncks.md deleted file mode 100644 index a02068c4..00000000 --- a/docs/src/UserGuide/etc/setchuncks.md +++ /dev/null @@ -1,90 +0,0 @@ -# Set chunks - -> [!IMPORTANT] -> Thinking about chunking is important when it comes to analyzing your data, because in most situations this will not fit into memory, hence having the fastest read access to it is crucial for your workflows. For example, for geo-spatial data do you want fast access on time or space, or... think about it. - -To determine the chunk size of the array representation on disk, -call the `setchunks` function prior to saving. - -## Chunking YAXArrays - -````@example chunks -using YAXArrays, Zarr -a = YAXArray(rand(10,20)) -a_chunked = setchunks(a, (5,10)) -a_chunked.chunks -```` -And the saved file is also splitted into Chunks. - -````@example chunks -f = tempname() -savecube(a_chunked, f, backend=:zarr) -Cube(f).chunks -```` - -Alternatively chunk sizes can be given by dimension name, so the following results in the same chunks: - -````@example chunks -a_chunked = setchunks(a, (Dim_2=10, Dim_1=5)) -a_chunked.chunks -```` - -## Chunking Datasets -Setchunks can also be applied to a `Dataset`. - -### Set Chunks by Axis - -Set chunk size for each axis occuring in a `Dataset`. This will be applied to all variables in the dataset: - -````@example chunks -using YAXArrays, Zarr -ds = Dataset(x = YAXArray(rand(10,20)), y = YAXArray(rand(10)), z = YAXArray(rand(10,20,5))) -dschunked = setchunks(ds, Dict("Dim_1"=>5, "Dim_2"=>10, "Dim_3"=>2)) -Cube(dschunked).chunks -```` - -Saving... - -````@example chunks -f = tempname() -savedataset(dschunked, path=f, driver=:zarr) -```` - -### Set chunking by Variable - -The following will set the chunk size for each Variable separately -and results in exactly the same chunking as the example above - -````@example chunks -using YAXArrays, Zarr -ds = Dataset(x = YAXArray(rand(10,20)), y = YAXArray(rand(10)), z = YAXArray(rand(10,20,5))) -dschunked = setchunks(ds,(x = (5,10), y = Dict("Dim_1"=>5), z = (Dim_1 = 5, Dim_2 = 10, Dim_3 = 2))) -Cube(dschunked).chunks -```` - -saving... - -````@example chunks -f = tempname() -savedataset(dschunked, path=f, driver=:zarr) -```` - -### Set chunking for all variables - -The following code snippet only works when all member variables of the dataset have the same shape and sets the output chunks for all arrays. - -````@example chunks -using YAXArrays, Zarr -ds = Dataset(x = YAXArray(rand(10,20)), y = YAXArray(rand(10,20)), z = YAXArray(rand(10,20))) -dschunked = setchunks(ds,(5,10)) -Cube(dschunked).chunks -```` - -saving... - -````@example chunks -f = tempname() -savedataset(dschunked, path=f, driver=:zarr) -```` - -Suggestions on how to improve or add to these examples is welcome. diff --git a/docs/src/UserGuide/read.md b/docs/src/UserGuide/read.md index 0a662a02..05b6d983 100644 --- a/docs/src/UserGuide/read.md +++ b/docs/src/UserGuide/read.md @@ -1,72 +1,45 @@ # Read YAXArrays and Datasets -Here we learn how to open files into arrays and datasets. +Here we learn how to open files as arrays and datasets. -## NetCDF -In this example we are going to use a `NetCDF` file. To open a single data file we first need to load the appropriate backend package via `using NetCDF`. +## Read Zarr -### File with one variable +Open a Zarr store as a `Dataset`: -````@example open_nc -using YAXArrays, NetCDF -using DiskArrays -using Downloads -url = "https://www.unidata.ucar.edu/software/netcdf/examples/tos_O1_2001-2002.nc" -filename = Downloads.download(url, "tos_O1_2001-2002.nc") # you pick your own path -nothing # hide +````@example read_zarr +using YAXArrays +using Zarr +path="gs://cmip6/CMIP6/ScenarioMIP/DKRZ/MPI-ESM1-2-HR/ssp585/r1i1p1f1/3hr/tas/gn/v20190710/" +store = zopen(path, consolidated=true) +ds = open_dataset(store) ```` -````@ansi open_nc -c = Cube(filename) -```` - -### File with multiple variables, mixed dimensions +We can set `path` to a URL, a local directory, or in this case to a cloud object storage path. -When the dataset contains variables with different dimensions you should use `open_dataset` as in +A zarr store may contain multiple arrays. +Individual arrays can be accessed using subsetting: -````@example open_nc -path2file = "https://www.unidata.ucar.edu/software/netcdf/examples/sresa1b_ncar_ccsm3-example.nc" -filename = Downloads.download(path2file, "sresa1b_ncar_ccsm3-example.nc") -c = open_dataset(filename) -nothing # hide +````@example read_zarr +ds.tas ```` -````@ansi open_nc -c -```` +## Read NetCDF -Afterwards, selecting a variable as usual works, i.e. +Open a NetCDF file as a `Dataset`: -````@ansi open_nc -c["ua"] -```` +````@example read_netcdf +using YAXArrays +using NetCDF +using Downloads: download -or - -````@ansi open_nc -c["tas"] +path = download("https://www.unidata.ucar.edu/software/netcdf/examples/tos_O1_2001-2002.nc", "example.nc") +ds = open_dataset(path) ```` -Note that their output is a YAXArray. - -## Zarr +A NetCDF file may contain multiple arrays. +Individual arrays can be accessed using subsetting: -````@example open_zarr -using Zarr, YAXArrays -store ="gs://cmip6/CMIP6/ScenarioMIP/DKRZ/MPI-ESM1-2-HR/ssp585/r1i1p1f1/3hr/tas/gn/v20190710/" +````@example read_netcdf +ds.tos ```` - -Open and select the `tas` variable, - -````@ansi open_zarr -g = open_dataset(zopen(store, consolidated=true)) -```` - -get variable - -````@ansi open_zarr -c = g["tas"] -```` - -After this operate on it as usual. diff --git a/docs/src/UserGuide/subset.md b/docs/src/UserGuide/subset.md index 081ba334..fd53855b 100644 --- a/docs/src/UserGuide/subset.md +++ b/docs/src/UserGuide/subset.md @@ -1 +1,122 @@ -# Subset YAXArrays and Datasets \ No newline at end of file +# Subset YAXArrays and Datasets + +The dimensions or axes of an `YAXArray` are named making it easier to select or query certain ranges of an array. +Let's open an example `Dataset` used to select certain elements: + +````@example subset +using YAXArrays +using NetCDF +using Downloads: download + +path = download("https://www.unidata.ucar.edu/software/netcdf/examples/tos_O1_2001-2002.nc", "example.nc") +ds = open_dataset(path) +```` + +## Select a YAXArray + +Get the sea surface temperature of the `Dataset`: + +````@example subset +tos = ds.tos +```` + +which is the same as: + +````@example subset +tos = ds.cubes[:tos] +```` + +## Select elements + +Using positional integer indexing: + +````@example subset +tos[lon = 1, lat = 1] +```` + +Same but using named indexing: + +````@example subset +tos[lon = At(1), lat = At(-79.5)] +```` + +Using special types: + +````@example subset +using CFTime +time1 = DateTime360Day(2001,01,16) +tos[time = At(time1)] +```` + +## Select ranges + +Here we subset an interval of a dimension using positional integer indexing. + +````@example subset +tos[lon = 1:10, lat = 1:10] +```` + +Same but using named indexing: + +````@example subset +tos[lon = At(1.0:2:19), lat = At(-79.5:1:-70.5)] +```` + +Read more about the `At` selector in the package `DimensionalData`. +Get values within a tolerances: + +````@example subset +tos[lon = At(1:10; atol = 1)] +```` + +## Closed and open intervals + +Although a `Between(a,b)` function is available in `DimensionalData`, is recommended to use instead the `a .. b` notation: + +````@example subset +tos[lon = 90 .. 180] +```` + +This describes a closed interval in which all points were included. +More selectors from DimensionalData are available, such as `Touches`, `Near`, `Where` and `Contains`. + + +````@example subset +using IntervalSets +```` + +````@ansi subset +tos[lon = OpenInterval(90, 180)] +```` + +````@ansi subset +tos[lon = ClosedInterval(90, 180)] +```` +````@ansi subset +tos[lon =Interval{:open,:closed}(90,180)] +```` +````@ansi subset +tos[lon =Interval{:closed,:open}(90,180)] +```` + +See tutorials for use cases. + +## Get a dimension + +Get values, .e.g., axis tick labels, of a dimension that can be used for subseting: + +````@example subset +collect(tos.lat) +```` + +These values are defined as lookups in the package `DimensionalData`: + +````@example subset +lookup(tos, :lon) +```` + +which is equivalent to: + +````@example subset +tos.lon.val +```` \ No newline at end of file diff --git a/docs/src/UserGuide/types.md b/docs/src/UserGuide/types.md index fe48e83c..b1077dc5 100644 --- a/docs/src/UserGuide/types.md +++ b/docs/src/UserGuide/types.md @@ -10,7 +10,7 @@ A one dimensional array is called `Vector` and a two dimensional array is called In many Machine Learning libraries, arrays are also called tensors. Arrays are designed to store dense spatial-temporal data stored in a grid, whereas a collection of sparse points is usually stored in data frames or relational databases. -A `DimArray` as defined by (DimensionalData.jl)(https://rafaqz.github.io/DimensionalData.jl/dev/) adds names to the dimensions and their axes ticks for a given `Array`. +A `DimArray` as defined by [DimensionalData.jl](https://rafaqz.github.io/DimensionalData.jl/dev/) adds names to the dimensions and their axes ticks for a given `Array`. These names can be used to access the data, e.g., by date instead of just by integer position. A `YAXArray` is a subtype of a `AbstractDimArray` and adds functions to load and process the named arrays. diff --git a/docs/src/UserGuide/write.md b/docs/src/UserGuide/write.md index 3554c935..f2f310ea 100644 --- a/docs/src/UserGuide/write.md +++ b/docs/src/UserGuide/write.md @@ -1,5 +1,115 @@ # Write YAXArrays and Datasets -## NetCDF +Create an example Dataset: + +````@example write +using YAXArrays +using NetCDF +using Downloads: download + +path = download("https://www.unidata.ucar.edu/software/netcdf/examples/tos_O1_2001-2002.nc", "example.nc") +ds = open_dataset(path) +```` + +## Write Zarr + +Save a single YAXArray to a directory: + +````@example write +using Zarr +savecube(ds.tos, "tos.zarr", driver=:zarr) +nothing # hide +```` + +Save an entire Dataset to a directory: + +````@example write +savedataset(ds, path="ds.zarr", driver=:zarr) +nothing # hide +```` + +## Write NetCDF + +Save a single YAXArray to a directory: + +````@example write +using NetCDF +savecube(ds.tos, "tos.nc", driver=:netcdf) +nothing # hide +```` + +Save an entire Dataset to a directory: + +````@example write +savedataset(ds, path="ds.nc", driver=:netcdf) +nothing # hide +```` + +## Overwrite a Dataset +If a path already exists, an error will be thrown. Set `overwrite=true` to delete the existing dataset + +````@example write +savedataset(ds, path="ds.zarr", driver=:zarr, overwrite=true) +nothing # hide +```` + +::: danger + +Again, setting `overwrite` will delete all your previous saved data. + +::: + +Look at the doc string for more information + +````@docs +savedataset +```` + +## Append to a Dataset + +New variables can be added to an existing dataset using the `append=true` keyword. + +````@example write +ds2 = Dataset(z = YAXArray(rand(10,20,5))) +savedataset(ds2, path="ds.zarr", backend=:zarr, append=true) +nothing # hide +```` + +````@ansi write +open_dataset("ds.zarr", driver=:zarr) +```` + +## Save Skeleton +Sometimes one merely wants to create a datacube "Skeleton" on disk and gradually fill it with data. Here we make use of `FillArrays` to create a `YAXArray` and write only the axis data and array metadata to disk, while no actual array data is copied: + +````@example write +using YAXArrays, Zarr, FillArrays +```` + +create the `Zeros` array + +````@ansi write +a = YAXArray(Zeros(Union{Missing, Int32}, 10, 20)) +```` + +and save them as + +````@example write +r = savecube(a, "skeleton.zarr", driver=:zarr, skeleton=true) +nothing # hide +```` + +and check that all the values are `missing` + +````@example write +all(ismissing,r[:,:]) +```` + +If using `FillArrays` is not possible, using the `zeros` function works as well, though it does allocate the array in memory. + +::: info + +The `skeleton` argument is also available for `savedataset`. + +::: -## Zarr \ No newline at end of file diff --git a/docs/src/api.md b/docs/src/api.md index 56a450a3..0d9839ff 100644 --- a/docs/src/api.md +++ b/docs/src/api.md @@ -1,3 +1,7 @@ +# API Reference + +This section describes all available functions of this package. + ## Public API ```@meta DocTestSetup= quote diff --git a/docs/src/index.md b/docs/src/index.md index 347660a4..af70d0e6 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -11,9 +11,9 @@ hero: src: /logo.png alt: VitePress actions: - - theme: brand - text: Getting Started - link: /getting_started + - theme: alt + text: Get Started + link: /get_started - theme: alt text: View on Github link: https://github.com/JuliaDataCubes/YAXArrays.jl From c868ffacf21a812839a88da15e18d0ac90080d60 Mon Sep 17 00:00:00 2001 From: Daniel Loos Date: Thu, 23 May 2024 14:23:04 +0200 Subject: [PATCH 04/10] Integrate distributed, creating and cube from func --- docs/src/.vitepress/config.mts | 15 +- docs/src/UserGuide/combine.md | 9 +- docs/src/UserGuide/compute.md | 130 ++++++++++++- docs/src/UserGuide/convert.md | 31 +++ docs/src/UserGuide/create.md | 11 ++ .../etc/create_cube_from_function.md | 81 -------- docs/src/UserGuide/etc/creating.md | 183 ------------------ docs/src/UserGuide/etc/distributed.md | 67 ------- docs/src/UserGuide/read.md | 2 +- docs/src/UserGuide/{subset.md => select.md} | 4 +- docs/src/UserGuide/types.md | 19 +- 11 files changed, 193 insertions(+), 359 deletions(-) create mode 100644 docs/src/UserGuide/convert.md delete mode 100644 docs/src/UserGuide/etc/create_cube_from_function.md delete mode 100644 docs/src/UserGuide/etc/creating.md delete mode 100644 docs/src/UserGuide/etc/distributed.md rename docs/src/UserGuide/{subset.md => select.md} (96%) diff --git a/docs/src/.vitepress/config.mts b/docs/src/.vitepress/config.mts index 18ad08a6..e6f86a4d 100644 --- a/docs/src/.vitepress/config.mts +++ b/docs/src/.vitepress/config.mts @@ -40,13 +40,14 @@ export default defineConfig({ text: 'User Guide', items: [ { text: 'Read', link: '/UserGuide/read' }, - { text: 'Create', link: '/UserGuide/create' }, { text: 'Write', link: '/UserGuide/write' }, - { text: 'Subset', link: '/UserGuide/subset' }, + { text: 'Convert', link: '/UserGuide/convert' }, + { text: 'Create', link: '/UserGuide/create' }, + { text: 'Select', link: '/UserGuide/select' }, { text: 'Compute', link: '/UserGuide/compute' }, + { text: 'Chunk', link: '/UserGuide/chunk' }, { text: 'Group', link: '/UserGuide/group' }, { text: 'Combine', link: '/UserGuide/combine' }, - { text: 'Chunk', link: '/UserGuide/chunk' }, { text: 'FAQ', link: '/UserGuide/faq' } ] }, @@ -83,15 +84,15 @@ export default defineConfig({ { text: 'User Guide', items: [ - { text: 'Types', link: '/UserGuide/types' }, - { text: 'Create', link: '/UserGuide/create' }, { text: 'Read', link: '/UserGuide/read' }, { text: 'Write', link: '/UserGuide/write' }, - { text: 'Subset', link: '/UserGuide/subset' }, + { text: 'Convert', link: '/UserGuide/convert' }, + { text: 'Create', link: '/UserGuide/create' }, + { text: 'Select', link: '/UserGuide/select' }, { text: 'Compute', link: '/UserGuide/compute' }, + { text: 'Chunk', link: '/UserGuide/chunk' }, { text: 'Group', link: '/UserGuide/group' }, { text: 'Combine', link: '/UserGuide/combine' }, - { text: 'Chunk', link: '/UserGuide/chunk' }, { text: 'FAQ', link: '/UserGuide/faq' } ] }, diff --git a/docs/src/UserGuide/combine.md b/docs/src/UserGuide/combine.md index 7d31afb1..73bde3c5 100644 --- a/docs/src/UserGuide/combine.md +++ b/docs/src/UserGuide/combine.md @@ -3,7 +3,7 @@ Data is often scattered across multiple files and corresponding arrays, e.g. one file per time step. This section describes methods on how to combine them into a single YAXArray. -## Concatenate YAXArrays along an existing dimension +## `cat` along an existing dimension Here we use `cat` to combine two arrays consisting of data from the first and the second half of a year into one single array containing the whole year. We glue the arrays along the first dimension using `dims = 1`: @@ -17,10 +17,11 @@ second_half = YAXArray((Dim{:time}(7:12),), rand(6)) whole_year = cat(first_half, second_half, dims = 1) ```` -## Combine YAXArrays along a new dimension +## `concatenatecubes` to a new dimension -Here we use `concatenatecubes` to combine two arrays of different variables that share the same time dimension. +Here we use `concatenatecubes` to combine two arrays of different variables that have the same dimensions. The resulting array `combined` has an additional dimension `variable` indicating from which array the element values originates. +Note that using a `Dataset` instead is a more flexible approach in handling different variables. ````@example concatenatecubes using YAXArrays @@ -30,4 +31,4 @@ precipitation = YAXArray((Dim{:time}(1:6),), rand(6)) cubes = [temperature,precipitation] var_axis = Dim{:variable}(["temp", "prep"]) combined = concatenatecubes(cubes, var_axis) -```` \ No newline at end of file +```` diff --git a/docs/src/UserGuide/compute.md b/docs/src/UserGuide/compute.md index 01d7aa3a..c7e98034 100644 --- a/docs/src/UserGuide/compute.md +++ b/docs/src/UserGuide/compute.md @@ -21,7 +21,7 @@ axlist = ( ) data = rand(30, 10, 15) properties = Dict(:origin => "user guide") -a = YAXArray(axlist, data2, properties) +a = YAXArray(axlist, data, properties) ```` ## Modify elements of a YAXArray @@ -57,7 +57,7 @@ a2 = a .+ 5 a2[1,2,3] == a[1,2,3] + 5 ```` -## map +## `map` Apply a function on every element of an array individually: @@ -77,9 +77,9 @@ The code runs very fast, because `map` applies the function lazily. Actual computation will be performed only on demand, e.g. when elements were explicitly requested or further computations were performed. -## mapslices +## `mapslices` -Reduce the time dimension by calculating the average value of all time points: +Reduce the time dimension by calculating the average value of all points in time: ````@example compute import Statistics: mean @@ -89,14 +89,130 @@ There is no time dimension left, because there is only one value left after aver We can also calculate spatial means resulting in one value per time step: ````@example compute -import Statistics: mean mapslices(mean, a, dims=("lat", "lon")) ```` -## mapCube +## `mapCube` + +`mapCube` is the most flexible way to apply a function over subsets of an array. +Dimensions may be added or removed. + +Here we transform a raster array with spatial dimension lat and lon into a vector array having just one spatial dimension i.e. region. +First, create the raster array: + +````@example compute_mapcube +using YAXArrays +using DimensionalData +using Dates + +axlist = ( + Dim{:time}(Date("2022-01-01"):Day(1):Date("2022-01-30")), + Dim{:lon}(range(1, 10, length=10)), + Dim{:lat}(range(1, 5, length=15)), +) +data = rand(30, 10, 15) +raster_arr = YAXArray(axlist, data) +```` + +Then, create a Matrix with the same spatial dimensions indicating to which region each point belongs to: + +````@example compute_mapcube +regions_mat = map(Iterators.product(raster_arr.lon, raster_arr.lat)) do (lon, lat) + 1 <= lon < 10 && 1 <= lat < 5 && return "A" + 1 <= lon < 10 && 5 <= lat < 10 && return "B" + 10 <= lon < 15 && 1 <= lat < 5 && return "C" + return "D" +end +regions_mat = DimArray(regions_mat, (raster_arr.lon, raster_arr.lat)) +```` + +which has the same spatial dimensions as the raster array at any given point in time: + +````@example compute_mapcube +DimArray(raster_arr[time = 1]) +```` + +Now we calculate the list of corresponding points for each region. +This will be re-used for each point in time during the final `mapCube`. +In addition, this avoids the allocation of unnecessary memory. + +````@example compute_mapcube +regions = ["A", "B", "C", "D"] +points_of_regions = map(enumerate(regions)) do (i,region) + region => findall(isequal(region), regions_mat) +end |> Dict |> sort +```` + +Finally, we can transform the entire raster array: + +````@example compute_mapcube +vector_array = mapCube( + raster_arr, + indims=InDims("lon", "lat"), + outdims=OutDims(Dim{:region}(regions)) +) do xout, xin + for (region_pos, points) in enumerate(points_of_regions.vals) + # aggregate values of points in the current region at the current date + xout[region_pos] = sum(view(xin, points)) + end +end +```` + +This gives us a vector array with only one spatial dimension, i.e. the region. +Note that we still have 30 points in time. +The transformation was applied for each date separately. +Hereby, `xin` is a 10x15 array representing a map at a given time and `xout` is a 4 element vector of missing values initially representing the 4 regions at that date. Then, we set each output element by the sum of all corresponding points ## Distributed Computation -parallel \ No newline at end of file +All map methods apply a function on all elements of all non-input dimensions separately. +This allows to run each map function call in parallel. +For example, we can execute each date of a time series in a different CPU thread during spatial aggregation. + +The following code does a time mean over all grid points using multiple CPUs of a local machine: + +````julia +using YAXArrays +using Dates +using Distributed + +axlist = ( + Dim{:time}(Date("2022-01-01"):Day(1):Date("2022-01-30")), + Dim{:lon}(range(1, 10, length=10)), + Dim{:lat}(range(1, 5, length=15)), +) +data = rand(30, 10, 15) +properties = Dict(:origin => "user guide") +a = YAXArray(axlist, data, properties) + +addprocs(2) + +@everywhere begin + using YAXArrays + using Zarr + using Statistics +end + +@everywhere function mymean(output, pixel) + @show "doing a mean" + output[:] .= mean(pixel) +end + +mapCube(mymean, a, indims=InDims("time"), outdims=OutDims()) +```` + +In the last example, `mapCube` was used to map the `mymean` function. `mapslices` is a convenient function that can replace `mapCube`, where you can omit defining an extra function with the output argument as an input (e.g. `mymean`). It is possible to simply use `mapslice` + +````julia +mapslices(mean ∘ skipmissing, a, dims="time") +```` + +It is also possible to distribute easily the workload on a cluster, with little modification to the code. To do so, we use the `ClusterManagers` package. + +````julia +using Distributed +using ClusterManagers +addprocs(SlurmManager(10)) +```` \ No newline at end of file diff --git a/docs/src/UserGuide/convert.md b/docs/src/UserGuide/convert.md new file mode 100644 index 00000000..0eddd171 --- /dev/null +++ b/docs/src/UserGuide/convert.md @@ -0,0 +1,31 @@ +# Convert YAXArrays + +This section describes how to convert variables from types of other Julia packages into YAXArrays and vice versa. + + +::: warning + +YAXArrays is designed to work with large datasets that are way larger than the memory. +However, most types are designed to work in memory. +Those conversions are only possible if the entire dataset fits into memory. +In addition, metadata might be lost during conversion. + +::: + + +## Convert `Base.Array` + +Convert `Base.Array` to `YAXArray`: + +````@example convert +using YAXArrays + +m = rand(5,10) +a = YAXArray(m) +```` + +Convert `YAXArray` to `Base.Array`: + +````@example convert +m2 = collect(a.data) +```` \ No newline at end of file diff --git a/docs/src/UserGuide/create.md b/docs/src/UserGuide/create.md index 7e973192..835c7436 100644 --- a/docs/src/UserGuide/create.md +++ b/docs/src/UserGuide/create.md @@ -1,5 +1,7 @@ # Create YAXArrays and Datasets +This section describes how to create arrays and datasets by filling values directly. + ## Create a YAXArray We can create a new YAXArray by filling the values directly: @@ -9,6 +11,7 @@ using YAXArrays a1 = YAXArray(rand(10, 20, 5)) ```` +The dimensions have only generic names, e.g. `Dim_1` and only integer values. We can also specify the dimensions with custom names enabling easier access: ````@example create @@ -24,6 +27,14 @@ properties = Dict(:origin => "user guide") a2 = YAXArray(axlist, data2, properties) ```` +````@example create +a2.properties +```` + +````@example create +a2.axes +```` + ## Create a Dataset ````@example create diff --git a/docs/src/UserGuide/etc/create_cube_from_function.md b/docs/src/UserGuide/etc/create_cube_from_function.md deleted file mode 100644 index e4b5d535..00000000 --- a/docs/src/UserGuide/etc/create_cube_from_function.md +++ /dev/null @@ -1,81 +0,0 @@ -# Create cube / YAXArray from function - -````@example create_cube -using YAXArrays, Zarr -using Dates -```` - -## Define function in space and time - -````@example create_cube -f(lo, la, t) = (lo + la + Dates.dayofyear(t)) -```` - -Wrap function for mapCube output - -````@example create_cube -function g(xout,lo,la,t) - xout .= f.(lo,la,t) -end -```` - -Note the applied `.` after `f`, this is because we will slice/broadcasted across time. - -## Create Cube's Axes - -We wrap the dimensions of every axis into a YAXArray to use them in the mapCube function. - -````@ansi create_cube -lon = YAXArray(Dim{:lon}(range(1, 15))) -lat = YAXArray(Dim{:lat}(range(1, 10))) -```` - -And a time axis - -````@ansi create_cube -tspan = Date("2022-01-01"):Day(1):Date("2022-01-30") -time = YAXArray(Dim{:time}( tspan)) -```` - -## Generate Cube - -The following generates a new `cube` using `mapCube` and saving the output directly to disk. - -````@example create_cube -gen_cube = mapCube(g, (lon, lat, time); - indims = (InDims(), InDims(), InDims("time")), - outdims = OutDims("time", overwrite=true, - path = "my_gen_cube.zarr", backend=:zarr, outtype=Float32), - ## max_cache=1e9 - ) -nothing # hide -```` -!!! warning "time axis is first" - Note that currently the `time` axis in the output cube goes first. - -Check that it is working - -````@ansi create_cube -gen_cube.data[1,:,:] -```` - -## Change output order - -The following generates a new `cube` using `mapCube` and saving the output directly to disk. - -````@example create_cube -gen_cube = mapCube(g, (lon, lat, time); - indims = (InDims("lon"), InDims(), InDims()), - outdims = OutDims("lon", overwrite=true, - path = "my_gen_cube.zarr", backend=:zarr, outtype=Float32), - ## max_cache=1e9 - ) -nothing # hide -```` - -!!! info "slicing dim" - Note that now the broadcasted dimension is `lon`. - -````@ansi create_cube -gen_cube.data[:, :, 1] -```` \ No newline at end of file diff --git a/docs/src/UserGuide/etc/creating.md b/docs/src/UserGuide/etc/creating.md deleted file mode 100644 index 305774ee..00000000 --- a/docs/src/UserGuide/etc/creating.md +++ /dev/null @@ -1,183 +0,0 @@ -# Creating YAXArrays and Datasets - -Here, we use `YAXArray` when the variables share dimensions and `Dataset` otherwise. - -## Creating a YAXArray - -````@example creating -using YAXArrays -using DimensionalData: DimensionalData as DD -using DimensionalData -```` - -````@ansi creating -a = YAXArray(rand(10, 20, 5)) -```` - -if no names are defined then default ones will be used, i.e. `Dim_1`, `Dim_2`. - -Get data from each Dimension with - -````@example creating -a.Dim_1 -```` - -or with - -````@example creating -getproperty(a, :Dim_1) -```` - -or even better with the `DD` `lookup` function - -````@example creating -lookup(a, :Dim_1) -```` - -## Creating a YAXArray with named axis - -The two most used axis are `RangeAxis` and `CategoricalAxis`. Here, we use a combination of them to create a `time`, `lon` and `lat` axis and a Categorical Axis for two variables. - -### Axis definitions - -````@ansi creating -using Dates -axlist = ( - Dim{:time}(Date("2022-01-01"):Day(1):Date("2022-01-30")), - Dim{:lon}(range(1, 10, length=10)), - Dim{:lat}(range(1, 5, length=15)), - Dim{:Variable}(["var1", "var2"]) - ) -```` - -And the corresponding data - -````@example creating -data = rand(30, 10, 15, 2); -nothing # hide -```` - -then, the `YAXArray` is - -````@ansi creating -ds = YAXArray(axlist, data) -```` - -### Select variables - -````@ansi creating -ds[Variable = At("var1"), lon = DD.Between(1,2.1)] -```` - -!!! info - Please note that selecting elements in YAXArrays is done via the `DimensionalData.jl` syntax. - For more information checkout the [docs](https://rafaqz.github.io/DimensionalData.jl/dev/). - - -````@ansi creating -subset = ds[ - time = DD.Between( Date("2022-01-01"), Date("2022-01-10")), - lon=DD.Between(1,2), - Variable = At("var2") - ] -```` - -### Properties / Attributes - -You might also want to add additional properties to your YAXArray. This can be done via a Dictionary, namely - -````@example creating -props = Dict( - "time" => "days", - "lon" => "longitude", - "lat" => "latitude", - "var1" => "first variable", - "var2" => "second variable", -); -nothing # hide -```` - -Then the `yaxarray` with properties is assemble with - -````@ansi creating -ds = YAXArray(axlist, data, props) -```` - -Access these properties with - -````@example creating -ds.properties -```` - -Note that this properties are shared for both variables `var1` and `var2`. -Namely, this are global properties for your `YAXArray`. -However, in most cases you will want to pass properties for each variable, here we will do this via Datasets. - -## Creating a Dataset - -Let's define first some range axis - -````@ansi creating -axs = ( - Dim{:lon}(range(0,1, length=10)), - Dim{:lat}(range(0,1, length=5)), -) -```` - -And two toy random `YAXArrays` to assemble our dataset - -````@ansi creating -t2m = YAXArray(axs, rand(10,5), Dict("units" => "K", "reference" => "your references")) -prec = YAXArray(axs, rand(10,5), Dict("units" => "mm", "reference" => "your references")) -```` - -Then the `Dataset` is assembled as - -````@ansi creating -ds = Dataset(t2m=t2m, prec= prec, num = YAXArray(rand(10)), - properties = Dict("space"=>"lon/lat", "reference" => "your global references")) -```` - -::: tip - -Note that the YAXArrays used not necessarily shared the same dimensions. -Hence, using a Dataset is more versatile than a plain YAXArray. - -::: - -## Selected Variables in a Data Cube - -Being able to collect variables that share dimensions into a data cube is possible with - -````@ansi creating -c = Cube(ds[["t2m", "prec"]]) -```` - -or simply the one that does not share all dimensions - -````@ansi creating -Cube(ds[["num"]]) -```` - -### Variable properties - -Access to variables properties is done via - -````@example creating -Cube(ds[["t2m"]]).properties -```` - -and - -````@example creating -Cube(ds[["prec"]]).properties -```` - -Note also that the global properties for the Dataset are accessed with - -````@example creating -ds.properties -```` - -Saving and different chunking modes are discussed [here](/UserGuide/setchuncks). - diff --git a/docs/src/UserGuide/etc/distributed.md b/docs/src/UserGuide/etc/distributed.md deleted file mode 100644 index 9000fb0f..00000000 --- a/docs/src/UserGuide/etc/distributed.md +++ /dev/null @@ -1,67 +0,0 @@ -# Distributed Computing - -## How to calculate a time mean - -````@example distributed -using YAXArrays, Statistics, Zarr -using DimensionalData -using Dates -axlist = ( - Dim{:time}(Date("2022-01-01"):Day(1):Date("2022-01-10")), - Dim{:lon}(range(1, 10, length=10)), - Dim{:lat}(range(1, 5, length=15)), - Dim{:Variable}(["var1", "var2"]) - ) -# # And the corresponding data -data = rand(10, 10, 15, 2) -nothing # hide -```` - -````@ansi distributed -ds = YAXArray(axlist, data) -```` - -````@example distributed -c = ds[Variable = At("var1")] -mapslices(mean ∘ skipmissing, c, dims="Time") -```` - -## Distributed calculations - -It is possible to distribute the calculations over multiple process. The following code -does a time mean over all grid points using multiple CPU over a local machine. - - -````julia -using Distributed -addprocs(2) -@everywhere begin - using NetCDF - using YAXArrays - using Statistics - using Zarr -end -@everywhere function mymean(output, pixel) - @show "doing a mean" - output[:] .= mean(pixel) -end -indims = InDims("time") -outdims = OutDims() -resultcube = mapCube(mymean, c, indims=indims, outdims=outdims) -```` - - -In the last example, `mapCube` was used to map the `mymean` function. `mapslices` is a convenient function that can replace `mapCube`, where you can omit defining an extra function with the output argument as an input (e.g. `mymean`). It is possible to simply use `mapslice` - -````@ansi distributed -resultcube = mapslices(mean ∘ skipmissing, c, dims="time") -```` - -It is also possible to distribute easily the workload on a cluster, with little modification to the code. To do so, we use the `ClusterManagers` package. - -````julia -using Distributed -using ClusterManagers -addprocs(SlurmManager(10)) -```` - diff --git a/docs/src/UserGuide/read.md b/docs/src/UserGuide/read.md index 05b6d983..ca495abe 100644 --- a/docs/src/UserGuide/read.md +++ b/docs/src/UserGuide/read.md @@ -42,4 +42,4 @@ Individual arrays can be accessed using subsetting: ````@example read_netcdf ds.tos -```` +```` \ No newline at end of file diff --git a/docs/src/UserGuide/subset.md b/docs/src/UserGuide/select.md similarity index 96% rename from docs/src/UserGuide/subset.md rename to docs/src/UserGuide/select.md index fd53855b..9afdaf9c 100644 --- a/docs/src/UserGuide/subset.md +++ b/docs/src/UserGuide/select.md @@ -1,6 +1,6 @@ -# Subset YAXArrays and Datasets +# Select YAXArrays and Datasets -The dimensions or axes of an `YAXArray` are named making it easier to select or query certain ranges of an array. +The dimensions or axes of an `YAXArray` are named making it easier to subset or query certain ranges of an array. Let's open an example `Dataset` used to select certain elements: ````@example subset diff --git a/docs/src/UserGuide/types.md b/docs/src/UserGuide/types.md index b1077dc5..9f9698db 100644 --- a/docs/src/UserGuide/types.md +++ b/docs/src/UserGuide/types.md @@ -10,7 +10,7 @@ A one dimensional array is called `Vector` and a two dimensional array is called In many Machine Learning libraries, arrays are also called tensors. Arrays are designed to store dense spatial-temporal data stored in a grid, whereas a collection of sparse points is usually stored in data frames or relational databases. -A `DimArray` as defined by [DimensionalData.jl](https://rafaqz.github.io/DimensionalData.jl/dev/) adds names to the dimensions and their axes ticks for a given `Array`. +A `DimArray` as defined by [DimensionalData.jl](https://rafaqz.github.io/DimensionalData.jl/dev/dimarrays) adds names to the dimensions and their axes ticks for a given `Array`. These names can be used to access the data, e.g., by date instead of just by integer position. A `YAXArray` is a subtype of a `AbstractDimArray` and adds functions to load and process the named arrays. @@ -19,15 +19,20 @@ In addition, it provides functions for parallel computation. ## Dataset -A `Dataset` is an ordered dictionary of `YAXArrays` that usually share dimensios. +A `Dataset` is an ordered dictionary of `YAXArrays` that usually share dimensions. For example, it can bundle arrays storing temperature and precipitation that are measured at the same time points and the same locations. -One also can store a picture in a Dataset with three arrays containing brightness values for red green and blue, respectiveley. +One also can store a picture in a Dataset with three arrays containing brightness values for red green and blue, respectively. Internally, those arrays are still separated allowing to chose different element types for each array. Analog to the (NetCDF Data Model)[https://docs.unidata.ucar.edu/netcdf-c/current/netcdf_data_model.html], a Dataset usually represents variables belonging to the same group. -## Cube +## (Data) Cube -A `Cube` is just a `YAXArray` in which arrays from a dataset are combined together by introducing a new dimension containing labels of which array the corresponding element came from. +A (Data) Cube is just a `YAXArray` in which arrays from a dataset are combined together by introducing a new dimension containing labels of which array the corresponding element came from. Unlike a `Dataset`, all arrays must have the same element type to be converted into a cube. -This data structure is usefull when we want to use all variables at once. -For example, the arrays temperature and precipitation are combnined into a single cube. \ No newline at end of file +This data structure is useful when we want to use all variables at once. +For example, the arrays temperature and precipitation which are measured at the same locations and dates can be combined into a single cube. +A more formal definition of Data Cubes are given in [Mahecha et al. 2020](https://doi.org/10.5194/esd-11-201-2020) + +## Dimension + +A `Dimension` or axis as defined by [DimensionalData.jl](https://rafaqz.github.io/DimensionalData.jl/dev/dimensions) adds tick labels, e.g., to each row or column of an array. It's name is used to access particular subsets of that array. \ No newline at end of file From ddd811c216fc8e2d1ef8daef4ae5cabff6f4867e Mon Sep 17 00:00:00 2001 From: Daniel Loos Date: Thu, 23 May 2024 14:48:06 +0200 Subject: [PATCH 05/10] Add PR welcome message --- docs/src/development/contribute.md | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/docs/src/development/contribute.md b/docs/src/development/contribute.md index 2dc0613d..b8babc78 100644 --- a/docs/src/development/contribute.md +++ b/docs/src/development/contribute.md @@ -1,5 +1,8 @@ +# Contribute to YAXArrays.jl -# Contribute to Documentation +Pull requests and bug reports are always welcome at the [YAXArrays.jl GitHub repository](https://github.com/JuliaDataCubes/YAXArrays.jl). + +## Contribute to Documentation Contributing with examples can be done by first creating a new file example [here](https://github.com/JuliaDataCubes/YAXArrays.jl/tree/master/docs/examples/UserGuide) @@ -19,7 +22,7 @@ Your new entry should look like: ::: -## Build docs locally +### Build docs locally If you want to take a look at the docs locally before doing a PR follow the next steps: From 69d31bf6faa1c561f36c96b0e030ba20f72c173c Mon Sep 17 00:00:00 2001 From: Daniel Loos Date: Thu, 23 May 2024 14:48:30 +0200 Subject: [PATCH 06/10] Add read GDAL --- docs/src/UserGuide/read.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/docs/src/UserGuide/read.md b/docs/src/UserGuide/read.md index ca495abe..ea6d82bc 100644 --- a/docs/src/UserGuide/read.md +++ b/docs/src/UserGuide/read.md @@ -42,4 +42,17 @@ Individual arrays can be accessed using subsetting: ````@example read_netcdf ds.tos +```` + +## Read GDAL (GeoTIFF, GeoJSON) + +All GDAL compatible files can be read as a `YAXArrays.Dataset` after loading [ArchGDAL](https://yeesian.com/ArchGDAL.jl/latest/): + +````@example read_gdal +using YAXArrays +using ArchGDAL +using Downloads: download + +path = download("https://github.com/yeesian/ArchGDALDatasets/raw/307f8f0e584a39a050c042849004e6a2bd674f99/gdalworkshop/world.tif", "world.tif") +ds = open_dataset(path) ```` \ No newline at end of file From e5216d9fac94cd200c9c250d1029418695805505 Mon Sep 17 00:00:00 2001 From: Daniel Loos Date: Thu, 23 May 2024 15:16:38 +0200 Subject: [PATCH 07/10] Add convert Raster --- docs/Project.toml | 1 + docs/src/UserGuide/convert.md | 17 +++++++++++++++++ 2 files changed, 18 insertions(+) diff --git a/docs/Project.toml b/docs/Project.toml index 43a7c871..eefa1884 100644 --- a/docs/Project.toml +++ b/docs/Project.toml @@ -22,6 +22,7 @@ MultivariateStats = "6f286f6a-111f-5878-ab1e-185364afe411" NetCDF = "30363a11-5582-574a-97bb-aa9a979735b9" OnlineStats = "a15396b6-48d5-5d58-9928-6d29437db91e" PlotUtils = "995b91a9-d308-5afd-9ec6-746e21dbc043" +Rasters = "a3a2b9e3-a471-40c9-b274-f788e487c689" SkipNan = "aed68c70-c8b0-4309-8cd1-d392a74f991a" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" WGLMakie = "276b4fcb-3e11-5398-bf8b-a0c2d153d008" diff --git a/docs/src/UserGuide/convert.md b/docs/src/UserGuide/convert.md index 0eddd171..21ed20b8 100644 --- a/docs/src/UserGuide/convert.md +++ b/docs/src/UserGuide/convert.md @@ -28,4 +28,21 @@ Convert `YAXArray` to `Base.Array`: ````@example convert m2 = collect(a.data) +```` + +## Convert `Raster` + +A `Raster` as defined in [Rasters.jl](https://rafaqz.github.io/Rasters.jl/stable/) has a same supertype of a `YAXArray`, i.e. `AbstractDimArray`, allowing easy conversion between those types: + +````@example convert +using Rasters + +lon, lat = X(25:1:30), Y(25:1:30) +time = Ti(2000:2024) +ras = Raster(rand(lon, lat, time)) +a = YAXArray(dims(ras), ras.data) +```` + +````@example convert +ras2 = Raster(a) ```` \ No newline at end of file From 852d4f13d912b56521f19bad19149163e3c64718 Mon Sep 17 00:00:00 2001 From: Daniel Loos Date: Thu, 23 May 2024 15:54:01 +0200 Subject: [PATCH 08/10] Simplify get started --- docs/src/UserGuide/read.md | 3 +- docs/src/get_started.md | 86 ++++++++++++++------------------------ 2 files changed, 33 insertions(+), 56 deletions(-) diff --git a/docs/src/UserGuide/read.md b/docs/src/UserGuide/read.md index ea6d82bc..cf4024bc 100644 --- a/docs/src/UserGuide/read.md +++ b/docs/src/UserGuide/read.md @@ -1,7 +1,6 @@ # Read YAXArrays and Datasets -Here we learn how to open files as arrays and datasets. - +This section describes how to read files, URLs, and directories into YAXArrays and datasets. ## Read Zarr diff --git a/docs/src/get_started.md b/docs/src/get_started.md index 0acfdb66..13757f72 100644 --- a/docs/src/get_started.md +++ b/docs/src/get_started.md @@ -14,89 +14,67 @@ Alternatively, you can also do import Pkg; Pkg.add("YAXArrays") ``` -:::tip - -The Julia Compiler is always improving. As such, we recommend using the latest stable -version of Julia. - -::: - - ## Quickstart +Create a simple array from random numbers given the size of each dimension or axis: + ```@example quickstart using YAXArrays -``` -You may check the installed version with: - -```julia -pkg> st YAXArrays +a = YAXArray(rand(2,3)) ``` -Let's assemble a `YAXArray` with 4 dimensions i.e. time, x,y and a variable dimension with two variables. +Assemble a more complex `YAXArray` with 4 dimensions, i.e. time, x, y and a variable type: ```@example quickstart -using YAXArrays, DimensionalData +using DimensionalData + +# axes or dimensions with name and tick values axlist = ( Dim{:time}(range(1, 20, length=20)), X(range(1, 10, length=10)), Y(range(1, 5, length=15)), - Dim{:Variable}(["var1", "var2"])) -# and the corresponding data. -data = rand(20, 10, 15, 2); -nothing # hide -``` - -::: info - -With `YAXArrays.jl 0.5` we switched the underlying data type to be a subtype of the DimensionalData.jl types. Therefore the indexing with named dimensions changed to the DimensionalData syntax. See the [`DimensionalData.jl docs`](https://rafaqz.github.io/DimensionalData.jl/stable/). + Dim{:variable}(["temperature", "precipitation"]) +) -::: - -You can also add additional properties via a Dictionary, namely +# the actual data matching the dimensions defined in axlist +data = rand(20, 10, 15, 2) -```@example quickstart +# metadata about the array props = Dict( - "time" => "days", - "x" => "lon", - "y" => "lat", - "var1" => "one of your variables", - "var2" => "your second variable", + "origin" => "YAXArrays.jl example", + "x" => "longitude", + "y" => "latitude", ); -nothing # hide + +a2 = YAXArray(axlist, data, props) ``` -And our first YAXArray is built with: +Get the temperature map at the first point in time: -```@ansi quickstart -ds = YAXArray(axlist, data, props) +```@example quickstart +a2[variable=At("temperature"), time=1].data ``` -## Getting data from a YAXArray +Get more details at the [select page](UserGuide/select) -For axis can be via `.` +## Updates -```@example quickstart -ds.X -``` +:::tip -or better yet via `lookup` +The Julia Compiler is always improving. As such, we recommend using the latest stable +version of Julia. -```@example quickstart -lookup(ds, :X) -``` +::: -note that also the `.data` field can be use +You may check the installed version with: -```@example quickstart -lookup(ds, :X).data +```julia +pkg> st YAXArrays ``` -The data for one variables, i.e. `var1` can be accessed via: +::: info -```@ansi quickstart -ds[Variable=At("var1")] -``` +With `YAXArrays.jl 0.5` we switched the underlying data type to be a subtype of the DimensionalData.jl types. Therefore the indexing with named dimensions changed to the DimensionalData syntax. See the [`DimensionalData.jl docs`](https://rafaqz.github.io/DimensionalData.jl/stable/). -and again, you can use the `.data` field to actually get the data. \ No newline at end of file +::: \ No newline at end of file From b29a9adfe1f891ecc33075dd24226642c5b690fe Mon Sep 17 00:00:00 2001 From: Daniel Loos Date: Thu, 23 May 2024 16:18:13 +0200 Subject: [PATCH 09/10] Fix types entry --- docs/src/.vitepress/config.mts | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/src/.vitepress/config.mts b/docs/src/.vitepress/config.mts index e6f86a4d..8aced75e 100644 --- a/docs/src/.vitepress/config.mts +++ b/docs/src/.vitepress/config.mts @@ -39,6 +39,7 @@ export default defineConfig({ { text: 'User Guide', items: [ + { text: 'Types', link: '/UserGuide/types' }, { text: 'Read', link: '/UserGuide/read' }, { text: 'Write', link: '/UserGuide/write' }, { text: 'Convert', link: '/UserGuide/convert' }, @@ -84,6 +85,7 @@ export default defineConfig({ { text: 'User Guide', items: [ + { text: 'Types', link: '/UserGuide/types' }, { text: 'Read', link: '/UserGuide/read' }, { text: 'Write', link: '/UserGuide/write' }, { text: 'Convert', link: '/UserGuide/convert' }, From 778f635eeb4ab468b816767857b2521694e0ba0b Mon Sep 17 00:00:00 2001 From: Daniel Loos Date: Thu, 23 May 2024 16:34:02 +0200 Subject: [PATCH 10/10] Refactor convert DimArray from FAQ to convert --- docs/src/UserGuide/convert.md | 28 ++++++++++++++++++++- docs/src/UserGuide/faq.md | 46 ----------------------------------- 2 files changed, 27 insertions(+), 47 deletions(-) diff --git a/docs/src/UserGuide/convert.md b/docs/src/UserGuide/convert.md index 21ed20b8..4c3684cc 100644 --- a/docs/src/UserGuide/convert.md +++ b/docs/src/UserGuide/convert.md @@ -45,4 +45,30 @@ a = YAXArray(dims(ras), ras.data) ````@example convert ras2 = Raster(a) -```` \ No newline at end of file +```` + +## Convert `DimArray` + +A `DimArray` as defined in [DimensionalData.jl](https://rafaqz.github.io/DimensionalData.jl/dev/dimarrays) has a same supertype of a `YAXArray`, i.e. `AbstractDimArray`, allowing easy conversion between those types. + +Convert `DimArray` to `YAXArray`: + +````@example convert +using DimensionalData +using YAXArrayBase + +dim_arr = rand(X(1:5), Y(10.0:15.0), metadata = Dict{String, Any}()) +a = yaxconvert(YAXArray, dim_arr) +```` + +Convert `YAXArray` to `DimArray`: + +````@example convert +dim_arr2 = yaxconvert(DimArray, a) +```` + +::: info + +At the moment there is no support to save a DimArray directly into disk as a `NetCDF` or a `Zarr` file. + +::: \ No newline at end of file diff --git a/docs/src/UserGuide/faq.md b/docs/src/UserGuide/faq.md index 8b46eccb..b292ca31 100644 --- a/docs/src/UserGuide/faq.md +++ b/docs/src/UserGuide/faq.md @@ -191,52 +191,6 @@ We can also use more than one criteria for grouping the values. In the next exam fitcube = cubefittable(t, Mean, :values, by=(:classes, :time)) ```` -## convertions types `DimArray` & `YAXArray` - -````@example howdoi -using YAXArrays, YAXArrayBase -using DimensionalData -```` - -### `DimArray` to `YAXArray` - -````@ansi howdoi -dim_arr = rand(X(1:5), Y(10.0:15.0), metadata = Dict{String, Any}()) -```` - -!!! warning "metadata" - Note the `metadata` argument. Needed by `yaxconvert`. - -````@ansi howdoi -yax_arr = yaxconvert(YAXArray, dim_arr) -```` - -And saving it: - -````@example howdoi -using Zarr, NetCDF -savecube(yax_arr, "yax_arr.nc", driver=:netcdf, overwrite=true); -nothing # hide -```` - -or as a `zarr` file - -````@example howdoi -savecube(yax_arr, "yax_arr.zarr", driver=:zarr); -nothing # hide -```` - -And going back to the DimArray type is also possible. - -### `YAXArray` into a `DimArray` - -````@ansi howdoi -dim_arr = yaxconvert(DimArray, yax_arr) -```` - -at the moment there is no support to save a DimArray directly into disk as a `NetCDF` or a `Zarr` file. - - ## How do I assing variable names to `YAXArrays` in a `Dataset` ### One variable name