diff --git a/arkouda/categorical.py b/arkouda/categorical.py index ee5ef37027..6c521f8c7d 100644 --- a/arkouda/categorical.py +++ b/arkouda/categorical.py @@ -808,10 +808,8 @@ def to_hdf( file_type: str = "distribute", ) -> str: """ - Save the Categorical object to HDF5. The result is a collection of HDF5 files, - one file per locale of the arkouda server, where each filename starts - with prefix_path and dataset. Each locale saves its chunk of the Categorical to its - corresponding file. + Save the Categorical object to HDF5. + The object can be saved to a collection of files or single file. Parameters ---------- @@ -834,18 +832,24 @@ def to_hdf( Raises ------ - ValueError - Raised if the lengths of columns and values differ, or the mode is - neither 'truncate' nor 'append' - TypeError - Raised if prefix_path, dataset, or mode is not a str - + RuntimeError + Raised if a server-side error is thrown saving the pdarray Notes ----- - Important implementation notes: (1) Strings state is saved as two datasets - within an hdf5 group: one for the string characters and one for the - segments corresponding to the start of each string, (2) the hdf5 group is named - via the dataset parameter. + - The prefix_path must be visible to the arkouda server and the user must + have write permission. + - Output files have names of the form ``_LOCALE``, where ```` + ranges from 0 to ``numLocales`` for `file_type='distribute'`. Otherwise, + the file name will be `prefix_path`. + - If any of the output files already exist and + the mode is 'truncate', they will be overwritten. If the mode is 'append' + and the number of output files is less than the number of locales or a + dataset with the same name already exists, a ``RuntimeError`` will result. + - Any file extension can be used.The file I/O does not rely on the extension to + determine the file format. + See Also + --------- + to_parquet """ result = [] comp_dict = {k: v for k, v in self._get_components_dict().items() if v is not None} @@ -878,9 +882,11 @@ def to_parquet( compression: Optional[str] = None, ) -> str: """ - Save the Categorical object to Parquet. The result is a collection of Parquet files, + This functionality is currently not supported and will also raise a RuntimeError. + Support is in development. + Save the Categorical to Parquet. The result is a collection of files, one file per locale of the arkouda server, where each filename starts - with prefix_path and dataset. Each locale saves its chunk of the Categorical to its + with prefix_path. Each locale saves its chunk of the array to its corresponding file. Parameters @@ -903,11 +909,24 @@ def to_parquet( Raises ------ - ValueError - Raised if the lengths of columns and values differ, or the mode is - neither 'truncate' nor 'append' - TypeError - Raised if prefix_path, dataset, or mode is not a str + RuntimeError + On run due to compatability issues of Categorical with Parquet. + Notes + ----- + - The prefix_path must be visible to the arkouda server and the user must + have write permission. + - Output files have names of the form ``_LOCALE``, where ```` + ranges from 0 to ``numLocales`` for `file_type='distribute'`. + - 'append' write mode is supported, but is not efficient. + - If any of the output files already exist and + the mode is 'truncate', they will be overwritten. If the mode is 'append' + and the number of output files is less than the number of locales or a + dataset with the same name already exists, a ``RuntimeError`` will result. + - Any file extension can be used.The file I/O does not rely on the extension to + determine the file format. + See Also + -------- + to_hdf """ # due to the possibility that components will be different sizes, # writing to Parquet is not supported at this time diff --git a/arkouda/dataframe.py b/arkouda/dataframe.py index 9844f92f60..729f22607f 100644 --- a/arkouda/dataframe.py +++ b/arkouda/dataframe.py @@ -1475,12 +1475,21 @@ def to_hdf(self, path, index=False, columns=None, file_type="distribute"): file_type: str (single | distribute) Default: distribute Whether to save to a single file or distribute across Locales - + Returns + ------- + None + Raises + ------ + RuntimeError + Raised if a server-side error is thrown saving the pdarray Notes ----- This method saves one file per locale of the arkouda server. All files are prefixed by the path argument and suffixed by their locale number. + See Also + --------- + to_parquet, load """ from arkouda.io import to_hdf @@ -1503,12 +1512,21 @@ def to_parquet(self, path, index=False, columns=None, compression: Optional[str] Default None Provide the compression type to use when writing the file. Supported values: snappy, gzip, brotli, zstd, lz4 - + Returns + ------- + None + Raises + ------ + RuntimeError + Raised if a server-side error is thrown saving the pdarray Notes ----- This method saves one file per locale of the arkouda server. All files are prefixed by the path argument and suffixed by their locale number. + See Also + --------- + to_hdf, load """ from arkouda.io import to_parquet diff --git a/arkouda/index.py b/arkouda/index.py index 2b9227a2d7..1c60d90a1f 100644 --- a/arkouda/index.py +++ b/arkouda/index.py @@ -202,6 +202,44 @@ def to_hdf( mode: str = "truncate", file_type: str = "distribute", ) -> str: + """ + Save the Index to HDF5. + The object can be saved to a collection of files or single file. + Parameters + ---------- + prefix_path : str + Directory and filename prefix that all output files share + dataset : str + Name of the dataset to create in files (must not already exist) + mode : str {'truncate' | 'append'} + By default, truncate (overwrite) output files, if they exist. + If 'append', attempt to create new dataset in existing files. + file_type: str ("single" | "distribute") + Default: "distribute" + When set to single, dataset is written to a single file. + When distribute, dataset is written on a file per locale. + This is only supported by HDF5 files and will have no impact of Parquet Files. + Returns + ------- + string message indicating result of save operation + Raises + ------- + RuntimeError + Raised if a server-side error is thrown saving the pdarray + Notes + ----- + - The prefix_path must be visible to the arkouda server and the user must + have write permission. + - Output files have names of the form ``_LOCALE``, where ```` + ranges from 0 to ``numLocales`` for `file_type='distribute'`. Otherwise, + the file name will be `prefix_path`. + - If any of the output files already exist and + the mode is 'truncate', they will be overwritten. If the mode is 'append' + and the number of output files is less than the number of locales or a + dataset with the same name already exists, a ``RuntimeError`` will result. + - Any file extension can be used.The file I/O does not rely on the extension to + determine the file format. + """ return self.values.to_hdf(prefix_path, dataset=dataset, mode=mode, file_type=file_type) def to_parquet( @@ -211,6 +249,44 @@ def to_parquet( mode: str = "truncate", compression: Optional[str] = None, ): + """ + Save the Index to Parquet. The result is a collection of files, + one file per locale of the arkouda server, where each filename starts + with prefix_path. Each locale saves its chunk of the array to its + corresponding file. + Parameters + ---------- + prefix_path : str + Directory and filename prefix that all output files share + dataset : str + Name of the dataset to create in files (must not already exist) + mode : str {'truncate' | 'append'} + By default, truncate (overwrite) output files, if they exist. + If 'append', attempt to create new dataset in existing files. + compression : str (Optional) + (None | "snappy" | "gzip" | "brotli" | "zstd" | "lz4") + Sets the compression type used with Parquet files + Returns + ------- + string message indicating result of save operation + Raises + ------ + RuntimeError + Raised if a server-side error is thrown saving the pdarray + Notes + ----- + - The prefix_path must be visible to the arkouda server and the user must + have write permission. + - Output files have names of the form ``_LOCALE``, where ```` + ranges from 0 to ``numLocales`` for `file_type='distribute'`. + - 'append' write mode is supported, but is not efficient. + - If any of the output files already exist and + the mode is 'truncate', they will be overwritten. If the mode is 'append' + and the number of output files is less than the number of locales or a + dataset with the same name already exists, a ``RuntimeError`` will result. + - Any file extension can be used.The file I/O does not rely on the extension to + determine the file format. + """ return self.values.to_parquet(prefix_path, dataset=dataset, mode=mode, compression=compression) def save( diff --git a/arkouda/io.py b/arkouda/io.py index fe480d68c9..4f53ba0d13 100644 --- a/arkouda/io.py +++ b/arkouda/io.py @@ -837,10 +837,12 @@ def to_parquet( ValueError Raised if (1) the lengths of columns and values differ or (2) the mode is not 'truncate' or 'append' + RuntimeError + Raised if a server-side error is thrown saving the pdarray See Also -------- - save, load_all + to_hdf, load, load_all, read Notes ----- @@ -931,10 +933,12 @@ def to_hdf( ValueError Raised if (1) the lengths of columns and values differ or (2) the mode is not 'truncate' or 'append' + RuntimeError + Raised if a server-side error is thrown saving the pdarray See Also -------- - save, load_all + to_parquet, load, load_all, read Notes ----- @@ -1095,7 +1099,7 @@ def load( See Also -------- - save, load_all, read + to_parquet, to_hdf, load_all, read Notes ----- @@ -1172,7 +1176,7 @@ def load_all( See Also -------- - save_all, load, read + to_parquet, to_hdf, load, read Notes _____ @@ -1273,7 +1277,7 @@ def read( See Also -------- - read, get_datasets, ls, read_parquet, read_hdf + get_datasets, ls, read_parquet, read_hdf Notes ----- diff --git a/arkouda/pdarrayclass.py b/arkouda/pdarrayclass.py index 273b7c57fd..212bff3616 100755 --- a/arkouda/pdarrayclass.py +++ b/arkouda/pdarrayclass.py @@ -1347,6 +1347,54 @@ def to_parquet( mode: str = "truncate", compression: Optional[str] = None, ) -> str: + """ + Save the pdarray to Parquet. The result is a collection of files, + one file per locale of the arkouda server, where each filename starts + with prefix_path. Each locale saves its chunk of the array to its + corresponding file. + Parameters + ---------- + prefix_path : str + Directory and filename prefix that all output files share + dataset : str + Name of the dataset to create in files (must not already exist) + mode : str {'truncate' | 'append'} + By default, truncate (overwrite) output files, if they exist. + If 'append', attempt to create new dataset in existing files. + compression : str (Optional) + (None | "snappy" | "gzip" | "brotli" | "zstd" | "lz4") + Sets the compression type used with Parquet files + Returns + ------- + string message indicating result of save operation + Raises + ------ + RuntimeError + Raised if a server-side error is thrown saving the pdarray + Notes + ----- + - The prefix_path must be visible to the arkouda server and the user must + have write permission. + - Output files have names of the form ``_LOCALE``, where ```` + ranges from 0 to ``numLocales`` for `file_type='distribute'`. + - 'append' write mode is supported, but is not efficient. + - If any of the output files already exist and + the mode is 'truncate', they will be overwritten. If the mode is 'append' + and the number of output files is less than the number of locales or a + dataset with the same name already exists, a ``RuntimeError`` will result. + - Any file extension can be used.The file I/O does not rely on the extension to + determine the file format. + Examples + -------- + >>> a = ak.arange(25) + >>> # Saving without an extension + >>> a.to_parquet('path/prefix', dataset='array') + Saves the array to numLocales HDF5 files with the name ``cwd/path/name_prefix_LOCALE####`` + >>> # Saving with an extension (HDF5) + >>> a.to_parqet('path/prefix.parquet', dataset='array') + Saves the array to numLocales HDF5 files with the name + ``cwd/path/name_prefix_LOCALE####.parquet`` where #### is replaced by each locale number + """ from arkouda.io import mode_str_to_int return cast( @@ -1372,6 +1420,58 @@ def to_hdf( mode: str = "truncate", file_type: str = "distribute", ) -> str: + """ + Save the pdarray to HDF5. + The object can be saved to a collection of files or single file. + Parameters + ---------- + prefix_path : str + Directory and filename prefix that all output files share + dataset : str + Name of the dataset to create in files (must not already exist) + mode : str {'truncate' | 'append'} + By default, truncate (overwrite) output files, if they exist. + If 'append', attempt to create new dataset in existing files. + file_type: str ("single" | "distribute") + Default: "distribute" + When set to single, dataset is written to a single file. + When distribute, dataset is written on a file per locale. + This is only supported by HDF5 files and will have no impact of Parquet Files. + Returns + ------- + string message indicating result of save operation + Raises + ------- + RuntimeError + Raised if a server-side error is thrown saving the pdarray + Notes + ----- + - The prefix_path must be visible to the arkouda server and the user must + have write permission. + - Output files have names of the form ``_LOCALE``, where ```` + ranges from 0 to ``numLocales`` for `file_type='distribute'`. Otherwise, + the file name will be `prefix_path`. + - If any of the output files already exist and + the mode is 'truncate', they will be overwritten. If the mode is 'append' + and the number of output files is less than the number of locales or a + dataset with the same name already exists, a ``RuntimeError`` will result. + - Any file extension can be used.The file I/O does not rely on the extension to + determine the file format. + Examples + -------- + >>> a = ak.arange(25) + >>> # Saving without an extension + >>> a.to_hdf('path/prefix', dataset='array') + Saves the array to numLocales HDF5 files with the name ``cwd/path/name_prefix_LOCALE####`` + >>> # Saving with an extension (HDF5) + >>> a.to_hdf('path/prefix.h5', dataset='array') + Saves the array to numLocales HDF5 files with the name + ``cwd/path/name_prefix_LOCALE####.h5`` where #### is replaced by each locale number + >>> # Saving to a single file + >>> a.to_hdf('path/prefix.hdf5', dataset='array', file_type='single') + Saves the array in to single hdf5 file on the root node. + ``cwd/path/name_prefix.hdf5`` + """ from arkouda.io import file_type_to_int, mode_str_to_int return cast( @@ -1403,7 +1503,8 @@ def save( DEPRECATED Save the pdarray to HDF5 or Parquet. The result is a collection of files, one file per locale of the arkouda server, where each filename starts - with prefix_path. Each locale saves its chunk of the array to its + with prefix_path. HDF5 support single files, in which case the file name will + only be that provided. Each locale saves its chunk of the array to its corresponding file. Parameters ---------- @@ -1442,7 +1543,7 @@ def save( is not a string See Also -------- - save_all, load, read + save_all, load, read, to_parquet, to_hdf Notes ----- The prefix_path must be visible to the arkouda server and the user must diff --git a/arkouda/segarray.py b/arkouda/segarray.py index 7b6da164aa..5fd37f1e64 100644 --- a/arkouda/segarray.py +++ b/arkouda/segarray.py @@ -1010,6 +1010,10 @@ def to_hdf( the HDF5 file, not nested under a group. SegArray is not currently supported by Parquet + + See Also + --------- + load """ self.segments.to_hdf( prefix_path, dataset=dataset + segment_suffix, mode=mode, file_type=file_type @@ -1027,6 +1031,48 @@ def save( mode="truncate", file_type="distribute", ): + """ + DEPRECATED + Save the SegArray to HDF5. + The object can be saved to a collection of files or single file. + Parameters + ---------- + prefix_path : str + Directory and filename prefix that all output files share + dataset : str + Name of the dataset to create in files (must not already exist) + mode : str {'truncate' | 'append'} + By default, truncate (overwrite) output files, if they exist. + If 'append', attempt to create new dataset in existing files. + file_type: str ("single" | "distribute") + Default: "distribute" + When set to single, dataset is written to a single file. + When distribute, dataset is written on a file per locale. + This is only supported by HDF5 files and will have no impact of Parquet Files. + Returns + ------- + string message indicating result of save operation + Raises + ------- + RuntimeError + Raised if a server-side error is thrown saving the pdarray + Notes + ----- + - The prefix_path must be visible to the arkouda server and the user must + have write permission. + - Output files have names of the form ``_LOCALE``, where ```` + ranges from 0 to ``numLocales`` for `file_type='distribute'`. Otherwise, + the file name will be `prefix_path`. + - If any of the output files already exist and + the mode is 'truncate', they will be overwritten. If the mode is 'append' + and the number of output files is less than the number of locales or a + dataset with the same name already exists, a ``RuntimeError`` will result. + - Any file extension can be used.The file I/O does not rely on the extension to + determine the file format. + See Also + -------- + to_hdf, load + """ from warnings import warn warn( "ak.SegArray.save has been deprecated. Please use ak.SegArray.to_hdf", diff --git a/arkouda/strings.py b/arkouda/strings.py index 4f93f1706f..04a6b1ef14 100755 --- a/arkouda/strings.py +++ b/arkouda/strings.py @@ -1884,36 +1884,42 @@ def to_parquet( compression: Optional[str] = None, ) -> str: """ - Save the Strings object to Parquet. The result is a collection of Parquet files, + Save the Strings object to Parquet. The result is a collection of files, one file per locale of the arkouda server, where each filename starts - with prefix_path. Each locale saves its chunk of the Strings array to its + with prefix_path. Each locale saves its chunk of the array to its corresponding file. - Parameters ---------- prefix_path : str Directory and filename prefix that all output files share dataset : str - The name of the Strings dataset to be written, defaults to strings_array + Name of the dataset to create in files (must not already exist) mode : str {'truncate' | 'append'} By default, truncate (overwrite) output files, if they exist. - If 'append', create a new Strings dataset within existing files. + If 'append', attempt to create new dataset in existing files. compression : str (Optional) - Default None - Provide the compression type to use when writing the file. - Supported values: snappy, gzip, brotli, zstd, lz4 - + (None | "snappy" | "gzip" | "brotli" | "zstd" | "lz4") + Sets the compression type used with Parquet files Returns ------- - String message indicating result of save operation - + string message indicating result of save operation Raises ------ - ValueError - Raised if the lengths of columns and values differ, or the mode is - neither 'truncate' nor 'append' - TypeError - Raised if prefix_path, dataset, or mode is not a str + RuntimeError + Raised if a server-side error is thrown saving the pdarray + Notes + ----- + - The prefix_path must be visible to the arkouda server and the user must + have write permission. + - Output files have names of the form ``_LOCALE``, where ```` + ranges from 0 to ``numLocales`` for `file_type='distribute'`. + - 'append' write mode is supported, but is not efficient. + - If any of the output files already exist and + the mode is 'truncate', they will be overwritten. If the mode is 'append' + and the number of output files is less than the number of locales or a + dataset with the same name already exists, a ``RuntimeError`` will result. + - Any file extension can be used.The file I/O does not rely on the extension to + determine the file format. """ from arkouda.io import mode_str_to_int @@ -1941,10 +1947,8 @@ def to_hdf( file_type: str = "distribute", ) -> str: """ - Save the Strings object to HDF5. The result is a collection of HDF5 files, - one file per locale of the arkouda server, where each filename starts - with prefix_path. Each locale saves its chunk of the Strings array to its - corresponding file. + Save the Strings object to HDF5. + The object can be saved to a collection of files or single file. Parameters ---------- @@ -1970,11 +1974,29 @@ def to_hdf( Raises ------ - ValueError - Raised if the lengths of columns and values differ, or the mode is - neither 'truncate' nor 'append' - TypeError - Raised if prefix_path, dataset, or mode is not a str + RuntimeError + Raised if a server-side error is thrown saving the pdarray + Notes + ----- + - Parquet files do not store the segments, only the values. + - Strings state is saved as two datasets within an hdf5 group: + one for the string characters and one for the + segments corresponding to the start of each string + - the hdf5 group is named via the dataset parameter. + - The prefix_path must be visible to the arkouda server and the user must + have write permission. + - Output files have names of the form ``_LOCALE``, where ```` + ranges from 0 to ``numLocales`` for `file_type='distribute'`. Otherwise, + the file name will be `prefix_path`. + - If any of the output files already exist and + the mode is 'truncate', they will be overwritten. If the mode is 'append' + and the number of output files is less than the number of locales or a + dataset with the same name already exists, a ``RuntimeError`` will result. + - Any file extension can be used.The file I/O does not rely on the extension to + determine the file format. + See Also + --------- + to_hdf """ from arkouda.io import file_type_to_int, mode_str_to_int @@ -2008,9 +2030,10 @@ def save( ) -> str: """ DEPRECATED - Save the Strings object to HDF5 or Parquet. The result is a collection of - files, one file per locale of the arkouda server, where each filename starts - with prefix_path. Each locale saves its chunk of the Strings array to its + Save the Strings object to HDF5 or Parquet. The result is a collection of files, + one file per locale of the arkouda server, where each filename starts + with prefix_path. HDF5 support single files, in which case the file name will + only be that provided. Each locale saves its chunk of the array to its corresponding file. Parameters ---------- @@ -2044,7 +2067,8 @@ def save( Important implementation notes: (1) Strings state is saved as two datasets within an hdf5 group: one for the string characters and one for the segments corresponding to the start of each string, (2) the hdf5 group is named - via the dataset parameter. + via the dataset parameter. (3) Parquet files do not store the segments, + only the values. """ from warnings import warn diff --git a/pydoc/FILEIO.md b/pydoc/FILEIO.md deleted file mode 100644 index d85f6a436b..0000000000 --- a/pydoc/FILEIO.md +++ /dev/null @@ -1,122 +0,0 @@ -# File I/O - -*Please Note: This file is being developed in conjunction with updates to our file I/O system. Information is being omitted until updates on each section are completed to avoid confusion.* - -## Supported File Types - -Arkouda currently supports the file types listed below. The way the data is stored may vary. This file will detail the "schema" each file type is expected to follow. If your file does not follow the detailed "schema", please try using our `import`/`export` tools. *Please Note: The functionality of the `import`/`export` tools is dependent on the size of the data because they only run on the client.* - -- HDF5 -- Parquet - -## HDF5 - -### File Formats - -HDF5 now supports saving datasets in 2 different file configurations. - -- Single File -- Distributed Files (Default) - -When saving to a single file, all the data from an Arkouda object is stored to one file. This file is stored on `LOCALE0`. When saving data to a distributed file system, data is stored in one file per `LOCALE`. Each file contains the portion of data from the object local to the `LOCALE` the file is being written to. Each file can contain multiple datasets/groups and thus can store multiple objects. - -### MetaData Attributes - -These attributes are required to be set for each group and dataset. - -`ObjType`: `int` - Integer representing the type of object stored in the group/dataset. This corresponds to the Chapel `enum ObjType`. Required to properly read each object. - - - 0 = `ArrayView` - - 1 = `pdarray` - - 2 = `Strings` - -`isBool`: `int` - Integer value (0 or 1) representing a boolean value that indicates if the data stored contains boolean values. This is only required to be set when the dataset contains boolean values. - -`file_version`: `real(32)` - Real value indicating the formatting version. `0.0` and `1.0` are no longer in use. Should be `2.0`. - -`arkouda_version`: `c_string` - String value of the Arkouda version at the time the object was written. - -### Supported Arkouda Data Types - -While most objects in Arkouda can be saved, there are 3 main datatypes currently supported within HDF5. - -- pdarray -- Strings -- ArrayView (Import/Export not Supported) - -### PDArray/ArrayView Dataset Format - -`ArrayView` and `pdarray` objects' storage format is identical. The only difference is that `ArrayView` objects require additional attributes to ensure that they can be read properly. These objects are stored in an HDF5 dataset. - -**Structure** - -1) Dataset - 1) Data - ArrayView/pdarray values - 2) Attributes - 1) MetaData Attributes - 2) ArrayView Attributes (If the `ObjType` is equivalent to `ArrayView`) - -**ArrayView Attributes** - -`Rank`: `int` - Integer representing the number of dimensions in the dataset. This should be stored as the rank of the *unflattened* data, even when storing as a flattened array. - -`Shape`: `int array` Integer array storing the size of each dimension. The array should be of length equal to the `Rank`. - -### Strings DataSet Format - -`Strings` objects are stored within an HDF5 group. This group contains datasets storing the values and segments separately. - -**Structure** - -1) Group - 1) Dataset - `values` - 1) `ObjType` Attribute - 2) Data - String object's values pdarray data - 2) DataSet - `segments` - 1) `ObjType` Attribute - 2) Data - String object's segments pdarray data - 3) MetaData Attributes - -Each dataset within the group contains the `ObjType` attribute so that they can be read individually as a dataset. The `isBool` attribute is not needed because these objects will never store boolean values. - -## Parquet - -COMING SOON - -## Reading Objects - -Arkouda objects can be read from files using the `ak.read()` or `ak.load()` functions. More information on these functions are linked below. - -- [ak.load](https://bears-r-us.github.io/arkouda/usage/IO.html#arkouda.load) -- [ak.load_all](https://bears-r-us.github.io/arkouda/usage/IO.html#arkouda.load_all) - -## Writing Objects - -*Objects currently being written with file version `v2.0`.* - -Additionally, there are `save` functions for individual Arkouda objects. The function definition is detailed below as it is the same for each object type. - -```python -def save(self, filepath: str, dset: str, mode: str = "truncate", file_type: str = "distribute") - """ - Save the current object to hdf5 file - Parameters - ---------- - filepath: str - Path to the file to write the dataset to - dset: str - Name of the dataset to write - mode: str (truncate | append) - Default: truncate - Mode to write the dataset in. Truncate will overwrite any existing files. - Append will add the dataset to an existing file. - file_type: str (single|distribute) - efault: distribute - Indicates the format to save the file. Single will store in a single file. - Distribute will store the date in a file per locale. -``` diff --git a/pydoc/file_io/HDF5.md b/pydoc/file_io/HDF5.md new file mode 100644 index 0000000000..f0d7176327 --- /dev/null +++ b/pydoc/file_io/HDF5.md @@ -0,0 +1,175 @@ +# HDF5 + +HDF5 is an extremely flexible format. Because of this, it is important to adhere to these specifications in order for a file to be readable by Arkouda. + +More information on HDF5 is available [here](https://www.hdfgroup.org/solutions/hdf5/). + +## File Configuration + +Arkouda supports saving HDF5 files in 2 ways: + +- Single File + - All data is pulled local to the processing root node and saved into one file +- Distributed Files (Default) + - Each file contains the portion of the data local to the locale where the file is being written. This results in one file per locale. + +*It is important to note that the file schemas are the same in both cases.* + +## Supported Arkouda Data Types + +While most objects in Arkouda can be saved, there are 3 main datatypes currently supported within HDF5. + +- pdarray +- Strings +- ArrayView +- DataFrame +- Index +- Categorical + +HDF5 is able to contain any number of objects within the same file. + +## MetaData Attributes + +All data within the HDF5 file is expected to contain several attributes that aide in determining the data within the object. These attributes are assigned at the `Group` and `Dataset` levels. + +`ObjType`: `int` +> Integer representing the type of object stored in the group/dataset. This corresponds to the Chapel `enum ObjType`. Required to properly read each object. +> +> - 0 = `ArrayView` +> - 1 = `pdarray` +> - 2 = `Strings` + +`isBool`: `int` +> Integer value (0 or 1) representing a boolean value that indicates if the data stored contains boolean values. This is only required to be set when the dataset contains boolean values. + +`file_version`: `real(32)` (Optional) +> Real value indicating the formatting version. `0.0` and `1.0` are no longer in use. Should be `2.0`. + +`arkouda_version`: `c_string` (Optional) +> String value of the Arkouda version at the time the object was written. + +The 2 attributes marked `Optional` are not required for data to be read. Thus, if you are reading data into Arkouda from another source, these can be omitted. However, any dataset written out by Arkodua will include this information. + +*Additional object types are being worked for direct support.* + +## Data Schema + +This section provides an outline of the expected data schema for each object type. Each example assumes the top level group/dataset is not nested. + +When reading array values, the data type of the values is automatically detected and is therefore not required to be included in the metadata. + +### ArrayView + +`ArrayView` objects require additional attributes to be read properly. + +`Rank`: `int` +> Integer representing the number of dimensions in the dataset. This should be stored as the rank of the *unflattened* data, even when storing as a flattened array. + +`Shape`: `int array` +>Integer array storing the size of each dimension. The array should be of length equal to the `Rank`. + +Providing these attributes allows for the ArrayView object to be reconstructed from its values. + +> 1. Dataset (will have a user provided name. Defaults to 'ArrayView') +> 1. Attributes +> 1. ObjType: 0 +> 2. isBool: 0 or 1 +> 3. Rank: `number_of_dimensions` +> 4. Shape: `array_of_size_rank` +> 5. file_version: 2.0 (Optional) +> 6. arkouda_version: 'current_arkouda_version' (Optional) +> 2. Data - values of the ArrayView. + +### pdarray + +> 1. Dataset (will have a user provided name. Defaults to 'array') +> 1. Attributes +> 1. ObjType: 1 +> 2. isBool: 0 or 1 +> 3. file_version: 2.0 (Optional) +> 4. arkouda_version: 'current_arkouda_version' (Optional) +> 2. Data - values of the pdarray. + +### Strings + +`Strings` objects are stored within an HDF5 group. This group contains datasets storing the values and segments separately. + +>1. Group (user provided dataset name. Defaults to 'strings_array') +> 1. Attributes +> 1. ObjType: 2 +> 2. file_version: 2.0 (Optional) +> 3. arkouda_version: 'current_arkouda_version' (Optional) +> 2. Dataset - Values (user provided dataset name with `_values` appended) +> 1. Attributes +> 1. ObjType: 1 +> 2. isBool: 0 or 1 +> 3. file_version: 2.0 (Optional) +> 4. arkouda_version: 'current_arkouda_version' (Optional) +> 2. Data - uint8 values representing our string values. Includes null byte termination. +> 3. Dataset - Offsets (user provided dataset name with `_segments` appended) (Optional) +> 1. Attributes +> 1. ObjType: 1 +> 2. isBool: 0 +> 3. file_version: 2.0 (Optional) +> 4. arkouda_version: 'current_arkouda_version' (Optional) +> 2. Data - int64 values representing in start index of each string value. + +*Please Note - The offsets dataset is note required but can be provided. Strings uses null byte termination and is able to calculate the offsets of its components during reads.* + +## Supported Write Modes + +**Truncate** +> When writing to HDF5 in `truncate` mode, any existing HDF5 file with the same name will be overwritten. If no file exists, one will be created. If writing multiple objects, the first is written in `truncate` mode. All subsequent objects will then be appended to the file. The user will be notified of any overwritten files. + +**Append** +> When writing to HDF5 in `append` mode, all datasets will be appended to the file. If no file with the supplied name exists, one will be created. If any datasets being written have a name that is already the name of a dataset within the file, an error will be generated. + +## Data Distribution + +**Single File** +> If the user elects to write to a single HDF5 file, all data will be pulled to the processing node and saved to ONE file with the supplied file name. It is important to ensure that the object is small enough to prevent memory exhaustion on the node. + +**Distributed Files** +> if the user elects to write data to distributed files, data will be written to one file per locale. Each file will contain the data from the object local to the locale of that file. File names will be the name provided by the user with the suffix `_LOCALE####` where `####` will be replaced with the locale number. Because the data is distributed across multiple nodes, there is a much lower risk of memory exhaustion. + +## Legacy File Support + +Older version of Arkouda used different schemas for `pdarray` and `Strings` objects (`ArrayView` was not supported). This format does not include the explicit `ObjType` attribute and requires the type to be inferred during processing. Reading these files is still supported by Arkouda. When the data type is `uint8` and the object with the name `dataset` (user supplied dataset name) is a group containing a dataset name `values` the object is assumed to be of object type Strings. + +## API Reference + +### pdarray + +```{eval-rst} +- :py:meth:`arkouda.pdarray.to_hdf` +- :py:meth:`arkouda.pdarray.save` +``` + +### Index + +```{eval-rst} +- :py:meth:`arkouda.Index.to_hdf` +- :py:meth:`arkouda.Index.save` +``` + +### DataFrame + +```{eval-rst} +- :py:meth:`arkouda.DataFrame.to_hdf` +- :py:meth:`arkouda.DataFrame.save` +- :py:meth:`arkouda.DataFrame.load` +``` + +### Strings + +```{eval-rst} +- :py:meth:`arkouda.Strings.to_hdf` +- :py:meth:`arkouda.Strings.save` +``` + +### Categorical + +```{eval-rst} +- :py:meth:`arkouda.Categorical.to_hdf` +- :py:meth:`arkouda.Categorical.save` +``` diff --git a/pydoc/file_io/IMPORT_EXPORT.md b/pydoc/file_io/IMPORT_EXPORT.md new file mode 100644 index 0000000000..109d94d439 --- /dev/null +++ b/pydoc/file_io/IMPORT_EXPORT.md @@ -0,0 +1,22 @@ +# Import/Export + +Arkouda allows for importing and exporting data in Pandas format, specifically DataFrames. This functionality is currently performed on the client. As a result it is assumed that the size of data being imported can be handled by the client because it was written by Pandas. Arkouda natively verifies that the size of data being sent to client can be handled. + +During both import and export operations, file type is maintained. Thus, if you import/export an HDf5 file and elect to save an appropriately formatted file during the operation, the resulting file will also be HDF5. + +This functionality should not be required for Parquet files, but is supported for both HDF5 and Parquet. + +## Export + +Export takes a file that was saved using Arkouda and reads it into Pandas. The user is able to specify if they would like to save the result to a file that can be read by Pandas and/or return the resulting Pandas object. + +## Import + +Importing data takes a file that was saved using Pandas and reads it into Arkouda. The user is able to specify if they would like to save the result to a file that can be read by Arkouda and/or return the resulting Arkouda object. + +## API Reference + +```{eval-rst} +- :py:func:`arkouda.io.import_data` +- :py:func:`arkouda.io.export` +``` diff --git a/pydoc/file_io/PARQUET.md b/pydoc/file_io/PARQUET.md new file mode 100644 index 0000000000..f236ca883f --- /dev/null +++ b/pydoc/file_io/PARQUET.md @@ -0,0 +1,70 @@ +# Parquet + +Parquet is a column-oriented file format that provides more structure than HDF5. While this is extremely beneficial, it does have some limitations within Arkouda at this time due to the requirement that columns have equal sizes. + +*We are currently working on providing functionality that eliminates these limitations in order to support more data types being saved to Parquet.* + +More information on Parquet can be found [here](https://parquet.apache.org/). + +## Supported Arkouda Data Types + +- pdarray +- Index +- DataFrame +- Strings + +## Compression + +Parquet supports 5 compression types: + +- Snappy +- GZip +- Brotli +- ZSTD +- LZ4 + +Data can also be saved using no compression. Arkouda now supports writting Parquet files with all compression types supported by Parquet. + +## Supported Write Modes + +**Truncate** +> When writing to Parquet in `truncate` mode, any existing HDF5 file with the same name will be overwritten. If no file exists, one will be created. If writing multiple objects, all corresponding columns will be written to the Paruqet file at once. + +**Append** +> When writting to Parquet in `append` mode, all datasets will be appended to the file. If no file with the supplied name exists, one will be created. If any datasets being written have a name that is already the name of a dataset within the file, an error will be generated. +>*Please Note: appending to a Parquet file is not natively support and is extremely ineffiecent. It is recommended to read the file out and call `arkouda.io.to_parquet` on the output with the additional columns added and then writting in `truncate` mode.* + +## API Reference + +### pdarray + +```{eval-rst} +- :py:meth:`arkouda.pdarray.to_parquet` +- :py:meth:`arkouda.pdarray.save` +``` + +### Index + +```{eval-rst} +- :py:meth:`arkouda.Index.to_parquet` +- :py:meth:`arkouda.Index.save` +``` + +### DataFrame + +```{eval-rst} +- :py:meth:`arkouda.DataFrame.to_parquet` +- :py:meth:`arkouda.DataFrame.save` +- :py:meth:`arkouda.DataFrame.load` +``` + +### Strings + +```{eval-rst} +- :py:meth:`arkouda.Strings.to_parquet` +- :py:meth:`arkouda.Strings.save` +``` + +### Categorical + +Categorical objects cannot currently be written to Parquet Files. This is due to the fact that the components of Categoricals can have different sizes. diff --git a/pydoc/file_io/io_menu.rst b/pydoc/file_io/io_menu.rst new file mode 100644 index 0000000000..3933763821 --- /dev/null +++ b/pydoc/file_io/io_menu.rst @@ -0,0 +1,44 @@ +.. _io-label: + +File I/O +====================== + +Arkouda supports reading and writing files to multiple file formats. + +Arkouda also supports importing files written by Pandas. + +.. toctree:: + :caption: Supported File Formats: + :maxdepth: 1 + + HDF5 + PARQUET + +Import/Export Support +---------------------- +Arkouda supports importing/exporting data in Pandas format. For information, please view the `Import/Export `_ documentation. + +.. toctree:: + :hidden: + :maxdepth: 1 + + IMPORT_EXPORT + +General I/O API +---------------- + +Arkouda supplies functions for general I/O interactions. These functions allow for writing 1 or more Arkouda objects and reading data into Arkouda objects. + +Write +^^^^^^ +- :py:func:`arkouda.io.to_parquet` +- :py:func:`arkouda.io.to_hdf` +- :py:func:`arkouda.io.save_all` + +Read +^^^^^ +- :py:func:`arkouda.io.load` +- :py:func:`arkouda.io.load_all` +- :py:func:`arkouda.io.read_parquet` +- :py:func:`arkouda.io.read_hdf` +- :py:func:`arkouda.io.read` diff --git a/pydoc/index.rst b/pydoc/index.rst index 42188d9f51..a0b0f48d86 100644 --- a/pydoc/index.rst +++ b/pydoc/index.rst @@ -20,7 +20,7 @@ Arkouda Documentation setup/testing - FILEIO + file_io/io_menu usage examples diff --git a/pydoc/usage.rst b/pydoc/usage.rst index f1a6aeab41..62a1ef9264 100644 --- a/pydoc/usage.rst +++ b/pydoc/usage.rst @@ -9,7 +9,6 @@ Usage usage/startup usage/pdarray usage/creation - usage/IO usage/arithmetic usage/indexing usage/histogram diff --git a/pydoc/usage/IO.rst b/pydoc/usage/IO.rst index b29604c21b..32e1110f91 100644 --- a/pydoc/usage/IO.rst +++ b/pydoc/usage/IO.rst @@ -69,35 +69,6 @@ HDF5/Parquet files can be queried via the server for dataset names and sizes. .. autofunction:: arkouda.ls_any -Persisting ``pdarray`` data to disk ------------------------------------ - -Arkouda supports saving pdarrays to HDF5/Parquet files. Unfortunately, arkouda does not yet support writing to a single HDF5 file from multiple locales and must create one output file per locale. - -.. autofunction:: arkouda.pdarray.save - -.. autofunction:: arkouda.save_all - -Loading persisted arrays from disk ------------------------------------ -These functions allow loading ``pdarray`` data persisted with ``save()`` and ``save_all()``. - -.. autofunction:: arkouda.load - -.. autofunction:: arkouda.load_all - -Persisting ``DataFrame`` data to disk -------------------------------------- -Arkouda supports saving ``DataFrame`` objects to HDF5/Parquet files. This is done by creating a dictionary that maps the column name to the pdarray containing the column data. The column names are treated as datasets in the file. - -.. autofunction:: arkouda.DataFrame.save_table - -Loading persisted DataFrame data from disk -------------------------------------------- -This functionality allows the columns be loaded as datasets, which creates a mapping of column names to column data. This structure is supported by the ``DataFrame`` constructor and is used to reconstruct the ``DataFrame`` - -.. autofunction:: arkouda.DataFrame.load_table - Import/Export ============= Import allows users to import data written by pandas into arkouda. Export allows users to write arkouda data into a format pandas can read. The file formats supported are: diff --git a/pydoc/usage/dataframe.rst b/pydoc/usage/dataframe.rst index 1854c90843..1076b08cef 100644 --- a/pydoc/usage/dataframe.rst +++ b/pydoc/usage/dataframe.rst @@ -51,7 +51,7 @@ Filter .. autofunction:: arkouda.DataFrame.filter_by_ranges Permutations ----------- +------------- .. autofunction:: arkouda.DataFrame.apply_permutation Sorting @@ -63,13 +63,13 @@ Sorting .. autofunction:: arkouda.DataFrame.sort_values Tail/Head of Data ----------- +------------------ .. autofunction:: arkouda.DataFrame.tail .. autofunction:: arkouda.DataFrame.head Rename Columns ----------- +--------------- .. autofunction:: arkouda.DataFrame.rename Append @@ -77,14 +77,13 @@ Append .. autofunction:: akrouda.DataFrame.append Concatenate ----------- +------------ .. autofunction:: arkouda.DataFrame.concat Reset Indexes ----------- +-------------- .. autofunction:: arkouda.DataFrame.reset_index Deduplication ----------- +-------------- .. autofunction:: arkouda.DataFrame.drop_duplicates -