Closes #1986 - File I/O Documentation Updates (#2045)

* Updates to File I/O documentation layout. Added Parquet information. * Updating docstrings. * clean up * Addressing review comments. * Updates from review.
Bears-R-Us · Jan 11, 2023 · 62b3266 · 62b3266
1 parent b023f48
commit 62b3266
Show file tree

Hide file tree

Showing 16 changed files with 665 additions and 219 deletions.
diff --git a/arkouda/categorical.py b/arkouda/categorical.py
@@ -808,10 +808,8 @@ def to_hdf(
         file_type: str = "distribute",
     ) -> str:
         """
-        Save the Categorical object to HDF5. The result is a collection of HDF5 files,
-        one file per locale of the arkouda server, where each filename starts
-        with prefix_path and dataset. Each locale saves its chunk of the Categorical to its
-        corresponding file.
+        Save the Categorical object to HDF5.
+        The object can be saved to a collection of files or single file.
 
         Parameters
         ----------
@@ -834,18 +832,24 @@ def to_hdf(
 
         Raises
         ------
-        ValueError
-            Raised if the lengths of columns and values differ, or the mode is
-            neither 'truncate' nor 'append'
-        TypeError
-            Raised if prefix_path, dataset, or mode is not a str
-
+        RuntimeError
+            Raised if a server-side error is thrown saving the pdarray
         Notes
         -----
-        Important implementation notes: (1) Strings state is saved as two datasets
-        within an hdf5 group: one for the string characters and one for the
-        segments corresponding to the start of each string, (2) the hdf5 group is named
-        via the dataset parameter.
+        - The prefix_path must be visible to the arkouda server and the user must
+        have write permission.
+        - Output files have names of the form ``<prefix_path>_LOCALE<i>``, where ``<i>``
+        ranges from 0 to ``numLocales`` for `file_type='distribute'`. Otherwise,
+        the file name will be `prefix_path`.
+        - If any of the output files already exist and
+        the mode is 'truncate', they will be overwritten. If the mode is 'append'
+        and the number of output files is less than the number of locales or a
+        dataset with the same name already exists, a ``RuntimeError`` will result.
+        - Any file extension can be used.The file I/O does not rely on the extension to
+        determine the file format.
+        See Also
+        ---------
+        to_parquet
         """
         result = []
         comp_dict = {k: v for k, v in self._get_components_dict().items() if v is not None}
@@ -878,9 +882,11 @@ def to_parquet(
         compression: Optional[str] = None,
     ) -> str:
         """
-        Save the Categorical object to Parquet. The result is a collection of Parquet files,
+        This functionality is currently not supported and will also raise a RuntimeError.
+        Support is in development.
+        Save the Categorical to Parquet. The result is a collection of files,
         one file per locale of the arkouda server, where each filename starts
-        with prefix_path and dataset. Each locale saves its chunk of the Categorical to its
+        with prefix_path. Each locale saves its chunk of the array to its
         corresponding file.
 
         Parameters
@@ -903,11 +909,24 @@ def to_parquet(
 
         Raises
         ------
-        ValueError
-            Raised if the lengths of columns and values differ, or the mode is
-            neither 'truncate' nor 'append'
-        TypeError
-            Raised if prefix_path, dataset, or mode is not a str
+        RuntimeError
+            On run due to compatability issues of Categorical with Parquet.
+        Notes
+        -----
+        - The prefix_path must be visible to the arkouda server and the user must
+        have write permission.
+        - Output files have names of the form ``<prefix_path>_LOCALE<i>``, where ``<i>``
+        ranges from 0 to ``numLocales`` for `file_type='distribute'`.
+        - 'append' write mode is supported, but is not efficient.
+        - If any of the output files already exist and
+        the mode is 'truncate', they will be overwritten. If the mode is 'append'
+        and the number of output files is less than the number of locales or a
+        dataset with the same name already exists, a ``RuntimeError`` will result.
+        - Any file extension can be used.The file I/O does not rely on the extension to
+        determine the file format.
+        See Also
+        --------
+        to_hdf
         """
         # due to the possibility that components will be different sizes,
         # writing to Parquet is not supported at this time

diff --git a/arkouda/dataframe.py b/arkouda/dataframe.py
@@ -1475,12 +1475,21 @@ def to_hdf(self, path, index=False, columns=None, file_type="distribute"):
         file_type: str (single | distribute)
             Default: distribute
             Whether to save to a single file or distribute across Locales
-
+        Returns
+        -------
+        None
+        Raises
+        ------
+        RuntimeError
+            Raised if a server-side error is thrown saving the pdarray
         Notes
         -----
         This method saves one file per locale of the arkouda server. All
         files are prefixed by the path argument and suffixed by their
         locale number.
+        See Also
+        ---------
+        to_parquet, load
         """
         from arkouda.io import to_hdf
 
@@ -1503,12 +1512,21 @@ def to_parquet(self, path, index=False, columns=None, compression: Optional[str]
             Default None
             Provide the compression type to use when writing the file.
             Supported values: snappy, gzip, brotli, zstd, lz4
-
+        Returns
+        -------
+        None
+        Raises
+        ------
+        RuntimeError
+            Raised if a server-side error is thrown saving the pdarray
         Notes
         -----
         This method saves one file per locale of the arkouda server. All
         files are prefixed by the path argument and suffixed by their
         locale number.
+        See Also
+        ---------
+        to_hdf, load
         """
         from arkouda.io import to_parquet
 

diff --git a/arkouda/index.py b/arkouda/index.py
@@ -202,6 +202,44 @@ def to_hdf(
         mode: str = "truncate",
         file_type: str = "distribute",
     ) -> str:
+        """
+        Save the Index to HDF5.
+        The object can be saved to a collection of files or single file.
+        Parameters
+        ----------
+        prefix_path : str
+            Directory and filename prefix that all output files share
+        dataset : str
+            Name of the dataset to create in files (must not already exist)
+        mode : str {'truncate' | 'append'}
+            By default, truncate (overwrite) output files, if they exist.
+            If 'append', attempt to create new dataset in existing files.
+        file_type: str ("single" | "distribute")
+            Default: "distribute"
+            When set to single, dataset is written to a single file.
+            When distribute, dataset is written on a file per locale.
+            This is only supported by HDF5 files and will have no impact of Parquet Files.
+        Returns
+        -------
+        string message indicating result of save operation
+        Raises
+        -------
+        RuntimeError
+            Raised if a server-side error is thrown saving the pdarray
+        Notes
+        -----
+        - The prefix_path must be visible to the arkouda server and the user must
+        have write permission.
+        - Output files have names of the form ``<prefix_path>_LOCALE<i>``, where ``<i>``
+        ranges from 0 to ``numLocales`` for `file_type='distribute'`. Otherwise,
+        the file name will be `prefix_path`.
+        - If any of the output files already exist and
+        the mode is 'truncate', they will be overwritten. If the mode is 'append'
+        and the number of output files is less than the number of locales or a
+        dataset with the same name already exists, a ``RuntimeError`` will result.
+        - Any file extension can be used.The file I/O does not rely on the extension to
+        determine the file format.
+        """
         return self.values.to_hdf(prefix_path, dataset=dataset, mode=mode, file_type=file_type)
 
     def to_parquet(
@@ -211,6 +249,44 @@ def to_parquet(
         mode: str = "truncate",
         compression: Optional[str] = None,
     ):
+        """
+        Save the Index to Parquet. The result is a collection of files,
+        one file per locale of the arkouda server, where each filename starts
+        with prefix_path. Each locale saves its chunk of the array to its
+        corresponding file.
+        Parameters
+        ----------
+        prefix_path : str
+            Directory and filename prefix that all output files share
+        dataset : str
+            Name of the dataset to create in files (must not already exist)
+        mode : str {'truncate' | 'append'}
+            By default, truncate (overwrite) output files, if they exist.
+            If 'append', attempt to create new dataset in existing files.
+        compression : str (Optional)
+            (None | "snappy" | "gzip" | "brotli" | "zstd" | "lz4")
+            Sets the compression type used with Parquet files
+        Returns
+        -------
+        string message indicating result of save operation
+        Raises
+        ------
+        RuntimeError
+            Raised if a server-side error is thrown saving the pdarray
+        Notes
+        -----
+        - The prefix_path must be visible to the arkouda server and the user must
+        have write permission.
+        - Output files have names of the form ``<prefix_path>_LOCALE<i>``, where ``<i>``
+        ranges from 0 to ``numLocales`` for `file_type='distribute'`.
+        - 'append' write mode is supported, but is not efficient.
+        - If any of the output files already exist and
+        the mode is 'truncate', they will be overwritten. If the mode is 'append'
+        and the number of output files is less than the number of locales or a
+        dataset with the same name already exists, a ``RuntimeError`` will result.
+        - Any file extension can be used.The file I/O does not rely on the extension to
+        determine the file format.
+        """
         return self.values.to_parquet(prefix_path, dataset=dataset, mode=mode, compression=compression)
 
     def save(

diff --git a/arkouda/io.py b/arkouda/io.py
@@ -837,10 +837,12 @@ def to_parquet(
     ValueError
         Raised if (1) the lengths of columns and values differ or (2) the mode
         is not 'truncate' or 'append'
+    RuntimeError
+            Raised if a server-side error is thrown saving the pdarray
 
     See Also
     --------
-    save, load_all
+    to_hdf, load, load_all, read
 
     Notes
     -----
@@ -931,10 +933,12 @@ def to_hdf(
     ValueError
         Raised if (1) the lengths of columns and values differ or (2) the mode
         is not 'truncate' or 'append'
+    RuntimeError
+            Raised if a server-side error is thrown saving the pdarray
 
     See Also
     --------
-    save, load_all
+    to_parquet, load, load_all, read
 
     Notes
     -----
@@ -1095,7 +1099,7 @@ def load(
 
     See Also
     --------
-    save, load_all, read
+    to_parquet, to_hdf, load_all, read
 
     Notes
     -----
@@ -1172,7 +1176,7 @@ def load_all(
 
     See Also
     --------
-    save_all, load, read
+    to_parquet, to_hdf, load, read
 
     Notes
     _____
@@ -1273,7 +1277,7 @@ def read(
 
     See Also
     --------
-    read, get_datasets, ls, read_parquet, read_hdf
+    get_datasets, ls, read_parquet, read_hdf
 
     Notes
     -----