diff --git a/docs/specifications/hdf5.Rmd b/docs/specifications/hdf5.Rmd index 071485d..10df744 100644 --- a/docs/specifications/hdf5.Rmd +++ b/docs/specifications/hdf5.Rmd @@ -28,7 +28,7 @@ If not present, the version is assumed to be \"1.0\" for back-compatibility purp cat("This attribute should hold a scalar string dataset containing the value \"", as.character(.version), "\".", sep="") } ``` -This should use a HDF5 string datatype that is compatible with the UTF-8 encoding. +This should use a datatype that can be represented by a UTF-8 encoded string. ```{r, echo=FALSE, results="asis"} if (.version >= package_version("1.3")) { @@ -57,7 +57,7 @@ By providing unique names, users can improve interoperability with native data s An R list is represented as a HDF5 group (`**/`) with the following attributes: - `uzuki_object`, a scalar string dataset containing the value `"list"`. - This should use a HDF5 string datatype that is compatible with the UTF-8 encoding. + This should use a datatype that can be represented by a UTF-8 encoded string. This group should contain a subgroup `**/data` that contains the list elements. Each list element is itself represented by a subgroup that is named after its 0-based position in the list, e.g., `**/data/0` for the first list element. @@ -65,14 +65,14 @@ One subgroup should be present for each integer in `[0, N)`, given a list of len Each list element may be any of the objects described in this specification, including further nested lists. If the list is named, there will additionally be a 1-dimensional `**/names` string dataset of length equal to the number of elements in `**/data`. -This should use a HDF5 string datatype that is compatible with the UTF-8 encoding. +This should use a datatype that can be represented by a UTF-8 encoded string. ### Atomic vectors An atomic vector is represented as a HDF5 group (`**/`) with the following attributes: - `uzuki_object`, a scalar string dataset containing the value `"vector"`. - This should use a HDF5 string datatype that is compatible with the UTF-8 encoding. + This should use a datatype that can be represented by a UTF-8 encoded string. ```{r, echo=FALSE, results="asis"} if (.version == package_version("1.0")) { cat('- `uzuki_type`, a scalar string dataset containing one of `"integer"`, `"boolean"`, `"number"`, `"string"`, `"date"` or `"date-time"`.') @@ -80,7 +80,7 @@ if (.version == package_version("1.0")) { cat('- `uzuki_type`, a scalar string dataset containing one of `"integer"`, `"boolean"`, `"number"` or `"string"`.') } ``` - This should use a HDF5 string datatype that is compatible with the UTF-8 encoding. + This should use a datatype that can be represented by a UTF-8 encoded string. The group should contain an 1-dimensional dataset at `**/data`. Vectors of length 1 may also be represented as a scalar dataset. @@ -116,12 +116,12 @@ This should be a scalar string dataset that specifies constraints to the format - `"date"`: strings should be `YYYY-MM-DD` dates or the placeholder value. - `"date-time"`: strings should be in the Internet Date/Time format ([RFC 3339, Section 5.6](https://www.rfc-editor.org/rfc/rfc3339#section-5.6)) or the placeholder value. -This should use a HDF5 string datatype that is compatible with the UTF-8 encoding.') +This should use a datatype that can be represented by a UTF-8 encoded string.') } ``` The atomic vector's group may also contain `**/names`, a 1-dimensional string dataset of length equal to that of `**/data`. -This should use a HDF5 string datatype that is compatible with the UTF-8 encoding. +This should use a datatype that can be represented by a UTF-8 encoded string. If `**/data` is a scalar, `**/names` should have length 1. ### Representing missing values @@ -136,7 +136,7 @@ If no such attribute is present, it can be assumed that there are no missing val if (.version >= package_version("1.2")) { cat('The datatype of the placeholder attribute should be exactly the same as that of `**/data`, so as to avoid unexpected results upon casting. -The only exception is when `**/data` is a string, in which case the placeholder may be of any string datatype that is compatible with the UTF-8 encoding. +The only exception is when `**/data` is a string, in which case the placeholder may be of any string datatype that can be represented by a UTF-8 encoded string. it is expected that any comparison between the placeholder and strings in `**/data` will be performed bytewise in the same manner as `strcmp`.') } @@ -170,7 +170,7 @@ If no such attribute is present, it can be assumed that there are no missing val A factor is represented as a HDF5 group (`**/`) with the following attributes: - `uzuki_object`, a scalar string dataset containing the value `"vector"`. - This should use a HDF5 string datatype that is compatible with the UTF-8 encoding. + This should use a datatype that can be represented by a UTF-8 encoded string. ```{r, echo=FALSE, results="asis"} if (.version == package_version("1.0")) { cat('- `uzuki_type`, a scalar string dataset containing `"factor"` or `"ordered"`.') @@ -178,7 +178,7 @@ if (.version == package_version("1.0")) { cat('- `uzuki_type`, a scalar string dataset containing `"factor"`.') } ``` - This should use a HDF5 string datatype that is compatible with the UTF-8 encoding. + This should use a datatype that can be represented by a UTF-8 encoded string. The group should contain an 1-dimensional dataset at `**/data`, containing 0-based indices into the levels. This should use a HDF5 integer datatype that can be represented by a 32-bit signed integer. @@ -186,7 +186,7 @@ This should use a HDF5 integer datatype that can be represented by a 32-bit sign Missing values are represented as described above for atomic vectors. The group should contain `**/levels`, a 1-dimensional string dataset that contains the levels for the indices in `**/data`. -This should use a HDF5 string datatype that is compatible with the UTF-8 encoding. +This should use a datatype that can be represented by a UTF-8 encoded string. Values in `**/levels` should be unique. Values in `**/data` should be non-negative (missing values excepted) and less than the length of `**/levels`. @@ -194,7 +194,7 @@ Note that the datatype constraints on `**/data` suggest that there should not be as beyond that, the levels cannot be indexed by elements of `**/data`. The group may also contain `**/names`, a 1-dimensional string dataset of length equal to `data`. -This should use a HDF5 string datatype is compatible with the UTF-8 encoding. +This should use a datatype that can be represented by a UTF-8 encoded string. ```{r, echo=FALSE, results="asis"} if (.version == package_version("1.1")) { @@ -208,14 +208,14 @@ This should be interpreted as a boolean where a non-zero value specifies that we A "nothing" (a.k.a., "null", "none") value is represented as a HDF5 group with the following attributes: - `uzuki_object`, a scalar string dataset containing the value `"nothing"`. - This should use a HDF5 string datatype that is compatible with the UTF-8 encoding. + This should use a datatype that can be represented by a UTF-8 encoded string. ### External object Each external object is represented as a HDF5 group (`**/`) with the following attributes: - `uzuki_object`, a scalar string dataset containing the value `"external"`. - This should use a HDF5 string datatype that is compatible with the UTF-8 encoding. + This should use a datatype that can be represented by a UTF-8 encoded string. This should contain an `**/index` scalar dataset, containing an index that identifies this external object uniquely within the entire list. `**/index` should start at zero and be incremented whenever an external object is encountered. diff --git a/include/uzuki2/parse_hdf5.hpp b/include/uzuki2/parse_hdf5.hpp index 6faf0d4..76aabde 100644 --- a/include/uzuki2/parse_hdf5.hpp +++ b/include/uzuki2/parse_hdf5.hpp @@ -91,7 +91,7 @@ void parse_integer_like(const H5::DataSet& handle, Host* ptr, Function check, co template void parse_string_like(const H5::DataSet& handle, Host* ptr, Function check, hsize_t buffer_size) try { if (!ritsuko::hdf5::is_utf8_string(handle)) { - throw std::runtime_error("expected a string datatype with a UTF-8 compatible encoding"); + throw std::runtime_error("expected a datatype that can be represented by a UTF-8 encoded string"); } auto missingness = ritsuko::hdf5::open_and_load_optional_string_missing_placeholder(handle, "missing-value-placeholder"); @@ -178,7 +178,7 @@ void extract_names(const H5::Group& handle, Host* ptr, hsize_t buffer_size) try auto nhandle = handle.openDataSet("names"); if (!ritsuko::hdf5::is_utf8_string(nhandle)) { - throw std::runtime_error("expected a string datatype with a UTF-8 compatible encoding"); + throw std::runtime_error("expected a datatype that can be represented by a UTF-8 encoded string"); } size_t len = ptr->size(); @@ -252,7 +252,7 @@ std::shared_ptr parse_inner(const H5::Group& handle, Externals& ext, const } else if (vector_type == "factor" || (version.equals(1, 0) && vector_type == "ordered")) { auto levhandle = ritsuko::hdf5::open_dataset(handle, "levels"); if (!ritsuko::hdf5::is_utf8_string(levhandle)) { - throw std::runtime_error("expected a string datatype with a UTF-8 compatible encoding"); + throw std::runtime_error("expected a datatype that can be represented by a UTF-8 encoded string"); } int32_t levlen = ritsuko::hdf5::get_1d_length(levhandle.getSpace(), false); @@ -300,7 +300,7 @@ std::shared_ptr parse_inner(const H5::Group& handle, Externals& ext, const } else if (handle.exists("format")) { auto fhandle = check_scalar_dataset(handle, "format"); if (!ritsuko::hdf5::is_utf8_string(fhandle)) { - throw std::runtime_error("expected a string datatype with a UTF-8 compatible encoding"); + throw std::runtime_error("expected a datatype that can be represented by a UTF-8 encoded string"); } auto x = ritsuko::hdf5::load_scalar_string_dataset(fhandle); if (x == "date") { diff --git a/tests/src/date.cpp b/tests/src/date.cpp index d0ba449..abcfd15 100644 --- a/tests/src/date.cpp +++ b/tests/src/date.cpp @@ -175,7 +175,7 @@ TEST(Hdf5DateTest, CheckError) { vhandle.createDataSet("format", H5::PredType::NATIVE_INT, H5S_SCALAR); create_dataset(vhandle, "data", { "harry", "ron", "hermoine" }); } - expect_hdf5_error(path, "foo", "string datatype"); + expect_hdf5_error(path, "foo", "can be represented by a UTF-8 encoded string"); { H5::H5File handle(path, H5F_ACC_TRUNC); diff --git a/tests/src/string.cpp b/tests/src/string.cpp index 52d7588..fbeb8e0 100644 --- a/tests/src/string.cpp +++ b/tests/src/string.cpp @@ -164,7 +164,7 @@ TEST(Hdf5StringTest, CheckError) { auto ghandle = vector_opener(handle, "foo", "string"); create_dataset(ghandle, "data", { 1, 2, 3, 4, 5 }, H5::PredType::NATIVE_INT); } - expect_hdf5_error(path, "foo", "expected a string"); + expect_hdf5_error(path, "foo", "UTF-8 encoded string"); { H5::H5File handle(path, H5F_ACC_TRUNC);