From 4ec0a2f60405244500bb091bcc4ea9eb7d14e0ec Mon Sep 17 00:00:00 2001 From: Aaron Lun Date: Tue, 21 Nov 2023 23:18:59 -0800 Subject: [PATCH] Updated all function calls to use the latest ritsuko. (#15) By and large, this involves just renaming a few functions and updating the expected error messages. We also switch to the Stream classes to iterate more intuitively over the 1-dimensional datasets. --- include/uzuki2/parse_hdf5.hpp | 218 +++++++++++++++------------------- tests/src/date.cpp | 18 +++ tests/src/factor.cpp | 2 +- tests/src/list.cpp | 4 +- 4 files changed, 120 insertions(+), 122 deletions(-) diff --git a/include/uzuki2/parse_hdf5.hpp b/include/uzuki2/parse_hdf5.hpp index 688f6ef..72ee29f 100644 --- a/include/uzuki2/parse_hdf5.hpp +++ b/include/uzuki2/parse_hdf5.hpp @@ -41,14 +41,24 @@ namespace hdf5 { /** * @cond */ -inline H5::DataSet get_scalar_dataset(const H5::Group& handle, const std::string& name, H5T_class_t type_class) try { - auto dhandle = ritsuko::hdf5::get_scalar_dataset(handle, name.c_str()); - if (dhandle.getTypeClass() != type_class) { - throw std::runtime_error("dataset has the wrong datatype class"); +template +std::string check_scalar_string_attribute(const Object_& handle, const char* name) { + auto attr = ritsuko::hdf5::open_attribute(handle, name); + if (!ritsuko::hdf5::is_scalar(attr) || attr.getTypeClass() != H5T_STRING) { + throw std::runtime_error("'" + std::string(name) + "' should be a scalar string attribute"); + } + return ritsuko::hdf5::load_scalar_string_attribute(attr); +} + +inline H5::DataSet check_scalar_dataset(const H5::Group& handle, const char* name) { + if (handle.childObjType(name) != H5O_TYPE_DATASET) { + throw std::runtime_error("expected '" + std::string(name) + "' to be a dataset"); + } + auto dhandle = handle.openDataSet(name); + if (!ritsuko::hdf5::is_scalar(dhandle)) { + throw std::runtime_error("expected '" + std::string(name) + "'to be a scalar dataset"); } return dhandle; -} catch (std::exception& e) { - throw std::runtime_error("failed to load scalar dataset at '" + ritsuko::hdf5::get_name(handle) + "/" + name + "'; " + std::string(e.what())); } template @@ -65,30 +75,23 @@ void parse_integer_like(const H5::DataSet& handle, Host* ptr, Function check, co const char* placeholder_name = "missing-value-placeholder"; has_missing = handle.attrExists(placeholder_name); if (has_missing) { - auto attr = ritsuko::hdf5::get_missing_placeholder_attribute(handle, placeholder_name, /* type_class_only = */ version.lt(1, 2)); + auto attr = ritsuko::hdf5::open_missing_placeholder_attribute(handle, placeholder_name, /* type_class_only = */ version.lt(1, 2)); attr.read(H5::PredType::NATIVE_INT32, &missing_value); } } hsize_t full_length = ptr->size(); - auto block_size = ritsuko::hdf5::pick_1d_block_size(handle.getCreatePlist(), full_length, buffer_size); - std::vector buffer(block_size); - ritsuko::hdf5::iterate_1d_blocks( - full_length, - block_size, - [&](hsize_t counter, hsize_t limit, const H5::DataSpace& mspace, const H5::DataSpace& dspace) -> void { - handle.read(buffer.data(), H5::PredType::NATIVE_INT32, mspace, dspace); - for (hsize_t i = 0; i < limit; ++i) { - auto current = buffer[i]; - if (has_missing && current == missing_value) { - ptr->set_missing(counter + i); - } else { - check(current); - ptr->set(counter + i, current); - } - } + ritsuko::hdf5::Stream1dNumericDataset stream(&handle, full_length, buffer_size); + for (hsize_t i = 0; i < full_length; ++i, stream.next()) { + auto current = stream.get(); + if (has_missing && current == missing_value) { + ptr->set_missing(i); + } else { + check(current); + ptr->set(i, current); } - ); + } + } catch (std::exception& e) { throw std::runtime_error("failed to load integer dataset at '" + ritsuko::hdf5::get_name(handle) + "'; " + std::string(e.what())); } @@ -100,28 +103,22 @@ void parse_string_like(const H5::DataSet& handle, Host* ptr, Function check, hsi throw std::runtime_error("expected a string dataset"); } - const char* placeholder_name = "missing-value-placeholder"; - bool has_missing = handle.attrExists(placeholder_name); - std::string missing_val; - if (has_missing) { - auto ahandle = ritsuko::hdf5::get_missing_placeholder_attribute(handle, placeholder_name, /* type_class_only = */ true); - missing_val = ritsuko::hdf5::load_scalar_string_attribute(ahandle); - } + auto missingness = ritsuko::hdf5::load_string_missing_placeholder(handle, "missing-value-placeholder"); + bool has_missing = missingness.first; + std::string missing_val = missingness.second; - ritsuko::hdf5::load_1d_string_dataset( - handle, - ptr->size(), - buffer_size, - [&](size_t i, const char* str, size_t len) -> void { - std::string x(str, str + len); - if (has_missing && x == missing_val) { - ptr->set_missing(i); - } else { - check(x); - ptr->set(i, std::move(x)); - } + hsize_t full_length = ptr->size(); + ritsuko::hdf5::Stream1dStringDataset stream(&handle, full_length, buffer_size); + for (hsize_t i = 0; i < full_length; ++i, stream.next()) { + auto x = stream.steal(); + if (has_missing && x == missing_val) { + ptr->set_missing(i); + } else { + check(x); + ptr->set(i, std::move(x)); } - ); + } + } catch (std::exception& e) { throw std::runtime_error("failed to load string dataset at '" + ritsuko::hdf5::get_name(handle) + "'; " + std::string(e.what())); } @@ -148,7 +145,7 @@ void parse_numbers(const H5::DataSet& handle, Host* ptr, Function check, const V const char* placeholder_name = "missing-value-placeholder"; has_missing = handle.attrExists(placeholder_name); if (has_missing) { - auto attr = ritsuko::hdf5::get_missing_placeholder_attribute(handle, placeholder_name, /* type_class_only = */ version.lt(1, 2)); + auto attr = ritsuko::hdf5::open_missing_placeholder_attribute(handle, placeholder_name, /* type_class_only = */ version.lt(1, 2)); attr.read(H5::PredType::NATIVE_DOUBLE, &missing_value); } } @@ -166,24 +163,17 @@ void parse_numbers(const H5::DataSet& handle, Host* ptr, Function check, const V }; hsize_t full_length = ptr->size(); - auto block_size = ritsuko::hdf5::pick_1d_block_size(handle.getCreatePlist(), full_length, buffer_size); - std::vector buffer(block_size); - ritsuko::hdf5::iterate_1d_blocks( - full_length, - block_size, - [&](hsize_t counter, hsize_t limit, const H5::DataSpace& mspace, const H5::DataSpace& dspace) -> void { - handle.read(buffer.data(), H5::PredType::NATIVE_DOUBLE, mspace, dspace); - for (hsize_t i = 0; i < limit; ++i) { - auto current = buffer[i]; - if (has_missing && is_missing_value(current)) { - ptr->set_missing(counter + i); - } else { - check(current); - ptr->set(counter + i, current); - } - } + ritsuko::hdf5::Stream1dNumericDataset stream(&handle, full_length, buffer_size); + for (hsize_t i = 0; i < full_length; ++i, stream.next()) { + auto current = stream.get(); + if (has_missing && is_missing_value(current)) { + ptr->set_missing(i); + } else { + check(current); + ptr->set(i, current); } - ); + } + } catch (std::exception& e) { throw std::runtime_error("failed to load floating-point dataset at '" + ritsuko::hdf5::get_name(handle) + "'; " + std::string(e.what())); } @@ -206,14 +196,10 @@ void extract_names(const H5::Group& handle, Host* ptr, hsize_t buffer_size) try throw std::runtime_error("number of names should be equal to the object length"); } - ritsuko::hdf5::load_1d_string_dataset( - nhandle, - nlen, - buffer_size, - [&](size_t i, const char* val, size_t len) -> void { - ptr->set_name(i, std::string(val, val + len)); - } - ); + ritsuko::hdf5::Stream1dStringDataset stream(&nhandle, nlen, buffer_size); + for (size_t i = 0; i < nlen; ++i, stream.next()) { + ptr->set_name(i, stream.steal()); + } } catch (std::exception& e) { throw std::runtime_error("failed to load names at '" + ritsuko::hdf5::get_name(handle) + "'; " + std::string(e.what())); } @@ -221,27 +207,25 @@ void extract_names(const H5::Group& handle, Host* ptr, hsize_t buffer_size) try template std::shared_ptr parse_inner(const H5::Group& handle, Externals& ext, const Version& version, hsize_t buffer_size) try { // Deciding what type we're dealing with. - auto object_type = ritsuko::hdf5::load_scalar_string_attribute(handle, "uzuki_object"); + auto object_type = check_scalar_string_attribute(handle, "uzuki_object"); std::shared_ptr output; if (object_type == "list") { - if (!handle.exists("data") || handle.childObjType("data") != H5O_TYPE_GROUP) { - throw std::runtime_error("expected a group at 'data'"); - } - auto dhandle = handle.openGroup("data"); + auto dhandle = ritsuko::hdf5::open_group(handle, "data"); size_t len = dhandle.getNumObjs(); bool named = handle.exists("names"); auto lptr = Provisioner::new_List(len, named); output.reset(lptr); - for (size_t i = 0; i < len; ++i) { - auto istr = std::to_string(i); - if (!dhandle.exists(istr) || dhandle.childObjType(istr) != H5O_TYPE_GROUP) { - throw std::runtime_error("expected a group at 'data/" + istr + "'"); + try { + for (size_t i = 0; i < len; ++i) { + auto istr = std::to_string(i); + auto lhandle = ritsuko::hdf5::open_group(dhandle, istr.c_str()); + lptr->set(i, parse_inner(lhandle, ext, version, buffer_size)); } - auto lhandle = dhandle.openGroup(istr); - lptr->set(i, parse_inner(lhandle, ext, version, buffer_size)); + } catch (std::exception& e) { + throw std::runtime_error("failed to parse list contents in 'data'; " + std::string(e.what())); } if (named) { @@ -249,9 +233,9 @@ std::shared_ptr parse_inner(const H5::Group& handle, Externals& ext, const } } else if (object_type == "vector") { - auto vector_type = ritsuko::hdf5::load_scalar_string_attribute(handle, "uzuki_type"); + auto vector_type = check_scalar_string_attribute(handle, "uzuki_type"); - auto dhandle = ritsuko::hdf5::get_dataset(handle, "data"); + auto dhandle = ritsuko::hdf5::open_dataset(handle, "data"); size_t len = ritsuko::hdf5::get_1d_length(dhandle.getSpace(), true); bool is_scalar = (len == 0); if (is_scalar) { @@ -275,7 +259,7 @@ std::shared_ptr parse_inner(const H5::Group& handle, Externals& ext, const }, version, buffer_size); } else if (vector_type == "factor" || (version.equals(1, 0) && vector_type == "ordered")) { - auto levhandle = ritsuko::hdf5::get_dataset(handle, "levels"); + auto levhandle = ritsuko::hdf5::open_dataset(handle, "levels"); auto levtype = levhandle.getDataType(); if (levtype.getClass() != H5T_STRING) { throw std::runtime_error("expected a string dataset for the levels at 'levels'"); @@ -286,9 +270,12 @@ std::shared_ptr parse_inner(const H5::Group& handle, Externals& ext, const if (vector_type == "ordered") { ordered = true; } else if (handle.exists("ordered")) { - auto ohandle = get_scalar_dataset(handle, "ordered", H5T_INTEGER); - int tmp_ordered = 0; - ohandle.read(&tmp_ordered, H5::PredType::NATIVE_INT); + auto ohandle = check_scalar_dataset(handle, "ordered"); + if (ritsuko::hdf5::exceeds_integer_limit(ohandle, 32, true)) { + throw std::runtime_error("'ordered' value cannot be represented by a 32-bit integer"); + } + int32_t tmp_ordered = 0; + ohandle.read(&tmp_ordered, H5::PredType::NATIVE_INT32); ordered = tmp_ordered > 0; } @@ -301,19 +288,15 @@ std::shared_ptr parse_inner(const H5::Group& handle, Externals& ext, const }, version, buffer_size); std::unordered_set present; - ritsuko::hdf5::load_1d_string_dataset( - levhandle, - levlen, - buffer_size, - [&](size_t i, const char* val, size_t len) -> void { - std::string x(val, val + len); - if (present.find(x) != present.end()) { - throw std::runtime_error("levels should be unique"); - } - fptr->set_level(i, x); - present.insert(std::move(x)); + ritsuko::hdf5::Stream1dStringDataset stream(&levhandle, levlen, buffer_size); + for (int32_t i = 0; i < levlen; ++i, stream.next()) { + auto x = stream.steal(); + if (present.find(x) != present.end()) { + throw std::runtime_error("levels should be unique"); } - ); + fptr->set_level(i, x); + present.insert(std::move(x)); + } } else if (vector_type == "string" || (version.equals(1, 0) && (vector_type == "date" || vector_type == "date-time"))) { StringVector::Format format = StringVector::NONE; @@ -323,23 +306,20 @@ std::shared_ptr parse_inner(const H5::Group& handle, Externals& ext, const } else if (vector_type == "date-time") { format = StringVector::DATETIME; } + } else if (handle.exists("format")) { - auto fhandle = get_scalar_dataset(handle, "format", H5T_STRING); - ritsuko::hdf5::load_1d_string_dataset( - fhandle, - 1, - buffer_size, - [&](size_t, const char* val, size_t len) -> void { - std::string x(val, val + len); - if (x == "date") { - format = StringVector::DATE; - } else if (x == "date-time") { - format = StringVector::DATETIME; - } else { - throw std::runtime_error("unsupported format '" + x + "'"); - } - } - ); + auto fhandle = check_scalar_dataset(handle, "format"); + if (fhandle.getTypeClass() != H5T_STRING) { + throw std::runtime_error("'format' dataset should have a string datatype class"); + } + auto x = ritsuko::hdf5::load_scalar_string_dataset(fhandle); + if (x == "date") { + format = StringVector::DATE; + } else if (x == "date-time") { + format = StringVector::DATETIME; + } else { + throw std::runtime_error("unsupported format '" + x + "'"); + } } auto sptr = Provisioner::new_String(len, named, is_scalar, format); @@ -380,7 +360,7 @@ std::shared_ptr parse_inner(const H5::Group& handle, Externals& ext, const output.reset(Provisioner::new_Nothing()); } else if (object_type == "external") { - auto ihandle = ritsuko::hdf5::get_dataset(handle, "index"); + auto ihandle = ritsuko::hdf5::open_dataset(handle, "index"); if (ritsuko::hdf5::exceeds_integer_limit(ihandle, 32, true)) { throw std::runtime_error("external index at 'index' cannot be represented by a 32-bit signed integer"); } @@ -475,7 +455,7 @@ template ParsedList parse(const H5::Group& handle, Externals ext, Options options = Options()) { Version version; if (handle.attrExists("uzuki_version")) { - auto ver_str = ritsuko::hdf5::load_scalar_string_attribute(handle, "uzuki_version"); + auto ver_str = check_scalar_string_attribute(handle, "uzuki_version"); auto vraw = ritsuko::parse_version_string(ver_str.c_str(), ver_str.size(), /* skip_patch = */ true); version.major = vraw.major; version.minor = vraw.minor; @@ -530,7 +510,7 @@ ParsedList parse(const H5::Group& handle, Options options = Options()) { template ParsedList parse(const std::string& file, const std::string& name, Externals ext, Options options = Options()) { H5::H5File handle(file, H5F_ACC_RDONLY); - return parse(handle.openGroup(name), std::move(ext), std::move(options)); + return parse(ritsuko::hdf5::open_group(handle, name.c_str()), std::move(ext), std::move(options)); } /** @@ -551,7 +531,7 @@ ParsedList parse(const std::string& file, const std::string& name, Externals ext template ParsedList parse(const std::string& file, const std::string& name, Options options = Options()) { H5::H5File handle(file, H5F_ACC_RDONLY); - return parse(handle.openGroup(name), uzuki2::DummyExternals(0), std::move(options)); + return parse(ritsuko::hdf5::open_group(handle, name.c_str()), uzuki2::DummyExternals(0), std::move(options)); } /** diff --git a/tests/src/date.cpp b/tests/src/date.cpp index 043fd7d..d0ba449 100644 --- a/tests/src/date.cpp +++ b/tests/src/date.cpp @@ -159,6 +159,24 @@ TEST(Hdf5DateTest, CheckError) { } expect_hdf5_error(path, "foo", "dates should follow"); + { + H5::H5File handle(path, H5F_ACC_TRUNC); + auto vhandle = vector_opener(handle, "foo", "string"); + add_version(vhandle, "1.1"); + write_string(vhandle, "format", "foobar"); + create_dataset(vhandle, "data", { "harry", "ron", "hermoine" }); + } + expect_hdf5_error(path, "foo", "unsupported format"); + + { + H5::H5File handle(path, H5F_ACC_TRUNC); + auto vhandle = vector_opener(handle, "foo", "string"); + add_version(vhandle, "1.1"); + vhandle.createDataSet("format", H5::PredType::NATIVE_INT, H5S_SCALAR); + create_dataset(vhandle, "data", { "harry", "ron", "hermoine" }); + } + expect_hdf5_error(path, "foo", "string datatype"); + { H5::H5File handle(path, H5F_ACC_TRUNC); auto vhandle = vector_opener(handle, "foo", "date"); diff --git a/tests/src/factor.cpp b/tests/src/factor.cpp index fecee53..a94be29 100644 --- a/tests/src/factor.cpp +++ b/tests/src/factor.cpp @@ -199,7 +199,7 @@ TEST(Hdf5FactorTest, CheckError) { create_dataset(vhandle, "levels", { "Malcolm", "Tony", "John" }); write_scalar(vhandle, "ordered", 1.2, H5::PredType::NATIVE_DOUBLE); } - expect_hdf5_error(path, "blub", "wrong datatype"); + expect_hdf5_error(path, "blub", "cannot be represented"); { H5::H5File handle(path, H5F_ACC_TRUNC); diff --git a/tests/src/list.cpp b/tests/src/list.cpp index 3e11630..1c8a371 100644 --- a/tests/src/list.cpp +++ b/tests/src/list.cpp @@ -96,7 +96,7 @@ TEST(Hdf5ListTest, CheckError) { auto dhandle = ghandle.createGroup("data"); nothing_opener(dhandle, "1"); } - expect_hdf5_error(path, "foo", "expected a group at 'data/0'"); + expect_hdf5_error(path, "foo", "expected a group at '0'"); { H5::H5File handle(path, H5F_ACC_TRUNC); @@ -104,7 +104,7 @@ TEST(Hdf5ListTest, CheckError) { auto dhandle = ghandle.createGroup("data"); create_dataset(dhandle, "0", { 1, 2, 3 }, H5::PredType::NATIVE_INT); } - expect_hdf5_error(path, "foo", "expected a group at 'data/0'"); + expect_hdf5_error(path, "foo", "expected a group at '0'"); }