Skip to content

Commit

Permalink
Updated all function calls to use the latest ritsuko. (#15)
Browse files Browse the repository at this point in the history
By and large, this involves just renaming a few functions and updating the
expected error messages. We also switch to the Stream classes to iterate more
intuitively over the 1-dimensional datasets.
  • Loading branch information
LTLA authored Nov 22, 2023
1 parent 5f8d3f0 commit 4ec0a2f
Show file tree
Hide file tree
Showing 4 changed files with 120 additions and 122 deletions.
218 changes: 99 additions & 119 deletions include/uzuki2/parse_hdf5.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,14 +41,24 @@ namespace hdf5 {
/**
* @cond
*/
inline H5::DataSet get_scalar_dataset(const H5::Group& handle, const std::string& name, H5T_class_t type_class) try {
auto dhandle = ritsuko::hdf5::get_scalar_dataset(handle, name.c_str());
if (dhandle.getTypeClass() != type_class) {
throw std::runtime_error("dataset has the wrong datatype class");
template<class Object_>
std::string check_scalar_string_attribute(const Object_& handle, const char* name) {
auto attr = ritsuko::hdf5::open_attribute(handle, name);
if (!ritsuko::hdf5::is_scalar(attr) || attr.getTypeClass() != H5T_STRING) {
throw std::runtime_error("'" + std::string(name) + "' should be a scalar string attribute");
}
return ritsuko::hdf5::load_scalar_string_attribute(attr);
}

inline H5::DataSet check_scalar_dataset(const H5::Group& handle, const char* name) {
if (handle.childObjType(name) != H5O_TYPE_DATASET) {
throw std::runtime_error("expected '" + std::string(name) + "' to be a dataset");
}
auto dhandle = handle.openDataSet(name);
if (!ritsuko::hdf5::is_scalar(dhandle)) {
throw std::runtime_error("expected '" + std::string(name) + "'to be a scalar dataset");
}
return dhandle;
} catch (std::exception& e) {
throw std::runtime_error("failed to load scalar dataset at '" + ritsuko::hdf5::get_name(handle) + "/" + name + "'; " + std::string(e.what()));
}

template<class Host, class Function>
Expand All @@ -65,30 +75,23 @@ void parse_integer_like(const H5::DataSet& handle, Host* ptr, Function check, co
const char* placeholder_name = "missing-value-placeholder";
has_missing = handle.attrExists(placeholder_name);
if (has_missing) {
auto attr = ritsuko::hdf5::get_missing_placeholder_attribute(handle, placeholder_name, /* type_class_only = */ version.lt(1, 2));
auto attr = ritsuko::hdf5::open_missing_placeholder_attribute(handle, placeholder_name, /* type_class_only = */ version.lt(1, 2));
attr.read(H5::PredType::NATIVE_INT32, &missing_value);
}
}

hsize_t full_length = ptr->size();
auto block_size = ritsuko::hdf5::pick_1d_block_size(handle.getCreatePlist(), full_length, buffer_size);
std::vector<int32_t> buffer(block_size);
ritsuko::hdf5::iterate_1d_blocks(
full_length,
block_size,
[&](hsize_t counter, hsize_t limit, const H5::DataSpace& mspace, const H5::DataSpace& dspace) -> void {
handle.read(buffer.data(), H5::PredType::NATIVE_INT32, mspace, dspace);
for (hsize_t i = 0; i < limit; ++i) {
auto current = buffer[i];
if (has_missing && current == missing_value) {
ptr->set_missing(counter + i);
} else {
check(current);
ptr->set(counter + i, current);
}
}
ritsuko::hdf5::Stream1dNumericDataset<int32_t> stream(&handle, full_length, buffer_size);
for (hsize_t i = 0; i < full_length; ++i, stream.next()) {
auto current = stream.get();
if (has_missing && current == missing_value) {
ptr->set_missing(i);
} else {
check(current);
ptr->set(i, current);
}
);
}

} catch (std::exception& e) {
throw std::runtime_error("failed to load integer dataset at '" + ritsuko::hdf5::get_name(handle) + "'; " + std::string(e.what()));
}
Expand All @@ -100,28 +103,22 @@ void parse_string_like(const H5::DataSet& handle, Host* ptr, Function check, hsi
throw std::runtime_error("expected a string dataset");
}

const char* placeholder_name = "missing-value-placeholder";
bool has_missing = handle.attrExists(placeholder_name);
std::string missing_val;
if (has_missing) {
auto ahandle = ritsuko::hdf5::get_missing_placeholder_attribute(handle, placeholder_name, /* type_class_only = */ true);
missing_val = ritsuko::hdf5::load_scalar_string_attribute(ahandle);
}
auto missingness = ritsuko::hdf5::load_string_missing_placeholder(handle, "missing-value-placeholder");
bool has_missing = missingness.first;
std::string missing_val = missingness.second;

ritsuko::hdf5::load_1d_string_dataset(
handle,
ptr->size(),
buffer_size,
[&](size_t i, const char* str, size_t len) -> void {
std::string x(str, str + len);
if (has_missing && x == missing_val) {
ptr->set_missing(i);
} else {
check(x);
ptr->set(i, std::move(x));
}
hsize_t full_length = ptr->size();
ritsuko::hdf5::Stream1dStringDataset stream(&handle, full_length, buffer_size);
for (hsize_t i = 0; i < full_length; ++i, stream.next()) {
auto x = stream.steal();
if (has_missing && x == missing_val) {
ptr->set_missing(i);
} else {
check(x);
ptr->set(i, std::move(x));
}
);
}

} catch (std::exception& e) {
throw std::runtime_error("failed to load string dataset at '" + ritsuko::hdf5::get_name(handle) + "'; " + std::string(e.what()));
}
Expand All @@ -148,7 +145,7 @@ void parse_numbers(const H5::DataSet& handle, Host* ptr, Function check, const V
const char* placeholder_name = "missing-value-placeholder";
has_missing = handle.attrExists(placeholder_name);
if (has_missing) {
auto attr = ritsuko::hdf5::get_missing_placeholder_attribute(handle, placeholder_name, /* type_class_only = */ version.lt(1, 2));
auto attr = ritsuko::hdf5::open_missing_placeholder_attribute(handle, placeholder_name, /* type_class_only = */ version.lt(1, 2));
attr.read(H5::PredType::NATIVE_DOUBLE, &missing_value);
}
}
Expand All @@ -166,24 +163,17 @@ void parse_numbers(const H5::DataSet& handle, Host* ptr, Function check, const V
};

hsize_t full_length = ptr->size();
auto block_size = ritsuko::hdf5::pick_1d_block_size(handle.getCreatePlist(), full_length, buffer_size);
std::vector<double> buffer(block_size);
ritsuko::hdf5::iterate_1d_blocks(
full_length,
block_size,
[&](hsize_t counter, hsize_t limit, const H5::DataSpace& mspace, const H5::DataSpace& dspace) -> void {
handle.read(buffer.data(), H5::PredType::NATIVE_DOUBLE, mspace, dspace);
for (hsize_t i = 0; i < limit; ++i) {
auto current = buffer[i];
if (has_missing && is_missing_value(current)) {
ptr->set_missing(counter + i);
} else {
check(current);
ptr->set(counter + i, current);
}
}
ritsuko::hdf5::Stream1dNumericDataset<double> stream(&handle, full_length, buffer_size);
for (hsize_t i = 0; i < full_length; ++i, stream.next()) {
auto current = stream.get();
if (has_missing && is_missing_value(current)) {
ptr->set_missing(i);
} else {
check(current);
ptr->set(i, current);
}
);
}

} catch (std::exception& e) {
throw std::runtime_error("failed to load floating-point dataset at '" + ritsuko::hdf5::get_name(handle) + "'; " + std::string(e.what()));
}
Expand All @@ -206,52 +196,46 @@ void extract_names(const H5::Group& handle, Host* ptr, hsize_t buffer_size) try
throw std::runtime_error("number of names should be equal to the object length");
}

ritsuko::hdf5::load_1d_string_dataset(
nhandle,
nlen,
buffer_size,
[&](size_t i, const char* val, size_t len) -> void {
ptr->set_name(i, std::string(val, val + len));
}
);
ritsuko::hdf5::Stream1dStringDataset stream(&nhandle, nlen, buffer_size);
for (size_t i = 0; i < nlen; ++i, stream.next()) {
ptr->set_name(i, stream.steal());
}
} catch (std::exception& e) {
throw std::runtime_error("failed to load names at '" + ritsuko::hdf5::get_name(handle) + "'; " + std::string(e.what()));
}

template<class Provisioner, class Externals>
std::shared_ptr<Base> parse_inner(const H5::Group& handle, Externals& ext, const Version& version, hsize_t buffer_size) try {
// Deciding what type we're dealing with.
auto object_type = ritsuko::hdf5::load_scalar_string_attribute(handle, "uzuki_object");
auto object_type = check_scalar_string_attribute(handle, "uzuki_object");
std::shared_ptr<Base> output;

if (object_type == "list") {
if (!handle.exists("data") || handle.childObjType("data") != H5O_TYPE_GROUP) {
throw std::runtime_error("expected a group at 'data'");
}
auto dhandle = handle.openGroup("data");
auto dhandle = ritsuko::hdf5::open_group(handle, "data");
size_t len = dhandle.getNumObjs();

bool named = handle.exists("names");
auto lptr = Provisioner::new_List(len, named);
output.reset(lptr);

for (size_t i = 0; i < len; ++i) {
auto istr = std::to_string(i);
if (!dhandle.exists(istr) || dhandle.childObjType(istr) != H5O_TYPE_GROUP) {
throw std::runtime_error("expected a group at 'data/" + istr + "'");
try {
for (size_t i = 0; i < len; ++i) {
auto istr = std::to_string(i);
auto lhandle = ritsuko::hdf5::open_group(dhandle, istr.c_str());
lptr->set(i, parse_inner<Provisioner>(lhandle, ext, version, buffer_size));
}
auto lhandle = dhandle.openGroup(istr);
lptr->set(i, parse_inner<Provisioner>(lhandle, ext, version, buffer_size));
} catch (std::exception& e) {
throw std::runtime_error("failed to parse list contents in 'data'; " + std::string(e.what()));
}

if (named) {
extract_names(handle, lptr, buffer_size);
}

} else if (object_type == "vector") {
auto vector_type = ritsuko::hdf5::load_scalar_string_attribute(handle, "uzuki_type");
auto vector_type = check_scalar_string_attribute(handle, "uzuki_type");

auto dhandle = ritsuko::hdf5::get_dataset(handle, "data");
auto dhandle = ritsuko::hdf5::open_dataset(handle, "data");
size_t len = ritsuko::hdf5::get_1d_length(dhandle.getSpace(), true);
bool is_scalar = (len == 0);
if (is_scalar) {
Expand All @@ -275,7 +259,7 @@ std::shared_ptr<Base> parse_inner(const H5::Group& handle, Externals& ext, const
}, version, buffer_size);

} else if (vector_type == "factor" || (version.equals(1, 0) && vector_type == "ordered")) {
auto levhandle = ritsuko::hdf5::get_dataset(handle, "levels");
auto levhandle = ritsuko::hdf5::open_dataset(handle, "levels");
auto levtype = levhandle.getDataType();
if (levtype.getClass() != H5T_STRING) {
throw std::runtime_error("expected a string dataset for the levels at 'levels'");
Expand All @@ -286,9 +270,12 @@ std::shared_ptr<Base> parse_inner(const H5::Group& handle, Externals& ext, const
if (vector_type == "ordered") {
ordered = true;
} else if (handle.exists("ordered")) {
auto ohandle = get_scalar_dataset(handle, "ordered", H5T_INTEGER);
int tmp_ordered = 0;
ohandle.read(&tmp_ordered, H5::PredType::NATIVE_INT);
auto ohandle = check_scalar_dataset(handle, "ordered");
if (ritsuko::hdf5::exceeds_integer_limit(ohandle, 32, true)) {
throw std::runtime_error("'ordered' value cannot be represented by a 32-bit integer");
}
int32_t tmp_ordered = 0;
ohandle.read(&tmp_ordered, H5::PredType::NATIVE_INT32);
ordered = tmp_ordered > 0;
}

Expand All @@ -301,19 +288,15 @@ std::shared_ptr<Base> parse_inner(const H5::Group& handle, Externals& ext, const
}, version, buffer_size);

std::unordered_set<std::string> present;
ritsuko::hdf5::load_1d_string_dataset(
levhandle,
levlen,
buffer_size,
[&](size_t i, const char* val, size_t len) -> void {
std::string x(val, val + len);
if (present.find(x) != present.end()) {
throw std::runtime_error("levels should be unique");
}
fptr->set_level(i, x);
present.insert(std::move(x));
ritsuko::hdf5::Stream1dStringDataset stream(&levhandle, levlen, buffer_size);
for (int32_t i = 0; i < levlen; ++i, stream.next()) {
auto x = stream.steal();
if (present.find(x) != present.end()) {
throw std::runtime_error("levels should be unique");
}
);
fptr->set_level(i, x);
present.insert(std::move(x));
}

} else if (vector_type == "string" || (version.equals(1, 0) && (vector_type == "date" || vector_type == "date-time"))) {
StringVector::Format format = StringVector::NONE;
Expand All @@ -323,23 +306,20 @@ std::shared_ptr<Base> parse_inner(const H5::Group& handle, Externals& ext, const
} else if (vector_type == "date-time") {
format = StringVector::DATETIME;
}

} else if (handle.exists("format")) {
auto fhandle = get_scalar_dataset(handle, "format", H5T_STRING);
ritsuko::hdf5::load_1d_string_dataset(
fhandle,
1,
buffer_size,
[&](size_t, const char* val, size_t len) -> void {
std::string x(val, val + len);
if (x == "date") {
format = StringVector::DATE;
} else if (x == "date-time") {
format = StringVector::DATETIME;
} else {
throw std::runtime_error("unsupported format '" + x + "'");
}
}
);
auto fhandle = check_scalar_dataset(handle, "format");
if (fhandle.getTypeClass() != H5T_STRING) {
throw std::runtime_error("'format' dataset should have a string datatype class");
}
auto x = ritsuko::hdf5::load_scalar_string_dataset(fhandle);
if (x == "date") {
format = StringVector::DATE;
} else if (x == "date-time") {
format = StringVector::DATETIME;
} else {
throw std::runtime_error("unsupported format '" + x + "'");
}
}

auto sptr = Provisioner::new_String(len, named, is_scalar, format);
Expand Down Expand Up @@ -380,7 +360,7 @@ std::shared_ptr<Base> parse_inner(const H5::Group& handle, Externals& ext, const
output.reset(Provisioner::new_Nothing());

} else if (object_type == "external") {
auto ihandle = ritsuko::hdf5::get_dataset(handle, "index");
auto ihandle = ritsuko::hdf5::open_dataset(handle, "index");
if (ritsuko::hdf5::exceeds_integer_limit(ihandle, 32, true)) {
throw std::runtime_error("external index at 'index' cannot be represented by a 32-bit signed integer");
}
Expand Down Expand Up @@ -475,7 +455,7 @@ template<class Provisioner, class Externals>
ParsedList parse(const H5::Group& handle, Externals ext, Options options = Options()) {
Version version;
if (handle.attrExists("uzuki_version")) {
auto ver_str = ritsuko::hdf5::load_scalar_string_attribute(handle, "uzuki_version");
auto ver_str = check_scalar_string_attribute(handle, "uzuki_version");
auto vraw = ritsuko::parse_version_string(ver_str.c_str(), ver_str.size(), /* skip_patch = */ true);
version.major = vraw.major;
version.minor = vraw.minor;
Expand Down Expand Up @@ -530,7 +510,7 @@ ParsedList parse(const H5::Group& handle, Options options = Options()) {
template<class Provisioner, class Externals>
ParsedList parse(const std::string& file, const std::string& name, Externals ext, Options options = Options()) {
H5::H5File handle(file, H5F_ACC_RDONLY);
return parse<Provisioner>(handle.openGroup(name), std::move(ext), std::move(options));
return parse<Provisioner>(ritsuko::hdf5::open_group(handle, name.c_str()), std::move(ext), std::move(options));
}

/**
Expand All @@ -551,7 +531,7 @@ ParsedList parse(const std::string& file, const std::string& name, Externals ext
template<class Provisioner>
ParsedList parse(const std::string& file, const std::string& name, Options options = Options()) {
H5::H5File handle(file, H5F_ACC_RDONLY);
return parse<Provisioner>(handle.openGroup(name), uzuki2::DummyExternals(0), std::move(options));
return parse<Provisioner>(ritsuko::hdf5::open_group(handle, name.c_str()), uzuki2::DummyExternals(0), std::move(options));
}

/**
Expand Down
18 changes: 18 additions & 0 deletions tests/src/date.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,24 @@ TEST(Hdf5DateTest, CheckError) {
}
expect_hdf5_error(path, "foo", "dates should follow");

{
H5::H5File handle(path, H5F_ACC_TRUNC);
auto vhandle = vector_opener(handle, "foo", "string");
add_version(vhandle, "1.1");
write_string(vhandle, "format", "foobar");
create_dataset(vhandle, "data", { "harry", "ron", "hermoine" });
}
expect_hdf5_error(path, "foo", "unsupported format");

{
H5::H5File handle(path, H5F_ACC_TRUNC);
auto vhandle = vector_opener(handle, "foo", "string");
add_version(vhandle, "1.1");
vhandle.createDataSet("format", H5::PredType::NATIVE_INT, H5S_SCALAR);
create_dataset(vhandle, "data", { "harry", "ron", "hermoine" });
}
expect_hdf5_error(path, "foo", "string datatype");

{
H5::H5File handle(path, H5F_ACC_TRUNC);
auto vhandle = vector_opener(handle, "foo", "date");
Expand Down
2 changes: 1 addition & 1 deletion tests/src/factor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,7 @@ TEST(Hdf5FactorTest, CheckError) {
create_dataset(vhandle, "levels", { "Malcolm", "Tony", "John" });
write_scalar(vhandle, "ordered", 1.2, H5::PredType::NATIVE_DOUBLE);
}
expect_hdf5_error(path, "blub", "wrong datatype");
expect_hdf5_error(path, "blub", "cannot be represented");

{
H5::H5File handle(path, H5F_ACC_TRUNC);
Expand Down
Loading

0 comments on commit 4ec0a2f

Please sign in to comment.