From ad65ef8a71a909482f4621f7bfa2e0b7434367d4 Mon Sep 17 00:00:00 2001 From: LTLA Date: Fri, 17 Nov 2023 23:43:41 -0800 Subject: [PATCH] Throw an error if the top-level object is not a list. This makes sense as uzuki2 is, after all, designed to load lists, not assorted vectors or factors. Nonetheless, it can be disabled with a new 'strict_list' option if the caller doesn't mind dealing with non-list outputs. This option requires a new class for the HDF5 parsers. While we're making such a class, we took the opportunity to allow users to modify the HDF5 buffer size, rather than hard-coding it into the various functions. --- include/uzuki2/parse_hdf5.hpp | 94 ++++++++++++++++++++++------------- include/uzuki2/parse_json.hpp | 10 ++++ tests/src/external.cpp | 14 ++++-- tests/src/list.cpp | 14 +++--- tests/src/misc.cpp | 14 ++++++ tests/src/utils.h | 15 +++++- 6 files changed, 115 insertions(+), 46 deletions(-) diff --git a/include/uzuki2/parse_hdf5.hpp b/include/uzuki2/parse_hdf5.hpp index 6f0e5ff..688f6ef 100644 --- a/include/uzuki2/parse_hdf5.hpp +++ b/include/uzuki2/parse_hdf5.hpp @@ -52,7 +52,7 @@ inline H5::DataSet get_scalar_dataset(const H5::Group& handle, const std::string } template -void parse_integer_like(const H5::DataSet& handle, Host* ptr, Function check, const Version& version) try { +void parse_integer_like(const H5::DataSet& handle, Host* ptr, Function check, const Version& version, hsize_t buffer_size) try { if (ritsuko::hdf5::exceeds_integer_limit(handle, 32, true)) { throw std::runtime_error("dataset cannot be represented by 32-bit signed integers"); } @@ -71,7 +71,7 @@ void parse_integer_like(const H5::DataSet& handle, Host* ptr, Function check, co } hsize_t full_length = ptr->size(); - auto block_size = ritsuko::hdf5::pick_1d_block_size(handle.getCreatePlist(), full_length, /* buffer_size = */ 10000); + auto block_size = ritsuko::hdf5::pick_1d_block_size(handle.getCreatePlist(), full_length, buffer_size); std::vector buffer(block_size); ritsuko::hdf5::iterate_1d_blocks( full_length, @@ -94,7 +94,7 @@ void parse_integer_like(const H5::DataSet& handle, Host* ptr, Function check, co } template -void parse_string_like(const H5::DataSet& handle, Host* ptr, Function check) try { +void parse_string_like(const H5::DataSet& handle, Host* ptr, Function check, hsize_t buffer_size) try { auto dtype = handle.getDataType(); if (dtype.getClass() != H5T_STRING) { throw std::runtime_error("expected a string dataset"); @@ -111,7 +111,7 @@ void parse_string_like(const H5::DataSet& handle, Host* ptr, Function check) try ritsuko::hdf5::load_1d_string_dataset( handle, ptr->size(), - /* buffer_size = */ 10000, + buffer_size, [&](size_t i, const char* str, size_t len) -> void { std::string x(str, str + len); if (has_missing && x == missing_val) { @@ -127,7 +127,7 @@ void parse_string_like(const H5::DataSet& handle, Host* ptr, Function check) try } template -void parse_numbers(const H5::DataSet& handle, Host* ptr, Function check, const Version& version) try { +void parse_numbers(const H5::DataSet& handle, Host* ptr, Function check, const Version& version, hsize_t buffer_size) try { if (version.lt(1, 3)) { if (handle.getTypeClass() != H5T_FLOAT) { throw std::runtime_error("expected a floating-point dataset"); @@ -166,7 +166,7 @@ void parse_numbers(const H5::DataSet& handle, Host* ptr, Function check, const V }; hsize_t full_length = ptr->size(); - auto block_size = ritsuko::hdf5::pick_1d_block_size(handle.getCreatePlist(), full_length, /* buffer_size = */ 10000); + auto block_size = ritsuko::hdf5::pick_1d_block_size(handle.getCreatePlist(), full_length, buffer_size); std::vector buffer(block_size); ritsuko::hdf5::iterate_1d_blocks( full_length, @@ -189,7 +189,7 @@ void parse_numbers(const H5::DataSet& handle, Host* ptr, Function check, const V } template -void extract_names(const H5::Group& handle, Host* ptr) try { +void extract_names(const H5::Group& handle, Host* ptr, hsize_t buffer_size) try { if (handle.childObjType("names") != H5O_TYPE_DATASET) { throw std::runtime_error("expected a dataset"); } @@ -208,8 +208,8 @@ void extract_names(const H5::Group& handle, Host* ptr) try { ritsuko::hdf5::load_1d_string_dataset( nhandle, - nlen, - /* buffer_size = */ 10000, + nlen, + buffer_size, [&](size_t i, const char* val, size_t len) -> void { ptr->set_name(i, std::string(val, val + len)); } @@ -219,7 +219,7 @@ void extract_names(const H5::Group& handle, Host* ptr) try { } template -std::shared_ptr parse_inner(const H5::Group& handle, Externals& ext, const Version& version) try { +std::shared_ptr parse_inner(const H5::Group& handle, Externals& ext, const Version& version, hsize_t buffer_size) try { // Deciding what type we're dealing with. auto object_type = ritsuko::hdf5::load_scalar_string_attribute(handle, "uzuki_object"); std::shared_ptr output; @@ -241,11 +241,11 @@ std::shared_ptr parse_inner(const H5::Group& handle, Externals& ext, const throw std::runtime_error("expected a group at 'data/" + istr + "'"); } auto lhandle = dhandle.openGroup(istr); - lptr->set(i, parse_inner(lhandle, ext, version)); + lptr->set(i, parse_inner(lhandle, ext, version, buffer_size)); } if (named) { - extract_names(handle, lptr); + extract_names(handle, lptr, buffer_size); } } else if (object_type == "vector") { @@ -263,7 +263,7 @@ std::shared_ptr parse_inner(const H5::Group& handle, Externals& ext, const if (vector_type == "integer") { auto iptr = Provisioner::new_Integer(len, named, is_scalar); output.reset(iptr); - parse_integer_like(dhandle, iptr, [](int32_t) -> void {}, version); + parse_integer_like(dhandle, iptr, [](int32_t) -> void {}, version, buffer_size); } else if (vector_type == "boolean") { auto bptr = Provisioner::new_Boolean(len, named, is_scalar); @@ -272,7 +272,7 @@ std::shared_ptr parse_inner(const H5::Group& handle, Externals& ext, const if (x != 0 && x != 1) { throw std::runtime_error("boolean values should be 0 or 1"); } - }, version); + }, version, buffer_size); } else if (vector_type == "factor" || (version.equals(1, 0) && vector_type == "ordered")) { auto levhandle = ritsuko::hdf5::get_dataset(handle, "levels"); @@ -298,13 +298,13 @@ std::shared_ptr parse_inner(const H5::Group& handle, Externals& ext, const if (x < 0 || x >= levlen) { throw std::runtime_error("factor codes should be non-negative and less than the number of levels"); } - }, version); + }, version, buffer_size); std::unordered_set present; ritsuko::hdf5::load_1d_string_dataset( levhandle, levlen, - /* buffer_size = */ 10000, + buffer_size, [&](size_t i, const char* val, size_t len) -> void { std::string x(val, val + len); if (present.find(x) != present.end()) { @@ -328,7 +328,7 @@ std::shared_ptr parse_inner(const H5::Group& handle, Externals& ext, const ritsuko::hdf5::load_1d_string_dataset( fhandle, 1, - /* buffer_size = */ 10000, + buffer_size, [&](size_t, const char* val, size_t len) -> void { std::string x(val, val + len); if (x == "date") { @@ -345,27 +345,27 @@ std::shared_ptr parse_inner(const H5::Group& handle, Externals& ext, const auto sptr = Provisioner::new_String(len, named, is_scalar, format); output.reset(sptr); if (format == StringVector::NONE) { - parse_string_like(dhandle, sptr, [](const std::string&) -> void {}); + parse_string_like(dhandle, sptr, [](const std::string&) -> void {}, buffer_size); } else if (format == StringVector::DATE) { parse_string_like(dhandle, sptr, [&](const std::string& x) -> void { if (!ritsuko::is_date(x.c_str(), x.size())) { throw std::runtime_error("dates should follow YYYY-MM-DD formatting"); } - }); + }, buffer_size); } else if (format == StringVector::DATETIME) { parse_string_like(dhandle, sptr, [&](const std::string& x) -> void { if (!ritsuko::is_rfc3339(x.c_str(), x.size())) { throw std::runtime_error("date-times should follow the Internet Date/Time format"); } - }); + }, buffer_size); } } else if (vector_type == "number") { auto dptr = Provisioner::new_Number(len, named, is_scalar); output.reset(dptr); - parse_numbers(dhandle, dptr, [](double) -> void {}, version); + parse_numbers(dhandle, dptr, [](double) -> void {}, version, buffer_size); } else { throw std::runtime_error("unknown vector type '" + vector_type + "'"); @@ -373,7 +373,7 @@ std::shared_ptr parse_inner(const H5::Group& handle, Externals& ext, const if (named) { auto vptr = static_cast(output.get()); - extract_names(handle, vptr); + extract_names(handle, vptr, buffer_size); } } else if (object_type == "nothing") { @@ -411,12 +411,28 @@ std::shared_ptr parse_inner(const H5::Group& handle, Externals& ext, const * @endcond */ +/** + * @brief Options for HDF5 file parsing. + */ +struct Options { + /** + * Buffer size, in terms of the number of elements, to use for reading data from HDF5 datasets. + */ + hsize_t buffer_size = 10000; + + /** + * Whether to throw an error if the top-level R object is not an R list. + */ + bool strict_list = true; +}; + /** * @tparam Provisioner A class namespace defining static methods for creating new `Base` objects. * @tparam Externals Class describing how to resolve external references for type `EXTERNAL`. * * @param handle Handle for a HDF5 group corresponding to the list. * @param ext Instance of an external reference resolver class. + * @param options Optional parameters. * * @return A `ParsedList` containing a pointer to the root `Base` object. * Depending on `Provisioner`, this may contain references to all nested objects. @@ -456,7 +472,7 @@ std::shared_ptr parse_inner(const H5::Group& handle, Externals& ext, const * - `size_t size()`, which returns the number of available external references. */ template -ParsedList parse(const H5::Group& handle, Externals ext) { +ParsedList parse(const H5::Group& handle, Externals ext, Options options = Options()) { Version version; if (handle.attrExists("uzuki_version")) { auto ver_str = ritsuko::hdf5::load_scalar_string_attribute(handle, "uzuki_version"); @@ -466,8 +482,13 @@ ParsedList parse(const H5::Group& handle, Externals ext) { } ExternalTracker etrack(std::move(ext)); - auto ptr = parse_inner(handle, etrack, version); + auto ptr = parse_inner(handle, etrack, version, options.buffer_size); + + if (options.strict_list && ptr->type() != LIST) { + throw std::runtime_error("top-level object should represent an R list"); + } etrack.validate(); + return ParsedList(std::move(ptr), std::move(version)); } @@ -478,6 +499,7 @@ ParsedList parse(const H5::Group& handle, Externals ext) { * @tparam Provisioner A class namespace defining static methods for creating new `Base` objects. * * @param handle Handle for a HDF5 group corresponding to the list. + * @param options Optional parameters. * * @return A `ParsedList` containing a pointer to the root `Base` object. * Depending on `Provisioner`, this may contain references to all nested objects. @@ -485,8 +507,8 @@ ParsedList parse(const H5::Group& handle, Externals ext) { * Any invalid representations in `contents` will cause an error to be thrown. */ template -ParsedList parse(const H5::Group& handle) { - return parse(handle, uzuki2::DummyExternals(0)); +ParsedList parse(const H5::Group& handle, Options options = Options()) { + return parse(handle, uzuki2::DummyExternals(0), std::move(options)); } /** @@ -498,6 +520,7 @@ ParsedList parse(const H5::Group& handle) { * @param file Path to a HDF5 file. * @param name Name of the HDF5 group containing the list in `file`. * @param ext Instance of an external reference resolver class. + * @param options Optional parameters. * * @return A `ParsedList` containing a pointer to the root `Base` object. * Depending on `Provisioner`, this may contain references to all nested objects. @@ -505,9 +528,9 @@ ParsedList parse(const H5::Group& handle) { * Any invalid representations in `contents` will cause an error to be thrown. */ template -ParsedList parse(const std::string& file, const std::string& name, Externals ext) { +ParsedList parse(const std::string& file, const std::string& name, Externals ext, Options options = Options()) { H5::H5File handle(file, H5F_ACC_RDONLY); - return parse(handle.openGroup(name), std::move(ext)); + return parse(handle.openGroup(name), std::move(ext), std::move(options)); } /** @@ -518,6 +541,7 @@ ParsedList parse(const std::string& file, const std::string& name, Externals ext * * @param file Path to a HDF5 file. * @param name Name of the HDF5 group containing the list in `file`. + * @param options Optional parameters. * * @return A `ParsedList` containing a pointer to the root `Base` object. * Depending on `Provisioner`, this may contain references to all nested objects. @@ -525,9 +549,9 @@ ParsedList parse(const std::string& file, const std::string& name, Externals ext * Any invalid representations in `contents` will cause an error to be thrown. */ template -ParsedList parse(const std::string& file, const std::string& name) { +ParsedList parse(const std::string& file, const std::string& name, Options options = Options()) { H5::H5File handle(file, H5F_ACC_RDONLY); - return parse(handle.openGroup(name), uzuki2::DummyExternals(0)); + return parse(handle.openGroup(name), uzuki2::DummyExternals(0), std::move(options)); } /** @@ -538,10 +562,11 @@ ParsedList parse(const std::string& file, const std::string& name) { * @param name Name of the HDF5 group corresponding to `handle`. * Only used for error messages. * @param num_external Expected number of external references. + * @param options Optional parameters. */ -inline void validate(const H5::Group& handle, int num_external = 0) { +inline void validate(const H5::Group& handle, int num_external = 0, Options options = Options()) { DummyExternals ext(num_external); - parse(handle, ext); + parse(handle, ext, std::move(options)); return; } @@ -552,10 +577,11 @@ inline void validate(const H5::Group& handle, int num_external = 0) { * @param file Path to a HDF5 file. * @param name Name of the HDF5 group containing the list in `file`. * @param num_external Expected number of external references. + * @param options Optional parameters. */ -inline void validate(const std::string& file, const std::string& name, int num_external = 0) { +inline void validate(const std::string& file, const std::string& name, int num_external = 0, Options options = Options()) { DummyExternals ext(num_external); - parse(file, name, ext); + parse(file, name, ext, std::move(options)); return; } diff --git a/include/uzuki2/parse_json.hpp b/include/uzuki2/parse_json.hpp index 99303ca..b7e2fbb 100644 --- a/include/uzuki2/parse_json.hpp +++ b/include/uzuki2/parse_json.hpp @@ -395,6 +395,11 @@ struct Options { * If true, an extra thread is used to avoid blocking I/O operations. */ bool parallel = false; + + /** + * Whether to throw an error if the top-level R object is not an R list. + */ + bool strict_list = true; }; /** @@ -443,7 +448,12 @@ ParsedList parse(byteme::Reader& reader, Externals ext, Options options = Option ExternalTracker etrack(std::move(ext)); auto output = parse_object(contents.get(), etrack, "", version); + + if (options.strict_list && output->type() != LIST) { + throw std::runtime_error("top-level object should represent an R list"); + } etrack.validate(); + return ParsedList(std::move(output), std::move(version)); } diff --git a/tests/src/external.cpp b/tests/src/external.cpp index 87b74e1..b3a30c1 100644 --- a/tests/src/external.cpp +++ b/tests/src/external.cpp @@ -8,6 +8,8 @@ TEST(Hdf5ExternalTest, SimpleLoading) { auto path = "TEST-external.h5"; + uzuki2::hdf5::Options opt; + opt.strict_list = false; // Simple stuff works correctly. { @@ -17,7 +19,7 @@ TEST(Hdf5ExternalTest, SimpleLoading) { } { DefaultExternals ext(1); - auto parsed = uzuki2::hdf5::parse(path, "foo", ext); + auto parsed = uzuki2::hdf5::parse(path, "foo", ext, opt); EXPECT_EQ(parsed->type(), uzuki2::EXTERNAL); auto stuff = static_cast(parsed.get()); @@ -36,7 +38,7 @@ TEST(Hdf5ExternalTest, SimpleLoading) { } { DefaultExternals ext(2); - auto parsed = uzuki2::hdf5::parse(path, "foo", ext); + auto parsed = uzuki2::hdf5::parse(path, "foo", ext, opt); EXPECT_EQ(parsed->type(), uzuki2::LIST); auto list = static_cast(parsed.get()); @@ -50,9 +52,11 @@ TEST(Hdf5ExternalTest, SimpleLoading) { void expect_hdf5_external_error(std::string path, std::string name, std::string msg, int num_expected) { H5::H5File file(path, H5F_ACC_RDONLY); + uzuki2::hdf5::Options opt; + opt.strict_list = false; EXPECT_ANY_THROW({ try { - uzuki2::hdf5::validate(file.openGroup(name), num_expected); + uzuki2::hdf5::validate(file.openGroup(name), num_expected, std::move(opt)); } catch (std::exception& e) { EXPECT_THAT(e.what(), ::testing::HasSubstr(msg)); throw; @@ -105,7 +109,9 @@ TEST(Hdf5ExternalTest, CheckErrors) { auto load_json_with_externals(std::string x, int num_externals) { DefaultExternals ext(num_externals); - return uzuki2::json::parse_buffer(reinterpret_cast(x.c_str()), x.size(), ext); + uzuki2::json::Options opt; + opt.strict_list = false; + return uzuki2::json::parse_buffer(reinterpret_cast(x.c_str()), x.size(), ext, std::move(opt)); } TEST(JsonExternalTest, SimpleLoading) { diff --git a/tests/src/list.cpp b/tests/src/list.cpp index b13e5bd..3e11630 100644 --- a/tests/src/list.cpp +++ b/tests/src/list.cpp @@ -19,7 +19,7 @@ TEST(Hdf5ListTest, SimpleLoading) { create_dataset(vhandle, "data", { 1, 2, 3, 4, 5 }, H5::PredType::NATIVE_INT); } { - auto parsed = load_hdf5(path, "foo"); + auto parsed = load_hdf5_strict(path, "foo"); EXPECT_EQ(parsed->type(), uzuki2::LIST); auto stuff = static_cast(parsed.get()); @@ -41,7 +41,7 @@ TEST(Hdf5ListTest, SimpleLoading) { create_dataset(ghandle, "names", { "bruce", "alfred" }); } { - auto parsed = load_hdf5(path, "foo"); + auto parsed = load_hdf5_strict(path, "foo"); EXPECT_EQ(parsed->type(), uzuki2::LIST); auto stuff = static_cast(parsed.get()); @@ -66,7 +66,7 @@ TEST(Hdf5ListTest, NestedLoading) { } { - auto parsed = load_hdf5(path, "foo"); + auto parsed = load_hdf5_strict(path, "foo"); EXPECT_EQ(parsed->type(), uzuki2::LIST); auto stuff = static_cast(parsed.get()); @@ -111,7 +111,7 @@ TEST(Hdf5ListTest, CheckError) { TEST(JsonListTest, SimpleLoading) { // Simple stuff works correctly. { - auto parsed = load_json("{ \"type\":\"list\", \"values\": [ { \"type\": \"nothing\" }, { \"type\": \"integer\", \"values\": [ 1, 2, 3 ] } ] }"); + auto parsed = load_json_strict("{ \"type\":\"list\", \"values\": [ { \"type\": \"nothing\" }, { \"type\": \"integer\", \"values\": [ 1, 2, 3 ] } ] }"); EXPECT_EQ(parsed->type(), uzuki2::LIST); auto stuff = static_cast(parsed.get()); @@ -128,7 +128,7 @@ TEST(JsonListTest, SimpleLoading) { // Works with names. { - auto parsed = load_json("{ \"type\":\"list\", \"values\": [ { \"type\": \"nothing\" }, { \"type\": \"integer\", \"values\": [ 1, 2, 3 ] } ], \"names\": [ \"X\", \"Y\" ] }"); + auto parsed = load_json_strict("{ \"type\":\"list\", \"values\": [ { \"type\": \"nothing\" }, { \"type\": \"integer\", \"values\": [ 1, 2, 3 ] } ], \"names\": [ \"X\", \"Y\" ] }"); EXPECT_EQ(parsed->type(), uzuki2::LIST); auto stuff = static_cast(parsed.get()); @@ -139,7 +139,7 @@ TEST(JsonListTest, SimpleLoading) { // Works if empty. { - auto parsed = load_json("{ \"type\":\"list\", \"values\": [] }"); + auto parsed = load_json_strict("{ \"type\":\"list\", \"values\": [] }"); EXPECT_EQ(parsed->type(), uzuki2::LIST); auto stuff = static_cast(parsed.get()); EXPECT_EQ(stuff->size(), 0); @@ -147,7 +147,7 @@ TEST(JsonListTest, SimpleLoading) { } TEST(JsonListTest, NestedLoading) { - auto parsed = load_json("{ \"type\":\"list\", \"values\": [ { \"type\": \"nothing\" }, { \"type\": \"list\", \"values\": [ { \"type\": \"nothing\" } ] } ] }"); + auto parsed = load_json_strict("{ \"type\":\"list\", \"values\": [ { \"type\": \"nothing\" }, { \"type\": \"list\", \"values\": [ { \"type\": \"nothing\" } ] } ] }"); EXPECT_EQ(parsed->type(), uzuki2::LIST); auto stuff = static_cast(parsed.get()); diff --git a/tests/src/misc.cpp b/tests/src/misc.cpp index ebdf5d1..2749374 100644 --- a/tests/src/misc.cpp +++ b/tests/src/misc.cpp @@ -41,6 +41,20 @@ TEST(Hdf5AttributeTest, CheckError) { expect_hdf5_error(path, "whee", "unknown vector type"); } +TEST(TopLevelList, CheckErrors) { + auto path = "TEST-date.h5"; + + { + H5::H5File handle(path, H5F_ACC_TRUNC); + std::map attrs; + attrs["uzuki_object"] = "nothing"; + super_group_opener(handle, "whee", attrs); + } + expect_hdf5_error(path, "whee", "top-level object should represent an R list"); + + expect_json_error("{ \"type\":\"nothing\" }", "top-level object should represent an R list"); +} + class JsonFileTest : public ::testing::TestWithParam > {}; TEST_P(JsonFileTest, Chunking) { diff --git a/tests/src/utils.h b/tests/src/utils.h index 2be851d..eccb9df 100644 --- a/tests/src/utils.h +++ b/tests/src/utils.h @@ -158,13 +158,26 @@ inline H5::DataSet create_dataset(const H5::Group& parent, const std::string& na } inline auto load_hdf5(std::string name, std::string group) { + uzuki2::hdf5::Options opt; + opt.strict_list = false; + return uzuki2::hdf5::parse(name, group, std::move(opt)); +} + +inline auto load_hdf5_strict(std::string name, std::string group) { return uzuki2::hdf5::parse(name, group); } inline auto load_json(std::string x, bool parallel = false) { uzuki2::json::Options opt; opt.parallel = parallel; - return uzuki2::json::parse_buffer(reinterpret_cast(x.c_str()), x.size(), opt); + opt.strict_list = false; + return uzuki2::json::parse_buffer(reinterpret_cast(x.c_str()), x.size(), std::move(opt)); +} + +inline auto load_json_strict(std::string x, bool parallel = false) { + uzuki2::json::Options opt; + opt.parallel = parallel; + return uzuki2::json::parse_buffer(reinterpret_cast(x.c_str()), x.size(), std::move(opt)); } inline void expect_hdf5_error(std::string file, std::string name, std::string msg) {