diff --git a/src/metadata.cpp b/src/metadata.cpp new file mode 100644 index 00000000..216c7d7c --- /dev/null +++ b/src/metadata.cpp @@ -0,0 +1,246 @@ +/* + * Copyright 2023 Veloman Yunkan + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, + * MA 02110-1301, USA. + */ + +#include "metadata.h" + +#include +#include +#include + +#include +#include + + +namespace zim +{ + +namespace +{ + +const bool MANDATORY = true; +const bool OPTIONAL = false; + +const std::string LANGS_REGEXP = "\\w{3}(,\\w{3})*"; +const std::string DATE_REGEXP = R"(\d\d\d\d-\d\d-\d\d)"; + +// PNG regexp has to be defined in such a tricky way because it includes +// a NUL character +const char PNG_REGEXP_DATA[] = "^\x89\x50\x4e\x47\x0d\x0a\x1a\x0a(.|\\s|\0)+"; +const std::string PNG_REGEXP(PNG_REGEXP_DATA, sizeof(PNG_REGEXP_DATA)-1); + +bool matchRegex(const std::string& regexStr, const std::string& text) +{ + const std::regex regex(regexStr); + return std::regex_match(text.begin(), text.end(), regex); +} + +size_t getTextLength(const std::string& utf8EncodedString) +{ + return icu::UnicodeString::fromUTF8(utf8EncodedString).length(); +} + +class MetadataComplexCheckBase +{ +public: + const std::string description; + const MetadataComplexCheckBase* const prev; + +public: // functions + explicit MetadataComplexCheckBase(const std::string& desc); + + MetadataComplexCheckBase(const MetadataComplexCheckBase&) = delete; + MetadataComplexCheckBase(MetadataComplexCheckBase&&) = delete; + void operator=(const MetadataComplexCheckBase&) = delete; + void operator=(MetadataComplexCheckBase&&) = delete; + + virtual ~MetadataComplexCheckBase(); + + virtual bool checkMetadata(const Metadata& m) const = 0; + + static const MetadataComplexCheckBase* getLastCheck() { return last; } + +private: // functions + static const MetadataComplexCheckBase* last; +}; + +const MetadataComplexCheckBase* MetadataComplexCheckBase::last = nullptr; + +MetadataComplexCheckBase::MetadataComplexCheckBase(const std::string& desc) + : description(desc) + , prev(last) +{ + last = this; +} + +MetadataComplexCheckBase::~MetadataComplexCheckBase() +{ + // Ideally, we should de-register this object from the list of live objects. + // However, in the current implementation MetadataComplexCheckBase objects + // are only constructed in static storage and the list of active objects + // isn't supposed to be accessed after any MetadataComplexCheckBase object + // has been destroyed as part of program termination clean-up actions. +} + +#define ADD_METADATA_COMPLEX_CHECK(DESC, CLSNAME) \ +class CLSNAME : public MetadataComplexCheckBase \ +{ \ +public: \ + CLSNAME() : MetadataComplexCheckBase(DESC) {} \ + bool checkMetadata(const Metadata& data) const override; \ +}; \ + \ +const CLSNAME CONCAT(obj, CLSNAME); \ + \ +bool CLSNAME::checkMetadata(const Metadata& data) const \ +/* should be followed by the check body */ + + + +#define CONCAT(X, Y) X##Y +#define GENCLSNAME(UUID) CONCAT(MetadataComplexCheck, UUID) + +#define METADATA_ASSERT(DESC) ADD_METADATA_COMPLEX_CHECK(DESC, GENCLSNAME(__LINE__)) + + +#include "metadata_constraints.cpp" + +// This function is intended for pretty printing of regexps with non-printable +// characters. +// In a general purpose/rigorous version we should escape the escape symbol +// (backslash) too, but that doesn't play well with the purpose stated above. +std::string escapeNonPrintableChars(const std::string& s) +{ + std::ostringstream os; + os << std::hex; + for (const char c : s) { + if (std::isprint(c)) { + os << c; + } else { + const unsigned int charVal = static_cast(c); + os << "\\x" << std::setw(2) << std::setfill('0') << charVal; + } + } + return os.str(); +} + +Metadata::Errors concat(Metadata::Errors e1, const Metadata::Errors& e2) +{ + e1.insert(e1.end(), e2.begin(), e2.end()); + return e1; +} + +} // unnamed namespace + +const Metadata::ReservedMetadataTable& Metadata::reservedMetadataInfo = reservedMetadataInfoTable; + +const Metadata::ReservedMetadataRecord& +Metadata::getReservedMetadataRecord(const std::string& name) +{ + for ( const auto& x : reservedMetadataInfo ) { + if ( x.name == name ) + return x; + } + + throw std::out_of_range(name + " is not a reserved metadata name"); +} + +bool Metadata::has(const std::string& name) const +{ + return data.find(name) != data.end(); +} + +const std::string& Metadata::operator[](const std::string& name) const +{ + return data.at(name); +} + +void Metadata::set(const std::string& name, const std::string& value) +{ + data[name] = value; +} + +bool Metadata::valid() const +{ + return check().empty(); +} + +Metadata::Errors Metadata::checkMandatoryMetadata() const +{ + Errors errors; + for ( const auto& rmr : reservedMetadataInfo ) { + if ( rmr.isMandatory && data.find(rmr.name) == data.end() ) { + errors.push_back("Missing mandatory metadata: " + rmr.name ); + } + } + + return errors; +} + +Metadata::Errors Metadata::checkSimpleConstraints() const +{ + Errors errors; + for ( const auto& nv : data ) { + const auto& name = nv.first; + const auto& value = nv.second; + try { + const auto& rmr = getReservedMetadataRecord(name); + if ( rmr.minLength != 0 && getTextLength(value) < rmr.minLength ) { + std::ostringstream oss; + oss << name << " must contain at least " << rmr.minLength << " characters"; + errors.push_back(oss.str()); + } + if ( rmr.maxLength != 0 && getTextLength(value) > rmr.maxLength ) { + std::ostringstream oss; + oss << name << " must contain at most " << rmr.maxLength << " characters"; + errors.push_back(oss.str()); + } + if ( !rmr.regex.empty() && !matchRegex(rmr.regex, value) ) { + const std::string regex = escapeNonPrintableChars(rmr.regex); + errors.push_back(name + " doesn't match regex: " + regex); + } + } catch ( const std::out_of_range& ) { + // ignore non-reserved metadata + } + } + return errors; +} + +Metadata::Errors Metadata::checkComplexConstraints() const +{ + Errors errors; + const MetadataComplexCheckBase* c = MetadataComplexCheckBase::getLastCheck(); + for ( ; c != nullptr ; c = c->prev ) { + if ( ! c->checkMetadata(*this) ) { + errors.push_back(c->description); + } + } + return errors; +} + +Metadata::Errors Metadata::check() const +{ + const Errors e1 = checkMandatoryMetadata(); + const Errors e2 = checkSimpleConstraints(); + if ( !e1.empty() || !e2.empty() ) + return concat(e1, e2); + + return checkComplexConstraints(); +} + +} // namespace zim diff --git a/src/metadata.h b/src/metadata.h new file mode 100644 index 00000000..f57655d8 --- /dev/null +++ b/src/metadata.h @@ -0,0 +1,77 @@ +/* + * Copyright 2023 Veloman Yunkan + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, + * MA 02110-1301, USA. + */ + +#ifndef OPENZIM_METADATA_H +#define OPENZIM_METADATA_H + +#include +#include +#include + +namespace zim +{ + +class Metadata +{ + typedef std::map KeyValueMap; + +public: // types + struct ReservedMetadataRecord + { + const std::string name; + const bool isMandatory; + const size_t minLength; + const size_t maxLength; + const std::string regex; + }; + + typedef std::vector ReservedMetadataTable; + + typedef std::vector Errors; + + typedef KeyValueMap::const_iterator Iterator; + +public: // data + static const ReservedMetadataTable& reservedMetadataInfo; + +public: // functions + void set(const std::string& name, const std::string& value); + bool has(const std::string& name) const; + const std::string& operator[](const std::string& name) const; + + bool valid() const; + Errors check() const; + + static const ReservedMetadataRecord& getReservedMetadataRecord(const std::string& name); + + Iterator begin() const { return data.begin(); } + Iterator end() const { return data.end(); } + +private: // functions + Errors checkMandatoryMetadata() const; + Errors checkSimpleConstraints() const; + Errors checkComplexConstraints() const; + +private: // data + KeyValueMap data; +}; + +} // namespace zim + +#endif // OPENZIM_METADATA_H diff --git a/src/metadata_constraints.cpp b/src/metadata_constraints.cpp new file mode 100644 index 00000000..48c8e1c4 --- /dev/null +++ b/src/metadata_constraints.cpp @@ -0,0 +1,32 @@ +const Metadata::ReservedMetadataTable reservedMetadataInfoTable = { + // name isMandatory minLength maxLength regex + { "Name", MANDATORY, 1, 0, "" }, + { "Title", MANDATORY, 1, 30, "" }, + { "Language", MANDATORY, 3, 0, LANGS_REGEXP }, + { "Creator", MANDATORY, 1, 0, "" }, + { "Publisher", MANDATORY, 1, 0, "" }, + { "Date", MANDATORY, 10, 10, DATE_REGEXP }, + { "Description", MANDATORY, 1, 80, "" }, + { "LongDescription", OPTIONAL, 0, 4000, "" }, + { "License", OPTIONAL, 0, 0, "" }, + { "Tags", OPTIONAL, 0, 0, "" }, + { "Relation", OPTIONAL, 0, 0, "" }, + { "Flavour", OPTIONAL, 0, 0, "" }, + { "Source", OPTIONAL, 0, 0, "" }, + { "Counter", OPTIONAL, 0, 0, "" }, + { "Scraper", OPTIONAL, 0, 0, "" }, + + { + "Illustration_48x48@1", + MANDATORY, + 0, // There are no constraints on the illustration metadata size + 0, // in order to avoid decoding it as UTF-8 encoded text + PNG_REGEXP + }, +}; + +METADATA_ASSERT("LongDescription shouldn't be shorter than Description") +{ + return !data.has("LongDescription") + || data["LongDescription"].size() >= data["Description"].size(); +} diff --git a/src/zimcheck/checks.cpp b/src/zimcheck/checks.cpp index ab48330c..d2abf2bf 100644 --- a/src/zimcheck/checks.cpp +++ b/src/zimcheck/checks.cpp @@ -2,6 +2,7 @@ #include "checks.h" #include "../tools.h" #include "../concurrent_cache.h" +#include "../metadata.h" #include #include @@ -49,7 +50,7 @@ std::unordered_map> errormapping = { { TestType::CHECKSUM, {LogTag::ERROR, "Invalid checksum"}}, { TestType::INTEGRITY, {LogTag::ERROR, "Invalid low-level structure"}}, { TestType::EMPTY, {LogTag::ERROR, "Empty articles"}}, - { TestType::METADATA, {LogTag::ERROR, "Missing metadata entries"}}, + { TestType::METADATA, {LogTag::ERROR, "Metadata errors"}}, { TestType::FAVICON, {LogTag::ERROR, "Favicon"}}, { TestType::MAIN_PAGE, {LogTag::ERROR, "Missing mainpage"}}, { TestType::REDUNDANT, {LogTag::WARNING, "Redundant data found"}}, @@ -73,7 +74,7 @@ std::unordered_map msgTable = { { MsgId::DANGLING_LINKS, { TestType::URL_INTERNAL, "The following links:\n{{#links}}- {{&value}}\n{{/links}}({{&normalized_link}}) were not found in article {{&path}}" } }, { MsgId::EXTERNAL_LINK, { TestType::URL_EXTERNAL, "{{&link}} is an external dependence in article {{&path}}" } }, { MsgId::REDUNDANT_ITEMS, { TestType::REDUNDANT, "{{&path1}} and {{&path2}}" } }, - { MsgId::MISSING_METADATA, { TestType::METADATA, "{{&metadata_type}}" } }, + { MsgId::METADATA, { TestType::METADATA, "{{&error}}" } }, { MsgId::REDIRECT_LOOP, { TestType::REDIRECT, "Redirect loop exists from entry {{&entry_path}}\n" } }, { MsgId::MISSING_FAVICON, { TestType::FAVICON, "Favicon is missing" } } }; @@ -266,21 +267,13 @@ void test_integrity(const std::string& filename, ErrorLogger& reporter) { void test_metadata(const zim::Archive& archive, ErrorLogger& reporter) { - reporter.infoMsg("[INFO] Searching for metadata entries..."); - static const char* const test_meta[] = { - "Title", - "Creator", - "Publisher", - "Date", - "Description", - "Language"}; - auto existing_metadata = archive.getMetadataKeys(); - auto begin = existing_metadata.begin(); - auto end = existing_metadata.end(); - for (auto &meta : test_meta) { - if (std::find(begin, end, meta) == end) { - reporter.addMsg(MsgId::MISSING_METADATA, {{"metadata_type", meta}}); - } + reporter.infoMsg("[INFO] Checking metadata..."); + zim::Metadata metadata; + for ( const auto& key : archive.getMetadataKeys() ) { + metadata.set(key, archive.getMetadata(key)); + } + for (const auto &error : metadata.check()) { + reporter.addMsg(MsgId::METADATA, {{"error", error}}); } } diff --git a/src/zimcheck/checks.h b/src/zimcheck/checks.h index 14ef33e9..f5942fed 100644 --- a/src/zimcheck/checks.h +++ b/src/zimcheck/checks.h @@ -52,7 +52,7 @@ enum class MsgId { CHECKSUM, MAIN_PAGE, - MISSING_METADATA, + METADATA, EMPTY_ENTRY, OUTOFBOUNDS_LINK, EMPTY_LINKS, diff --git a/src/zimcheck/meson.build b/src/zimcheck/meson.build index c08af030..08c6789e 100644 --- a/src/zimcheck/meson.build +++ b/src/zimcheck/meson.build @@ -21,6 +21,7 @@ executable('zimcheck', 'checks.cpp', 'json_tools.cpp', '../tools.cpp', + '../metadata.cpp', include_directories : inc, dependencies: [libzim_dep, thread_dep], install: true) diff --git a/src/zimwriterfs/meson.build b/src/zimwriterfs/meson.build index 6f746884..a4f44012 100644 --- a/src/zimwriterfs/meson.build +++ b/src/zimwriterfs/meson.build @@ -3,6 +3,7 @@ sources = [ 'zimwriterfs.cpp', 'tools.cpp', '../tools.cpp', + '../metadata.cpp', 'zimcreatorfs.cpp' ] diff --git a/src/zimwriterfs/zimwriterfs.cpp b/src/zimwriterfs/zimwriterfs.cpp index f5ea6948..ed6cba6d 100644 --- a/src/zimwriterfs/zimwriterfs.cpp +++ b/src/zimwriterfs/zimwriterfs.cpp @@ -33,6 +33,7 @@ #include #include "zimcreatorfs.h" +#include "../metadata.h" #include "../tools.h" #include "../version.h" #include "tools.h" @@ -81,30 +82,52 @@ bool thereAreMissingArguments() || illustration.empty(); } -bool checkDescriptionLengths() { - if (description.empty()) { - std::cerr << "Description metadata should not be empty." << std::endl; - return false; +zim::Metadata makeMetadata() { + zim::Metadata metadata; + + metadata.set("Language", language); + metadata.set("Publisher", publisher); + metadata.set("Creator", creator); + metadata.set("Title", title); + metadata.set("Description", description); + metadata.set("LongDescription", longDescription); + metadata.set("Name", name); + metadata.set("Source", source); + metadata.set("Flavour", flavour); + metadata.set("Scraper", scraper); + metadata.set("Tags", tags); + metadata.set("Date", generateDate()); + if ( !illustration.empty() ) { + const auto data = getFileContent(directoryPath + "/" + illustration); + metadata.set("Illustration_48x48@1", data); } - if (!longDescription.empty() && longDescription.length() < description.length()) { - std::cerr << "Long description should not be shorter than the short description." << std::endl; - return false; - } + return metadata; +} - if (description.length() > 80) { - std::cerr << "Description length exceeds the 80 character limit." << std::endl; - return false; - } - if (!longDescription.empty() && longDescription.length() > 4000) { - std::cerr << "Long description length exceeds the 4000 character limit." << std::endl; - return false; +bool checkMetadata(const zim::Metadata& metadata) +{ + const auto errors = metadata.check(); + + if ( !errors.empty() ) { + std::cerr << "Metadata doesn't meet the following requirements:\n"; + for ( const auto& err : errors ) { + std::cerr << " " << err << std::endl; + } } - return true; + return errors.empty(); +} + +void addMetadata(ZimCreatorFS& zimCreator, const zim::Metadata& metadata) +{ + for ( const auto& kv : metadata ) { + zimCreator.addMetadata(kv.first, kv.second); + } } + } // Global flags @@ -246,7 +269,7 @@ void parse_args(int argc, char** argv) do { c = getopt_long( - argc, argv, "hVvijxuw:I:t:d:c:l:p:r:e:n:m:J:UB", long_options, &option_index); + argc, argv, "hVvijxuw:I:t:d:c:l:p:r:e:n:m:J:UBL:", long_options, &option_index); if (c != -1) { switch (c) { @@ -328,11 +351,6 @@ void parse_args(int argc, char** argv) } } while (c != -1); - if ( !checkDescriptionLengths() ) { - exit(1); - } - - while (optind < argc) { if (directoryPath.empty()) { directoryPath = argv[optind++]; @@ -390,7 +408,7 @@ void parse_args(int argc, char** argv) } } -void create_zim() +void create_zim(const zim::Metadata& metadata) { ZimCreatorFS zimCreator(directoryPath); zimCreator.configVerbose(isVerbose()) @@ -422,26 +440,12 @@ void create_zim() zimCreator.startZimCreation(zimPath); - zimCreator.addMetadata("Language", language); - zimCreator.addMetadata("Publisher", publisher); - zimCreator.addMetadata("Creator", creator); - zimCreator.addMetadata("Title", title); - zimCreator.addMetadata("Description", description); - zimCreator.addMetadata("Name", name); - zimCreator.addMetadata("Source", source); - zimCreator.addMetadata("Flavour", flavour); - zimCreator.addMetadata("Scraper", scraper); - zimCreator.addMetadata("Tags", tags); - zimCreator.addMetadata("Date", generateDate()); + addMetadata(zimCreator, metadata); if ( !welcome.empty() ) { zimCreator.setMainPath(welcome); } - if ( !illustration.empty() ) { - zimCreator.addIllustration(48, getFileContent(directoryPath + "/" + illustration)); - } - /* Directory visitor */ zimCreator.visitDirectory(directoryPath); @@ -474,7 +478,14 @@ int main(int argc, char** argv) try { parse_args(argc, argv); - create_zim(); + + const zim::Metadata metadata = makeMetadata(); + + if ( !checkMetadata(metadata) ) { + exit(1); + } + + create_zim(metadata); } catch(std::exception &e) { std::cerr << "zimwriterfs: " << e.what() << std::endl; diff --git a/test/data/zimfiles/bad_checksum.zim b/test/data/zimfiles/bad_checksum.zim index 6f6314ae..d2391e77 100644 Binary files a/test/data/zimfiles/bad_checksum.zim and b/test/data/zimfiles/bad_checksum.zim differ diff --git a/test/data/zimfiles/create_test_zimfiles b/test/data/zimfiles/create_test_zimfiles index bb1c8152..a4ed30a1 100755 --- a/test/data/zimfiles/create_test_zimfiles +++ b/test/data/zimfiles/create_test_zimfiles @@ -33,7 +33,8 @@ make__good__zim() --no-uuid \ -w main.html \ -I favicon.png \ - -l en \ + -n good_zimfile \ + -l eng \ -t "Test ZIM file" \ -d "N/A" \ -c "N/A" \ @@ -87,6 +88,7 @@ make__poor__zim() --no-uuid \ -w "" \ -I "" \ + -n poor_zimfile \ -l en \ -t "" \ -d "" \ diff --git a/test/data/zimfiles/good.zim b/test/data/zimfiles/good.zim index cd9da5b9..99ea0989 100644 Binary files a/test/data/zimfiles/good.zim and b/test/data/zimfiles/good.zim differ diff --git a/test/data/zimfiles/poor.zim b/test/data/zimfiles/poor.zim index a617f620..bf3d8501 100644 Binary files a/test/data/zimfiles/poor.zim and b/test/data/zimfiles/poor.zim differ diff --git a/test/meson.build b/test/meson.build index 3669d109..07eeb99f 100644 --- a/test/meson.build +++ b/test/meson.build @@ -1,6 +1,7 @@ gtest_dep = dependency('gtest', main:true, fallback:['gtest', 'gtest_main_dep'], required:false) tests = [ + 'metadata-test', 'tools-test', 'zimwriterfs-zimcreatorfs', 'zimcheck-test' @@ -10,8 +11,9 @@ zimwriter_srcs = [ '../src/zimwriterfs/tools.cpp', '../src/zimwriterfs/zimcreatorfs.cpp', '../src/tools.cpp'] -tests_src_map = { 'zimcheck-test' : ['../src/zimcheck/zimcheck.cpp', '../src/zimcheck/checks.cpp', '../src/zimcheck/json_tools.cpp', '../src/tools.cpp'], +tests_src_map = { 'zimcheck-test' : ['../src/zimcheck/zimcheck.cpp', '../src/zimcheck/checks.cpp', '../src/zimcheck/json_tools.cpp', '../src/tools.cpp', '../src/metadata.cpp'], 'tools-test' : zimwriter_srcs, + 'metadata-test' : ['../src/metadata.cpp'], 'zimwriterfs-zimcreatorfs' : zimwriter_srcs } if gtest_dep.found() and not meson.is_cross_build() diff --git a/test/metadata-test.cpp b/test/metadata-test.cpp new file mode 100644 index 00000000..fe5110d7 --- /dev/null +++ b/test/metadata-test.cpp @@ -0,0 +1,228 @@ +#include "../src/metadata.h" + +#include "gtest/gtest.h" + +std::string fakePNG() +{ + return "\x89PNG\r\n\x1a\n" + std::string(100, 'x'); +} + +TEST(Metadata, isDefaultConstructible) +{ + zim::Metadata m; + (void)m; // suppress compiler's warning about an unused variable +} + + +TEST(Metadata, detectsAbsenceOfMandatoryEntries) +{ + zim::Metadata m; + + ASSERT_FALSE(m.valid()); + ASSERT_EQ(m.check(), + zim::Metadata::Errors({ + "Missing mandatory metadata: Name", + "Missing mandatory metadata: Title", + "Missing mandatory metadata: Language", + "Missing mandatory metadata: Creator", + "Missing mandatory metadata: Publisher", + "Missing mandatory metadata: Date", + "Missing mandatory metadata: Description", + "Missing mandatory metadata: Illustration_48x48@1", + }) + ); + + m.set("Description", "Any nonsense is better than nothing"); + m.set("Date", "2020-20-20"); + m.set("Creator", "Demiurge"); + m.set("Name", "wikipedia_py_all"); + + ASSERT_FALSE(m.valid()); + ASSERT_EQ(m.check(), + zim::Metadata::Errors({ + "Missing mandatory metadata: Title", + "Missing mandatory metadata: Language", + "Missing mandatory metadata: Publisher", + "Missing mandatory metadata: Illustration_48x48@1", + }) + ); + + m.set("Title", "Chief Executive Officer"); + m.set("Publisher", "Zangak"); + m.set("Language", "py3"); + m.set("Illustration_48x48@1", fakePNG()); + + ASSERT_TRUE(m.valid()); + ASSERT_TRUE(m.check().empty()); +} + +zim::Metadata makeValidMetadata() +{ + zim::Metadata m; + + m.set("Description", "Any nonsense is better than nothing"); + m.set("Date", "2020-20-20"); + m.set("Creator", "Demiurge"); + m.set("Name", "wikipedia_py_all"); + m.set("Title", "Chief Executive Officer"); + m.set("Publisher", "Zangak"); + m.set("Language", "py3"); + m.set("Illustration_48x48@1", fakePNG()); + + return m; +} + +TEST(Metadata, nonReservedMetadataIsNotAProblem) +{ + zim::Metadata m = makeValidMetadata(); + m.set("NonReservedMetadata", ""); + ASSERT_TRUE(m.valid()); +} + +TEST(Metadata, minSizeConstraints) +{ + zim::Metadata m = makeValidMetadata(); + m.set("Title", ""); + ASSERT_FALSE(m.valid()); + ASSERT_EQ(m.check(), + zim::Metadata::Errors({ + "Title must contain at least 1 characters" + }) + ); + m.set("Title", "t"); + ASSERT_TRUE(m.valid()); +} + +TEST(Metadata, maxSizeConstraints) +{ + zim::Metadata m = makeValidMetadata(); + m.set("Title", std::string(31, 'a')); + ASSERT_FALSE(m.valid()); + ASSERT_EQ(m.check(), + zim::Metadata::Errors({ + "Title must contain at most 30 characters" + }) + ); + m.set("Title", std::string(30, 'a')); + ASSERT_TRUE(m.valid()); +} + +TEST(Metadata, regexpConstraints) +{ + zim::Metadata m = makeValidMetadata(); + m.set("Date", "YYYY-MM-DD"); + ASSERT_FALSE(m.valid()); + ASSERT_EQ(m.check(), + zim::Metadata::Errors({ + "Date doesn't match regex: \\d\\d\\d\\d-\\d\\d-\\d\\d" + }) + ); + m.set("Date", "1234-56-78"); // Yes, such a date is considered valid + // by the current simple regex + ASSERT_TRUE(m.valid()); + + m.set("Language", "fre,"); + ASSERT_FALSE(m.valid()); + ASSERT_EQ(m.check(), + zim::Metadata::Errors({ + "Language doesn't match regex: \\w{3}(,\\w{3})*" + }) + ); + + m.set("Language", "fre,nch"); + ASSERT_TRUE(m.valid()); + + m.set("Illustration_48x48@1", "zimdata/favicon.png"); + ASSERT_EQ(m.check(), + zim::Metadata::Errors({ + "Illustration_48x48@1 doesn't match regex: ^\\x89PNG\\x0d\\x0a\\x1a\\x0a(.|\\s|\\x00)+" + }) + ); +} + +TEST(Metadata, pngRegexp) +{ + const std::string PNG_HEADER = "\x89PNG\r\n\x1a\n"; + zim::Metadata m = makeValidMetadata(); + { + m.set("Illustration_48x48@1", PNG_HEADER + 'A'); + ASSERT_TRUE(m.valid()); + } + { + m.set("Illustration_48x48@1", PNG_HEADER + '\n'); + ASSERT_TRUE(m.valid()); + } + { + m.set("Illustration_48x48@1", PNG_HEADER + '\0'); + ASSERT_TRUE(m.valid()); + } +} + + +TEST(Metadata, complexConstraints) +{ + zim::Metadata m = makeValidMetadata(); + m.set("Description", "Short description"); + m.set("LongDescription", "Long description"); + ASSERT_FALSE(m.valid()); + ASSERT_EQ(m.check(), + zim::Metadata::Errors({ + "LongDescription shouldn't be shorter than Description" + }) + ); +} + +TEST(Metadata, mandatoryMetadataAndSimpleChecksAreRunUnconditionally) +{ + zim::Metadata m; + + m.set("Description", "Blablabla"); + m.set("Date", "2020-20-20"); + m.set("Creator", "Demiurge"); + m.set("Name", "wikipedia_js_maxi"); + m.set("Title", "A title that is too long to read for a five year old"); + m.set("Publisher", "Zangak"); + m.set("Language", "js"); + //m.set("Illustration_48x48@1", ""); + + ASSERT_FALSE(m.valid()); + ASSERT_EQ(m.check(), + zim::Metadata::Errors({ + "Missing mandatory metadata: Illustration_48x48@1", + "Language must contain at least 3 characters", + "Language doesn't match regex: \\w{3}(,\\w{3})*", + "Title must contain at most 30 characters" + }) + ); +} + +TEST(Metadata, complexChecksAreRunOnlyIfMandatoryMetadataRequirementsAreMet) +{ + zim::Metadata m; + + m.set("Description", "Blablabla"); + m.set("LongDescription", "Blabla"); + m.set("Date", "2020-20-20"); + m.set("Creator", "TED"); + m.set("Name", "TED_bodylanguage"); + //m.set("Title", ""); + m.set("Publisher", "Kiwix"); + m.set("Language", "bod,yla,ngu,age"); + m.set("Illustration_48x48@1", fakePNG()); + + ASSERT_FALSE(m.valid()); + ASSERT_EQ(m.check(), + zim::Metadata::Errors({ + "Missing mandatory metadata: Title", + }) + ); + + m.set("Title", "Blabluba"); + + ASSERT_FALSE(m.valid()); + ASSERT_EQ(m.check(), + zim::Metadata::Errors({ + "LongDescription shouldn't be shorter than Description" + }) + ); +} diff --git a/test/zimcheck-test.cpp b/test/zimcheck-test.cpp index 1005655c..c13f54b7 100644 --- a/test/zimcheck-test.cpp +++ b/test/zimcheck-test.cpp @@ -27,7 +27,7 @@ TEST(zimfilechecks, test_checksum) TEST(zimfilechecks, test_metadata) { - std::string fn = "data/zimfiles/wikibooks_be_all_nopic_2017-02.zim"; + std::string fn = "data/zimfiles/good.zim"; zim::Archive archive(fn); ErrorLogger logger; @@ -244,9 +244,9 @@ void test_zimcheck_single_option(std::vector optionAliases, CapturedStdout zimcheck_output; CapturedStderr zimcheck_stderr; const CmdLine cmdline{"zimcheck", opt, zimfile}; - ASSERT_EQ(expected_exit_code, zimcheck(cmdline)) << cmdline; - ASSERT_EQ(expected_stderr, std::string(zimcheck_stderr)) << cmdline; - ASSERT_EQ(expected_stdout, std::string(zimcheck_output)) << cmdline; + EXPECT_EQ(expected_exit_code, zimcheck(cmdline)) << cmdline; + EXPECT_EQ(expected_stderr, std::string(zimcheck_stderr)) << cmdline; + EXPECT_EQ(expected_stdout, std::string(zimcheck_output)) << cmdline; } } @@ -293,7 +293,7 @@ TEST(zimcheck, metadata_goodzimfile) const std::string expected_output( "[INFO] Checking zim file data/zimfiles/good.zim" "\n" "[INFO] Zimcheck version is " VERSION "\n" - "[INFO] Searching for metadata entries..." "\n" + "[INFO] Checking metadata..." "\n" "[INFO] Overall Test Status: Pass" "\n" "[INFO] Total time taken by zimcheck: <3 seconds." "\n" ); @@ -413,7 +413,7 @@ const std::string ALL_CHECKS_OUTPUT_ON_GOODZIMFILE( "[INFO] Zimcheck version is " VERSION "\n" "[INFO] Verifying ZIM-archive structure integrity..." "\n" "[INFO] Avoiding redundant checksum test (already performed by the integrity check)." "\n" - "[INFO] Searching for metadata entries..." "\n" + "[INFO] Checking metadata..." "\n" "[INFO] Searching for Favicon..." "\n" "[INFO] Searching for main page..." "\n" "[INFO] Verifying Articles' content..." "\n" @@ -527,10 +527,13 @@ TEST(zimcheck, metadata_poorzimfile) const std::string expected_stdout( "[INFO] Checking zim file data/zimfiles/poor.zim" "\n" "[INFO] Zimcheck version is " VERSION "\n" - "[INFO] Searching for metadata entries..." "\n" - "[ERROR] Missing metadata entries:" "\n" - " Title" "\n" - " Description" "\n" + "[INFO] Checking metadata..." "\n" + "[ERROR] Metadata errors:" "\n" + " Missing mandatory metadata: Title" "\n" + " Missing mandatory metadata: Description" "\n" + " Missing mandatory metadata: Illustration_48x48@1" "\n" + " Language must contain at least 3 characters" "\n" + " Language doesn't match regex: \\w{3}(,\\w{3})*" "\n" "[INFO] Overall Test Status: Fail" "\n" "[INFO] Total time taken by zimcheck: <3 seconds." "\n" ); @@ -709,7 +712,7 @@ const std::string ALL_CHECKS_OUTPUT_ON_POORZIMFILE( "[INFO] Zimcheck version is " VERSION "\n" "[INFO] Verifying ZIM-archive structure integrity..." "\n" "[INFO] Avoiding redundant checksum test (already performed by the integrity check)." "\n" - "[INFO] Searching for metadata entries..." "\n" + "[INFO] Checking metadata..." "\n" "[INFO] Searching for Favicon..." "\n" "[INFO] Searching for main page..." "\n" "[INFO] Verifying Articles' content..." "\n" @@ -718,9 +721,12 @@ const std::string ALL_CHECKS_OUTPUT_ON_POORZIMFILE( "[INFO] Checking for redirect loops..." "\n" "[ERROR] Empty articles:" "\n" " Entry empty.html is empty" "\n" - "[ERROR] Missing metadata entries:" "\n" - " Title" "\n" - " Description" "\n" + "[ERROR] Metadata errors:" "\n" + " Missing mandatory metadata: Title" "\n" + " Missing mandatory metadata: Description" "\n" + " Missing mandatory metadata: Illustration_48x48@1" "\n" + " Language must contain at least 3 characters" "\n" + " Language doesn't match regex: \\w{3}(,\\w{3})*" "\n" "[ERROR] Favicon:" "\n" " Favicon is missing" "\n" "[ERROR] Missing mainpage:" "\n" @@ -832,14 +838,32 @@ TEST(zimcheck, json_poorzimfile) " {" "\n" " \"check\" : \"metadata\"," "\n" " \"level\" : \"ERROR\"," "\n" - " \"message\" : \"Title\"," "\n" - " \"metadata_type\" : \"Title\"" "\n" + " \"message\" : \"Missing mandatory metadata: Title\"," "\n" + " \"error\" : \"Missing mandatory metadata: Title\"" "\n" + " }," "\n" + " {" "\n" + " \"check\" : \"metadata\"," "\n" + " \"level\" : \"ERROR\"," "\n" + " \"message\" : \"Missing mandatory metadata: Description\"," "\n" + " \"error\" : \"Missing mandatory metadata: Description\"" "\n" + " }," "\n" + " {" "\n" + " \"check\" : \"metadata\"," "\n" + " \"level\" : \"ERROR\"," "\n" + " \"message\" : \"Missing mandatory metadata: Illustration_48x48@1\"," "\n" + " \"error\" : \"Missing mandatory metadata: Illustration_48x48@1\"" "\n" + " }," "\n" + " {" "\n" + " \"check\" : \"metadata\"," "\n" + " \"level\" : \"ERROR\"," "\n" + " \"message\" : \"Language must contain at least 3 characters\"," "\n" + " \"error\" : \"Language must contain at least 3 characters\"" "\n" " }," "\n" " {" "\n" " \"check\" : \"metadata\"," "\n" " \"level\" : \"ERROR\"," "\n" - " \"message\" : \"Description\"," "\n" - " \"metadata_type\" : \"Description\"" "\n" + " \"message\" : \"Language doesn't match regex: \\\\w{3}(,\\\\w{3})*\"," "\n" + " \"error\" : \"Language doesn't match regex: \\\\w{3}(,\\\\w{3})*\"" "\n" " }," "\n" " {" "\n" " \"check\" : \"favicon\"," "\n"