From 832b039c17222655a57bca789841cbf3b0d85496 Mon Sep 17 00:00:00 2001 From: Marcin Wojdyr Date: Thu, 9 Jan 2025 13:28:40 +0100 Subject: [PATCH] Ddl: category keys can be implicit --- include/gemmi/ddl.hpp | 2 -- src/ddl.cpp | 38 +++++++++++++++++++---------------- tests/mmcif_pdbx_v50_frag.dic | 25 +++++++++++++++++++++++ tests/test_cif.py | 11 ++++++---- 4 files changed, 53 insertions(+), 23 deletions(-) diff --git a/include/gemmi/ddl.hpp b/include/gemmi/ddl.hpp index 36d974e0a..5cbfcbe60 100644 --- a/include/gemmi/ddl.hpp +++ b/include/gemmi/ddl.hpp @@ -61,8 +61,6 @@ struct GEMMI_DLL Ddl { std::vector parents_; // storage for DDL2 _item_linked.child_name -> _item_linked.parent_name std::map item_parents_; - // counter that allows to limit the number of errors - mutable int missing_category_key_errors = 0; cif::Block* find_rules(const std::string& name) const { auto iter = name_index_.find(to_lower(name)); diff --git a/src/ddl.cpp b/src/ddl.cpp index 4e0d98f8e..c678d6ee5 100644 --- a/src/ddl.cpp +++ b/src/ddl.cpp @@ -263,6 +263,8 @@ void Ddl::check_mandatory_items(const cif::Block& b) const { // make a list of items in each category in the block std::map> categories; auto add_category = [&](const std::string& tag) { + if (!find_rules(tag)) // ignore unknown tags + return; size_t pos = tag.find('.'); if (pos != std::string::npos) categories[to_lower(tag.substr(0, pos+1))].push_back(to_lower(tag.substr(pos+1))); @@ -274,6 +276,7 @@ void Ddl::check_mandatory_items(const cif::Block& b) const { for (const std::string& tag : item.loop.tags) add_category(tag); } + // go over categories and check if nothing is missing for (const auto& cat : categories) { size_t n = cat.first.size(); @@ -287,29 +290,30 @@ void Ddl::check_mandatory_items(const cif::Block& b) const { if (use_context) if (const std::string* ct = cat_block->find_value("_pdbx_category_context.type")) logger.mesg(br(b), "category indicated as ", *ct, ": ", cat_name); - // check key items - for (const std::string& v : cat_block->find_values("_category_key.name")) { - std::string key = cif::as_string(v); - if (!gemmi::istarts_with(key, cat.first)) - logger.level<3>("inconsistent dictionary: wrong _category_key for ", cat_name); - if (!gemmi::in_vector(to_lower(key.substr(n)), cat.second)) { - // In mmcif_pdbx_v50.dic a category key is missing so often, - // that this generates about 20,000 errors, resulting in too much text. - if (missing_category_key_errors < 20 || logger.threshold >= 7) - warn(b, "missing category key: ", key); - else if (missing_category_key_errors == 20) - logger.level<3>("(Increase verbosity to show all missing-category-key errors.)"); - missing_category_key_errors++; - } - } + std::vector implicit_items; // check mandatory items for (auto i = name_index_.lower_bound(cat.first); i != name_index_.end() && gemmi::starts_with(i->first, cat.first); ++i) { - for (auto row : i->second->find("_item.", {"name", "mandatory_code"})) - if (row.str(1)[0] == 'y' && iequal(row.str(0), i->first) && + for (auto row : i->second->find("_item.", {"name", "mandatory_code"})) { + // mandatory_code can be one of: yes, not, implicit, implicit-ordinal + char mc0 = row.str(1)[0]; + if (mc0 == 'y' && iequal(row.str(0), i->first) && !gemmi::in_vector(i->first.substr(n), cat.second)) warn(b, "missing mandatory tag: ", i->first); + else if (mc0 == 'i') + implicit_items.push_back(gemmi::to_lower(row.str(0))); + } + } + // check key items + for (const std::string& v : cat_block->find_values("_category_key.name")) { + std::string key = gemmi::to_lower(cif::as_string(v)); + if (!gemmi::starts_with(key, cat.first)) + logger.level<3>("inconsistent dictionary: wrong _category_key for ", cat_name); + if (!gemmi::in_vector(key.substr(n), cat.second) && + // check if the key item is implicit (a feature is used in mmcif_ddl.dic) + !in_vector(key, implicit_items)) + warn(b, "missing category key: ", key); } } } diff --git a/tests/mmcif_pdbx_v50_frag.dic b/tests/mmcif_pdbx_v50_frag.dic index 1067695b4..8b1126197 100644 --- a/tests/mmcif_pdbx_v50_frag.dic +++ b/tests/mmcif_pdbx_v50_frag.dic @@ -898,6 +898,31 @@ save_atom_site ; save_ +save__atom_site.id + _item_description.description +; The value of _atom_site.id must uniquely identify a record in the + ATOM_SITE list. + +; + + # + loop_ + _item.name + _item.category_id + _item.mandatory_code + "_atom_site.id" atom_site yes + "_atom_site_anisotrop.id" atom_site_anisotrop yes + "_geom_angle.atom_site_id_1" geom_angle yes + # + # + _item_aliases.alias_name "_atom_site_label" + _item_aliases.dictionary cif_core.dic + _item_aliases.version 2.0.1 + # + # + _item_type.code code + # +save_ # save__atom_site.attached_hydrogens diff --git a/tests/test_cif.py b/tests/test_cif.py index 07677f8ac..e4977f9cc 100755 --- a/tests/test_cif.py +++ b/tests/test_cif.py @@ -401,20 +401,23 @@ def test_validation(self): self.assertEqual(msg_list, ['[dummy_block] unknown tag _custom_tag', '[dummy_block] unknown tag _another_one', + '[dummy_block] missing mandatory tag: _atom_site.id', '[dummy_block] missing category key: _atom_site.id']) + # try out set_logger() counter = 0 def incr_counter(_): nonlocal counter counter += 1 ddl.set_logger((incr_counter, 7)) ddl.validate_cif(doc) - self.assertEqual(counter, 3) + self.assertEqual(counter, len(msg_list)) msg_list = [] ddl.set_logger(lambda msg: msg_list.append(msg)) doc = cif.read_string(""" data_b2 + _something.unexpected here loop_ _atom_site.id _atom_site.auth_asym_id @@ -427,10 +430,10 @@ def incr_counter(_): """) ddl.validate_cif(doc) self.assertEqual(msg_list, [ - '[b2] unknown tag _atom_site.id', # 1 - "string:3 [b2] _atom_site.auth_asym_id: " + '[b2] unknown tag _something.unexpected', # 1 + "string:4 [b2] _atom_site.auth_asym_id: " "'hey hey' does not match the code regex", # 2 - 'string:3 [b2] _atom_site.attached_hydrogens: ' + 'string:4 [b2] _atom_site.attached_hydrogens: ' 'value out of expected range: -1', # 3 '[b2] category atom_site has 1 duplicated key:\n id=2', # 4 ])