diff --git a/src/azul/plugins/metadata/anvil/indexer/transform.py b/src/azul/plugins/metadata/anvil/indexer/transform.py index a57b85a65..66881a761 100644 --- a/src/azul/plugins/metadata/anvil/indexer/transform.py +++ b/src/azul/plugins/metadata/anvil/indexer/transform.py @@ -35,6 +35,9 @@ from azul import ( JSON, ) +from azul.collections import ( + deep_dict_merge, +) from azul.indexer import ( BundleFQID, BundlePartition, @@ -428,7 +431,29 @@ def reconcile_inner_entities(cls, ) -> tuple[JSON, BundleFQID]: this_entity, this_bundle = this that_entity, that_bundle = that - return that if that_bundle.version > this_bundle.version else this + if this_entity.keys() == that_entity.keys(): + return that if that_bundle.version > this_bundle.version else this + else: + assert entity_type == 'datasets', (entity_type, this, that) + expected_keys = cls.field_types()[entity_type].keys() + # There will be one contribution for a DUOS stub, and many redundant + # contributions (one per non-duos bundle) for the dataset metadata + # from BigQuery. Once the stub has been merged with a single main + # contribution to consolidate all expected fields, we can disregard + # the other contributions as usual. + if this_entity.keys() == expected_keys: + return this + elif that_entity.keys() == expected_keys: + return that + else: + assert this_bundle.version == that_bundle.version, (this, that) + assert this_entity.keys() < expected_keys, this + assert that_entity.keys() < expected_keys, that + merged = deep_dict_merge((this_entity, that_entity)) + assert merged.keys() == expected_keys, (this, that) + # We can safely discard that_bundle because only the version is + # used by the caller, and we know the versions are equal. + return merged, this_bundle class ActivityTransformer(BaseTransformer):