From e96f3b3f379cdb463366e62a222edc96dbe4d200 Mon Sep 17 00:00:00 2001 From: Tim Eccleston Date: Wed, 13 Aug 2025 14:14:30 -0700 Subject: [PATCH 01/24] :warning: UNTESTED: move wikibase user agent into config --- src/config.py | 9 +++++++++ src/wiki.py | 15 ++++----------- 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/src/config.py b/src/config.py index 66c96b8..950b154 100644 --- a/src/config.py +++ b/src/config.py @@ -1,6 +1,8 @@ from dataclasses import dataclass from typing import Optional from dotenv import load_dotenv +from wikibaseintegrator.wbi_config import config as wbi_config +from wikibaseintegrator import WikibaseIntegrator load_dotenv() @@ -15,3 +17,10 @@ class airtable: api_key: str = os.environ["AIRTABLE_API_KEY"] base_id: str = os.environ["AIRTABLE_BASE_ID"] + + +wbi_config["USER_AGENT"] = ( + "AutomationDev/0.1 (https://www.wikidata.org/wiki/User:TECCLESTON-TECH" +) + +wbi = WikibaseIntegrator() diff --git a/src/wiki.py b/src/wiki.py index 389c364..fa087d3 100644 --- a/src/wiki.py +++ b/src/wiki.py @@ -7,13 +7,7 @@ from random import sample import ctfg - -from wikibaseintegrator import WikibaseIntegrator -from wikibaseintegrator.wbi_config import config as wbi_config - -wbi_config["USER_AGENT"] = ( - "AutomationDev/0.1 (https://www.wikidata.org/wiki/User:TECCLESTON-TECH" -) +from wikibaseintegrator.entities.property import PropertyEntity def get_matches( @@ -44,9 +38,6 @@ def get_matches( return wiki_matches -wbi = WikibaseIntegrator() - - def summarize_matches(wiki_matches): count_of_counts = defaultdict(int) for x in wiki_matches.values(): @@ -72,9 +63,11 @@ def get_jsons(matched_items: list[ctfg.Listing]): log("Getting wikidata json for confirmed matches...") matched_wikis = { - x: wbi.item.get(x.wikidata_item.qid).get_json() + x: config.wbi.item.get(x.wikidata_item.qid).get_json() for x in sample(matched_items, min(50, len(matched_items))) } + log("Example wikidata json:") + pprint(list(matched_wikis.values())[0]) return matched_wikis From e5445e4ff1ee069718559af6a4d39dede0703dad Mon Sep 17 00:00:00 2001 From: Tim Eccleston Date: Wed, 13 Aug 2025 14:16:31 -0700 Subject: [PATCH 02/24] fix datatype for property id --- src/ctfg.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ctfg.py b/src/ctfg.py index 34c000e..c0ba82c 100644 --- a/src/ctfg.py +++ b/src/ctfg.py @@ -16,7 +16,7 @@ class WikidataProperty(Model): - pid_num = F.IntegerField("PID") + pid = F.SingleLineTextField("PID") label = F.SingleLineTextField("Label") description = F.MultilineTextField("Description") statements = F.LinkField("Statements", "Wikidata Statements") From a622bc7167b8058744f98c316ca8adf2eeace3e1 Mon Sep 17 00:00:00 2001 From: Tim Eccleston Date: Wed, 13 Aug 2025 14:27:16 -0700 Subject: [PATCH 03/24] reorder ctfg classes into strict hierarchy (commenting out backlinks) --- src/ctfg.py | 71 ++++++++++++++++++++++++++++++++++------------------- 1 file changed, 46 insertions(+), 25 deletions(-) diff --git a/src/ctfg.py b/src/ctfg.py index c0ba82c..9db3dd2 100644 --- a/src/ctfg.py +++ b/src/ctfg.py @@ -19,7 +19,7 @@ class WikidataProperty(Model): pid = F.SingleLineTextField("PID") label = F.SingleLineTextField("Label") description = F.MultilineTextField("Description") - statements = F.LinkField("Statements", "Wikidata Statements") + # statements = F.LinkField("Statements", "Wikidata Statements") class Meta: api_key = api_key @@ -27,12 +27,54 @@ class Meta: table_name = "Wikidata Properties" + +class WikidataStatement(Model): + uuid = F.SingleLineTextField("Identifier") + property = F.SingleLinkField("Wikidata Property", WikidataProperty) + datatype = F.SingleLineTextField("Data Type") + values = F.LinkField("Value Attributes", WikidataStatementValueAttribute) + # item = F.SingleLinkField("Wikidata Item", WikidataItem) + + class Meta: + api_key = api_key + base_id = base_id + table_name = "Wikidata Statements" + + @staticmethod + def from_wiki_statement(m: dict, keep_unknowns: bool = False): + keyMapping = { + "uuid": "id", + "property": "property", + "datatype": "datatype", + } + mappable = {k: m[v] for k, v in keyMapping.items()} + + def parseStatement(statement: dict) -> WikidataStatement: + uuid: str = statement["id"] + + # Ignore alternatives, qualifiers, and references + statement = statement["mainsnak"] + + statement["uuid"] = uuid + statement["value_json"] = dumps(statement["value"]) + statement["value"] = WikidataStatementValue.from_wiki_dict( + uuid, statement["value"] + ) + statement["property"] = WikidataProperty.from_wikidata_id( + statement["property"] + ) + + return WikidataStatement(**statement) + + return WikidataItem(**mappable) + + class WikidataItem(Model): qid = F.SingleLineTextField("QID") label = F.SingleLineTextField("Label") description = F.MultilineTextField("Description") - statements = F.LinkField("Statements", "Wikidata Statements") - listings = F.LinkField("Listing Suggestions", "Listing") + statements = F.LinkField("Statements", WikidataStatement) + # listings = F.LinkField("Listing Suggestions", "Listing") # listing = F.LinkField("Listing Official", "Listing") url = F.UrlField("Wikidata Page", readonly=True) @@ -47,24 +89,12 @@ def from_wiki_match(m: dict, keep_unknowns: bool = False): "qid": "id", "label": "label", "description": "description", + # "statements": "statements", } mappable = {k: m[v] for k, v in keyMapping.items()} return WikidataItem(**mappable) -class WikidataStatement(Model): - qid_num = F.IntegerField("QID") - label = F.SingleLineTextField("Label") - description = F.MultilineTextField("Description") - property = F.SingleLinkField("Wikidata Property", WikidataProperty) - item = F.SingleLinkField("Wikidata Item", WikidataItem) - - class Meta: - api_key = api_key - base_id = base_id - table_name = "Wikidata Statements" - - class Listing(Model): name = F.SingleLineTextField("Project name") wikidata_item = F.SingleLinkField("Wikidata Item Official", WikidataItem) @@ -90,15 +120,6 @@ def deploy_fields() -> None: WikidataItem.description: { "field_type": "singleLineText", }, - WikidataItem.listings: { - "field_type": "multipleRecordLinks", - "description": "CTFG listings that are suspected to match this wikidata item", - "options": { - "linkedTableId": Listing.meta.table.id, - # "isReversed": True, - # "prefersSingleRecordLink": True, - }, - }, WikidataItem.url: { "field_type": "singleLineText", }, From 77fe0c686752f2a64cb854ec21ad663560eed5b5 Mon Sep 17 00:00:00 2001 From: Tim Eccleston Date: Fri, 15 Aug 2025 13:08:15 -0700 Subject: [PATCH 04/24] :warning: BROKE: pull linked data (properties and values, but not saving yet) --- src/config.py | 2 + src/ctfg.py | 112 ++++++++++++++++++++++++++++++++++++++------------ 2 files changed, 87 insertions(+), 27 deletions(-) diff --git a/src/config.py b/src/config.py index 950b154..667e408 100644 --- a/src/config.py +++ b/src/config.py @@ -24,3 +24,5 @@ class airtable: ) wbi = WikibaseIntegrator() + +LANGUAGE_CODE="en" diff --git a/src/ctfg.py b/src/ctfg.py index 9db3dd2..21067ec 100644 --- a/src/ctfg.py +++ b/src/ctfg.py @@ -1,3 +1,4 @@ +from json import dumps from typing import Any from util import * from pyairtable import Api @@ -26,13 +27,69 @@ class Meta: base_id = base_id table_name = "Wikidata Properties" + # !need to memoize this! + @staticmethod + def from_wikidata_id(pid: str): + p = config.wbi.property.get(pid) + return WikidataProperty( + pid=p.id, + label=str(p.labels.values.get(config.LANGUAGE_CODE)), + description=str(p.descriptions.get(config.LANGUAGE_CODE)), + ) + + +class WikidataStatementValueAttribute(Model): + uuid = F.SingleLineTextField("Identifier") + key = F.SingleLineTextField("Value Attribute Key") + value = F.SingleLineTextField("Value Attribute Value") + # statement = F.LinkField("Statement", "Wikidata Statements") + + class Meta: + api_key = api_key + base_id = base_id + table_name = "Wikidata Statement Value Attributes" + + +class WikidataStatementValue(Model): + uuid = F.SingleLineTextField("Identifier") + type = F.SingleLineTextField("Wikidata Type") + json = F.MultilineTextField("Wikidata Value JSON") + attributes = F.LinkField("Attributes", WikidataStatementValueAttribute) + + class Meta: + api_key = api_key + base_id = base_id + table_name = "Wikidata Statement Value Attributes" + + @staticmethod + def parse_value_attributes( + value: dict | str, + ) -> list[WikidataStatementValueAttribute]: + if isinstance(value, str): + value = {"key": "string", "value": value} + + return [ + WikidataStatementValueAttribute(key=str(k), value=str(v)) + for k, v in value.items() + ] + + @staticmethod + def from_wiki_dict(uuid: str, datavalue: dict): + return WikidataStatementValue( + uuid=uuid, + type=datavalue["type"], + json=dumps(datavalue["value"]), + attributes=WikidataStatementValue.parse_value_attributes( + datavalue["value"] + ), + ) class WikidataStatement(Model): uuid = F.SingleLineTextField("Identifier") property = F.SingleLinkField("Wikidata Property", WikidataProperty) datatype = F.SingleLineTextField("Data Type") - values = F.LinkField("Value Attributes", WikidataStatementValueAttribute) + value = F.SingleLinkField("Value", WikidataStatementValue) # item = F.SingleLinkField("Wikidata Item", WikidataItem) class Meta: @@ -41,32 +98,23 @@ class Meta: table_name = "Wikidata Statements" @staticmethod - def from_wiki_statement(m: dict, keep_unknowns: bool = False): - keyMapping = { - "uuid": "id", - "property": "property", - "datatype": "datatype", - } - mappable = {k: m[v] for k, v in keyMapping.items()} + def from_wiki_statement(statement: dict): + uuid: str = statement["id"] - def parseStatement(statement: dict) -> WikidataStatement: - uuid: str = statement["id"] + # Ignore alternatives, qualifiers, and references + statement = statement["mainsnak"] - # Ignore alternatives, qualifiers, and references - statement = statement["mainsnak"] + property = WikidataProperty.from_wikidata_id(statement["property"]) + datavalue = statement.get("datavalue", None) + value = ( + WikidataStatementValue.from_wiki_dict(uuid, datavalue) + if datavalue + else None + ) - statement["uuid"] = uuid - statement["value_json"] = dumps(statement["value"]) - statement["value"] = WikidataStatementValue.from_wiki_dict( - uuid, statement["value"] - ) - statement["property"] = WikidataProperty.from_wikidata_id( - statement["property"] - ) - - return WikidataStatement(**statement) - - return WikidataItem(**mappable) + return WikidataStatement( + uuid=uuid, property=property, datatype=statement["datatype"], value=value + ) class WikidataItem(Model): @@ -85,14 +133,24 @@ class Meta: @staticmethod def from_wiki_match(m: dict, keep_unknowns: bool = False): + keyMapping = { "qid": "id", "label": "label", "description": "description", - # "statements": "statements", } mappable = {k: m[v] for k, v in keyMapping.items()} - return WikidataItem(**mappable) + + claims = config.wbi.item.get(mappable["qid"]).claims.get_json() + pprint(claims) + + import pdb + + # pdb.set_trace() + statements = [ + WikidataStatement.from_wiki_statement(s) for p in claims.values() for s in p + ] + return WikidataItem(**mappable, statements=statements) class Listing(Model): @@ -213,7 +271,7 @@ def upsert_matches(wiki_matches: dict[Listing, list[dict[str, Any]]]): wiki_items = list(set([x for y in with_wiki_items.values() for x in y])) log("Example item") - pprint(wiki_items[0].to_record()) + # pprint(wiki_items[0].to_record()) WikidataItem.batch_save(wiki_items) for x, matches in with_wiki_items.items(): From 621dd3a16352c9b3354e6daa247cfd404dc0f87e Mon Sep 17 00:00:00 2001 From: Tim Eccleston Date: Fri, 15 Aug 2025 13:09:26 -0700 Subject: [PATCH 05/24] add optional bot login --- src/config.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/src/config.py b/src/config.py index 667e408..76ab773 100644 --- a/src/config.py +++ b/src/config.py @@ -19,10 +19,17 @@ class airtable: base_id: str = os.environ["AIRTABLE_BASE_ID"] -wbi_config["USER_AGENT"] = ( - "AutomationDev/0.1 (https://www.wikidata.org/wiki/User:TECCLESTON-TECH" -) +WIKIDATA_BOT_USERNAME = os.getenv("WIKIDATA_BOT_USERNAME", None) +WIKIDATA_BOT_PW = os.getenv("WIKIDATA_BOT_PW", None) + +if WIKIDATA_BOT_USERNAME and WIKIDATA_BOT_PW: + wbi = WikibaseIntegrator() + wbi.login(user=WIKIDATA_BOT_USERNAME, password=WIKIDATA_BOT_PW) +else: + wbi_config["USER_AGENT"] = ( + "AutomationDev/0.1 (https://www.wikidata.org/wiki/User:TECCLESTON-TECH" + ) + wbi = WikibaseIntegrator() -wbi = WikibaseIntegrator() -LANGUAGE_CODE="en" +LANGUAGE_CODE = "en" From 50787350c5efc5ade75dd1eab0cde471b1a16f12 Mon Sep 17 00:00:00 2001 From: Tim Eccleston Date: Fri, 15 Aug 2025 13:47:07 -0700 Subject: [PATCH 06/24] resume providing an (improved user agent) --- src/config.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/src/config.py b/src/config.py index 76ab773..3286b5f 100644 --- a/src/config.py +++ b/src/config.py @@ -22,14 +22,15 @@ class airtable: WIKIDATA_BOT_USERNAME = os.getenv("WIKIDATA_BOT_USERNAME", None) WIKIDATA_BOT_PW = os.getenv("WIKIDATA_BOT_PW", None) + +wbi_config["USER_AGENT"] = ( + "CTFG-Wikidata/0.1 (https://www.wikidata.org/wiki/User:TECCLESTON-TECH, https://github.com/sprblm/CTFG-Wikidata)" +) + if WIKIDATA_BOT_USERNAME and WIKIDATA_BOT_PW: - wbi = WikibaseIntegrator() - wbi.login(user=WIKIDATA_BOT_USERNAME, password=WIKIDATA_BOT_PW) -else: - wbi_config["USER_AGENT"] = ( - "AutomationDev/0.1 (https://www.wikidata.org/wiki/User:TECCLESTON-TECH" - ) - wbi = WikibaseIntegrator() + wbi_login.Login(user=WIKIDATA_BOT_USERNAME, password=WIKIDATA_BOT_PW) + +wbi = WikibaseIntegrator() LANGUAGE_CODE = "en" From 9e1b0e99ef4d469ee5c856588679c1406a35f5f3 Mon Sep 17 00:00:00 2001 From: Tim Eccleston Date: Fri, 15 Aug 2025 13:47:46 -0700 Subject: [PATCH 07/24] add import misisng from new login (2 commits back) --- src/config.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/config.py b/src/config.py index 3286b5f..592879a 100644 --- a/src/config.py +++ b/src/config.py @@ -2,6 +2,7 @@ from typing import Optional from dotenv import load_dotenv from wikibaseintegrator.wbi_config import config as wbi_config +from wikibaseintegrator import wbi_login from wikibaseintegrator import WikibaseIntegrator load_dotenv() From 015e0a041ab61590d925c76f89dda944d0ae6dc8 Mon Sep 17 00:00:00 2001 From: Tim Eccleston Date: Fri, 15 Aug 2025 13:48:28 -0700 Subject: [PATCH 08/24] memoize wikidata property pull --- src/ctfg.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/ctfg.py b/src/ctfg.py index 21067ec..f1c4e1c 100644 --- a/src/ctfg.py +++ b/src/ctfg.py @@ -6,6 +6,7 @@ import pickle from collections import defaultdict from pprint import pprint +from functools import lru_cache api_key = config.airtable.api_key base_id = config.airtable.base_id @@ -27,7 +28,7 @@ class Meta: base_id = base_id table_name = "Wikidata Properties" - # !need to memoize this! + @lru_cache(maxsize=None) @staticmethod def from_wikidata_id(pid: str): p = config.wbi.property.get(pid) @@ -277,3 +278,4 @@ def upsert_matches(wiki_matches: dict[Listing, list[dict[str, Any]]]): for x, matches in with_wiki_items.items(): x.wikidata_suggestions = matches Listing.batch_save(list(with_wiki_items.keys())) + From 1a01a5da90d6cb99fd90df905af40231afbe35f7 Mon Sep 17 00:00:00 2001 From: Tim Eccleston Date: Fri, 15 Aug 2025 13:52:07 -0700 Subject: [PATCH 09/24] pretty print value json --- src/ctfg.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ctfg.py b/src/ctfg.py index f1c4e1c..7ccbcc8 100644 --- a/src/ctfg.py +++ b/src/ctfg.py @@ -79,7 +79,7 @@ def from_wiki_dict(uuid: str, datavalue: dict): return WikidataStatementValue( uuid=uuid, type=datavalue["type"], - json=dumps(datavalue["value"]), + json=dumps(datavalue["value"], indent=2), attributes=WikidataStatementValue.parse_value_attributes( datavalue["value"] ), From 13678e3739492081af84f0430aa57c3a9e0c8334 Mon Sep 17 00:00:00 2001 From: Tim Eccleston Date: Fri, 15 Aug 2025 21:31:28 -0700 Subject: [PATCH 10/24] start recursive batch saving --- src/ctfg.py | 74 +++++++++++++++++++++++++++++++++++++++++++++++++---- src/sync.py | 2 +- 2 files changed, 70 insertions(+), 6 deletions(-) diff --git a/src/ctfg.py b/src/ctfg.py index 7ccbcc8..3d91c13 100644 --- a/src/ctfg.py +++ b/src/ctfg.py @@ -1,5 +1,5 @@ from json import dumps -from typing import Any +from typing import Any, Self, TypeAlias from util import * from pyairtable import Api import os @@ -7,6 +7,7 @@ from collections import defaultdict from pprint import pprint from functools import lru_cache +from collections import defaultdict api_key = config.airtable.api_key base_id = config.airtable.base_id @@ -27,17 +28,22 @@ class Meta: api_key = api_key base_id = base_id table_name = "Wikidata Properties" + memoize = True @lru_cache(maxsize=None) @staticmethod def from_wikidata_id(pid: str): p = config.wbi.property.get(pid) - return WikidataProperty( + converted = WikidataProperty( pid=p.id, label=str(p.labels.values.get(config.LANGUAGE_CODE)), description=str(p.descriptions.get(config.LANGUAGE_CODE)), ) + return converted + @classmethod + def recursive_save(cls, siblings: list[Self]): + cls.batch_save(siblings) class WikidataStatementValueAttribute(Model): uuid = F.SingleLineTextField("Identifier") @@ -85,6 +91,18 @@ def from_wiki_dict(uuid: str, datavalue: dict): ), ) + def children(self) -> set[WikidataStatementValueAttribute]: + return set(self.attributes) + + @classmethod + def next_generation(cls, siblings: list[Self]) -> list: + return list(set(c for sib in siblings for c in sib.children())) + + @classmethod + def recursive_save(cls, siblings: list[Self]): + WikidataStatementValueAttribute.batch_save(cls.next_generation(siblings)) + cls.batch_save(siblings) + class WikidataStatement(Model): uuid = F.SingleLineTextField("Identifier") @@ -117,6 +135,29 @@ def from_wiki_statement(statement: dict): uuid=uuid, property=property, datatype=statement["datatype"], value=value ) + def children(self) -> dict[TypeAlias, set]: + res: dict[TypeAlias, set] = {} + if self.property: + res[WikidataProperty] = {self.property} + if self.value: + res[WikidataStatementValue] = {self.property} + return res + + @classmethod + def next_generation(cls, siblings: list[Self]) -> dict[TypeAlias, set[Model]]: + agg: defaultdict[TypeAlias, set] = defaultdict(set) + for sib in siblings: + child_dict = sib.children() + for typ, kids in child_dict.items(): + agg[typ].union(kids) + return agg + + @classmethod + def recursive_save(cls, siblings: list[Self]): + for typ, kids in cls.next_generation(siblings).items(): + typ.recursive_save(kids) + cls.batch_save(siblings) + class WikidataItem(Model): qid = F.SingleLineTextField("QID") @@ -153,6 +194,18 @@ def from_wiki_match(m: dict, keep_unknowns: bool = False): ] return WikidataItem(**mappable, statements=statements) + def children(self) -> set[WikidataStatement]: + return set(self.statements) + + @classmethod + def next_generation(cls, siblings: list[Self]) -> list: + return list(set(c for sib in siblings for c in sib.children())) + + @classmethod + def recursive_save(cls, siblings: list[Self]): + WikidataStatement.recursive_save(cls.next_generation(siblings)) + cls.batch_save(siblings) + class Listing(Model): name = F.SingleLineTextField("Project name") @@ -165,6 +218,18 @@ class Meta: base_id = base_id table_name = "Listings" + def children(self) -> set[WikidataItem]: + return set(self.wikidata_suggestions) + + @classmethod + def next_generation(cls, siblings: list[Self]) -> list: + return list(set(c for sib in siblings for c in sib.children())) + + @classmethod + def recursive_save(cls, siblings: list[Self]): + WikidataItem.recursive_save(cls.next_generation(siblings)) + cls.batch_save(siblings) + def deploy_fields() -> None: log("Deploying missing fields (as necessary)") @@ -273,9 +338,8 @@ def upsert_matches(wiki_matches: dict[Listing, list[dict[str, Any]]]): log("Example item") # pprint(wiki_items[0].to_record()) - WikidataItem.batch_save(wiki_items) + # WikidataItem.recursive_save(wiki_items) for x, matches in with_wiki_items.items(): x.wikidata_suggestions = matches - Listing.batch_save(list(with_wiki_items.keys())) - + Listing.recursive_save(list(with_wiki_items.keys())) diff --git a/src/sync.py b/src/sync.py index 123fee9..57df347 100644 --- a/src/sync.py +++ b/src/sync.py @@ -10,7 +10,7 @@ types = ctfg.summarize_types(items) (unmatched_items, matched_items) = ctfg.partition_matched(items) -wiki_matches = wiki.get_matches(unmatched_items, max_attempts=15) +wiki_matches = wiki.get_matches(unmatched_items, max_attempts=10) wiki_match_histogram = wiki.summarize_matches(wiki_matches) matched_wikis = wiki.get_jsons(matched_items) From 695a4d27f1e9c026a85db6498bdecd34e2adf4f8 Mon Sep 17 00:00:00 2001 From: Tim Eccleston Date: Fri, 15 Aug 2025 21:33:22 -0700 Subject: [PATCH 11/24] resolve type warning about Optional field --- src/wiki.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/wiki.py b/src/wiki.py index fa087d3..02a9405 100644 --- a/src/wiki.py +++ b/src/wiki.py @@ -65,6 +65,7 @@ def get_jsons(matched_items: list[ctfg.Listing]): matched_wikis = { x: config.wbi.item.get(x.wikidata_item.qid).get_json() for x in sample(matched_items, min(50, len(matched_items))) + if x.wikidata_item } log("Example wikidata json:") pprint(list(matched_wikis.values())[0]) From 29e1a66443d02f5b8576c4fa31def25acd28bc83 Mon Sep 17 00:00:00 2001 From: Tim Eccleston Date: Sat, 16 Aug 2025 07:35:37 -0700 Subject: [PATCH 12/24] save directly and immediately (not in batch) --- src/ctfg.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/src/ctfg.py b/src/ctfg.py index 3d91c13..d60801d 100644 --- a/src/ctfg.py +++ b/src/ctfg.py @@ -39,12 +39,14 @@ def from_wikidata_id(pid: str): label=str(p.labels.values.get(config.LANGUAGE_CODE)), description=str(p.descriptions.get(config.LANGUAGE_CODE)), ) + converted.save() return converted @classmethod def recursive_save(cls, siblings: list[Self]): cls.batch_save(siblings) + class WikidataStatementValueAttribute(Model): uuid = F.SingleLineTextField("Identifier") key = F.SingleLineTextField("Value Attribute Key") @@ -75,14 +77,16 @@ def parse_value_attributes( if isinstance(value, str): value = {"key": "string", "value": value} - return [ + attributes = [ WikidataStatementValueAttribute(key=str(k), value=str(v)) for k, v in value.items() ] + WikidataStatementValueAttribute.batch_save(attributes) + return attributes @staticmethod def from_wiki_dict(uuid: str, datavalue: dict): - return WikidataStatementValue( + result = WikidataStatementValue( uuid=uuid, type=datavalue["type"], json=dumps(datavalue["value"], indent=2), @@ -90,6 +94,8 @@ def from_wiki_dict(uuid: str, datavalue: dict): datavalue["value"] ), ) + result.save() + return result def children(self) -> set[WikidataStatementValueAttribute]: return set(self.attributes) @@ -131,9 +137,11 @@ def from_wiki_statement(statement: dict): else None ) - return WikidataStatement( + result = WikidataStatement( uuid=uuid, property=property, datatype=statement["datatype"], value=value ) + result.save() + return result def children(self) -> dict[TypeAlias, set]: res: dict[TypeAlias, set] = {} @@ -192,7 +200,9 @@ def from_wiki_match(m: dict, keep_unknowns: bool = False): statements = [ WikidataStatement.from_wiki_statement(s) for p in claims.values() for s in p ] - return WikidataItem(**mappable, statements=statements) + result = WikidataItem(**mappable, statements=statements) + result.save() + return result def children(self) -> set[WikidataStatement]: return set(self.statements) @@ -342,4 +352,4 @@ def upsert_matches(wiki_matches: dict[Listing, list[dict[str, Any]]]): for x, matches in with_wiki_items.items(): x.wikidata_suggestions = matches - Listing.recursive_save(list(with_wiki_items.keys())) + Listing.batch_save(list(with_wiki_items.keys())) From 821586a66870e77d95ffa38ac36463608e8a67ad Mon Sep 17 00:00:00 2001 From: Tim Eccleston Date: Sat, 16 Aug 2025 07:39:51 -0700 Subject: [PATCH 13/24] fix typo in table name --- src/ctfg.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ctfg.py b/src/ctfg.py index d60801d..52d36e2 100644 --- a/src/ctfg.py +++ b/src/ctfg.py @@ -68,7 +68,7 @@ class WikidataStatementValue(Model): class Meta: api_key = api_key base_id = base_id - table_name = "Wikidata Statement Value Attributes" + table_name = "Wikidata Statement Value" @staticmethod def parse_value_attributes( From 589771d524fda4e6c770aca99d00b7dfa03e34e9 Mon Sep 17 00:00:00 2001 From: Tim Eccleston Date: Sat, 16 Aug 2025 07:45:57 -0700 Subject: [PATCH 14/24] fix and simplify default string value attribute --- src/ctfg.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ctfg.py b/src/ctfg.py index 52d36e2..7f415ac 100644 --- a/src/ctfg.py +++ b/src/ctfg.py @@ -75,7 +75,7 @@ def parse_value_attributes( value: dict | str, ) -> list[WikidataStatementValueAttribute]: if isinstance(value, str): - value = {"key": "string", "value": value} + value = {"string": value} attributes = [ WikidataStatementValueAttribute(key=str(k), value=str(v)) From 5b4dbf210d82cd9c2b03b3e91d24df012e6b4d05 Mon Sep 17 00:00:00 2001 From: Tim Eccleston Date: Sat, 16 Aug 2025 07:46:21 -0700 Subject: [PATCH 15/24] remove old debugger --- src/ctfg.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/ctfg.py b/src/ctfg.py index 7f415ac..e63172a 100644 --- a/src/ctfg.py +++ b/src/ctfg.py @@ -194,9 +194,6 @@ def from_wiki_match(m: dict, keep_unknowns: bool = False): claims = config.wbi.item.get(mappable["qid"]).claims.get_json() pprint(claims) - import pdb - - # pdb.set_trace() statements = [ WikidataStatement.from_wiki_statement(s) for p in claims.values() for s in p ] From d6be886d5dcb5270be6b4b01958615998f7109a7 Mon Sep 17 00:00:00 2001 From: Tim Eccleston Date: Sat, 16 Aug 2025 21:05:33 -0700 Subject: [PATCH 16/24] start passing a concatenated identifier to value attributes --- src/ctfg.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/ctfg.py b/src/ctfg.py index e63172a..6f6781f 100644 --- a/src/ctfg.py +++ b/src/ctfg.py @@ -72,13 +72,15 @@ class Meta: @staticmethod def parse_value_attributes( - value: dict | str, + value: dict | str, base_id: str ) -> list[WikidataStatementValueAttribute]: if isinstance(value, str): value = {"string": value} attributes = [ - WikidataStatementValueAttribute(key=str(k), value=str(v)) + WikidataStatementValueAttribute( + uuid=base_id + str(k), key=str(k), value=str(v) + ) for k, v in value.items() ] WikidataStatementValueAttribute.batch_save(attributes) @@ -91,7 +93,7 @@ def from_wiki_dict(uuid: str, datavalue: dict): type=datavalue["type"], json=dumps(datavalue["value"], indent=2), attributes=WikidataStatementValue.parse_value_attributes( - datavalue["value"] + datavalue["value"], uuid ), ) result.save() From 386e75ddac80a7b3f3924f46661f125a17d57f19 Mon Sep 17 00:00:00 2001 From: Tim Eccleston Date: Sat, 16 Aug 2025 21:07:03 -0700 Subject: [PATCH 17/24] remove unused recursive_save logic (for now) --- src/ctfg.py | 65 +---------------------------------------------------- 1 file changed, 1 insertion(+), 64 deletions(-) diff --git a/src/ctfg.py b/src/ctfg.py index 6f6781f..cc7c924 100644 --- a/src/ctfg.py +++ b/src/ctfg.py @@ -1,5 +1,5 @@ from json import dumps -from typing import Any, Self, TypeAlias +from typing import Any from util import * from pyairtable import Api import os @@ -42,10 +42,6 @@ def from_wikidata_id(pid: str): converted.save() return converted - @classmethod - def recursive_save(cls, siblings: list[Self]): - cls.batch_save(siblings) - class WikidataStatementValueAttribute(Model): uuid = F.SingleLineTextField("Identifier") @@ -99,18 +95,6 @@ def from_wiki_dict(uuid: str, datavalue: dict): result.save() return result - def children(self) -> set[WikidataStatementValueAttribute]: - return set(self.attributes) - - @classmethod - def next_generation(cls, siblings: list[Self]) -> list: - return list(set(c for sib in siblings for c in sib.children())) - - @classmethod - def recursive_save(cls, siblings: list[Self]): - WikidataStatementValueAttribute.batch_save(cls.next_generation(siblings)) - cls.batch_save(siblings) - class WikidataStatement(Model): uuid = F.SingleLineTextField("Identifier") @@ -145,29 +129,6 @@ def from_wiki_statement(statement: dict): result.save() return result - def children(self) -> dict[TypeAlias, set]: - res: dict[TypeAlias, set] = {} - if self.property: - res[WikidataProperty] = {self.property} - if self.value: - res[WikidataStatementValue] = {self.property} - return res - - @classmethod - def next_generation(cls, siblings: list[Self]) -> dict[TypeAlias, set[Model]]: - agg: defaultdict[TypeAlias, set] = defaultdict(set) - for sib in siblings: - child_dict = sib.children() - for typ, kids in child_dict.items(): - agg[typ].union(kids) - return agg - - @classmethod - def recursive_save(cls, siblings: list[Self]): - for typ, kids in cls.next_generation(siblings).items(): - typ.recursive_save(kids) - cls.batch_save(siblings) - class WikidataItem(Model): qid = F.SingleLineTextField("QID") @@ -203,18 +164,6 @@ def from_wiki_match(m: dict, keep_unknowns: bool = False): result.save() return result - def children(self) -> set[WikidataStatement]: - return set(self.statements) - - @classmethod - def next_generation(cls, siblings: list[Self]) -> list: - return list(set(c for sib in siblings for c in sib.children())) - - @classmethod - def recursive_save(cls, siblings: list[Self]): - WikidataStatement.recursive_save(cls.next_generation(siblings)) - cls.batch_save(siblings) - class Listing(Model): name = F.SingleLineTextField("Project name") @@ -227,18 +176,6 @@ class Meta: base_id = base_id table_name = "Listings" - def children(self) -> set[WikidataItem]: - return set(self.wikidata_suggestions) - - @classmethod - def next_generation(cls, siblings: list[Self]) -> list: - return list(set(c for sib in siblings for c in sib.children())) - - @classmethod - def recursive_save(cls, siblings: list[Self]): - WikidataItem.recursive_save(cls.next_generation(siblings)) - cls.batch_save(siblings) - def deploy_fields() -> None: log("Deploying missing fields (as necessary)") From df9d4712b20cdfe2ce6912009538d8793fca5342 Mon Sep 17 00:00:00 2001 From: Tim Eccleston Date: Sat, 16 Aug 2025 22:27:15 -0700 Subject: [PATCH 18/24] remove unused import --- src/ctfg.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/ctfg.py b/src/ctfg.py index cc7c924..a7daefb 100644 --- a/src/ctfg.py +++ b/src/ctfg.py @@ -270,9 +270,6 @@ def partition_matched(items: list[Listing]) -> tuple[list[Listing], list[Listing return (list(unmatched), list(matched)) -from itertools import batched - - def upsert_matches(wiki_matches: dict[Listing, list[dict[str, Any]]]): log("Updating CTFG with matching wikibase IDs...") with_wiki_items = { From 770a68c847d5dc36746a8fb9584e7521c9c403ec Mon Sep 17 00:00:00 2001 From: Tim Eccleston Date: Sun, 17 Aug 2025 21:22:12 -0700 Subject: [PATCH 19/24] skip some unused stuff --- src/ctfg.py | 2 +- src/sync.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/ctfg.py b/src/ctfg.py index a7daefb..7eddf5f 100644 --- a/src/ctfg.py +++ b/src/ctfg.py @@ -155,7 +155,7 @@ def from_wiki_match(m: dict, keep_unknowns: bool = False): mappable = {k: m[v] for k, v in keyMapping.items()} claims = config.wbi.item.get(mappable["qid"]).claims.get_json() - pprint(claims) + # pprint(claims) statements = [ WikidataStatement.from_wiki_statement(s) for p in claims.values() for s in p diff --git a/src/sync.py b/src/sync.py index 57df347..0e608b6 100644 --- a/src/sync.py +++ b/src/sync.py @@ -10,10 +10,10 @@ types = ctfg.summarize_types(items) (unmatched_items, matched_items) = ctfg.partition_matched(items) -wiki_matches = wiki.get_matches(unmatched_items, max_attempts=10) -wiki_match_histogram = wiki.summarize_matches(wiki_matches) +wiki_matches = wiki.get_matches(unmatched_items, max_attempts=5) +# wiki_match_histogram = wiki.summarize_matches(wiki_matches) -matched_wikis = wiki.get_jsons(matched_items) -urls = wiki.get_urls(matched_wikis) +# matched_wikis = wiki.get_jsons(matched_items) +# urls = wiki.get_urls(matched_wikis) match_updates = ctfg.upsert_matches(wiki_matches) From 53700d0ec00d8d4c97b4755ef34c63cc08fb3236 Mon Sep 17 00:00:00 2001 From: Tim Eccleston Date: Sun, 17 Aug 2025 21:47:37 -0700 Subject: [PATCH 20/24] pass wikidata creds via github secrets --- .github/workflows/pull-request.yml | 2 ++ .github/workflows/sync.yml | 2 ++ 2 files changed, 4 insertions(+) diff --git a/.github/workflows/pull-request.yml b/.github/workflows/pull-request.yml index 5e3fb10..cb4c813 100644 --- a/.github/workflows/pull-request.yml +++ b/.github/workflows/pull-request.yml @@ -37,3 +37,5 @@ jobs: env: AIRTABLE_API_KEY: ${{ secrets.AIRTABLE_API_KEY }} AIRTABLE_BASE_ID: ${{ vars.AIRTABLE_BASE_ID }} + WIKIDATA_BOT_PW: $${{ secrets.WIKIDATA_BOT_PW }} + WIKIDATA_BOT_USERNAME: $${{ secrets.WIKIDATA_BOT_USERNAME }} diff --git a/.github/workflows/sync.yml b/.github/workflows/sync.yml index c775fd8..908eb13 100644 --- a/.github/workflows/sync.yml +++ b/.github/workflows/sync.yml @@ -34,3 +34,5 @@ jobs: env: AIRTABLE_API_KEY: ${{ secrets.AIRTABLE_API_KEY }} AIRTABLE_BASE_ID: ${{ vars.AIRTABLE_BASE_ID }} + WIKIDATA_BOT_PW: $${{ secrets.WIKIDATA_BOT_PW }} + WIKIDATA_BOT_USERNAME: $${{ secrets.WIKIDATA_BOT_USERNAME }} From 051544361d73394c20acead032f2e46f7272ab73 Mon Sep 17 00:00:00 2001 From: Tim Eccleston Date: Sun, 17 Aug 2025 22:53:07 -0700 Subject: [PATCH 21/24] print lens of wiki creds to verify --- src/config.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/config.py b/src/config.py index 592879a..bbd3e32 100644 --- a/src/config.py +++ b/src/config.py @@ -29,6 +29,9 @@ class airtable: ) if WIKIDATA_BOT_USERNAME and WIKIDATA_BOT_PW: + print(len(WIKIDATA_BOT_USERNAME)) + print(len(WIKIDATA_BOT_PW)) + wbi_login.Login(user=WIKIDATA_BOT_USERNAME, password=WIKIDATA_BOT_PW) wbi = WikibaseIntegrator() From 762b10d0521dbfb4e6b9320c3c81b8d0e48279e4 Mon Sep 17 00:00:00 2001 From: Tim Eccleston Date: Sun, 17 Aug 2025 22:56:52 -0700 Subject: [PATCH 22/24] print wiki username to understand how it has an extra character on gha --- src/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/config.py b/src/config.py index bbd3e32..b7beaa1 100644 --- a/src/config.py +++ b/src/config.py @@ -29,7 +29,7 @@ class airtable: ) if WIKIDATA_BOT_USERNAME and WIKIDATA_BOT_PW: - print(len(WIKIDATA_BOT_USERNAME)) + print(WIKIDATA_BOT_USERNAME) print(len(WIKIDATA_BOT_PW)) wbi_login.Login(user=WIKIDATA_BOT_USERNAME, password=WIKIDATA_BOT_PW) From 7913249035f8137b1fbd3e28e61997312d6b98b4 Mon Sep 17 00:00:00 2001 From: Tim Eccleston Date: Sun, 17 Aug 2025 22:58:20 -0700 Subject: [PATCH 23/24] remove extra dollar sign from front of wiki secret calls --- .github/workflows/pull-request.yml | 4 ++-- .github/workflows/sync.yml | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/pull-request.yml b/.github/workflows/pull-request.yml index cb4c813..8a2718f 100644 --- a/.github/workflows/pull-request.yml +++ b/.github/workflows/pull-request.yml @@ -37,5 +37,5 @@ jobs: env: AIRTABLE_API_KEY: ${{ secrets.AIRTABLE_API_KEY }} AIRTABLE_BASE_ID: ${{ vars.AIRTABLE_BASE_ID }} - WIKIDATA_BOT_PW: $${{ secrets.WIKIDATA_BOT_PW }} - WIKIDATA_BOT_USERNAME: $${{ secrets.WIKIDATA_BOT_USERNAME }} + WIKIDATA_BOT_PW: ${{ secrets.WIKIDATA_BOT_PW }} + WIKIDATA_BOT_USERNAME: ${{ secrets.WIKIDATA_BOT_USERNAME }} diff --git a/.github/workflows/sync.yml b/.github/workflows/sync.yml index 908eb13..1b4e514 100644 --- a/.github/workflows/sync.yml +++ b/.github/workflows/sync.yml @@ -34,5 +34,5 @@ jobs: env: AIRTABLE_API_KEY: ${{ secrets.AIRTABLE_API_KEY }} AIRTABLE_BASE_ID: ${{ vars.AIRTABLE_BASE_ID }} - WIKIDATA_BOT_PW: $${{ secrets.WIKIDATA_BOT_PW }} - WIKIDATA_BOT_USERNAME: $${{ secrets.WIKIDATA_BOT_USERNAME }} + WIKIDATA_BOT_PW: ${{ secrets.WIKIDATA_BOT_PW }} + WIKIDATA_BOT_USERNAME: ${{ secrets.WIKIDATA_BOT_USERNAME }} From fbfb6eec7194f111a9812440624d7febf552de09 Mon Sep 17 00:00:00 2001 From: Tim Eccleston Date: Sun, 17 Aug 2025 22:59:28 -0700 Subject: [PATCH 24/24] remove wiki cred lens --- src/config.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/config.py b/src/config.py index b7beaa1..592879a 100644 --- a/src/config.py +++ b/src/config.py @@ -29,9 +29,6 @@ class airtable: ) if WIKIDATA_BOT_USERNAME and WIKIDATA_BOT_PW: - print(WIKIDATA_BOT_USERNAME) - print(len(WIKIDATA_BOT_PW)) - wbi_login.Login(user=WIKIDATA_BOT_USERNAME, password=WIKIDATA_BOT_PW) wbi = WikibaseIntegrator()