Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
e96f3b3
:warning: UNTESTED: move wikibase user agent into config
combinatorist Aug 13, 2025
e5445e4
fix datatype for property id
combinatorist Aug 13, 2025
a622bc7
reorder ctfg classes into strict hierarchy (commenting out backlinks)
combinatorist Aug 13, 2025
77fe0c6
:warning: BROKE: pull linked data (properties and values, but not sav…
combinatorist Aug 15, 2025
621dd3a
add optional bot login
combinatorist Aug 15, 2025
5078735
resume providing an (improved user agent)
combinatorist Aug 15, 2025
9e1b0e9
add import misisng from new login (2 commits back)
combinatorist Aug 15, 2025
015e0a0
memoize wikidata property pull
combinatorist Aug 15, 2025
1a01a5d
pretty print value json
combinatorist Aug 15, 2025
13678e3
start recursive batch saving
combinatorist Aug 16, 2025
695a4d2
resolve type warning about Optional field
combinatorist Aug 16, 2025
29e1a66
save directly and immediately (not in batch)
combinatorist Aug 16, 2025
821586a
fix typo in table name
combinatorist Aug 16, 2025
589771d
fix and simplify default string value attribute
combinatorist Aug 16, 2025
5b4dbf2
remove old debugger
combinatorist Aug 16, 2025
d6be886
start passing a concatenated identifier to value attributes
combinatorist Aug 17, 2025
386e75d
remove unused recursive_save logic (for now)
combinatorist Aug 17, 2025
df9d471
remove unused import
combinatorist Aug 17, 2025
770a68c
skip some unused stuff
combinatorist Aug 18, 2025
53700d0
pass wikidata creds via github secrets
combinatorist Aug 18, 2025
0515443
print lens of wiki creds to verify
combinatorist Aug 18, 2025
762b10d
print wiki username to understand how it has an extra character on gha
combinatorist Aug 18, 2025
7913249
remove extra dollar sign from front of wiki secret calls
combinatorist Aug 18, 2025
fbfb6ee
remove wiki cred lens
combinatorist Aug 18, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .github/workflows/pull-request.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,3 +37,5 @@ jobs:
env:
AIRTABLE_API_KEY: ${{ secrets.AIRTABLE_API_KEY }}
AIRTABLE_BASE_ID: ${{ vars.AIRTABLE_BASE_ID }}
WIKIDATA_BOT_PW: ${{ secrets.WIKIDATA_BOT_PW }}
WIKIDATA_BOT_USERNAME: ${{ secrets.WIKIDATA_BOT_USERNAME }}
2 changes: 2 additions & 0 deletions .github/workflows/sync.yml
Original file line number Diff line number Diff line change
Expand Up @@ -34,3 +34,5 @@ jobs:
env:
AIRTABLE_API_KEY: ${{ secrets.AIRTABLE_API_KEY }}
AIRTABLE_BASE_ID: ${{ vars.AIRTABLE_BASE_ID }}
WIKIDATA_BOT_PW: ${{ secrets.WIKIDATA_BOT_PW }}
WIKIDATA_BOT_USERNAME: ${{ secrets.WIKIDATA_BOT_USERNAME }}
20 changes: 20 additions & 0 deletions src/config.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
from dataclasses import dataclass
from typing import Optional
from dotenv import load_dotenv
from wikibaseintegrator.wbi_config import config as wbi_config
from wikibaseintegrator import wbi_login
from wikibaseintegrator import WikibaseIntegrator

load_dotenv()

Expand All @@ -15,3 +18,20 @@
class airtable:
api_key: str = os.environ["AIRTABLE_API_KEY"]
base_id: str = os.environ["AIRTABLE_BASE_ID"]


WIKIDATA_BOT_USERNAME = os.getenv("WIKIDATA_BOT_USERNAME", None)
WIKIDATA_BOT_PW = os.getenv("WIKIDATA_BOT_PW", None)


wbi_config["USER_AGENT"] = (
"CTFG-Wikidata/0.1 (https://www.wikidata.org/wiki/User:TECCLESTON-TECH, https://github.com/sprblm/CTFG-Wikidata)"
)

if WIKIDATA_BOT_USERNAME and WIKIDATA_BOT_PW:
wbi_login.Login(user=WIKIDATA_BOT_USERNAME, password=WIKIDATA_BOT_PW)

wbi = WikibaseIntegrator()


LANGUAGE_CODE = "en"
148 changes: 118 additions & 30 deletions src/ctfg.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
from json import dumps
from typing import Any
from util import *
from pyairtable import Api
import os
import pickle
from collections import defaultdict
from pprint import pprint
from functools import lru_cache
from collections import defaultdict

api_key = config.airtable.api_key
base_id = config.airtable.base_id
Expand All @@ -16,23 +19,123 @@


class WikidataProperty(Model):
pid_num = F.IntegerField("PID")
pid = F.SingleLineTextField("PID")
label = F.SingleLineTextField("Label")
description = F.MultilineTextField("Description")
statements = F.LinkField("Statements", "Wikidata Statements")
# statements = F.LinkField("Statements", "Wikidata Statements")

class Meta:
api_key = api_key
base_id = base_id
table_name = "Wikidata Properties"
memoize = True

@lru_cache(maxsize=None)
@staticmethod
def from_wikidata_id(pid: str):
p = config.wbi.property.get(pid)
converted = WikidataProperty(
pid=p.id,
label=str(p.labels.values.get(config.LANGUAGE_CODE)),
description=str(p.descriptions.get(config.LANGUAGE_CODE)),
)
converted.save()
return converted


class WikidataStatementValueAttribute(Model):
uuid = F.SingleLineTextField("Identifier")
key = F.SingleLineTextField("Value Attribute Key")
value = F.SingleLineTextField("Value Attribute Value")
# statement = F.LinkField("Statement", "Wikidata Statements")

class Meta:
api_key = api_key
base_id = base_id
table_name = "Wikidata Statement Value Attributes"


class WikidataStatementValue(Model):
uuid = F.SingleLineTextField("Identifier")
type = F.SingleLineTextField("Wikidata Type")
json = F.MultilineTextField("Wikidata Value JSON")
attributes = F.LinkField("Attributes", WikidataStatementValueAttribute)

class Meta:
api_key = api_key
base_id = base_id
table_name = "Wikidata Statement Value"

@staticmethod
def parse_value_attributes(
value: dict | str, base_id: str
) -> list[WikidataStatementValueAttribute]:
if isinstance(value, str):
value = {"string": value}

attributes = [
WikidataStatementValueAttribute(
uuid=base_id + str(k), key=str(k), value=str(v)
)
for k, v in value.items()
]
WikidataStatementValueAttribute.batch_save(attributes)
return attributes

@staticmethod
def from_wiki_dict(uuid: str, datavalue: dict):
result = WikidataStatementValue(
uuid=uuid,
type=datavalue["type"],
json=dumps(datavalue["value"], indent=2),
attributes=WikidataStatementValue.parse_value_attributes(
datavalue["value"], uuid
),
)
result.save()
return result


class WikidataStatement(Model):
uuid = F.SingleLineTextField("Identifier")
property = F.SingleLinkField("Wikidata Property", WikidataProperty)
datatype = F.SingleLineTextField("Data Type")
value = F.SingleLinkField("Value", WikidataStatementValue)
# item = F.SingleLinkField("Wikidata Item", WikidataItem)

class Meta:
api_key = api_key
base_id = base_id
table_name = "Wikidata Statements"

@staticmethod
def from_wiki_statement(statement: dict):
uuid: str = statement["id"]

# Ignore alternatives, qualifiers, and references
statement = statement["mainsnak"]

property = WikidataProperty.from_wikidata_id(statement["property"])
datavalue = statement.get("datavalue", None)
value = (
WikidataStatementValue.from_wiki_dict(uuid, datavalue)
if datavalue
else None
)

result = WikidataStatement(
uuid=uuid, property=property, datatype=statement["datatype"], value=value
)
result.save()
return result


class WikidataItem(Model):
qid = F.SingleLineTextField("QID")
label = F.SingleLineTextField("Label")
description = F.MultilineTextField("Description")
statements = F.LinkField("Statements", "Wikidata Statements")
listings = F.LinkField("Listing Suggestions", "Listing")
statements = F.LinkField("Statements", WikidataStatement)
# listings = F.LinkField("Listing Suggestions", "Listing")
# listing = F.LinkField("Listing Official", "Listing")
url = F.UrlField("Wikidata Page", readonly=True)

Expand All @@ -43,26 +146,23 @@ class Meta:

@staticmethod
def from_wiki_match(m: dict, keep_unknowns: bool = False):

keyMapping = {
"qid": "id",
"label": "label",
"description": "description",
}
mappable = {k: m[v] for k, v in keyMapping.items()}
return WikidataItem(**mappable)

claims = config.wbi.item.get(mappable["qid"]).claims.get_json()
# pprint(claims)

class WikidataStatement(Model):
qid_num = F.IntegerField("QID")
label = F.SingleLineTextField("Label")
description = F.MultilineTextField("Description")
property = F.SingleLinkField("Wikidata Property", WikidataProperty)
item = F.SingleLinkField("Wikidata Item", WikidataItem)

class Meta:
api_key = api_key
base_id = base_id
table_name = "Wikidata Statements"
statements = [
WikidataStatement.from_wiki_statement(s) for p in claims.values() for s in p
]
result = WikidataItem(**mappable, statements=statements)
result.save()
return result


class Listing(Model):
Expand Down Expand Up @@ -90,15 +190,6 @@ def deploy_fields() -> None:
WikidataItem.description: {
"field_type": "singleLineText",
},
WikidataItem.listings: {
"field_type": "multipleRecordLinks",
"description": "CTFG listings that are suspected to match this wikidata item",
"options": {
"linkedTableId": Listing.meta.table.id,
# "isReversed": True,
# "prefersSingleRecordLink": True,
},
},
WikidataItem.url: {
"field_type": "singleLineText",
},
Expand Down Expand Up @@ -179,9 +270,6 @@ def partition_matched(items: list[Listing]) -> tuple[list[Listing], list[Listing
return (list(unmatched), list(matched))


from itertools import batched


def upsert_matches(wiki_matches: dict[Listing, list[dict[str, Any]]]):
log("Updating CTFG with matching wikibase IDs...")
with_wiki_items = {
Expand All @@ -192,8 +280,8 @@ def upsert_matches(wiki_matches: dict[Listing, list[dict[str, Any]]]):
wiki_items = list(set([x for y in with_wiki_items.values() for x in y]))

log("Example item")
pprint(wiki_items[0].to_record())
WikidataItem.batch_save(wiki_items)
# pprint(wiki_items[0].to_record())
# WikidataItem.recursive_save(wiki_items)

for x, matches in with_wiki_items.items():
x.wikidata_suggestions = matches
Expand Down
8 changes: 4 additions & 4 deletions src/sync.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,10 @@
types = ctfg.summarize_types(items)
(unmatched_items, matched_items) = ctfg.partition_matched(items)

wiki_matches = wiki.get_matches(unmatched_items, max_attempts=15)
wiki_match_histogram = wiki.summarize_matches(wiki_matches)
wiki_matches = wiki.get_matches(unmatched_items, max_attempts=5)
# wiki_match_histogram = wiki.summarize_matches(wiki_matches)

matched_wikis = wiki.get_jsons(matched_items)
urls = wiki.get_urls(matched_wikis)
# matched_wikis = wiki.get_jsons(matched_items)
# urls = wiki.get_urls(matched_wikis)

match_updates = ctfg.upsert_matches(wiki_matches)
16 changes: 5 additions & 11 deletions src/wiki.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,7 @@
from random import sample
import ctfg


from wikibaseintegrator import WikibaseIntegrator
from wikibaseintegrator.wbi_config import config as wbi_config

wbi_config["USER_AGENT"] = (
"AutomationDev/0.1 (https://www.wikidata.org/wiki/User:TECCLESTON-TECH"
)
from wikibaseintegrator.entities.property import PropertyEntity


def get_matches(
Expand Down Expand Up @@ -44,9 +38,6 @@ def get_matches(
return wiki_matches


wbi = WikibaseIntegrator()


def summarize_matches(wiki_matches):
count_of_counts = defaultdict(int)
for x in wiki_matches.values():
Expand All @@ -72,9 +63,12 @@ def get_jsons(matched_items: list[ctfg.Listing]):

log("Getting wikidata json for confirmed matches...")
matched_wikis = {
x: wbi.item.get(x.wikidata_item.qid).get_json()
x: config.wbi.item.get(x.wikidata_item.qid).get_json()
for x in sample(matched_items, min(50, len(matched_items)))
if x.wikidata_item
}
log("Example wikidata json:")
pprint(list(matched_wikis.values())[0])
return matched_wikis


Expand Down