Skip to content

Commit

Permalink
[IMPR] add -category option to delinker.py
Browse files Browse the repository at this point in the history
The -category option works as follows:
- retrieve pages from "Pages with missing files" listed on wikibase with
  item Q4989282. Any other category can be given with this option.
- for every page found in this category process their image links
- skip further processing if the FilePage exists locally or in the
  image repository
- also skip further processing if the file was not deleted. In that case
  there is an invalid link found on the source page.
- finally delink the found image link

Bug: T372206
Change-Id: I49d9260f2cbcb7e98f1916da82b191119a0bf127
  • Loading branch information
xqt committed Aug 11, 2024
1 parent a1cf30a commit 9f7f126
Showing 1 changed file with 119 additions and 36 deletions.
155 changes: 119 additions & 36 deletions scripts/delinker.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,11 @@
The following parameters are supported:
-category: Retrieve pages to delink from "Pages with missing files"
category. Usually the category is found on Q4989282 wikibase
item but can be overwritten by giving the category title
with that option. *-since* option is ignored.
-exclude: If the deletion log contains this pattern, the file is not
delinked (default is 'no-delink').
Expand All @@ -27,6 +32,8 @@
.. versionadded:: 7.2
This script is completely rewriten from compat branch.
.. versionchanged:: 9.4
*-category* option was added.
"""
#
# (C) Pywikibot team, 2006-2024
Expand All @@ -52,14 +59,120 @@

class CommonsDelinker(SingleSiteBot, ConfigParserBot, AutomaticTWSummaryBot):

"""Bot to delink deleted images."""
"""Base Delinker Bot."""

summary_key = 'delinker-delink'

def skip_page(self, page) -> bool:
"""Skip pages which neither exists locally nor on shared repository."""
pywikibot.info('.', newline=False)
if page.exists() or page.file_is_shared():
return True
return super().skip_page(page)

def treat(self, file_page):
"""Set page to current page and delink that page."""
# use image_regex from image.py
namespace = file_page.site.namespaces[6]
escaped = case_escape(namespace.case,
file_page.title(with_ns=False),
underscore=True)
self.image_regex = re.compile(
r'\[\[ *(?:{})\s*:\s*{} *(?P<parameters>\|'
r'(?:[^\[\]]|\[\[[^\]]+\]\]|\[[^\]]+\])*|) *\]\]'
.format('|'.join(ignore_case(s) for s in namespace), escaped))

shown = False
for page in file_page.using_pages(
content=True, namespaces=self.site.namespaces.MAIN):
if not shown:
pywikibot.info('\n>>> Delinking <<lightgreen>>'
f'{file_page.title()}<<default>> <<<')
shown = True
super().treat(page)

def treat_page(self):
"""Delink a single page."""
new = re.sub(self.image_regex, '', self.current_page.text)
self.put_current(new)


class DelinkerFromCategory(CommonsDelinker):

"""Bot to delink deleted images from pages found in category."""

pages_with_missing_files = 'Q4989282'

update_options = {
'exclude': 'no-delink',
'localonly': False,
'category': True,
}

@property
def generator(self):
"""Retrieve pages with missing files and yield there image links."""
if self.opt.category is True:
cat = self.site.page_from_repository(self.pages_with_missing_files)
else:
cat = pywikibot.Category(self.site, self.opt.category)
if not cat.exists():
cat = None

if not cat:
pywikibot.warning('No valid category given for generator')
return

for article in cat.articles(namespaces=self.site.namespaces.MAIN):
yield from article.imagelinks()

def init_page(self, item) -> pywikibot.page.FilePage:
"""Upcast logevent to FilePage and combine edit summary."""
return pywikibot.FilePage(item, ignore_extension=True)

def skip_page(self, page) -> pywikibot.page.FilePage:
"""Skip pages which aren't deleted on any repository."""
if super().skip_page(page):
return True

params = {
'logtype': 'delete',
'reverse': True,
'page': 'File:' + page.title(underscore=True, with_ns=False),
}
try:
entry = next(self.site.logevents(**params))
except StopIteration:
try:
entry = next(self.site.image_repository().logevents(**params))
except StopIteration:
pywikibot.info()
pywikibot.warning(
f'unable to delink missing {page.title(as_link=True)}')
found = list(self.site.search(
page.title(),
namespaces=self.site.namespaces.MAIN,
total=1
))
if found:
pywikibot.info('probably <<lightblue>>'
f'{found[0].title(as_link=True)}'
'<<default>> is meant')
return True

self.summary_parameters = dict(entry)
return False


class DelinkerFromLog(CommonsDelinker):

"""Bot to delink deleted images from deletion log."""

update_options = {
'exclude': 'no-delink',
'localonly': False,
'since': '',
}
summary_key = 'delinker-delink'

@property
def generator(self):
Expand Down Expand Up @@ -90,38 +203,6 @@ def init_page(self, item) -> pywikibot.page.FilePage:
self.summary_parameters = dict(item)
return pywikibot.FilePage(item.page(), ignore_extension=True)

def skip_page(self, page) -> bool:
"""Skip pages which neither exists locally nor on shared repository."""
pywikibot.info('.', newline=False)
if page.exists() or page.file_is_shared():
return True
return super().skip_page(page)

def treat(self, file_page):
"""Set page to current page and delink that page."""
# use image_regex from image.py
namespace = file_page.site.namespaces[6]
escaped = case_escape(namespace.case,
file_page.title(with_ns=False),
underscore=True)
self.image_regex = re.compile(
r'\[\[ *(?:{})\s*:\s*{} *(?P<parameters>\|'
r'(?:[^\[\]]|\[\[[^\]]+\]\]|\[[^\]]+\])*|) *\]\]'
.format('|'.join(ignore_case(s) for s in namespace), escaped))

shown = False
for page in file_page.using_pages(content=True, namespaces=0):
if not shown:
pywikibot.info('\n>>> <<lightgreen>>Delinking '
f'{file_page.title()}<<default>> <<<')
shown = True
super().treat(page)

def treat_page(self):
"""Delink a single page."""
new = re.sub(self.image_regex, '', self.current_page.text)
self.put_current(new)

def teardown(self):
"""Save the last used logevent timestamp."""
if not hasattr(self, 'last_ts'):
Expand Down Expand Up @@ -153,11 +234,13 @@ def main(*args: str) -> None:
opt = removeprefix(opt, '-')
if opt == 'localonly':
options[opt] = True
elif opt == 'category':
options[opt] = value or True
else:
options[opt] = value

bot = CommonsDelinker(site=pywikibot.Site(), **options)
bot.run()
bot = DelinkerFromCategory if options.get('category') else DelinkerFromLog
bot(**options).run()


if __name__ == '__main__':
Expand Down

0 comments on commit 9f7f126

Please sign in to comment.