Skip to content

Commit 74722bd

Browse files
committed
entry_dedupe: fix flip-flopping between duplicate new entries. #340
1 parent 3c6d101 commit 74722bd

File tree

3 files changed

+71
-13
lines changed

3 files changed

+71
-13
lines changed

CHANGES.rst

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,11 @@ Unreleased
1515
:class:`EntryCounts` and :class:`EntrySearchCounts`.
1616
Thanks to `chenthur`_ for the pull request.
1717
(:issue:`283`)
18+
* Fix bug in :mod:`~reader.plugins.entry_dedupe` causing entries to flip-flop
19+
if there were multiple *new* duplicates of the same issue
20+
(on the first update, one entry remains, on the second update, the other);
21+
related to the bug fixed in `version 3.2 <Version 3.2_>`_.
22+
(:issue:`340`)
1823

1924
.. _chenthur: https://github.com/chenthur
2025

@@ -354,7 +359,7 @@ Released 2022-09-14
354359
became optional.
355360
(:issue:`96`)
356361
* Fix bug in :mod:`~reader.plugins.entry_dedupe` causing updates to fail
357-
if there were multiple *new* duplicates of the same issue.
362+
if there were multiple *new* duplicates of the same entry.
358363
(:issue:`292`)
359364
* Fix bug in :mod:`~reader.plugins.readtime`
360365
and :mod:`~reader.plugins.mark_as_read` causing updates to fail

src/reader/plugins/entry_dedupe.py

Lines changed: 37 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -113,14 +113,15 @@
113113
from collections import Counter
114114
from collections import defaultdict
115115
from collections import deque
116+
from datetime import datetime
117+
from datetime import timezone
116118
from itertools import groupby
117119
from typing import NamedTuple
118120

119121
from reader._utils import BetterStrPartial as partial
120122
from reader.exceptions import EntryNotFoundError
121123
from reader.exceptions import TagNotFoundError
122124
from reader.types import EntryUpdateStatus
123-
from reader.types import Feed
124125

125126

126127
log = logging.getLogger('reader.plugins.entry_dedupe')
@@ -260,11 +261,43 @@ def _after_entry_update(reader, entry, status, *, dry_run=False):
260261
if _is_duplicate_full(entry, e)
261262
]
262263
if not duplicates:
264+
log.debug("entry_dedupe: no duplicates for %s", entry.resource_id)
263265
return
264266

267+
try:
268+
entry = reader.get_entry(entry)
269+
except EntryNotFoundError:
270+
log.info("entry_dedupe: entry %r was deleted, aborting", entry.resource_id)
271+
return
272+
273+
def group_key(e):
274+
# unlike _get_entry_groups, we cannot rely on e.last_updated,
275+
# because for duplicates in the feed, we'd end up flip-flopping
276+
# (on the first update, entry 1 is deleted and entry 2 remains;
277+
# on the second update, entry 1 remains because it's new,
278+
# and entry 2 is deleted because it's not modified,
279+
# has lower last_updated, and no update hook runs for it; repeat).
280+
#
281+
# it would be more correct to sort by (is in new feed, last_retrieved),
282+
# but as of 3.14, we don't know about existing but not modified entries
283+
# (the hook isn't called), and entries don't have last_retrieved.
284+
#
285+
# also see test_duplicates_in_feed / #340.
286+
#
287+
return e.updated or e.published or DEFAULT_UPDATED, e.id
288+
289+
290+
291+
group = [entry] + duplicates
292+
group.sort(key=group_key, reverse=True)
293+
entry, *duplicates = group
294+
265295
_dedupe_entries(reader, entry, duplicates, dry_run=dry_run)
266296

267297

298+
DEFAULT_UPDATED = datetime(1970, 1, 1, tzinfo=timezone.utc)
299+
300+
268301
def _get_same_group_entries(reader, entry):
269302
# to make this better, we could do something like
270303
# reader.search_entries(f'title: {fts5_escape(entry.title)}'),
@@ -347,6 +380,7 @@ def by_title(e):
347380
continue
348381

349382
while group:
383+
# keep the latest entry, consider the rest duplicates
350384
group.sort(key=lambda e: e.last_updated, reverse=True)
351385
entry, *rest = group
352386

@@ -496,10 +530,6 @@ def _dedupe_entries(reader, entry, duplicates, *, dry_run):
496530
[e.id for e in duplicates],
497531
)
498532

499-
# in case entry is EntryData, not Entry
500-
if hasattr(entry, 'as_entry'):
501-
entry = entry.as_entry(feed=Feed(entry.feed_url))
502-
503533
# don't do anything until we know all actions were generated successfully
504534
actions = list(_make_actions(reader, entry, duplicates))
505535
# FIXME: what if this fails with EntryNotFoundError?
@@ -510,8 +540,8 @@ def _dedupe_entries(reader, entry, duplicates, *, dry_run):
510540
for action in actions:
511541
action()
512542
log.info("entry_dedupe: %s", action)
513-
except EntryNotFoundError as e:
514-
if entry.resource_id != e.resource_id: # pragma: no cover
543+
except EntryNotFoundError as e: # pragma: no cover
544+
if entry.resource_id != e.resource_id:
515545
raise
516546
log.info("entry_dedupe: entry %r was deleted, aborting", entry.resource_id)
517547

tests/test_plugins_entry_dedupe.py

Lines changed: 28 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -755,12 +755,13 @@ def test_recent_sort_copying(make_reader, db_path):
755755

756756

757757
@pytest.mark.parametrize('update_after_one', [False, True])
758-
def test_duplicates_in_new_feed(make_reader, update_after_one):
758+
@pytest.mark.parametrize('with_dates, expected_id', [(False, '3'), (True, '2')])
759+
def test_duplicates_in_feed(make_reader, update_after_one, with_dates, expected_id):
759760
reader = make_reader(':memory:', plugins=['reader.entry_dedupe'])
760761
reader._parser = parser = Parser()
761762

762763
reader.add_feed(parser.feed(1))
763-
one = parser.entry(1, 1, title='title', summary='summary')
764+
one = parser.entry(1, '1', title='title', summary='summary')
764765

765766
if update_after_one:
766767
reader.update_feeds()
@@ -769,10 +770,32 @@ def test_duplicates_in_new_feed(make_reader, update_after_one):
769770
reader.mark_entry_as_important(one)
770771
reader.set_tag(one, 'key', 'value')
771772

772-
parser.entry(1, 2, title='title', summary='summary')
773-
parser.entry(1, 3, title='title', summary='summary')
773+
parser.entry(
774+
1,
775+
'2',
776+
title='title',
777+
summary='summary',
778+
updated=datetime(2010, 1, 2) if with_dates else None,
779+
)
780+
parser.entry(
781+
1,
782+
'3',
783+
title='title',
784+
summary='summary',
785+
published=datetime(2010, 1, 1) if with_dates else None,
786+
)
774787

775788
# shouldn't fail
776789
reader.update_feeds()
790+
assert {e.id for e in reader.get_entries()} == {expected_id}
777791

778-
assert [eval(e.id)[1] for e in reader.get_entries()] == [3]
792+
# shouldn't flip flop
793+
# https://github.com/lemon24/reader/issues/340
794+
reader.update_feeds()
795+
assert {e.id for e in reader.get_entries()} == {expected_id}
796+
797+
if update_after_one:
798+
entry = reader.get_entry(('1', expected_id))
799+
assert entry.read
800+
assert entry.important
801+
assert dict(reader.get_tags(entry)) == {'key': 'value'}

0 commit comments

Comments
 (0)