entry_dedupe: fix flip-flopping between duplicate new entries. #340

lemon24 · lemon24 · commit 74722bd68e9e · 2024-06-23T16:38:27.000+03:00
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -15,6 +15,11 @@ Unreleased
   :class:`EntryCounts` and :class:`EntrySearchCounts`.
   Thanks to `chenthur`_ for the pull request.
   (:issue:`283`)
+* Fix bug in :mod:`~reader.plugins.entry_dedupe` causing entries to flip-flop
+  if there were multiple *new* duplicates of the same issue
+  (on the first update, one entry remains, on the second update, the other);
+  related to the bug fixed in `version 3.2 <Version 3.2_>`_.
+  (:issue:`340`)
 
 .. _chenthur: https://github.com/chenthur
 
@@ -354,7 +359,7 @@ Released 2022-09-14
   became optional.
   (:issue:`96`)
 * Fix bug in :mod:`~reader.plugins.entry_dedupe` causing updates to fail
-  if there were multiple *new* duplicates of the same issue.
+  if there were multiple *new* duplicates of the same entry.
   (:issue:`292`)
 * Fix bug in :mod:`~reader.plugins.readtime`
   and :mod:`~reader.plugins.mark_as_read` causing updates to fail
diff --git a/src/reader/plugins/entry_dedupe.py b/src/reader/plugins/entry_dedupe.py
@@ -113,14 +113,15 @@
 from collections import Counter
 from collections import defaultdict
 from collections import deque
+from datetime import datetime
+from datetime import timezone
 from itertools import groupby
 from typing import NamedTuple
 
 from reader._utils import BetterStrPartial as partial
 from reader.exceptions import EntryNotFoundError
 from reader.exceptions import TagNotFoundError
 from reader.types import EntryUpdateStatus
-from reader.types import Feed
 
 
 log = logging.getLogger('reader.plugins.entry_dedupe')
@@ -260,11 +261,43 @@ def _after_entry_update(reader, entry, status, *, dry_run=False):
         if _is_duplicate_full(entry, e)
     ]
     if not duplicates:
+        log.debug("entry_dedupe: no duplicates for %s", entry.resource_id)
         return
 
+    try:
+        entry = reader.get_entry(entry)
+    except EntryNotFoundError:
+        log.info("entry_dedupe: entry %r was deleted, aborting", entry.resource_id)
+        return
+
+    def group_key(e):
+        # unlike _get_entry_groups, we cannot rely on e.last_updated,
+        # because for duplicates in the feed, we'd end up flip-flopping
+        # (on the first update, entry 1 is deleted and entry 2 remains;
+        # on the second update, entry 1 remains because it's new,
+        # and entry 2 is deleted because it's not modified,
+        # has lower last_updated, and no update hook runs for it; repeat).
+        #
+        # it would be more correct to sort by (is in new feed, last_retrieved),
+        # but as of 3.14, we don't know about existing but not modified entries
+        # (the hook isn't called), and entries don't have last_retrieved.
+        #
+        # also see test_duplicates_in_feed / #340.
+        #
+        return e.updated or e.published or DEFAULT_UPDATED, e.id
+
+
+
+    group = [entry] + duplicates
+    group.sort(key=group_key, reverse=True)
+    entry, *duplicates = group
+
     _dedupe_entries(reader, entry, duplicates, dry_run=dry_run)
 
 
+DEFAULT_UPDATED = datetime(1970, 1, 1, tzinfo=timezone.utc)
+
+
 def _get_same_group_entries(reader, entry):
     # to make this better, we could do something like
     # reader.search_entries(f'title: {fts5_escape(entry.title)}'),
@@ -347,6 +380,7 @@ def by_title(e):
             continue
 
         while group:
+            # keep the latest entry, consider the rest duplicates
             group.sort(key=lambda e: e.last_updated, reverse=True)
             entry, *rest = group
 
@@ -496,10 +530,6 @@ def _dedupe_entries(reader, entry, duplicates, *, dry_run):
         [e.id for e in duplicates],
     )
 
-    # in case entry is EntryData, not Entry
-    if hasattr(entry, 'as_entry'):
-        entry = entry.as_entry(feed=Feed(entry.feed_url))
-
     # don't do anything until we know all actions were generated successfully
     actions = list(_make_actions(reader, entry, duplicates))
     # FIXME: what if this fails with EntryNotFoundError?
@@ -510,8 +540,8 @@ def _dedupe_entries(reader, entry, duplicates, *, dry_run):
         for action in actions:
             action()
             log.info("entry_dedupe: %s", action)
-    except EntryNotFoundError as e:
-        if entry.resource_id != e.resource_id:  # pragma: no cover
+    except EntryNotFoundError as e:  # pragma: no cover
+        if entry.resource_id != e.resource_id:
             raise
         log.info("entry_dedupe: entry %r was deleted, aborting", entry.resource_id)
 
diff --git a/tests/test_plugins_entry_dedupe.py b/tests/test_plugins_entry_dedupe.py
@@ -755,12 +755,13 @@ def test_recent_sort_copying(make_reader, db_path):
 
 
 @pytest.mark.parametrize('update_after_one', [False, True])
-def test_duplicates_in_new_feed(make_reader, update_after_one):
+@pytest.mark.parametrize('with_dates, expected_id', [(False, '3'), (True, '2')])
+def test_duplicates_in_feed(make_reader, update_after_one, with_dates, expected_id):
     reader = make_reader(':memory:', plugins=['reader.entry_dedupe'])
     reader._parser = parser = Parser()
 
     reader.add_feed(parser.feed(1))
-    one = parser.entry(1, 1, title='title', summary='summary')
+    one = parser.entry(1, '1', title='title', summary='summary')
 
     if update_after_one:
         reader.update_feeds()
@@ -769,10 +770,32 @@ def test_duplicates_in_new_feed(make_reader, update_after_one):
         reader.mark_entry_as_important(one)
         reader.set_tag(one, 'key', 'value')
 
-    parser.entry(1, 2, title='title', summary='summary')
-    parser.entry(1, 3, title='title', summary='summary')
+    parser.entry(
+        1,
+        '2',
+        title='title',
+        summary='summary',
+        updated=datetime(2010, 1, 2) if with_dates else None,
+    )
+    parser.entry(
+        1,
+        '3',
+        title='title',
+        summary='summary',
+        published=datetime(2010, 1, 1) if with_dates else None,
+    )
 
     # shouldn't fail
     reader.update_feeds()
+    assert {e.id for e in reader.get_entries()} == {expected_id}
 
-    assert [eval(e.id)[1] for e in reader.get_entries()] == [3]
+    # shouldn't flip flop
+    # https://github.com/lemon24/reader/issues/340
+    reader.update_feeds()
+    assert {e.id for e in reader.get_entries()} == {expected_id}
+
+    if update_after_one:
+        entry = reader.get_entry(('1', expected_id))
+        assert entry.read
+        assert entry.important
+        assert dict(reader.get_tags(entry)) == {'key': 'value'}