113
113
from collections import Counter
114
114
from collections import defaultdict
115
115
from collections import deque
116
+ from datetime import datetime
117
+ from datetime import timezone
116
118
from itertools import groupby
117
119
from typing import NamedTuple
118
120
119
121
from reader ._utils import BetterStrPartial as partial
120
122
from reader .exceptions import EntryNotFoundError
121
123
from reader .exceptions import TagNotFoundError
122
124
from reader .types import EntryUpdateStatus
123
- from reader .types import Feed
124
125
125
126
126
127
log = logging .getLogger ('reader.plugins.entry_dedupe' )
@@ -260,11 +261,43 @@ def _after_entry_update(reader, entry, status, *, dry_run=False):
260
261
if _is_duplicate_full (entry , e )
261
262
]
262
263
if not duplicates :
264
+ log .debug ("entry_dedupe: no duplicates for %s" , entry .resource_id )
263
265
return
264
266
267
+ try :
268
+ entry = reader .get_entry (entry )
269
+ except EntryNotFoundError :
270
+ log .info ("entry_dedupe: entry %r was deleted, aborting" , entry .resource_id )
271
+ return
272
+
273
+ def group_key (e ):
274
+ # unlike _get_entry_groups, we cannot rely on e.last_updated,
275
+ # because for duplicates in the feed, we'd end up flip-flopping
276
+ # (on the first update, entry 1 is deleted and entry 2 remains;
277
+ # on the second update, entry 1 remains because it's new,
278
+ # and entry 2 is deleted because it's not modified,
279
+ # has lower last_updated, and no update hook runs for it; repeat).
280
+ #
281
+ # it would be more correct to sort by (is in new feed, last_retrieved),
282
+ # but as of 3.14, we don't know about existing but not modified entries
283
+ # (the hook isn't called), and entries don't have last_retrieved.
284
+ #
285
+ # also see test_duplicates_in_feed / #340.
286
+ #
287
+ return e .updated or e .published or DEFAULT_UPDATED , e .id
288
+
289
+
290
+
291
+ group = [entry ] + duplicates
292
+ group .sort (key = group_key , reverse = True )
293
+ entry , * duplicates = group
294
+
265
295
_dedupe_entries (reader , entry , duplicates , dry_run = dry_run )
266
296
267
297
298
+ DEFAULT_UPDATED = datetime (1970 , 1 , 1 , tzinfo = timezone .utc )
299
+
300
+
268
301
def _get_same_group_entries (reader , entry ):
269
302
# to make this better, we could do something like
270
303
# reader.search_entries(f'title: {fts5_escape(entry.title)}'),
@@ -347,6 +380,7 @@ def by_title(e):
347
380
continue
348
381
349
382
while group :
383
+ # keep the latest entry, consider the rest duplicates
350
384
group .sort (key = lambda e : e .last_updated , reverse = True )
351
385
entry , * rest = group
352
386
@@ -496,10 +530,6 @@ def _dedupe_entries(reader, entry, duplicates, *, dry_run):
496
530
[e .id for e in duplicates ],
497
531
)
498
532
499
- # in case entry is EntryData, not Entry
500
- if hasattr (entry , 'as_entry' ):
501
- entry = entry .as_entry (feed = Feed (entry .feed_url ))
502
-
503
533
# don't do anything until we know all actions were generated successfully
504
534
actions = list (_make_actions (reader , entry , duplicates ))
505
535
# FIXME: what if this fails with EntryNotFoundError?
@@ -510,8 +540,8 @@ def _dedupe_entries(reader, entry, duplicates, *, dry_run):
510
540
for action in actions :
511
541
action ()
512
542
log .info ("entry_dedupe: %s" , action )
513
- except EntryNotFoundError as e :
514
- if entry .resource_id != e .resource_id : # pragma: no cover
543
+ except EntryNotFoundError as e : # pragma: no cover
544
+ if entry .resource_id != e .resource_id :
515
545
raise
516
546
log .info ("entry_dedupe: entry %r was deleted, aborting" , entry .resource_id )
517
547
0 commit comments