my.youtube.takeout: deduplicate watched videos and sort out a few min…

…or errors
karlicoss · Sep 22, 2024 · e1ac02d · e1ac02d
1 parent 3166109
commit e1ac02d
Show file tree

Hide file tree

Showing 3 changed files with 87 additions and 30 deletions.
diff --git a/my/core/compat.py b/my/core/compat.py
@@ -22,12 +22,17 @@ def sqlite_backup(*, source: sqlite3.Connection, dest: sqlite3.Connection, **kwa
         source.backup(dest, **kwargs)
 
 
-# can remove after python3.9 (although need to keep the method itself for bwd compat)
+## can remove after python3.9 (although need to keep the method itself for bwd compat)
 def removeprefix(text: str, prefix: str) -> str:
     if text.startswith(prefix):
         return text[len(prefix) :]
     return text
 
+def removesuffix(text: str, suffix: str) -> str:
+    if text.endswith(suffix):
+        return text[:-len(suffix)]
+    return text
+##
 
 ## used to have compat function before 3.8 for these, keeping for runtime back compatibility
 if not TYPE_CHECKING:

diff --git a/my/media/youtube.py b/my/media/youtube.py
@@ -1,5 +1,10 @@
-from ..core.warnings import high
+from my.core import __NOT_HPI_MODULE__
+
+from typing import TYPE_CHECKING
+
+from my.core.warnings import high
+
 high("DEPRECATED! Please use my.youtube.takeout instead.")
-from ..core.util import __NOT_HPI_MODULE__
 
-from ..youtube.takeout import *
+if not TYPE_CHECKING:
+    from my.youtube.takeout import *
diff --git a/my/youtube/takeout.py b/my/youtube/takeout.py
@@ -1,13 +1,16 @@
-from typing import NamedTuple, List, Iterable, TYPE_CHECKING
+from __future__ import annotations
 
-from my.core import datetime_aware, make_logger, stat, Res, Stats
-from my.core.compat import deprecated, removeprefix
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any, Iterable, Iterator
 
+from my.core import Res, Stats, datetime_aware, make_logger, stat, warnings
+from my.core.compat import deprecated, removeprefix, removesuffix
 
 logger = make_logger(__name__)
 
 
-class Watched(NamedTuple):
+@dataclass
+class Watched:
     url: str
     title: str
     when: datetime_aware
@@ -16,19 +19,55 @@ class Watched(NamedTuple):
     def eid(self) -> str:
         return f'{self.url}-{self.when.isoformat()}'
 
+    def is_deleted(self) -> bool:
+        return self.title == self.url
+
 
 # todo define error policy?
 # although it has one from google takeout module.. so not sure
 
-def watched() -> Iterable[Res[Watched]]:
+
+def watched() -> Iterator[Res[Watched]]:
+    emitted: dict[Any, Watched] = {}
+    for w in _watched():
+        if isinstance(w, Exception):
+            yield w  # TODO also make unique?
+            continue
+
+        # older exports (e.g. html) didn't have microseconds
+        # wheras newer json ones do have them
+        # seconds resolution is enough to distinguish watched videos
+        # also we're processing takeouts in HPI in reverse order, so first seen watch would contain microseconds, resulting in better data
+        without_microsecond = w.when.replace(microsecond=0)
+
+        key = w.url, without_microsecond
+        prev = emitted.get(key, None)
+        if prev is not None:
+            if w.title in prev.title:
+                # often more stuff added to the title, like 'Official Video'
+                # in this case not worth emitting the change
+                # also handles the case when titles match
+                continue
+            # otherwise if title changed completely, just emit the change... not sure what else we could do?
+            # could merge titles in the 'titles' field and update dynamically? but a bit complicated, maybe later..
+
+            # TODO would also be nice to handle is_deleted here somehow...
+            # but for that would need to process data in direct order vs reversed..
+            # not sure, maybe this could use a special mode or something?
+
+        emitted[key] = w
+        yield w
+
+
+def _watched() -> Iterator[Res[Watched]]:
     try:
-        from ..google.takeout.parser import events
         from google_takeout_parser.models import Activity
+
+        from ..google.takeout.parser import events
     except ModuleNotFoundError as ex:
         logger.exception(ex)
-        from ..core.warnings import high
-        high("Please set up my.google.takeout.parser module for better youtube support. Falling back to legacy implementation.")
-        yield from _watched_legacy()
+        warnings.high("Please set up my.google.takeout.parser module for better youtube support. Falling back to legacy implementation.")
+        yield from _watched_legacy()  # type: ignore[name-defined]
         return
 
     YOUTUBE_VIDEO_LINK = '://www.youtube.com/watch?v='
@@ -43,12 +82,12 @@ def watched() -> Iterable[Res[Watched]]:
             continue
 
         url = e.titleUrl
-        header = e.header
-        title = e.title
 
         if url is None:
             continue
 
+        header = e.header
+
         if header in {'Image Search', 'Search', 'Chrome'}:
             # sometimes results in youtube links.. but definitely not watch history
             continue
@@ -61,6 +100,8 @@ def watched() -> Iterable[Res[Watched]]:
                 pass
             continue
 
+        title = e.title
+
         if header == 'youtube.com' and title.startswith('Visited '):
             continue
 
@@ -76,16 +117,22 @@ def watched() -> Iterable[Res[Watched]]:
         # also compatible with legacy titles
         title = removeprefix(title, 'Watched ')
 
+        # watches originating from some activity end with this, remove it for consistency
+        title = removesuffix(title, ' - YouTube')
+
         if YOUTUBE_VIDEO_LINK not in url:
             if e.details == ['From Google Ads']:
                 # weird, sometimes results in odd
                 continue
-            if title == 'Used YouTube' and e.products == ['Android']:
+            if title == 'Used YouTube':
                 continue
 
             yield RuntimeError(f'Unexpected url: {e}')
             continue
 
+        # TODO contribute to takeout parser? seems that these still might happen in json data
+        title = title.replace("\xa0", " ")
+
         yield Watched(
             url=url,
             title=title,
@@ -100,24 +147,24 @@ def stats() -> Stats:
 ### deprecated stuff (keep in my.media.youtube)
 
 if not TYPE_CHECKING:
+
     @deprecated("use 'watched' instead")
     def get_watched(*args, **kwargs):
         return watched(*args, **kwargs)
 
+    def _watched_legacy() -> Iterable[Watched]:
+        from ..google.takeout.html import read_html
+        from ..google.takeout.paths import get_last_takeout
 
-def _watched_legacy() -> Iterable[Watched]:
-    from ..google.takeout.html import read_html
-    from ..google.takeout.paths import get_last_takeout
-
-    # todo looks like this one doesn't have retention? so enough to use the last
-    path = 'Takeout/My Activity/YouTube/MyActivity.html'
-    last = get_last_takeout(path=path)
-    if last is None:
-        return []
+        # todo looks like this one doesn't have retention? so enough to use the last
+        path = 'Takeout/My Activity/YouTube/MyActivity.html'
+        last = get_last_takeout(path=path)
+        if last is None:
+            return []
 
-    watches: List[Watched] = []
-    for dt, url, title in read_html(last, path):
-        watches.append(Watched(url=url, title=title, when=dt))
+        watches: list[Watched] = []
+        for dt, url, title in read_html(last, path):
+            watches.append(Watched(url=url, title=title, when=dt))
 
-    # todo hmm they already come sorted.. wonder if should just rely on it..
-    return sorted(watches, key=lambda e: e.when)
+        # todo hmm they already come sorted.. wonder if should just rely on it..
+        return sorted(watches, key=lambda e: e.when)