Skip to content

Commit

Permalink
my.youtube.takeout: deduplicate watched videos and sort out a few min…
Browse files Browse the repository at this point in the history
…or errors
  • Loading branch information
karlicoss committed Sep 22, 2024
1 parent 3166109 commit e1ac02d
Show file tree
Hide file tree
Showing 3 changed files with 87 additions and 30 deletions.
7 changes: 6 additions & 1 deletion my/core/compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,17 @@ def sqlite_backup(*, source: sqlite3.Connection, dest: sqlite3.Connection, **kwa
source.backup(dest, **kwargs)


# can remove after python3.9 (although need to keep the method itself for bwd compat)
## can remove after python3.9 (although need to keep the method itself for bwd compat)
def removeprefix(text: str, prefix: str) -> str:
if text.startswith(prefix):
return text[len(prefix) :]
return text

def removesuffix(text: str, suffix: str) -> str:
if text.endswith(suffix):
return text[:-len(suffix)]
return text
##

## used to have compat function before 3.8 for these, keeping for runtime back compatibility
if not TYPE_CHECKING:
Expand Down
11 changes: 8 additions & 3 deletions my/media/youtube.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
from ..core.warnings import high
from my.core import __NOT_HPI_MODULE__

from typing import TYPE_CHECKING

from my.core.warnings import high

high("DEPRECATED! Please use my.youtube.takeout instead.")
from ..core.util import __NOT_HPI_MODULE__

from ..youtube.takeout import *
if not TYPE_CHECKING:
from my.youtube.takeout import *
99 changes: 73 additions & 26 deletions my/youtube/takeout.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
from typing import NamedTuple, List, Iterable, TYPE_CHECKING
from __future__ import annotations

from my.core import datetime_aware, make_logger, stat, Res, Stats
from my.core.compat import deprecated, removeprefix
from dataclasses import dataclass
from typing import TYPE_CHECKING, Any, Iterable, Iterator

from my.core import Res, Stats, datetime_aware, make_logger, stat, warnings
from my.core.compat import deprecated, removeprefix, removesuffix

logger = make_logger(__name__)


class Watched(NamedTuple):
@dataclass
class Watched:
url: str
title: str
when: datetime_aware
Expand All @@ -16,19 +19,55 @@ class Watched(NamedTuple):
def eid(self) -> str:
return f'{self.url}-{self.when.isoformat()}'

def is_deleted(self) -> bool:
return self.title == self.url


# todo define error policy?
# although it has one from google takeout module.. so not sure

def watched() -> Iterable[Res[Watched]]:

def watched() -> Iterator[Res[Watched]]:
emitted: dict[Any, Watched] = {}
for w in _watched():
if isinstance(w, Exception):
yield w # TODO also make unique?
continue

# older exports (e.g. html) didn't have microseconds
# wheras newer json ones do have them
# seconds resolution is enough to distinguish watched videos
# also we're processing takeouts in HPI in reverse order, so first seen watch would contain microseconds, resulting in better data
without_microsecond = w.when.replace(microsecond=0)

key = w.url, without_microsecond
prev = emitted.get(key, None)
if prev is not None:
if w.title in prev.title:
# often more stuff added to the title, like 'Official Video'
# in this case not worth emitting the change
# also handles the case when titles match
continue
# otherwise if title changed completely, just emit the change... not sure what else we could do?
# could merge titles in the 'titles' field and update dynamically? but a bit complicated, maybe later..

# TODO would also be nice to handle is_deleted here somehow...
# but for that would need to process data in direct order vs reversed..
# not sure, maybe this could use a special mode or something?

emitted[key] = w
yield w


def _watched() -> Iterator[Res[Watched]]:
try:
from ..google.takeout.parser import events
from google_takeout_parser.models import Activity

from ..google.takeout.parser import events
except ModuleNotFoundError as ex:
logger.exception(ex)
from ..core.warnings import high
high("Please set up my.google.takeout.parser module for better youtube support. Falling back to legacy implementation.")
yield from _watched_legacy()
warnings.high("Please set up my.google.takeout.parser module for better youtube support. Falling back to legacy implementation.")
yield from _watched_legacy() # type: ignore[name-defined]
return

YOUTUBE_VIDEO_LINK = '://www.youtube.com/watch?v='
Expand All @@ -43,12 +82,12 @@ def watched() -> Iterable[Res[Watched]]:
continue

url = e.titleUrl
header = e.header
title = e.title

if url is None:
continue

header = e.header

if header in {'Image Search', 'Search', 'Chrome'}:
# sometimes results in youtube links.. but definitely not watch history
continue
Expand All @@ -61,6 +100,8 @@ def watched() -> Iterable[Res[Watched]]:
pass
continue

title = e.title

if header == 'youtube.com' and title.startswith('Visited '):
continue

Expand All @@ -76,16 +117,22 @@ def watched() -> Iterable[Res[Watched]]:
# also compatible with legacy titles
title = removeprefix(title, 'Watched ')

# watches originating from some activity end with this, remove it for consistency
title = removesuffix(title, ' - YouTube')

if YOUTUBE_VIDEO_LINK not in url:
if e.details == ['From Google Ads']:
# weird, sometimes results in odd
continue
if title == 'Used YouTube' and e.products == ['Android']:
if title == 'Used YouTube':
continue

yield RuntimeError(f'Unexpected url: {e}')
continue

# TODO contribute to takeout parser? seems that these still might happen in json data
title = title.replace("\xa0", " ")

yield Watched(
url=url,
title=title,
Expand All @@ -100,24 +147,24 @@ def stats() -> Stats:
### deprecated stuff (keep in my.media.youtube)

if not TYPE_CHECKING:

@deprecated("use 'watched' instead")
def get_watched(*args, **kwargs):
return watched(*args, **kwargs)

def _watched_legacy() -> Iterable[Watched]:
from ..google.takeout.html import read_html
from ..google.takeout.paths import get_last_takeout

def _watched_legacy() -> Iterable[Watched]:
from ..google.takeout.html import read_html
from ..google.takeout.paths import get_last_takeout

# todo looks like this one doesn't have retention? so enough to use the last
path = 'Takeout/My Activity/YouTube/MyActivity.html'
last = get_last_takeout(path=path)
if last is None:
return []
# todo looks like this one doesn't have retention? so enough to use the last
path = 'Takeout/My Activity/YouTube/MyActivity.html'
last = get_last_takeout(path=path)
if last is None:
return []

watches: List[Watched] = []
for dt, url, title in read_html(last, path):
watches.append(Watched(url=url, title=title, when=dt))
watches: list[Watched] = []
for dt, url, title in read_html(last, path):
watches.append(Watched(url=url, title=title, when=dt))

# todo hmm they already come sorted.. wonder if should just rely on it..
return sorted(watches, key=lambda e: e.when)
# todo hmm they already come sorted.. wonder if should just rely on it..
return sorted(watches, key=lambda e: e.when)

0 comments on commit e1ac02d

Please sign in to comment.