From 811abcd28b3b023da4a66ba07196b0d993ee18d3 Mon Sep 17 00:00:00 2001
From: Thomas J Faughnan Jr <thomas@faughnan.net>
Date: Sat, 13 Jul 2024 20:55:49 -0400
Subject: [PATCH] title: simplify youtube parsing

In `youtube_title` only use regex to extract the embedded JSON. Then use
the stdlib's actual JSON parser from there. This avoids some unescaping
quirks and generally feels less brittle. We now disregard `<title>`
entirely since we already were relying on JSON's presence for the
channel name. So we might as well use it up front for the video name
too.
---
 src/bobbit/modules/title.py | 31 +++++++++++--------------------
 1 file changed, 11 insertions(+), 20 deletions(-)
diff --git a/src/bobbit/modules/title.py b/src/bobbit/modules/title.py
index 5385139..933b41f 100644
--- a/src/bobbit/modules/title.py
+++ b/src/bobbit/modules/title.py
@@ -2,6 +2,7 @@
 
 import logging
 import html
+import json
 import re
 
 from bobbit.utils import strip_html
@@ -92,31 +93,21 @@ async def youtube_title(bot, url, text):
         return None
 
     try:
-        # finding channel name can't use regex search for HTML since YouTube sends a pile of JSON that's assembled client side
-        # current regex into the JSON may break if YouTube allows " characters in channel names
-        try:
-            channel_name = re.findall(r',"author":"([^"]*)",', text)[0]
-        except IndexError:
-            # XXX: 2024-06-08 - Alternative means of extracting channel name (Google sending back different JSON)
-            channel_name = re.findall(r'Unsubscribe from.*?"text":"([^"]+)"', text)[1]
-
-        try:
-            video_name   = re.findall(r'<title[^>]*>([^<]+) - YouTube[\s]*</title>', text)[0] # get title, removing "- YouTube" from the end
-        except IndexError:
-            # XXX: 2024-06-08 - Alternative means of extracting video name (Google sending back different JSON)
-            video_name   = re.findall(r'videoPrimaryInfoRenderer.*?"text":"(.*?)"}', text)[0]
-
-        # XXX: Escape backslashed strings, https://stackoverflow.com/a/57192592
-        video_name = video_name.encode('latin-1', 'backslashreplace')\
-                               .decode('unicode-escape')
+        m = re.search(r'<script[^>]*>var\s+ytInitialData\s*=\s*(?P<data>\{.+\});</script>', text)
+        if not m:
+            raise re.error('No regex match')
+        data = json.loads(m.group('data'))
+        details = data['playerOverlays']['playerOverlayRenderer']['videoDetails']['playerOverlayVideoDetailsRenderer']
+        video_name = details['title']['simpleText']
+        channel_name = details['subtitle']['runs'][0]['text']
 
         return bot.client.format_text(
             '{color}{green}Video{color}: {bold}{video_name}{bold} {color}{green}Channel{color}: {bold}{channel_name}{bold}',
-            video_name   = html.unescape(video_name.strip()),
+            video_name   = video_name.strip(),
             channel_name = channel_name.strip()
         )
-    except IndexError:
-        logging.warning('Unable to find channel or video name for %s, YouTube formatting may have changed', url)
+    except (re.error, json.JSONDecodeError, IndexError, KeyError) as e:
+        logging.warning('Unable to find channel or video name for %s, YouTube formatting may have changed: %s', url, e)
         return None
 
 # Reddit Command