From bf529967401dbac12e32ab852723a59e673dadab Mon Sep 17 00:00:00 2001 From: geauxlo <66712139+geauxlo@users.noreply.github.com> Date: Wed, 10 Jun 2020 04:00:01 +0000 Subject: [PATCH 01/31] Update api call to v5 spec in TwitchPlaylistBaseIE --- youtube_dl/extractor/twitch.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index e211cd4c84c..3f0f7e277ea 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -380,11 +380,13 @@ class TwitchPlaylistBaseIE(TwitchBaseIE): _PLAYLIST_PATH = 'kraken/channels/%s/videos/?offset=%d&limit=%d' _PAGE_LIMIT = 100 - def _extract_playlist(self, channel_id): + def _extract_playlist(self, channel_name): info = self._call_api( - 'kraken/channels/%s' % channel_id, - channel_id, 'Downloading channel info JSON') - channel_name = info.get('display_name') or info.get('name') + 'kraken/users?login=%s' % channel_name, + channel_name, 'Downloading channel info JSON') + info = info['users'][0] + channel_id = info['_id'] + channel_name = info.get('display_name') or info.get('name') or channel_name entries = [] offset = 0 limit = self._PAGE_LIMIT @@ -444,7 +446,7 @@ class TwitchProfileIE(TwitchPlaylistBaseIE): _TESTS = [{ 'url': 'http://www.twitch.tv/vanillatv/profile', 'info_dict': { - 'id': 'vanillatv', + 'id': '22744919', 'title': 'VanillaTV', }, 'playlist_mincount': 412, @@ -468,7 +470,7 @@ class TwitchAllVideosIE(TwitchVideosBaseIE): _TESTS = [{ 'url': 'https://www.twitch.tv/spamfish/videos/all', 'info_dict': { - 'id': 'spamfish', + 'id': '497952', 'title': 'Spamfish', }, 'playlist_mincount': 869, @@ -487,7 +489,7 @@ class TwitchUploadsIE(TwitchVideosBaseIE): _TESTS = [{ 'url': 'https://www.twitch.tv/spamfish/videos/uploads', 'info_dict': { - 'id': 'spamfish', + 'id': '497952', 'title': 'Spamfish', }, 'playlist_mincount': 0, @@ -506,7 +508,7 @@ class TwitchPastBroadcastsIE(TwitchVideosBaseIE): _TESTS = [{ 'url': 'https://www.twitch.tv/spamfish/videos/past-broadcasts', 'info_dict': { - 'id': 'spamfish', + 'id': '497952', 'title': 'Spamfish', }, 'playlist_mincount': 0, @@ -525,7 +527,7 @@ class TwitchHighlightsIE(TwitchVideosBaseIE): _TESTS = [{ 'url': 'https://www.twitch.tv/spamfish/videos/highlights', 'info_dict': { - 'id': 'spamfish', + 'id': '497952', 'title': 'Spamfish', }, 'playlist_mincount': 805, From 3951a7faae1c3916827d01b1e242bb12a9cdadec Mon Sep 17 00:00:00 2001 From: geauxlo <66712139+geauxlo@users.noreply.github.com> Date: Wed, 10 Jun 2020 06:38:32 +0000 Subject: [PATCH 02/31] Prefer API to scraping HTML when possible Also changed instances of `var is None` to `var == None`, and replaced `var.replace('http%3A', 'http:')` with a regex --- youtube_dl/extractor/screencast.py | 52 +++++++++++++++++++++++------- 1 file changed, 40 insertions(+), 12 deletions(-) diff --git a/youtube_dl/extractor/screencast.py b/youtube_dl/extractor/screencast.py index 69a0d01f39c..d52d46cc3dc 100644 --- a/youtube_dl/extractor/screencast.py +++ b/youtube_dl/extractor/screencast.py @@ -1,6 +1,8 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..compat import ( compat_parse_qs, @@ -13,6 +15,8 @@ class ScreencastIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?screencast\.com/t/(?P[a-zA-Z0-9]+)' + _API_URL = 'https://www.screencast.com/api/external/oembed?url=%s&format=json' + _TESTS = [{ 'url': 'http://www.screencast.com/t/3ZEjQXlT', 'md5': '917df1c13798a3e96211dd1561fded83', @@ -60,13 +64,32 @@ class ScreencastIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) + + # The info JSON given by the API has a thumbnail URL, + # but it's inferior to the webpage's thumbnail. + # It also has no video description, so we + # definitely still need to get the webpage. + + info = self._download_json( + self._API_URL % url, video_id, + 'Downloading video info JSON') + + video_url = info.get('url') + if video_url != None: + video_url_raw = compat_urllib_request.quote(video_url) + video_url = re.sub(r'^(?Phttps|http)%3A', + lambda match: '%s:' % match.group('proto'), + video_url_raw) + + title = info.get('title') webpage = self._download_webpage(url, video_id) - video_url = self._html_search_regex( - r'http|https)%3A', + lambda match: '%s:' % match.group('proto'), + video_url_raw) - if video_url is None: + if video_url == None: video_meta = self._html_search_meta( 'og:video', webpage, default=None) if video_meta: @@ -90,28 +115,31 @@ def _real_extract(self, url): r'src=(.*?)(?:$|&)', video_meta, 'meta tag video URL', default=None) - if video_url is None: + if video_url == None: video_url = self._html_search_regex( r'MediaContentUrl["\']\s*:(["\'])(?P(?:(?!\1).)+)\1', webpage, 'video url', default=None, group='url') - if video_url is None: + if video_url == None: video_url = self._html_search_meta( 'og:video', webpage, default=None) - if video_url is None: + if video_url == None: raise ExtractorError('Cannot find video') - title = self._og_search_title(webpage, default=None) - if title is None: + if title == None: + title = self._og_search_title(webpage, default=None) + + if title == None: title = self._html_search_regex( [r'Title: ([^<]+)', r'class="tabSeperator">>(.+?)<', r'([^<]+)'], webpage, 'title') + thumbnail = self._og_search_thumbnail(webpage) description = self._og_search_description(webpage, default=None) - if description is None: + if description == None: description = self._html_search_meta('description', webpage) return { From 66c498e5586e0b90b0579e6e0754cfc03775a3a8 Mon Sep 17 00:00:00 2001 From: geauxlo <66712139+geauxlo@users.noreply.github.com> Date: Wed, 10 Jun 2020 06:39:35 +0000 Subject: [PATCH 03/31] Recognize more valid URLs URLs like `https://www.screencast.com/users/cindyhailes/folders/Jing/media/c9be177c-5808-4c4f-af56-eadceb3a7c82` weren't being accepted before --- youtube_dl/extractor/screencast.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/screencast.py b/youtube_dl/extractor/screencast.py index d52d46cc3dc..d23a53706f1 100644 --- a/youtube_dl/extractor/screencast.py +++ b/youtube_dl/extractor/screencast.py @@ -14,7 +14,7 @@ class ScreencastIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?screencast\.com/t/(?P[a-zA-Z0-9]+)' + _VALID_URL = r'https?://(?:www\.)?screencast\.com/(?:t|users/[^/]+/folders/[^/]+/media)/(?P[a-zA-Z0-9\-]+)' _API_URL = 'https://www.screencast.com/api/external/oembed?url=%s&format=json' _TESTS = [{ @@ -60,12 +60,22 @@ class ScreencastIE(InfoExtractor): }, { 'url': 'http://screencast.com/t/aAB3iowa', 'only_matching': True, + }, { + 'url': 'https://www.screencast.com/users/cindyhailes/folders/Jing/media/c9be177c-5808-4c4f-af56-eadceb3a7c82', + 'md5': '589d37a28d2add53c8bf16b9126d9dc2', + 'info_dict': { + 'id': 'c9be177c-5808-4c4f-af56-eadceb3a7c82', + 'ext': 'swf', + 'title': '2020-05-31_1737', + 'description': 'Shared from Screencast.com', + 'thumbnail': r're:^https?://.*\.(?:gif|jpg)$', + } }] def _real_extract(self, url): video_id = self._match_id(url) - # The info JSON given by the API has a thumbnail URL, + # The JSON given by the API has a thumbnail URL, # but it's inferior to the webpage's thumbnail. # It also has no video description, so we # definitely still need to get the webpage. From 33afd662d96f82f7a5af4ac89513666f2a078a84 Mon Sep 17 00:00:00 2001 From: geauxlo <66712139+geauxlo@users.noreply.github.com> Date: Wed, 10 Jun 2020 06:42:44 +0000 Subject: [PATCH 04/31] UNDO --- youtube_dl/extractor/screencast.py | 64 ++++++------------------------ 1 file changed, 13 insertions(+), 51 deletions(-) diff --git a/youtube_dl/extractor/screencast.py b/youtube_dl/extractor/screencast.py index d23a53706f1..69a0d01f39c 100644 --- a/youtube_dl/extractor/screencast.py +++ b/youtube_dl/extractor/screencast.py @@ -1,8 +1,6 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor from ..compat import ( compat_parse_qs, @@ -14,9 +12,7 @@ class ScreencastIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?screencast\.com/(?:t|users/[^/]+/folders/[^/]+/media)/(?P[a-zA-Z0-9\-]+)' - _API_URL = 'https://www.screencast.com/api/external/oembed?url=%s&format=json' - + _VALID_URL = r'https?://(?:www\.)?screencast\.com/t/(?P[a-zA-Z0-9]+)' _TESTS = [{ 'url': 'http://www.screencast.com/t/3ZEjQXlT', 'md5': '917df1c13798a3e96211dd1561fded83', @@ -60,46 +56,17 @@ class ScreencastIE(InfoExtractor): }, { 'url': 'http://screencast.com/t/aAB3iowa', 'only_matching': True, - }, { - 'url': 'https://www.screencast.com/users/cindyhailes/folders/Jing/media/c9be177c-5808-4c4f-af56-eadceb3a7c82', - 'md5': '589d37a28d2add53c8bf16b9126d9dc2', - 'info_dict': { - 'id': 'c9be177c-5808-4c4f-af56-eadceb3a7c82', - 'ext': 'swf', - 'title': '2020-05-31_1737', - 'description': 'Shared from Screencast.com', - 'thumbnail': r're:^https?://.*\.(?:gif|jpg)$', - } }] def _real_extract(self, url): video_id = self._match_id(url) - - # The JSON given by the API has a thumbnail URL, - # but it's inferior to the webpage's thumbnail. - # It also has no video description, so we - # definitely still need to get the webpage. - - info = self._download_json( - self._API_URL % url, video_id, - 'Downloading video info JSON') - - video_url = info.get('url') - if video_url != None: - video_url_raw = compat_urllib_request.quote(video_url) - video_url = re.sub(r'^(?Phttps|http)%3A', - lambda match: '%s:' % match.group('proto'), - video_url_raw) - - title = info.get('title') webpage = self._download_webpage(url, video_id) - if video_url == None: - video_url = self._html_search_regex( - r'http|https)%3A', - lambda match: '%s:' % match.group('proto'), - video_url_raw) + video_url = video_url_raw.replace('http%3A', 'http:') - if video_url == None: + if video_url is None: video_meta = self._html_search_meta( 'og:video', webpage, default=None) if video_meta: @@ -125,31 +90,28 @@ def _real_extract(self, url): r'src=(.*?)(?:$|&)', video_meta, 'meta tag video URL', default=None) - if video_url == None: + if video_url is None: video_url = self._html_search_regex( r'MediaContentUrl["\']\s*:(["\'])(?P(?:(?!\1).)+)\1', webpage, 'video url', default=None, group='url') - if video_url == None: + if video_url is None: video_url = self._html_search_meta( 'og:video', webpage, default=None) - if video_url == None: + if video_url is None: raise ExtractorError('Cannot find video') - if title == None: - title = self._og_search_title(webpage, default=None) - - if title == None: + title = self._og_search_title(webpage, default=None) + if title is None: title = self._html_search_regex( [r'Title: ([^<]+)', r'class="tabSeperator">>(.+?)<', r'([^<]+)'], webpage, 'title') - thumbnail = self._og_search_thumbnail(webpage) description = self._og_search_description(webpage, default=None) - if description == None: + if description is None: description = self._html_search_meta('description', webpage) return { From 777d5a45be81fb1f274c8c558ba1eb24855c66fc Mon Sep 17 00:00:00 2001 From: Alex Merkel Date: Thu, 18 Jun 2020 22:36:44 +0200 Subject: [PATCH 05/31] [postprocessor/embedthumbnail] Add conversion for non JPG/PNG images --- youtube_dl/postprocessor/embedthumbnail.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/youtube_dl/postprocessor/embedthumbnail.py b/youtube_dl/postprocessor/embedthumbnail.py index 56be914b8f1..a5939a7d318 100644 --- a/youtube_dl/postprocessor/embedthumbnail.py +++ b/youtube_dl/postprocessor/embedthumbnail.py @@ -41,6 +41,16 @@ def run(self, info): 'Skipping embedding the thumbnail because the file is missing.') return [], info + if not os.path.splitext(encodeFilename(thumbnail_filename))[1].lower() in ['.jpg', '.png']: + jpg_thumbnail_filename = thumbnail_filename + ".jpg" + + self._downloader.to_screen('[ffmpeg] Converting thumbnail "%s" to JPEG' % thumbnail_filename) + + self.run_ffmpeg(thumbnail_filename, jpg_thumbnail_filename, ['-bsf:v', 'mjpeg2jpeg']) + + os.remove(thumbnail_filename) + thumbnail_filename = jpg_thumbnail_filename + if info['ext'] == 'mp3': options = [ '-c', 'copy', '-map', '0', '-map', '1', From e987deb504565102bb6dc271b074781434a75e5c Mon Sep 17 00:00:00 2001 From: Alex Merkel Date: Sun, 21 Jun 2020 11:53:22 +0200 Subject: [PATCH 06/31] [postprocessor/embedthumbnail] Add detection for mislabeled WebP files --- youtube_dl/postprocessor/embedthumbnail.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/youtube_dl/postprocessor/embedthumbnail.py b/youtube_dl/postprocessor/embedthumbnail.py index a5939a7d318..74928be554b 100644 --- a/youtube_dl/postprocessor/embedthumbnail.py +++ b/youtube_dl/postprocessor/embedthumbnail.py @@ -41,8 +41,19 @@ def run(self, info): 'Skipping embedding the thumbnail because the file is missing.') return [], info - if not os.path.splitext(encodeFilename(thumbnail_filename))[1].lower() in ['.jpg', '.png']: - jpg_thumbnail_filename = thumbnail_filename + ".jpg" + #Check for mislabeled webp file + with open(encodeFilename(thumbnail_filename), "rb") as f: + b = f.read(16) + if b'\x57\x45\x42\x50' in b: #Binary for WEBP + [thumbnail_filename_path, thumbnail_filename_extension] = os.path.splitext(thumbnail_filename) + if not thumbnail_filename_extension == ".webp": + webp_thumbnail_filename = thumbnail_filename_path + ".webp" + os.rename(thumbnail_filename, webp_thumbnail_filename) + thumbnail_filename = webp_thumbnail_filename + + #If not a jpg or png thumbnail, convert it to jpg using ffmpeg + if not os.path.splitext(thumbnail_filename)[1].lower() in ['.jpg', '.png']: + jpg_thumbnail_filename = os.path.splitext(thumbnail_filename)[0] + ".jpg" self._downloader.to_screen('[ffmpeg] Converting thumbnail "%s" to JPEG' % thumbnail_filename) From ac0ad4f91dbf4e82c22a8fd059891f0b5c138f0d Mon Sep 17 00:00:00 2001 From: Alex Merkel Date: Sun, 21 Jun 2020 12:06:01 +0200 Subject: [PATCH 07/31] [postprocessor/embedthumbnail] Close file before possible renaming --- youtube_dl/postprocessor/embedthumbnail.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/youtube_dl/postprocessor/embedthumbnail.py b/youtube_dl/postprocessor/embedthumbnail.py index 74928be554b..7673b4fd118 100644 --- a/youtube_dl/postprocessor/embedthumbnail.py +++ b/youtube_dl/postprocessor/embedthumbnail.py @@ -44,12 +44,12 @@ def run(self, info): #Check for mislabeled webp file with open(encodeFilename(thumbnail_filename), "rb") as f: b = f.read(16) - if b'\x57\x45\x42\x50' in b: #Binary for WEBP - [thumbnail_filename_path, thumbnail_filename_extension] = os.path.splitext(thumbnail_filename) - if not thumbnail_filename_extension == ".webp": - webp_thumbnail_filename = thumbnail_filename_path + ".webp" - os.rename(thumbnail_filename, webp_thumbnail_filename) - thumbnail_filename = webp_thumbnail_filename + if b'\x57\x45\x42\x50' in b: #Binary for WEBP + [thumbnail_filename_path, thumbnail_filename_extension] = os.path.splitext(thumbnail_filename) + if not thumbnail_filename_extension == ".webp": + webp_thumbnail_filename = thumbnail_filename_path + ".webp" + os.rename(thumbnail_filename, webp_thumbnail_filename) + thumbnail_filename = webp_thumbnail_filename #If not a jpg or png thumbnail, convert it to jpg using ffmpeg if not os.path.splitext(thumbnail_filename)[1].lower() in ['.jpg', '.png']: From 6011dd9539eae03d78246db5c320f29607871d43 Mon Sep 17 00:00:00 2001 From: Alex Merkel Date: Sun, 21 Jun 2020 12:16:45 +0200 Subject: [PATCH 08/31] [postprocessor/embedthumbnail] Fix comments to make flake8 happy --- youtube_dl/postprocessor/embedthumbnail.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/postprocessor/embedthumbnail.py b/youtube_dl/postprocessor/embedthumbnail.py index 7673b4fd118..ebf7ea27bed 100644 --- a/youtube_dl/postprocessor/embedthumbnail.py +++ b/youtube_dl/postprocessor/embedthumbnail.py @@ -41,17 +41,17 @@ def run(self, info): 'Skipping embedding the thumbnail because the file is missing.') return [], info - #Check for mislabeled webp file + # Check for mislabeled webp file with open(encodeFilename(thumbnail_filename), "rb") as f: b = f.read(16) - if b'\x57\x45\x42\x50' in b: #Binary for WEBP + if b'\x57\x45\x42\x50' in b: # Binary for WEBP [thumbnail_filename_path, thumbnail_filename_extension] = os.path.splitext(thumbnail_filename) if not thumbnail_filename_extension == ".webp": webp_thumbnail_filename = thumbnail_filename_path + ".webp" os.rename(thumbnail_filename, webp_thumbnail_filename) thumbnail_filename = webp_thumbnail_filename - #If not a jpg or png thumbnail, convert it to jpg using ffmpeg + # If not a jpg or png thumbnail, convert it to jpg using ffmpeg if not os.path.splitext(thumbnail_filename)[1].lower() in ['.jpg', '.png']: jpg_thumbnail_filename = os.path.splitext(thumbnail_filename)[0] + ".jpg" From f6513e1a9302ab601460133804e0c06f1595279a Mon Sep 17 00:00:00 2001 From: Alex Merkel Date: Tue, 23 Jun 2020 10:25:04 +0200 Subject: [PATCH 09/31] [postprocessor/embedthumbnail] Replace % with _ in ffmpeg image output path --- youtube_dl/postprocessor/embedthumbnail.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/youtube_dl/postprocessor/embedthumbnail.py b/youtube_dl/postprocessor/embedthumbnail.py index ebf7ea27bed..e2002ab0b1e 100644 --- a/youtube_dl/postprocessor/embedthumbnail.py +++ b/youtube_dl/postprocessor/embedthumbnail.py @@ -48,18 +48,19 @@ def run(self, info): [thumbnail_filename_path, thumbnail_filename_extension] = os.path.splitext(thumbnail_filename) if not thumbnail_filename_extension == ".webp": webp_thumbnail_filename = thumbnail_filename_path + ".webp" - os.rename(thumbnail_filename, webp_thumbnail_filename) + os.rename(encodeFilename(thumbnail_filename), encodeFilename(webp_thumbnail_filename)) thumbnail_filename = webp_thumbnail_filename # If not a jpg or png thumbnail, convert it to jpg using ffmpeg if not os.path.splitext(thumbnail_filename)[1].lower() in ['.jpg', '.png']: jpg_thumbnail_filename = os.path.splitext(thumbnail_filename)[0] + ".jpg" + jpg_thumbnail_filename = os.path.join(os.path.dirname(jpg_thumbnail_filename), os.path.basename(jpg_thumbnail_filename).replace('%', '_')) # ffmpeg interprets % as image sequence self._downloader.to_screen('[ffmpeg] Converting thumbnail "%s" to JPEG' % thumbnail_filename) self.run_ffmpeg(thumbnail_filename, jpg_thumbnail_filename, ['-bsf:v', 'mjpeg2jpeg']) - os.remove(thumbnail_filename) + os.remove(encodeFilename(thumbnail_filename)) thumbnail_filename = jpg_thumbnail_filename if info['ext'] == 'mp3': From cae18ef8f2795e5f584fe450b9619de176e72e02 Mon Sep 17 00:00:00 2001 From: random-nick Date: Fri, 24 Jul 2020 00:04:19 +0200 Subject: [PATCH 10/31] [youtube] Fix age gate detection --- youtube_dl/extractor/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index c27f2cd9523..685b0cd6415 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1825,7 +1825,7 @@ def extract_player_response(player_response, video_id): # Get video info video_info = {} embed_webpage = None - if re.search(r'player-age-gate-content">', video_webpage) is not None: + if self._html_search_meta('og:restrictions:age', video_webpage, default=None) == "18+": age_gate = True # We simulate the access to the video from www.youtube.com/v/{video_id} # this can be viewed without login into Youtube From 98b69821e4d94f8be027c7d3a60db701b5c17792 Mon Sep 17 00:00:00 2001 From: siikamiika Date: Mon, 3 Aug 2020 23:54:52 +0300 Subject: [PATCH 11/31] use dl function for subtitles --- youtube_dl/YoutubeDL.py | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 19370f62b0d..f9aa91f3021 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1805,6 +1805,14 @@ def ensure_dir_exists(path): self.report_error('Cannot write annotations file: ' + annofn) return + def dl(name, info): + fd = get_suitable_downloader(info, self.params)(self, self.params) + for ph in self._progress_hooks: + fd.add_progress_hook(ph) + if self.params.get('verbose'): + self.to_stdout('[debug] Invoking downloader on %r' % info.get('url')) + return fd.download(name, info) + subtitles_are_requested = any([self.params.get('writesubtitles', False), self.params.get('writeautomaticsub')]) @@ -1819,7 +1827,6 @@ def ensure_dir_exists(path): if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(sub_filename)): self.to_screen('[info] Video subtitle %s.%s is already present' % (sub_lang, sub_format)) else: - self.to_screen('[info] Writing video subtitles to: ' + sub_filename) if sub_info.get('data') is not None: try: # Use newline='' to prevent conversion of newline characters @@ -1831,10 +1838,9 @@ def ensure_dir_exists(path): return else: try: - sub_data = ie._request_webpage( - sub_info['url'], info_dict['id'], note=False).read() - with io.open(encodeFilename(sub_filename), 'wb') as subfile: - subfile.write(sub_data) + # TODO does this transfer session...? + # TODO exceptions + dl(sub_filename, sub_info) except (ExtractorError, IOError, OSError, ValueError) as err: self.report_warning('Unable to download subtitle for "%s": %s' % (sub_lang, error_to_compat_str(err))) @@ -1856,14 +1862,6 @@ def ensure_dir_exists(path): if not self.params.get('skip_download', False): try: - def dl(name, info): - fd = get_suitable_downloader(info, self.params)(self, self.params) - for ph in self._progress_hooks: - fd.add_progress_hook(ph) - if self.params.get('verbose'): - self.to_stdout('[debug] Invoking downloader on %r' % info.get('url')) - return fd.download(name, info) - if info_dict.get('requested_formats') is not None: downloaded = [] success = True From a78e3a57951893a1b885d6c478d09d279101f6a2 Mon Sep 17 00:00:00 2001 From: siikamiika Date: Wed, 5 Aug 2020 01:02:23 +0300 Subject: [PATCH 12/31] support youtube live chat replay --- youtube_dl/downloader/__init__.py | 2 + youtube_dl/downloader/youtube_live_chat.py | 88 ++++++++++++++++++++++ youtube_dl/extractor/youtube.py | 8 ++ 3 files changed, 98 insertions(+) create mode 100644 youtube_dl/downloader/youtube_live_chat.py diff --git a/youtube_dl/downloader/__init__.py b/youtube_dl/downloader/__init__.py index 2e485df9dac..4ae81f516e6 100644 --- a/youtube_dl/downloader/__init__.py +++ b/youtube_dl/downloader/__init__.py @@ -8,6 +8,7 @@ from .dash import DashSegmentsFD from .rtsp import RtspFD from .ism import IsmFD +from .youtube_live_chat import YoutubeLiveChatReplayFD from .external import ( get_external_downloader, FFmpegFD, @@ -26,6 +27,7 @@ 'f4m': F4mFD, 'http_dash_segments': DashSegmentsFD, 'ism': IsmFD, + 'youtube_live_chat_replay': YoutubeLiveChatReplayFD, } diff --git a/youtube_dl/downloader/youtube_live_chat.py b/youtube_dl/downloader/youtube_live_chat.py new file mode 100644 index 00000000000..64d1d20b2c9 --- /dev/null +++ b/youtube_dl/downloader/youtube_live_chat.py @@ -0,0 +1,88 @@ +from __future__ import division, unicode_literals + +import re +import json + +from .fragment import FragmentFD + + +class YoutubeLiveChatReplayFD(FragmentFD): + """ Downloads YouTube live chat replays fragment by fragment """ + + FD_NAME = 'youtube_live_chat_replay' + + def real_download(self, filename, info_dict): + video_id = info_dict['video_id'] + self.to_screen('[%s] Downloading live chat' % self.FD_NAME) + + test = self.params.get('test', False) + + ctx = { + 'filename': filename, + 'live': True, + 'total_frags': None, + } + + def dl_fragment(url): + headers = info_dict.get('http_headers', {}) + return self._download_fragment(ctx, url, info_dict, headers) + + def parse_yt_initial_data(data): + raw_json = re.search(b'window\["ytInitialData"\]\s*=\s*(.*);', data).group(1) + return json.loads(raw_json) + + self._prepare_and_start_frag_download(ctx) + + success, raw_fragment = dl_fragment( + 'https://www.youtube.com/watch?v={}'.format(video_id)) + if not success: + return False + data = parse_yt_initial_data(raw_fragment) + continuation_id = data['contents']['twoColumnWatchNextResults']['conversationBar']['liveChatRenderer']['continuations'][0]['reloadContinuationData']['continuation'] + # no data yet but required to call _append_fragment + self._append_fragment(ctx, b'') + + first = True + offset = None + while continuation_id is not None: + data = None + if first: + url = 'https://www.youtube.com/live_chat_replay?continuation={}'.format(continuation_id) + success, raw_fragment = dl_fragment(url) + if not success: + return False + data = parse_yt_initial_data(raw_fragment) + else: + url = ('https://www.youtube.com/live_chat_replay/get_live_chat_replay' + + '?continuation={}'.format(continuation_id) + + '&playerOffsetMs={}'.format(offset - 5000) + + '&hidden=false' + + '&pbj=1') + success, raw_fragment = dl_fragment(url) + if not success: + return False + data = json.loads(raw_fragment)['response'] + + first = False + continuation_id = None + + live_chat_continuation = data['continuationContents']['liveChatContinuation'] + offset = None + processed_fragment = bytearray() + if 'actions' in live_chat_continuation: + for action in live_chat_continuation['actions']: + if 'replayChatItemAction' in action: + replay_chat_item_action = action['replayChatItemAction'] + offset = int(replay_chat_item_action['videoOffsetTimeMsec']) + processed_fragment.extend( + json.dumps(action, ensure_ascii=False).encode('utf-8') + b'\n') + continuation_id = live_chat_continuation['continuations'][0]['liveChatReplayContinuationData']['continuation'] + + self._append_fragment(ctx, processed_fragment) + + if test or offset is None: + break + + self._finish_frag_download(ctx) + + return True diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index b35bf03aafc..e554702e77c 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1462,6 +1462,14 @@ def _get_subtitles(self, video_id, webpage): 'ext': ext, }) sub_lang_list[lang] = sub_formats + # TODO check that live chat replay actually exists + sub_lang_list['live_chat'] = [ + { + 'video_id': video_id, + 'ext': 'json', + 'protocol': 'youtube_live_chat_replay', + }, + ] if not sub_lang_list: self._downloader.report_warning('video doesn\'t have subtitles') return {} From 321bf820c577f34593ff0462775e43875c8d886d Mon Sep 17 00:00:00 2001 From: siikamiika Date: Wed, 5 Aug 2020 03:30:10 +0300 Subject: [PATCH 13/31] check live chat replay existence --- youtube_dl/YoutubeDL.py | 7 +++--- youtube_dl/extractor/youtube.py | 39 ++++++++++++++++++++++++--------- 2 files changed, 33 insertions(+), 13 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index f9aa91f3021..1b8a938e5a5 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1838,10 +1838,11 @@ def dl(name, info): return else: try: - # TODO does this transfer session...? - # TODO exceptions dl(sub_filename, sub_info) - except (ExtractorError, IOError, OSError, ValueError) as err: + except ( + ExtractorError, IOError, OSError, ValueError, + compat_urllib_error.URLError, + compat_http_client.HTTPException, socket.error) as err: self.report_warning('Unable to download subtitle for "%s": %s' % (sub_lang, error_to_compat_str(err))) continue diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index e554702e77c..782aba6ffc8 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1435,7 +1435,7 @@ def _decrypt_signature(self, s, video_id, player_url, age_gate=False): raise ExtractorError( 'Signature extraction failed: ' + tb, cause=e) - def _get_subtitles(self, video_id, webpage): + def _get_subtitles(self, video_id, webpage, is_live_content): try: subs_doc = self._download_xml( 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id, @@ -1462,14 +1462,14 @@ def _get_subtitles(self, video_id, webpage): 'ext': ext, }) sub_lang_list[lang] = sub_formats - # TODO check that live chat replay actually exists - sub_lang_list['live_chat'] = [ - { - 'video_id': video_id, - 'ext': 'json', - 'protocol': 'youtube_live_chat_replay', - }, - ] + if is_live_content: + sub_lang_list['live_chat'] = [ + { + 'video_id': video_id, + 'ext': 'json', + 'protocol': 'youtube_live_chat_replay', + }, + ] if not sub_lang_list: self._downloader.report_warning('video doesn\'t have subtitles') return {} @@ -1493,6 +1493,14 @@ def _get_ytplayer_config(self, video_id, webpage): return self._parse_json( uppercase_escape(config), video_id, fatal=False) + def _get_yt_initial_data(self, video_id, webpage): + config = self._search_regex( + r'window\["ytInitialData"\]\s*=\s*(.*);', + webpage, 'ytInitialData', default=None) + if config: + return self._parse_json( + uppercase_escape(config), video_id, fatal=False) + def _get_automatic_captions(self, video_id, webpage): """We need the webpage for getting the captions url, pass it as an argument to speed up the process.""" @@ -1992,6 +2000,16 @@ def feed_entry(name): if is_live is None: is_live = bool_or_none(video_details.get('isLive')) + has_live_chat_replay = False + is_live_content = bool_or_none(video_details.get('isLiveContent')) + if not is_live and is_live_content: + yt_initial_data = self._get_yt_initial_data(video_id, video_webpage) + try: + yt_initial_data['contents']['twoColumnWatchNextResults']['conversationBar']['liveChatRenderer']['continuations'][0]['reloadContinuationData']['continuation'] + has_live_chat_replay = True + except (KeyError, IndexError): + pass + # Check for "rental" videos if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info: raise ExtractorError('"rental" videos not supported. See https://github.com/ytdl-org/youtube-dl/issues/359 for more information.', expected=True) @@ -2399,7 +2417,8 @@ def _extract_count(count_name): or try_get(video_info, lambda x: float_or_none(x['avg_rating'][0]))) # subtitles - video_subtitles = self.extract_subtitles(video_id, video_webpage) + video_subtitles = self.extract_subtitles( + video_id, video_webpage, has_live_chat_replay) automatic_captions = self.extract_automatic_captions(video_id, video_webpage) video_duration = try_get( From 7627f548e6de828114e4841385c75a73c0911506 Mon Sep 17 00:00:00 2001 From: siikamiika Date: Wed, 5 Aug 2020 03:38:07 +0300 Subject: [PATCH 14/31] run flake8 --- youtube_dl/YoutubeDL.py | 9 ++++----- youtube_dl/downloader/youtube_live_chat.py | 2 +- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 1b8a938e5a5..0dc869d562b 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1820,7 +1820,6 @@ def dl(name, info): # subtitles download errors are already managed as troubles in relevant IE # that way it will silently go on when used with unsupporting IE subtitles = info_dict['requested_subtitles'] - ie = self.get_info_extractor(info_dict['extractor_key']) for sub_lang, sub_info in subtitles.items(): sub_format = sub_info['ext'] sub_filename = subtitles_filename(filename, sub_lang, sub_format, info_dict.get('ext')) @@ -1839,10 +1838,10 @@ def dl(name, info): else: try: dl(sub_filename, sub_info) - except ( - ExtractorError, IOError, OSError, ValueError, - compat_urllib_error.URLError, - compat_http_client.HTTPException, socket.error) as err: + except (ExtractorError, IOError, OSError, ValueError, + compat_urllib_error.URLError, + compat_http_client.HTTPException, + socket.error) as err: self.report_warning('Unable to download subtitle for "%s": %s' % (sub_lang, error_to_compat_str(err))) continue diff --git a/youtube_dl/downloader/youtube_live_chat.py b/youtube_dl/downloader/youtube_live_chat.py index 64d1d20b2c9..214a3720313 100644 --- a/youtube_dl/downloader/youtube_live_chat.py +++ b/youtube_dl/downloader/youtube_live_chat.py @@ -28,7 +28,7 @@ def dl_fragment(url): return self._download_fragment(ctx, url, info_dict, headers) def parse_yt_initial_data(data): - raw_json = re.search(b'window\["ytInitialData"\]\s*=\s*(.*);', data).group(1) + raw_json = re.search(rb'window\["ytInitialData"\]\s*=\s*(.*);', data).group(1) return json.loads(raw_json) self._prepare_and_start_frag_download(ctx) From f96f5ddad956bca6481280e293ea221410aac56b Mon Sep 17 00:00:00 2001 From: siikamiika Date: Wed, 5 Aug 2020 04:04:36 +0300 Subject: [PATCH 15/31] rename variable --- youtube_dl/extractor/youtube.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 782aba6ffc8..feb80f7f491 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1435,7 +1435,7 @@ def _decrypt_signature(self, s, video_id, player_url, age_gate=False): raise ExtractorError( 'Signature extraction failed: ' + tb, cause=e) - def _get_subtitles(self, video_id, webpage, is_live_content): + def _get_subtitles(self, video_id, webpage, has_live_chat_replay): try: subs_doc = self._download_xml( 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id, @@ -1462,7 +1462,7 @@ def _get_subtitles(self, video_id, webpage, is_live_content): 'ext': ext, }) sub_lang_list[lang] = sub_formats - if is_live_content: + if has_live_chat_replay: sub_lang_list['live_chat'] = [ { 'video_id': video_id, From 7cd9e2a05ff71999eb620618366eb1cc53ac48cd Mon Sep 17 00:00:00 2001 From: siikamiika Date: Wed, 5 Aug 2020 04:14:25 +0300 Subject: [PATCH 16/31] attempt to fix syntax error on older python --- youtube_dl/downloader/youtube_live_chat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/downloader/youtube_live_chat.py b/youtube_dl/downloader/youtube_live_chat.py index 214a3720313..e7eb4bbfe29 100644 --- a/youtube_dl/downloader/youtube_live_chat.py +++ b/youtube_dl/downloader/youtube_live_chat.py @@ -28,7 +28,7 @@ def dl_fragment(url): return self._download_fragment(ctx, url, info_dict, headers) def parse_yt_initial_data(data): - raw_json = re.search(rb'window\["ytInitialData"\]\s*=\s*(.*);', data).group(1) + raw_json = re.search(b'window\\["ytInitialData"\\]\s*=\\s*(.*);', data).group(1) return json.loads(raw_json) self._prepare_and_start_frag_download(ctx) From 88a68db03e616fc8e6d2684ffbfadeb64dd93cfb Mon Sep 17 00:00:00 2001 From: siikamiika Date: Wed, 5 Aug 2020 04:19:44 +0300 Subject: [PATCH 17/31] flake8 --- youtube_dl/downloader/youtube_live_chat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/downloader/youtube_live_chat.py b/youtube_dl/downloader/youtube_live_chat.py index e7eb4bbfe29..f7478c3366b 100644 --- a/youtube_dl/downloader/youtube_live_chat.py +++ b/youtube_dl/downloader/youtube_live_chat.py @@ -28,7 +28,7 @@ def dl_fragment(url): return self._download_fragment(ctx, url, info_dict, headers) def parse_yt_initial_data(data): - raw_json = re.search(b'window\\["ytInitialData"\\]\s*=\\s*(.*);', data).group(1) + raw_json = re.search(b'window\\["ytInitialData"\\]\\s*=\\s*(.*);', data).group(1) return json.loads(raw_json) self._prepare_and_start_frag_download(ctx) From f0f76a33dc0e5a3f495a05293b1db4ceab5c3029 Mon Sep 17 00:00:00 2001 From: siikamiika Date: Wed, 5 Aug 2020 23:29:41 +0300 Subject: [PATCH 18/31] fix premiere live chat They have isLiveContent = false so just check if the live chat renderer continuation exists --- youtube_dl/extractor/youtube.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index feb80f7f491..d6c35fab4d0 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -2001,13 +2001,12 @@ def feed_entry(name): is_live = bool_or_none(video_details.get('isLive')) has_live_chat_replay = False - is_live_content = bool_or_none(video_details.get('isLiveContent')) - if not is_live and is_live_content: + if not is_live: yt_initial_data = self._get_yt_initial_data(video_id, video_webpage) try: yt_initial_data['contents']['twoColumnWatchNextResults']['conversationBar']['liveChatRenderer']['continuations'][0]['reloadContinuationData']['continuation'] has_live_chat_replay = True - except (KeyError, IndexError): + except (KeyError, IndexError, TypeError): pass # Check for "rental" videos From eaedbfd97e860214399b0028fc47a487762e8294 Mon Sep 17 00:00:00 2001 From: siikamiika Date: Tue, 11 Aug 2020 00:05:32 +0300 Subject: [PATCH 19/31] fix ytInitialData parsing --- youtube_dl/downloader/youtube_live_chat.py | 10 ++++++++-- youtube_dl/extractor/youtube.py | 3 ++- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/youtube_dl/downloader/youtube_live_chat.py b/youtube_dl/downloader/youtube_live_chat.py index f7478c3366b..697e5255009 100644 --- a/youtube_dl/downloader/youtube_live_chat.py +++ b/youtube_dl/downloader/youtube_live_chat.py @@ -28,8 +28,14 @@ def dl_fragment(url): return self._download_fragment(ctx, url, info_dict, headers) def parse_yt_initial_data(data): - raw_json = re.search(b'window\\["ytInitialData"\\]\\s*=\\s*(.*);', data).group(1) - return json.loads(raw_json) + window_patt = b'window\\["ytInitialData"\\]\\s*=\\s*(.*?);' + var_patt = b'var\\s+ytInitialData\\s*=\\s*(.*?);' + for patt in window_patt, var_patt: + try: + raw_json = re.search(patt, data).group(1) + return json.loads(raw_json) + except AttributeError: + continue self._prepare_and_start_frag_download(ctx) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index d6c35fab4d0..e143bbee7b1 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1495,7 +1495,8 @@ def _get_ytplayer_config(self, video_id, webpage): def _get_yt_initial_data(self, video_id, webpage): config = self._search_regex( - r'window\["ytInitialData"\]\s*=\s*(.*);', + (r'window\["ytInitialData"\]\s*=\s*(.*);', + r'var\s+ytInitialData\s*=\s*(.*?);'), webpage, 'ytInitialData', default=None) if config: return self._parse_json( From 15eae44d74c80cca29cd5b24129585ad2d1e535f Mon Sep 17 00:00:00 2001 From: siikamiika Date: Tue, 11 Aug 2020 00:13:43 +0300 Subject: [PATCH 20/31] harden regex with lookbehind --- youtube_dl/downloader/youtube_live_chat.py | 4 ++-- youtube_dl/extractor/youtube.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/youtube_dl/downloader/youtube_live_chat.py b/youtube_dl/downloader/youtube_live_chat.py index 697e5255009..4932dd9c527 100644 --- a/youtube_dl/downloader/youtube_live_chat.py +++ b/youtube_dl/downloader/youtube_live_chat.py @@ -28,8 +28,8 @@ def dl_fragment(url): return self._download_fragment(ctx, url, info_dict, headers) def parse_yt_initial_data(data): - window_patt = b'window\\["ytInitialData"\\]\\s*=\\s*(.*?);' - var_patt = b'var\\s+ytInitialData\\s*=\\s*(.*?);' + window_patt = b'window\\["ytInitialData"\\]\\s*=\\s*(.*?)(?<=});' + var_patt = b'var\\s+ytInitialData\\s*=\\s*(.*?)(?<=});' for patt in window_patt, var_patt: try: raw_json = re.search(patt, data).group(1) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index e143bbee7b1..9fff8bdf4c9 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1495,8 +1495,8 @@ def _get_ytplayer_config(self, video_id, webpage): def _get_yt_initial_data(self, video_id, webpage): config = self._search_regex( - (r'window\["ytInitialData"\]\s*=\s*(.*);', - r'var\s+ytInitialData\s*=\s*(.*?);'), + (r'window\["ytInitialData"\]\s*=\s*(.*?)(?<=});', + r'var\s+ytInitialData\s*=\s*(.*?)(?<=});'), webpage, 'ytInitialData', default=None) if config: return self._parse_json( From edd83104b432a971feb86c69c4963dc67cc3dfc0 Mon Sep 17 00:00:00 2001 From: George Schizas Date: Tue, 11 Aug 2020 12:20:47 +0300 Subject: [PATCH 21/31] Use initial data from JS instead to get chapters There are probably a lot more useful data in there. --- youtube_dl/extractor/youtube.py | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index b35bf03aafc..03b726942f2 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1661,21 +1661,15 @@ def extract_id(cls, url): def _extract_chapters_from_json(self, webpage, video_id, duration): if not webpage: return - player = self._parse_json( + initial_data = self._parse_json( self._search_regex( - r'RELATED_PLAYER_ARGS["\']\s*:\s*({.+})\s*,?\s*\n', webpage, + r'window\["ytInitialData"\] = (.+);\n', webpage, 'player args', default='{}'), video_id, fatal=False) - if not player or not isinstance(player, dict): - return - watch_next_response = player.get('watch_next_response') - if not isinstance(watch_next_response, compat_str): - return - response = self._parse_json(watch_next_response, video_id, fatal=False) - if not response or not isinstance(response, dict): + if not initial_data or not isinstance(initial_data, dict): return chapters_list = try_get( - response, + initial_data, lambda x: x['playerOverlays'] ['playerOverlayRenderer'] ['decoratedPlayerBarRenderer'] From 34675f9de91bd0e8a98b151b6bff4a795d7f2428 Mon Sep 17 00:00:00 2001 From: Adrian Heine Date: Wed, 12 Aug 2020 00:07:09 +0200 Subject: [PATCH 22/31] [videa] Adapt to updates Closes #26288, closes #25973, closes #25650. --- youtube_dl/extractor/videa.py | 62 +++++++++++++++++++++++++++++++++-- 1 file changed, 60 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/videa.py b/youtube_dl/extractor/videa.py index d0e34c81980..a03614cc109 100644 --- a/youtube_dl/extractor/videa.py +++ b/youtube_dl/extractor/videa.py @@ -2,15 +2,24 @@ from __future__ import unicode_literals import re +import random +import string +import struct from .common import InfoExtractor from ..utils import ( + ExtractorError, int_or_none, mimetype2ext, parse_codecs, xpath_element, xpath_text, ) +from ..compat import ( + compat_b64decode, + compat_ord, + compat_parse_qs, +) class VideaIE(InfoExtractor): @@ -60,15 +69,63 @@ def _extract_urls(webpage): r']+src=(["\'])(?P(?:https?:)?//videa\.hu/player\?.*?\bv=.+?)\1', webpage)] + def rc4(self, ciphertext, key): + res = b'' + + keyLen = len(key) + S = list(range(256)) + + j = 0 + for i in range(256): + j = (j + S[i] + ord(key[i % keyLen])) % 256 + S[i], S[j] = S[j], S[i] + + i = 0 + j = 0 + for m in range(len(ciphertext)): + i = (i + 1) % 256 + j = (j + S[i]) % 256 + S[i], S[j] = S[j], S[i] + k = S[(S[i] + S[j]) % 256] + res += struct.pack("B", k ^ compat_ord(ciphertext[m])) + + return res + def _real_extract(self, url): video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id, fatal=True) + error = self._search_regex(r'

([^<]+)

', webpage, 'error', default=None) + if error: + raise ExtractorError(error, expected=True) + + video_src_params_raw = self._search_regex(r']+id="videa_player_iframe"[^>]+src="/player\?([^"]+)"', webpage, 'video_src_params') + video_src_params = compat_parse_qs(video_src_params_raw) + player_page = self._download_webpage("https://videa.hu/videojs_player?%s" % video_src_params_raw, video_id, fatal=True) + nonce = self._search_regex(r'_xt\s*=\s*"([^"]+)"', player_page, 'nonce') + random_seed = ''.join(random.choice(string.ascii_uppercase + string.ascii_lowercase + string.digits) for _ in range(8)) + static_secret = 'xHb0ZvME5q8CBcoQi6AngerDu3FGO9fkUlwPmLVY_RTzj2hJIS4NasXWKy1td7p' + l = nonce[:32] + s = nonce[32:] + result = '' + for i in range(0, 32): + result += s[i - (static_secret.index(l[i]) - 31)] - info = self._download_xml( + video_src_params['_s'] = random_seed + video_src_params['_t'] = result[:16] + encryption_key_stem = result[16:] + random_seed + + [b64_info, handle] = self._download_webpage_handle( 'http://videa.hu/videaplayer_get_xml.php', video_id, - query={'v': video_id}) + query=video_src_params, fatal=True) + + encrypted_info = compat_b64decode(b64_info) + key = encryption_key_stem + handle.info()['x-videa-xs'] + info_str = self.rc4(encrypted_info, key).decode('utf8') + info = self._parse_xml(info_str, video_id) video = xpath_element(info, './/video', 'video', fatal=True) sources = xpath_element(info, './/video_sources', 'sources', fatal=True) + hash_values = xpath_element(info, './/hash_values', 'hash_values', fatal=True) title = xpath_text(video, './title', fatal=True) @@ -77,6 +134,7 @@ def _real_extract(self, url): source_url = source.text if not source_url: continue + source_url += '?md5=%s&expires=%s' % (hash_values.find('hash_value_%s' % source.get('name')).text, source.get('exp')) f = parse_codecs(source.get('codecs')) f.update({ 'url': source_url, From 13c30d1d525586c1b7cd1a98f63320302e4ecf1b Mon Sep 17 00:00:00 2001 From: Surkal <> Date: Fri, 14 Aug 2020 13:11:20 +0200 Subject: [PATCH 23/31] [francetv] fix extractor --- youtube_dl/extractor/francetv.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index 81b468c7d1e..e340cddba8f 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -316,13 +316,14 @@ class FranceTVInfoIE(FranceTVBaseInfoExtractor): _VALID_URL = r'https?://(?:www|mobile|france3-regions)\.francetvinfo\.fr/(?:[^/]+/)*(?P[^/?#&.]+)' _TESTS = [{ - 'url': 'http://www.francetvinfo.fr/replay-jt/france-3/soir-3/jt-grand-soir-3-lundi-26-aout-2013_393427.html', + 'url': 'https://www.francetvinfo.fr/replay-jt/france-3/soir-3/jt-grand-soir-3-jeudi-22-aout-2019_3561461.html', 'info_dict': { - 'id': '84981923', + 'id': 'd12458ee-5062-48fe-bfdd-a30d6a01b793', 'ext': 'mp4', 'title': 'Soir 3', - 'upload_date': '20130826', - 'timestamp': 1377548400, + 'upload_date': '20190822', + 'timestamp': 1566510900, + 'description': 'md5:72d167097237701d6e8452ff03b83c00', 'subtitles': { 'fr': 'mincount:2', }, @@ -374,7 +375,8 @@ def _real_extract(self, url): video_id = self._search_regex( (r'player\.load[^;]+src:\s*["\']([^"\']+)', r'id-video=([^@]+@[^"]+)', - r']+href="(?:https?:)?//videos\.francetv\.fr/video/([^@]+@[^"]+)"'), + r']+href="(?:https?:)?//videos\.francetv\.fr/video/([^@]+@[^"]+)"', + r'data-id="([^"]+)"'), webpage, 'video id') return self._make_url_result(video_id) From b50f352f59ecb1de8261a47fac32c03b9bcbd594 Mon Sep 17 00:00:00 2001 From: sxvghd Date: Sun, 16 Aug 2020 15:40:59 +0200 Subject: [PATCH 24/31] [doodstream] new extractor --- youtube_dl/extractor/doodstream.py | 68 ++++++++++++++++++++++++++++++ youtube_dl/extractor/extractors.py | 1 + 2 files changed, 69 insertions(+) create mode 100644 youtube_dl/extractor/doodstream.py diff --git a/youtube_dl/extractor/doodstream.py b/youtube_dl/extractor/doodstream.py new file mode 100644 index 00000000000..38af8bee5c6 --- /dev/null +++ b/youtube_dl/extractor/doodstream.py @@ -0,0 +1,68 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import string +import random +import time + +from .common import InfoExtractor + + +class DoodStreamIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?dood\.(?:to|watch)/[ed]/(?P[a-z0-9]+)' + _TESTS = [{ + 'url': 'http://dood.to/e/5s1wmbdacezb', + 'md5': '4568b83b31e13242b3f1ff96c55f0595', + 'info_dict': { + 'id': '5s1wmbdacezb', + 'ext': 'mp4', + 'title': 'Kat Wonders - Monthly May 2020', + 'description': 'Kat Wonders - Monthly May 2020 | DoodStream.com', + 'thumbnail': 'https://img.doodcdn.com/snaps/flyus84qgl2fsk4g.jpg', + } + }, { + 'url': 'https://dood.to/d/jzrxn12t2s7n', + 'md5': '3207e199426eca7c2aa23c2872e6728a', + 'info_dict': { + 'id': 'jzrxn12t2s7n', + 'ext': 'mp4', + 'title': 'Stacy Cruz Cute ALLWAYSWELL', + 'description': 'Stacy Cruz Cute ALLWAYSWELL | DoodStream.com', + 'thumbnail': 'https://img.doodcdn.com/snaps/8edqd5nppkac3x8u.jpg', + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + if '/d/' in url: + url = "https://dood.to" + self._html_search_regex( + r'