Merge branch 'yt-dlp:master' into my

lockmatrix · Aug 26, 2022 · d0b5727 · d0b5727
2 parents e765ece + a1af516
commit d0b5727
Show file tree

Hide file tree

Showing 24 changed files with 300 additions and 147 deletions.
diff --git a/README.md b/README.md
@@ -375,7 +375,13 @@ You can also fork the project on github and run your fork's [build workflow](.gi
     --list-extractors               List all supported extractors and exit
     --extractor-descriptions        Output descriptions of all supported
                                     extractors and exit
-    --force-generic-extractor       Force extraction to use the generic extractor
+    --use-extractors NAMES          Extractor names to use separated by commas.
+                                    You can also use regexes, "all", "default"
+                                    and "end" (end URL matching); e.g. --ies
+                                    "holodex.*,end,youtube". Prefix the name
+                                    with a "-" to exclude it, e.g. --ies
+                                    default,-generic. Use --list-extractors for
+                                    a list of extractor names. (Alias: --ies)
     --default-search PREFIX         Use this prefix for unqualified URLs. E.g.
                                     "gvsearch2:python" downloads two videos from
                                     google videos for the search term "python".
@@ -524,8 +530,8 @@ You can also fork the project on github and run your fork's [build workflow](.gi
                                     a file that is in the archive
     --break-on-reject               Stop the download process when encountering
                                     a file that has been filtered out
-    --break-per-input               Make --break-on-existing, --break-on-reject
-                                    and --max-downloads act only on the current
+    --break-per-input               Make --break-on-existing, --break-on-reject,
+                                    --max-downloads and autonumber reset per
                                     input URL
     --no-break-per-input            --break-on-existing and similar options
                                     terminates the entire download queue
@@ -2058,6 +2064,7 @@ While these options are redundant, they are still expected to be used due to the
 #### Not recommended
 While these options still work, their use is not recommended since there are other alternatives to achieve the same
 
+    --force-generic-extractor        --ies generic,default
     --exec-before-download CMD       --exec "before_dl:CMD"
     --no-exec-before-download        --no-exec
     --all-formats                    -f all

diff --git a/devscripts/lazy_load_template.py b/devscripts/lazy_load_template.py
@@ -11,14 +11,17 @@
 
 # These bloat the lazy_extractors, so allow them to passthrough silently
 ALLOWED_CLASSMETHODS = {'get_testcases', 'extract_from_webpage'}
+_WARNED = False
 
 
 class LazyLoadMetaClass(type):
     def __getattr__(cls, name):
-        if '_real_class' not in cls.__dict__ and name not in ALLOWED_CLASSMETHODS:
-            write_string(
-                'WARNING: Falling back to normal extractor since lazy extractor '
-                f'{cls.__name__} does not have attribute {name}{bug_reports_message()}\n')
+        global _WARNED
+        if ('_real_class' not in cls.__dict__
+                and name not in ALLOWED_CLASSMETHODS and not _WARNED):
+            _WARNED = True
+            write_string('WARNING: Falling back to normal extractor since lazy extractor '
+                         f'{cls.__name__} does not have attribute {name}{bug_reports_message()}\n')
         return getattr(cls.real_class, name)
 
 

diff --git a/devscripts/make_lazy_extractors.py b/devscripts/make_lazy_extractors.py
@@ -12,7 +12,9 @@
 from devscripts.utils import get_filename_args, read_file, write_file
 
 NO_ATTR = object()
-STATIC_CLASS_PROPERTIES = ['IE_NAME', 'IE_DESC', 'SEARCH_KEY', '_VALID_URL', '_WORKING', '_NETRC_MACHINE', 'age_limit']
+STATIC_CLASS_PROPERTIES = [
+    'IE_NAME', 'IE_DESC', 'SEARCH_KEY', '_VALID_URL', '_WORKING', '_ENABLED', '_NETRC_MACHINE', 'age_limit'
+]
 CLASS_METHODS = [
     'ie_key', 'working', 'description', 'suitable', '_match_valid_url', '_match_id', 'get_temp_id', 'is_suitable'
 ]

diff --git a/test/test_execution.py b/test/test_execution.py
@@ -11,41 +11,46 @@
 import contextlib
 import subprocess
 
-from yt_dlp.utils import encodeArgument
+from yt_dlp.utils import Popen
 
 rootDir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+LAZY_EXTRACTORS = 'yt_dlp/extractor/lazy_extractors.py'
 
 
-try:
-    _DEV_NULL = subprocess.DEVNULL
-except AttributeError:
-    _DEV_NULL = open(os.devnull, 'wb')
+class TestExecution(unittest.TestCase):
+    def run_yt_dlp(self, exe=(sys.executable, 'yt_dlp/__main__.py'), opts=('--version', )):
+        stdout, stderr, returncode = Popen.run(
+            [*exe, '--ignore-config', *opts], cwd=rootDir, text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        print(stderr, file=sys.stderr)
+        self.assertEqual(returncode, 0)
+        return stdout.strip(), stderr.strip()
 
+    def test_main_exec(self):
+        self.run_yt_dlp()
 
-class TestExecution(unittest.TestCase):
     def test_import(self):
-        subprocess.check_call([sys.executable, '-c', 'import yt_dlp'], cwd=rootDir)
+        self.run_yt_dlp(exe=(sys.executable, '-c', 'import yt_dlp'))
 
     def test_module_exec(self):
-        subprocess.check_call([sys.executable, '-m', 'yt_dlp', '--ignore-config', '--version'], cwd=rootDir, stdout=_DEV_NULL)
-
-    def test_main_exec(self):
-        subprocess.check_call([sys.executable, 'yt_dlp/__main__.py', '--ignore-config', '--version'], cwd=rootDir, stdout=_DEV_NULL)
+        self.run_yt_dlp(exe=(sys.executable, '-m', 'yt_dlp'))
 
     def test_cmdline_umlauts(self):
-        p = subprocess.Popen(
-            [sys.executable, 'yt_dlp/__main__.py', '--ignore-config', encodeArgument('ä'), '--version'],
-            cwd=rootDir, stdout=_DEV_NULL, stderr=subprocess.PIPE)
-        _, stderr = p.communicate()
+        _, stderr = self.run_yt_dlp(opts=('ä', '--version'))
         self.assertFalse(stderr)
 
     def test_lazy_extractors(self):
         try:
-            subprocess.check_call([sys.executable, 'devscripts/make_lazy_extractors.py', 'yt_dlp/extractor/lazy_extractors.py'], cwd=rootDir, stdout=_DEV_NULL)
-            subprocess.check_call([sys.executable, 'test/test_all_urls.py'], cwd=rootDir, stdout=_DEV_NULL)
+            subprocess.check_call([sys.executable, 'devscripts/make_lazy_extractors.py', LAZY_EXTRACTORS],
+                                  cwd=rootDir, stdout=subprocess.DEVNULL)
+            self.assertTrue(os.path.exists(LAZY_EXTRACTORS))
+
+            _, stderr = self.run_yt_dlp(opts=('-s', 'test:'))
+            self.assertFalse(stderr)
+
+            subprocess.check_call([sys.executable, 'test/test_all_urls.py'], cwd=rootDir, stdout=subprocess.DEVNULL)
         finally:
             with contextlib.suppress(OSError):
-                os.remove('yt_dlp/extractor/lazy_extractors.py')
+                os.remove(LAZY_EXTRACTORS)
 
 
 if __name__ == '__main__':

diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py
@@ -110,6 +110,14 @@
         'https://www.youtube.com/s/player/1f7d5369/player_ias.vflset/en_US/base.js',
         'batNX7sYqIJdkJ', 'IhOkL_zxbkOZBw',
     ),
+    (
+        'https://www.youtube.com/s/player/009f1d77/player_ias.vflset/en_US/base.js',
+        '5dwFHw8aFWQUQtffRq', 'audescmLUzI3jw',
+    ),
+    (
+        'https://www.youtube.com/s/player/dc0c6770/player_ias.vflset/en_US/base.js',
+        '5EHDMgYLV6HPGk_Mu-kk', 'n9lUJLHbxUI0GQ',
+    ),
 ]
 
 

diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py
@@ -29,6 +29,7 @@
 from .downloader import FFmpegFD, get_suitable_downloader, shorten_protocol_name
 from .downloader.rtmp import rtmpdump_version
 from .extractor import gen_extractor_classes, get_info_extractor
+from .extractor.common import UnsupportedURLIE
 from .extractor.openload import PhantomJSwrapper
 from .minicurses import format_text
 from .postprocessor import _PLUGIN_CLASSES as plugin_postprocessors
@@ -47,7 +48,7 @@
     get_postprocessor,
 )
 from .postprocessor.ffmpeg import resolve_mapping as resolve_recode_mapping
-from .update import detect_variant
+from .update import REPOSITORY, current_git_head, detect_variant
 from .utils import (
     DEFAULT_OUTTMPL,
     IDENTITY,
@@ -115,6 +116,7 @@
     network_exceptions,
     number_of_digits,
     orderedSet,
+    orderedSet_from_options,
     parse_filesize,
     preferredencoding,
     prepend_extension,
@@ -236,7 +238,7 @@ class YoutubeDL:
                        Default is 'only_download' for CLI, but False for API
     skip_playlist_after_errors: Number of allowed failures until the rest of
                        the playlist is skipped
-    force_generic_extractor: Force downloader to use the generic extractor
+    allowed_extractors:  List of regexes to match against extractor names that are allowed
     overwrites:        Overwrite all video and metadata files if True,
                        overwrite only non-video files if None
                        and don't overwrite any file if False
@@ -476,6 +478,8 @@ class YoutubeDL:
 
     The following options are deprecated and may be removed in the future:
 
+    force_generic_extractor: Force downloader to use the generic extractor
+                       - Use allowed_extractors = ['generic', 'default']
     playliststart:     - Use playlist_items
                        Playlist item to start at.
     playlistend:       - Use playlist_items
@@ -757,13 +761,6 @@ def add_info_extractor(self, ie):
             self._ies_instances[ie_key] = ie
             ie.set_downloader(self)
 
-    def _get_info_extractor_class(self, ie_key):
-        ie = self._ies.get(ie_key)
-        if ie is None:
-            ie = get_info_extractor(ie_key)
-            self.add_info_extractor(ie)
-        return ie
-
     def get_info_extractor(self, ie_key):
         """
         Get an instance of an IE with name ie_key, it will try to get one from
@@ -780,8 +777,19 @@ def add_default_info_extractors(self):
         """
         Add the InfoExtractors returned by gen_extractors to the end of the list
         """
-        for ie in gen_extractor_classes():
-            self.add_info_extractor(ie)
+        all_ies = {ie.IE_NAME.lower(): ie for ie in gen_extractor_classes()}
+        all_ies['end'] = UnsupportedURLIE()
+        try:
+            ie_names = orderedSet_from_options(
+                self.params.get('allowed_extractors', ['default']), {
+                    'all': list(all_ies),
+                    'default': [name for name, ie in all_ies.items() if ie._ENABLED],
+                }, use_regex=True)
+        except re.error as e:
+            raise ValueError(f'Wrong regex for allowed_extractors: {e.pattern}')
+        for name in ie_names:
+            self.add_info_extractor(all_ies[name])
+        self.write_debug(f'Loaded {len(ie_names)} extractors')
 
     def add_post_processor(self, pp, when='post_process'):
         """Add a PostProcessor object to the end of the chain."""
@@ -1412,11 +1420,11 @@ def extract_info(self, url, download=True, ie_key=None, extra_info=None,
             ie_key = 'Generic'
 
         if ie_key:
-            ies = {ie_key: self._get_info_extractor_class(ie_key)}
+            ies = {ie_key: self._ies[ie_key]} if ie_key in self._ies else {}
         else:
             ies = self._ies
 
-        for ie_key, ie in ies.items():
+        for key, ie in ies.items():
             if not ie.suitable(url):
                 continue
 
@@ -1425,14 +1433,16 @@ def extract_info(self, url, download=True, ie_key=None, extra_info=None,
                                     'and will probably not work.')
 
             temp_id = ie.get_temp_id(url)
-            if temp_id is not None and self.in_download_archive({'id': temp_id, 'ie_key': ie_key}):
-                self.to_screen(f'[{ie_key}] {temp_id}: has already been recorded in the archive')
+            if temp_id is not None and self.in_download_archive({'id': temp_id, 'ie_key': key}):
+                self.to_screen(f'[{key}] {temp_id}: has already been recorded in the archive')
                 if self.params.get('break_on_existing', False):
                     raise ExistingVideoReached()
                 break
-            return self.__extract_info(url, self.get_info_extractor(ie_key), download, extra_info, process)
+            return self.__extract_info(url, self.get_info_extractor(key), download, extra_info, process)
         else:
-            self.report_error('no suitable InfoExtractor for URL %s' % url)
+            extractors_restricted = self.params.get('allowed_extractors') not in (None, ['default'])
+            self.report_error(f'No suitable extractor{format_field(ie_key, None, " (%s)")} found for URL {url}',
+                              tb=False if extractors_restricted else None)
 
     def _handle_extraction_exceptions(func):
         @functools.wraps(func)
@@ -2737,27 +2747,11 @@ def process_subtitles(self, video_id, normal_subtitles, automatic_captions):
         if self.params.get('allsubtitles', False):
             requested_langs = all_sub_langs
         elif self.params.get('subtitleslangs', False):
-            # A list is used so that the order of languages will be the same as
-            # given in subtitleslangs. See https://github.com/yt-dlp/yt-dlp/issues/1041
-            requested_langs = []
-            for lang_re in self.params.get('subtitleslangs'):
-                discard = lang_re[0] == '-'
-                if discard:
-                    lang_re = lang_re[1:]
-                if lang_re == 'all':
-                    if discard:
-                        requested_langs = []
-                    else:
-                        requested_langs.extend(all_sub_langs)
-                    continue
-                current_langs = filter(re.compile(lang_re + '$').match, all_sub_langs)
-                if discard:
-                    for lang in current_langs:
-                        while lang in requested_langs:
-                            requested_langs.remove(lang)
-                else:
-                    requested_langs.extend(current_langs)
-            requested_langs = orderedSet(requested_langs)
+            try:
+                requested_langs = orderedSet_from_options(
+                    self.params.get('subtitleslangs'), {'all': all_sub_langs}, use_regex=True)
+            except re.error as e:
+                raise ValueError(f'Wrong regex for subtitlelangs: {e.pattern}')
         elif normal_sub_langs:
             requested_langs = ['en'] if 'en' in normal_sub_langs else normal_sub_langs[:1]
         else:
@@ -3271,6 +3265,7 @@ def wrapper(*args, **kwargs):
                 self.to_screen(f'[info] {e}')
                 if not self.params.get('break_per_url'):
                     raise
+                self._num_downloads = 0
             else:
                 if self.params.get('dump_single_json', False):
                     self.post_extract(res)
@@ -3319,6 +3314,12 @@ def sanitize_info(info_dict, remove_private_keys=False):
             return info_dict
         info_dict.setdefault('epoch', int(time.time()))
         info_dict.setdefault('_type', 'video')
+        info_dict.setdefault('_version', {
+            'version': __version__,
+            'current_git_head': current_git_head(),
+            'release_git_head': RELEASE_GIT_HEAD,
+            'repository': REPOSITORY,
+        })
 
         if remove_private_keys:
             reject = lambda k, v: v is None or k.startswith('__') or k in {
@@ -3683,7 +3684,8 @@ def get_encoding(stream):
         if VARIANT not in (None, 'pip'):
             source += '*'
         write_debug(join_nonempty(
-            'yt-dlp version', __version__,
+            f'{"yt-dlp" if REPOSITORY == "yt-dlp/yt-dlp" else REPOSITORY} version',
+            __version__,
             f'[{RELEASE_GIT_HEAD}]' if RELEASE_GIT_HEAD else '',
             '' if source == 'unknown' else f'({source})',
             delim=' '))
@@ -3699,18 +3701,8 @@ def get_encoding(stream):
         if self.params['compat_opts']:
             write_debug('Compatibility options: %s' % ', '.join(self.params['compat_opts']))
 
-        if source == 'source':
-            try:
-                stdout, _, _ = Popen.run(
-                    ['git', 'rev-parse', '--short', 'HEAD'],
-                    text=True, cwd=os.path.dirname(os.path.abspath(__file__)),
-                    stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-                if re.fullmatch('[0-9a-f]+', stdout.strip()):
-                    write_debug(f'Git HEAD: {stdout.strip()}')
-            except Exception:
-                with contextlib.suppress(Exception):
-                    sys.exc_clear()
-
+        if current_git_head():
+            write_debug(f'Git HEAD: {current_git_head()}')
         write_debug(system_identifier())
 
         exe_versions, ffmpeg_features = FFmpegPostProcessor.get_versions_and_features(self)

diff --git a/yt_dlp/__init__.py b/yt_dlp/__init__.py
@@ -766,6 +766,7 @@ def parse_options(argv=None):
         'windowsfilenames': opts.windowsfilenames,
         'ignoreerrors': opts.ignoreerrors,
         'force_generic_extractor': opts.force_generic_extractor,
+        'allowed_extractors': opts.allowed_extractors or ['default'],
         'ratelimit': opts.ratelimit,
         'throttledratelimit': opts.throttledratelimit,
         'overwrites': opts.overwrites,

diff --git a/yt_dlp/extractor/bilibili.py b/yt_dlp/extractor/bilibili.py
@@ -858,14 +858,15 @@ def _search_results(self, query):
                     'keyword': query,
                     'page': page_num,
                     'context': '',
-                    'order': 'pubdate',
                     'duration': 0,
                     'tids_2': '',
                     '__refresh__': 'true',
                     'search_type': 'video',
                     'tids': 0,
                     'highlight': 1,
-                })['data'].get('result') or []
+                })['data'].get('result')
+            if not videos:
+                break
             for video in videos:
                 yield self.url_result(video['arcurl'], 'BiliBili', str(video['aid']))
 

diff --git a/yt_dlp/extractor/bitchute.py b/yt_dlp/extractor/bitchute.py
@@ -65,10 +65,12 @@ def _real_extract(self, url):
                 error = self._html_search_regex(r'<h1 class="page-title">([^<]+)</h1>', webpage, 'error', default='Cannot find video')
                 if error == 'Video Unavailable':
                     raise GeoRestrictedError(error)
-                raise ExtractorError(error)
+                raise ExtractorError(error, expected=True)
             formats = entries[0]['formats']
 
         self._check_formats(formats, video_id)
+        if not formats:
+            raise self.raise_no_formats('Video is unavailable', expected=True, video_id=video_id)
         self._sort_formats(formats)
 
         description = self._html_search_regex(