Simplify scraper by dropping --type CLI argument

openzim · Oct 31, 2024 · 9a26ee3 · 9a26ee3
1 parent 160ec2c
commit 9a26ee3
Show file tree

Hide file tree

Showing 14 changed files with 81 additions and 110 deletions.
diff --git a/.github/workflows/Tests.yml b/.github/workflows/Tests.yml
@@ -100,7 +100,7 @@ jobs:
         env:
           YOUTUBE_API_KEY: ${{ secrets.YOUTUBE_API_KEY }}
           OPTIMIZATION_CACHE_URL: ${{ secrets.OPTIMIZATION_CACHE_URL }}
-        run: docker run -v $PWD/output:/output youtube2zim youtube2zim --api-key "$YOUTUBE_API_KEY" --optimization-cache "$OPTIMIZATION_CACHE_URL" --type channel --id "UC8elThf5TGMpQfQc_VE917Q" --name "tests_en_openzim-testing" --zim-file "openZIM_testing.zim" --tags "tEsTing,x-mark:yes"
+        run: docker run -v $PWD/output:/output youtube2zim youtube2zim --api-key "$YOUTUBE_API_KEY" --optimization-cache "$OPTIMIZATION_CACHE_URL" --id "UC8elThf5TGMpQfQc_VE917Q" --name "tests_en_openzim-testing" --zim-file "openZIM_testing.zim" --tags "tEsTing,x-mark:yes"
 
       - name: Run integration test suite
         run: docker run -v $PWD/scraper/tests-integration/integration.py:/src/scraper/tests-integration/integration.py -v $PWD/output:/output youtube2zim bash -c "pip install pytest; pytest -v /src/scraper/tests-integration/integration.py"
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,9 +7,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+### Deprecated
+
+- `--type` CLI argument is now deprecated (will be removed in next major)
+
 ### Changed
 
 - Raise exception if there are no videos in the playlists (#347)
+- Drop `--type` CLI argument and guess `--id` type (#361)
 
 ### Fixed
 

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -66,7 +66,7 @@ docker build -t local-youtube2zim .
 Scrape a channel (here we use the [openZIM_testing](https://www.youtube.com/channel/UC8elThf5TGMpQfQc_VE917Q) channel, but you could use any other one of interest for your UI developments).
 
 ```
-docker run --rm -it -v "$PWD/output":/output local-youtube2zim youtube2zim --api-key <YOUR-API-KEY> --type channel --id "UC8elThf5TGMpQfQc_VE917Q" --name "openZIM_testing" --zim-file "openZIM_testing"
+docker run --rm -it -v "$PWD/output":/output local-youtube2zim youtube2zim --api-key <YOUR-API-KEY> --id "UC8elThf5TGMpQfQc_VE917Q" --name "openZIM_testing" --zim-file "openZIM_testing"
 ```
 
 Extract interesting ZIM content and move it to `public` folder.

diff --git a/README.md b/README.md
@@ -78,18 +78,18 @@ To get an API Key:
 You can then create a ZIM from a singe channel / user / handle like `Vsauce`:
 
 ```bash
-youtube2zim --api-key "<your-api-key>" --type channel --id "Vsauce" --name "tests_hi_avanti"
+youtube2zim --api-key "<your-api-key>" --id "Vsauce" --name "tests_hi_avanti"
 ```
 
-When `--type channel` is used, you must pass one single value in `--id` and it can be the channel, user or playlist, or even the corresponding technical ID (see [FAQ/FEE](https://github.com/openzim/youtube/wiki/FAQ---FEE) for more details).
+When scraping a channel, you must pass one single value in `--id` and it can be the handle, user, or even the corresponding technical ID (see [FAQ/FEE](https://github.com/openzim/youtube/wiki/FAQ---FEE) for more details).
 
 Or you can create a ZIM from two playlists like `PL3rEvTTL-Jm8cBdskZoQaDTlDT4t7F6kp` and `PL3rEvTTL-Jm_OuyYpMfxtJW3Mcr9fFS2Z`:
 
 ```bash
-youtube2zim --api-key "<your-api-key>" --type playlist --id "PL3rEvTTL-Jm8cBdskZoQaDTlDT4t7F6kp,PL3rEvTTL-Jm_OuyYpMfxtJW3Mcr9fFS2Z" --name "tests_hi_avanti"
+youtube2zim --api-key "<your-api-key>" --id "PL3rEvTTL-Jm8cBdskZoQaDTlDT4t7F6kp,PL3rEvTTL-Jm_OuyYpMfxtJW3Mcr9fFS2Z" --name "tests_hi_avanti"
 ```
 
-When `--type playlist` is used, you can pass multiple playlist IDs separated by a comma in `--id`.
+When scraping playlists, you can pass multiple playlist IDs separated by a comma in `--id`.
 
 For more details / advanced usage, see the [Manual](https://github.com/openzim/youtube/wiki/Manual).
 
@@ -110,7 +110,7 @@ This script is a wrapper around `youtube2zim` and is bundled with the main packa
 Sample usage:
 
 ```
-youtube2zim-playlists --indiv-playlists --api-key XXX --type channel --id Vsauce --playlists-name="vsauce_en_playlist-{playlist_id}"
+youtube2zim-playlists --indiv-playlists --api-key XXX --id Vsauce --playlists-name="vsauce_en_playlist-{playlist_id}"
 ```
 
 Those are the required arguments for `youtube2zim-playlists` but **you can also pass any regular `youtube2zim` argument**. Those will be forwarded to `youtube2zim` (which will be run independently for each playlist).

diff --git a/scraper/src/youtube2zim/constants.py b/scraper/src/youtube2zim/constants.py
@@ -13,10 +13,6 @@
 
 SCRAPER = f"{NAME} {__version__}"
 
-CHANNEL = "channel"
-PLAYLIST = "playlist"
-USER = "user"
-
 # Youtube uses some non-standard language codes
 YOUTUBE_LANG_MAP = {
     "iw": "he",  # Hebrew

diff --git a/scraper/src/youtube2zim/entrypoint.py b/scraper/src/youtube2zim/entrypoint.py
@@ -6,7 +6,7 @@
 import os
 import sys
 
-from youtube2zim.constants import CHANNEL, NAME, PLAYLIST, SCRAPER, USER, logger
+from youtube2zim.constants import NAME, SCRAPER, logger
 from youtube2zim.scraper import Youtube2Zim
 
 
@@ -16,12 +16,12 @@ def main():
         description="Scraper to create a ZIM file from a Youtube Channel or Playlists",
     )
 
+    # Not used anymore, kept for backward compability till next major release
+    # Also remove trick lines 211-217 to not handle this anymore
     parser.add_argument(
         "--type",
         help="Type of collection",
-        choices=[CHANNEL, PLAYLIST, USER],
-        required=True,
-        dest="collection_type",
+        dest="not_used_anymore",
     )
     parser.add_argument(
         "--id", help="Youtube ID of the collection", required=True, dest="youtube_id"
@@ -208,7 +208,13 @@ def main():
     try:
         if args.max_concurrency < 1:
             raise ValueError(f"Invalid concurrency value: {args.max_concurrency}")
-        scraper = Youtube2Zim(**dict(args._get_kwargs()))
+        scraper = Youtube2Zim(
+            **{
+                key: value
+                for key, value in dict(args._get_kwargs()).items()
+                if key != "not_used_anymore"
+            }
+        )
         return scraper.run()
     except Exception as exc:
         logger.error(f"FAILED. An error occurred: {exc}")

diff --git a/scraper/src/youtube2zim/playlists/entrypoint.py b/scraper/src/youtube2zim/playlists/entrypoint.py
@@ -5,7 +5,7 @@
 import logging
 import sys
 
-from youtube2zim.constants import CHANNEL, NAME, PLAYLIST, SCRAPER, USER, logger
+from youtube2zim.constants import NAME, SCRAPER, logger
 from youtube2zim.utils import has_argument
 
 
@@ -19,13 +19,13 @@ def main():
         "{creator_id}, {creator_name}.",
     )
 
+    # Not used anymore, kept for backward compability till next major release
     parser.add_argument(
         "--type",
         help="Type of collection",
-        choices=[CHANNEL, PLAYLIST, USER],
-        required=True,
-        dest="collection_type",
+        dest="not_used_anymore",
     )
+
     parser.add_argument(
         "--id", help="Youtube ID of the collection", required=True, dest="youtube_id"
     )

diff --git a/scraper/src/youtube2zim/playlists/scraper.py b/scraper/src/youtube2zim/playlists/scraper.py
@@ -21,7 +21,7 @@
 import requests
 from zimscraperlib.logging import nicer_args_join
 
-from youtube2zim.constants import NAME, PLAYLIST, YOUTUBE, logger
+from youtube2zim.constants import NAME, YOUTUBE, logger
 from youtube2zim.youtube import (
     REQUEST_TIMEOUT,
     credentials_ok,
@@ -40,7 +40,6 @@ def __init__(
         self.debug = options["debug"]
         self.disable_metadata_checks = options["disable_metadata_checks"]
         self.playlists_mode = options["playlists_mode"]
-        self.collection_type = options["collection_type"]
         self.youtube_id = options["youtube_id"]
 
         self.extra_args = extra_args
@@ -76,10 +75,7 @@ def run(self):
             shutil.rmtree(self.build_dir, ignore_errors=True)  # not needed
             return self.handle_single_zim()
 
-        logger.info(
-            f"starting all-playlits {NAME} scraper "
-            f"for {self.collection_type}#{self.youtube_id}"
-        )
+        logger.info(f"starting all-playlists {NAME} scraper for {self.youtube_id}")
 
         # create required sub folders
         for sub_folder in ("cache", "videos", "channels"):
@@ -96,7 +92,8 @@ def run(self):
             playlists,
             main_channel_id,
             uploads_playlist_id,
-        ) = extract_playlists_details_from(self.collection_type, self.youtube_id)
+            is_playlist,
+        ) = extract_playlists_details_from(self.youtube_id)
 
         logger.info(
             ".. {} playlists:\n   {}".format(
@@ -128,8 +125,6 @@ def run_playlist_zim(self, playlist):
         playlist_id = playlist.playlist_id
         args = [
             *self.youtube2zim_exe,
-            "--type",
-            PLAYLIST,
             "--id",
             playlist_id,
             "--api-key",
@@ -180,8 +175,6 @@ def handle_single_zim(self):
 
         args = [
             *self.youtube2zim_exe,
-            "--type",
-            self.collection_type,
             "--id",
             self.youtube_id,
             "--api-key",

diff --git a/scraper/src/youtube2zim/schemas.py b/scraper/src/youtube2zim/schemas.py
@@ -105,7 +105,6 @@ class Channel(CamelModel):
     profile_path: str | None = None
     banner_path: str | None = None
     joined_date: str
-    collection_type: str
     main_playlist: str | None = None
     playlist_count: int
 

diff --git a/scraper/src/youtube2zim/scraper.py b/scraper/src/youtube2zim/scraper.py
@@ -43,11 +43,8 @@
 )
 
 from youtube2zim.constants import (
-    CHANNEL,
-    PLAYLIST,
     ROOT_DIR,
     SCRAPER,
-    USER,
     YOUTUBE,
     YOUTUBE_LANG_MAP,
     logger,
@@ -86,13 +83,10 @@
     skip_outofrange_videos,
 )
 
-MAXIMUM_YOUTUBEID_LENGTH = 24
-
 
 class Youtube2Zim:
     def __init__(
         self,
-        collection_type,
         youtube_id,
         api_key,
         video_format,
@@ -124,13 +118,6 @@ def __init__(
         secondary_color=None,
     ):
         # data-retrieval info
-        self.collection_type = collection_type
-        if self.collection_type == USER:
-            logger.warning(
-                "Collection type 'user' is deprecated. Please use 'channel' type,"
-                " behaviors have been merged. 'user' type is going to be dropped in "
-                " next major release"
-            )
         self.youtube_id = youtube_id
         self.api_key = api_key
         self.dateafter = dateafter
@@ -233,23 +220,9 @@ def profile_path(self):
     def banner_path(self):
         return self.build_dir.joinpath("banner.jpg")
 
-    @property
-    def is_user(self):
-        return self.collection_type == USER
-
-    @property
-    def is_channel(self):
-        return self.collection_type == CHANNEL
-
-    @property
-    def is_playlist(self):
-        return self.collection_type == PLAYLIST
-
     @property
     def is_single_channel(self):
-        if self.is_channel or self.is_user:
-            return True
-        return len(list({pl.creator_id for pl in self.playlists})) == 1
+        return len({pl.creator_id for pl in self.playlists}) == 1
 
     @property
     def sorted_playlists(self):
@@ -282,8 +255,6 @@ def run(self):
             # first report => creates a file with appropriate structure
             self.report_progress()
 
-            self.validate_id()
-
             # validate dateafter input
             self.validate_dateafter_input()
 
@@ -303,9 +274,7 @@ def run(self):
             if not self.build_dir.exists() or not self.build_dir.is_dir():
                 raise OSError(f"Incorrect build_dir: {self.build_dir}")
 
-            logger.info(
-                f"starting youtube scraper for {self.collection_type}#{self.youtube_id}"
-            )
+            logger.info(f"starting youtube scraper for {self.youtube_id}")
             logger.info(f"preparing build folder at {self.build_dir.resolve()}")
             self.prepare_build_folder()
 
@@ -497,17 +466,6 @@ def validate_dateafter_input(self):
             )
             raise ValueError(f"Invalid dateafter input: {exc}") from exc
 
-    def validate_id(self):
-        # space not allowed in youtube-ID
-        self.youtube_id = self.youtube_id.replace(" ", "")
-        if (
-            self.collection_type == "channel"
-            and len(self.youtube_id) > MAXIMUM_YOUTUBEID_LENGTH
-        ):
-            raise ValueError("Invalid ChannelId")
-        if "," in self.youtube_id and self.collection_type != "playlist":
-            raise ValueError("Invalid YoutubeId")
-
     def prepare_build_folder(self):
         """prepare build folder before we start downloading data"""
 
@@ -590,7 +548,8 @@ def extract_playlists(self):
             self.playlists,
             self.main_channel_id,
             self.uploads_playlist_id,
-        ) = extract_playlists_details_from(self.collection_type, self.youtube_id)
+            self.is_playlist,
+        ) = extract_playlists_details_from(self.youtube_id)
 
     def extract_videos_list(self):
         all_videos = load_json(self.cache_dir, "videos")
@@ -1262,7 +1221,6 @@ def get_playlist_slug(playlist) -> str:
                 channel_description=channel_data["snippet"]["description"],
                 profile_path="profile.jpg",
                 banner_path="banner.jpg",
-                collection_type=self.collection_type,
                 main_playlist=main_playlist_slug,
                 playlist_count=len(self.playlists),
                 joined_date=channel_data["snippet"]["publishedAt"],