openzim · benoit74 · Nov 26, 2024 · Oct 26, 2024 · Nov 26, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,8 +7,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
-### Fixed
+### Changed
+
+- Differentiate user uploaded shorts, lives & long videos (#367)
+
+### Fixed
 
+- Corrected the short video resolution in the UI (#366)
 - Check for empty playlists after filtering, and after downloading videos (#375)
 
 ## [3.2.1] - 2024-11-01

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -101,3 +101,11 @@ yarn test:e2e
 ```
 
 On Linux, you might need to install additional dependencies, see [Linux Prerequisites](https://docs.cypress.io/guides/getting-started/installing-cypress#Linux-Prerequisites) in the Cypress documentation.
+
+## running integration tests checking ZIM content
+
+We have a bunch of integration tests checking ZIM content. Once you have a the test ZIM from openZIM channel (see instructions above for Vue.JS ZIM UI), you can run the tests locally as well:
+
+```
+ZIM_FILE_PATH="output/openZIM_testing.zim" pytest scraper/tests-integration/integration.py
+```
diff --git a/scraper/src/youtube2zim/playlists/scraper.py b/scraper/src/youtube2zim/playlists/scraper.py
@@ -91,7 +91,9 @@ def run(self):
         (
             playlists,
             main_channel_id,
-            uploads_playlist_id,
+            user_long_uploads_playlist_id,
+            user_short_uploads_playlist_id,
+            user_lives_playlist_id,
             is_playlist,
         ) = extract_playlists_details_from(self.youtube_id)
 
@@ -106,10 +108,6 @@ def run(self):
         shutil.rmtree(self.build_dir, ignore_errors=True)
 
         for playlist in playlists:
-            if playlist.playlist_id == uploads_playlist_id:
-                logger.info(f"Skipping playlist {playlist.playlist_id} (uploads one)")
-                continue
-
             logger.info(f"Executing youtube2zim for playlist {playlist.playlist_id}")
             success, process = self.run_playlist_zim(playlist)
             if success:

diff --git a/scraper/src/youtube2zim/schemas.py b/scraper/src/youtube2zim/schemas.py
@@ -105,7 +105,10 @@
     profile_path: str | None = None
     banner_path: str | None = None
     joined_date: str
-    main_playlist: str | None = None
+    first_playlist: str | None = None
+    user_long_uploads_playlist: str | None = None
+    user_short_uploads_playlist: str | None = None
+    user_lives_playlist: str | None = None
     playlist_count: int
 
 

diff --git a/scraper/src/youtube2zim/scraper.py b/scraper/src/youtube2zim/scraper.py
@@ -170,7 +170,9 @@
 
         # process-related
         self.playlists = []
-        self.uploads_playlist_id = None
+        self.user_long_uploads_playlist_id = None
+        self.user_short_uploads_playlist_id = None
+        self.user_lives_playlist_id = None
         self.videos_ids = []
         self.video_ids_count = 0
         self.videos_processed = 0
@@ -229,194 +231,170 @@
     def is_single_channel(self):
         return len({pl.creator_id for pl in self.playlists}) == 1
 
-    @property
-    def sorted_playlists(self):
-        """sorted list of playlists (by title) but with Uploads one at first if any"""
-        if len(self.playlists) <= 1:
-            return self.playlists
-
-        sorted_playlists = sorted(self.playlists, key=lambda x: x.title)
-        index = 0
-        # make sure our Uploads, special playlist is first
-        if self.uploads_playlist_id:
-            try:
-                index = [
-                    index
-                    for index, p in enumerate(sorted_playlists)
-                    if p.playlist_id == self.uploads_playlist_id
-                ][-1]
-            except Exception:
-                index = 0
-        return (
-            [sorted_playlists[index]]
-            + sorted_playlists[0:index]
-            + sorted_playlists[index + 1 :]
-        )
-
     def run(self):
         """execute the scraper step by step"""
 
        try:
            # first report => creates a file with appropriate structure
            self.report_progress()

            # validate dateafter input
            self.validate_dateafter_input()

            if not self.name:
                raise Exception("name is mandatory")
            period = datetime.date.today().strftime("%Y-%m")
            self.fname = (
                self.fname.format(period=period)
                if self.fname
                else f"{self.name}_{period}.zim"
            )

            # check that we can create a ZIM file in the output directory
            validate_zimfile_creatable(self.output_dir, self.fname)

            # check that build_dir is correct
            if not self.build_dir.exists() or not self.build_dir.is_dir():
                raise OSError(f"Incorrect build_dir: {self.build_dir}")

            logger.info(f"starting youtube scraper for {self.youtube_id}")
            logger.info(f"preparing build folder at {self.build_dir.resolve()}")
            self.prepare_build_folder()

            logger.info("testing Youtube credentials")
            if not credentials_ok():
                raise ValueError(
                    "Unable to connect to Youtube API v3. check `API_KEY`."
                )

            if self.s3_url_with_credentials and not self.s3_credentials_ok():
                raise ValueError(
                    "Unable to connect to Optimization Cache. Check its URL."
                )

            # fail early if supplied branding files are missing
            self.check_branding_values()

            logger.info("compute playlists list to retrieve")
            self.extract_playlists()

            logger.info(
                ".. {} playlists:\n   {}".format(
                    len(self.playlists),
                    "\n   ".join([p.playlist_id for p in self.playlists]),
                )
            )

            logger.info("compute list of videos")
            self.extract_videos_list()

            self.video_ids_count = len(self.videos_ids)
            nb_videos_msg = f".. {self.video_ids_count} videos"
            if self.dateafter.start.year != 1:
                nb_videos_msg += (
                    f" in date range: {self.dateafter.start} - {datetime.date.today()}"
                )
            logger.info(f"{nb_videos_msg}.")

            # set a timer to report progress only every 10 seconds
            every(10).seconds.do(self.report_progress)

            logger.info("update general metadata")
            self.update_metadata()

            if not self.title:
                raise Exception("title is mandatory")
            if not self.description:
                raise Exception("description is mandatory")
            if not self.creator:
                raise Exception("creator is mandatory")

            # check that illustration is correct
            illustration = "favicon.png"
            illustration_path = self.build_dir / illustration
            if not illustration_path.exists() or not illustration_path.is_file():
                raise OSError(
                    f"Incorrect illustration: {illustration} ({illustration_path})"
                )
            with open(illustration_path, "rb") as fh:
                illustration_data = fh.read()

            logger.info("building ZIM file")
            self.zim_file = Creator(
                filename=self.output_dir / self.fname,
                main_path="index.html",
                ignore_duplicates=True,
                disable_metadata_checks=self.disable_metadata_checks,
            )
            self.zim_file.config_metadata(
                Name=self.name,
                Language=self.language,
                Title=self.title,
                Description=self.description,
                LongDescription=self.long_description,
                Creator=self.creator,
                Publisher=self.publisher,
                Tags=";".join(self.tags) if self.tags else "",
                Scraper=SCRAPER,
                Date=datetime.date.today(),
                Illustration_48x48_at_1=illustration_data,
            )
            self.zim_file.start()

            logger.debug(f"Preparing zimfile at {self.zim_file.filename}")

            logger.info("add main channel branding to ZIM")
            self.add_main_channel_branding_to_zim()

            logger.debug(f"add zimui files from {self.zimui_dist}")
            self.add_zimui()

            # download videos (and recompress)
            logger.info(
                "downloading all videos, subtitles and thumbnails "
                f"(concurrency={self.max_concurrency})"
            )
            logger.info(f"  format: {self.video_format}")
            logger.info(f"  quality: {self.video_quality}")
            logger.info(f"  generated-subtitles: {self.all_subtitles}")
            if self.s3_storage:
                logger.info(
                    f"  using cache: {self.s3_storage.url.netloc} "
                    f"with bucket: {self.s3_storage.bucket_name}"
                )
            succeeded, failed = self.download_video_files(
                max_concurrency=self.max_concurrency
            )
            if failed:
                logger.error(f"{len(failed)} video(s) failed to download: {failed}")
                if len(failed) >= len(succeeded):
                    logger.critical("More than half of videos failed. exiting")
                    raise OSError("Too much videos failed to download")

            logger.info("retrieve channel-info for all videos (author details)")
            get_videos_authors_info(succeeded)

            logger.info("download all author's profile pictures")
            self.download_authors_branding()

            logger.info("creating JSON files")
            self.make_json_files(succeeded)
        except KeyboardInterrupt:
            logger.error("KeyboardInterrupt, exiting.")
            return 1
        except Exception as exc:
            logger.error(f"Interrupting process due to error: {exc}")
            logger.exception(exc)
            return 1
        else:
            logger.info("Finishing ZIM file…")
            self.zim_file.finish()
        finally:
            self.report_progress()
            logger.info("removing temp folder")
            shutil.rmtree(self.build_dir, ignore_errors=True)

        logger.info("all done!")

    def add_zimui(self):
        logger.info(f"Adding files in {self.zimui_dist}")
@@ -552,7 +530,9 @@
         (
             self.playlists,
             self.main_channel_id,
-            self.uploads_playlist_id,
+            self.user_long_uploads_playlist_id,
+            self.user_short_uploads_playlist_id,
+            self.user_lives_playlist_id,
             self.is_playlist,
         ) = extract_playlists_details_from(self.youtube_id)
 
@@ -934,76 +914,76 @@
            if path.exists():
                self.add_file_to_zim(filename, path, callback=(delete_callback, path))

    def update_metadata(self):
        # we use title, description, profile and banner of channel/user
        # or channel of first playlist
        if not self.main_channel_id:
            raise Exception("main_channel_id is mandatory")
        try:
            main_channel_json = get_channel_json(self.main_channel_id)
        except KeyError:
            main_channel_json = {"snippet": {"title": "Unknown", "description": ""}}
        else:
            save_channel_branding(
                self.channels_dir, self.main_channel_id, save_banner=True
            )

        # if a single playlist was requested, use if for names;
        # otherwise, use main_channel's details.
        auto_title = (
            self.playlists[0].title
            if self.is_playlist and len(self.playlists) == 1
            else main_channel_json["snippet"]["title"].strip()
        )
        auto_description = (
            clean_text(self.playlists[0].description)
            if self.is_playlist and len(self.playlists) == 1
            else clean_text(main_channel_json["snippet"]["description"])
        ) or "-"
        self.title = self.title or auto_title or "-"
        self.description, self.long_description = compute_descriptions(
            default_description=auto_description,
            user_description=self.description,
            user_long_description=self.long_description,
        )

        if self.creator is None:
            if self.is_single_channel:
                self.creator = _("Youtube Channel “{title}”").format(
                    title=main_channel_json["snippet"]["title"]
                )
            else:
                self.creator = _("Youtube Channels")

        self.tags = self.tags or ["youtube"]
        if "_videos:yes" not in self.tags:
            self.tags.append("_videos:yes")

        # copy our main_channel branding into /(profile|banner).jpg if not supplied
        if not self.profile_path.exists():
            shutil.copy(
                self.channels_dir.joinpath(self.main_channel_id, "profile.jpg"),
                self.profile_path,
            )

        # set colors from images if not supplied
        if self.main_color is None or self.secondary_color is None:
            profile_main, profile_secondary = get_colors(self.profile_path)
            self.main_color = self.main_color or profile_main
            self.secondary_color = self.secondary_color or profile_secondary

        # convert profile image to png for favicon
        png_profile_path = self.build_dir.joinpath("profile.png")
        convert_image(self.profile_path, png_profile_path)

        resize_image(
            png_profile_path,
            width=48,
            height=48,
            method="thumbnail",
            dst=self.build_dir.joinpath("favicon.png"),
        )
        png_profile_path.unlink()

    def make_json_files(self, actual_videos_ids):
        """Generate JSON files to be consumed by the frontend"""
@@ -1045,6 +1025,7 @@
             author = videos_channels[video_id]
             subtitles_list = get_subtitles(video_id)
             channel_data = get_channel_json(author["channelId"])
+
             return Video(
                 id=video_id,
                 title=video["snippet"]["title"],
@@ -1151,10 +1132,13 @@
             )
 
         # write playlists JSON files
-        playlist_list = []
-        home_playlist_list = []
+        playlist_list: list[PlaylistPreview] = []
+        home_playlist_list: list[Playlist] = []
+
+        user_long_uploads_playlist_slug = None
+        user_short_uploads_playlist_slug = None
+        user_lives_playlist_slug = None
 
-        main_playlist_slug = None
         empty_playlists = list(
             filter(lambda playlist: len(get_videos_list(playlist)) == 0, self.playlists)
         )
@@ -1167,10 +1151,6 @@
         if len(self.playlists) == 0:
             raise Exception("No playlist succeeded to download")
 
-        main_playlist_slug = get_playlist_slug(
-            self.playlists[0]
-        )  # set first playlist as main playlist
-
         for playlist in self.playlists:
             playlist_slug = get_playlist_slug(playlist)
             playlist_path = f"playlists/{playlist_slug}.json"
@@ -1195,16 +1175,15 @@
             # modify playlist object for preview on homepage
             playlist_obj.videos = playlist_obj.videos[:12]
 
-            if playlist.playlist_id == self.uploads_playlist_id:
-                main_playlist_slug = (
-                    playlist_slug  # set uploads playlist as main playlist
-                )
-                # insert uploads playlist at the beginning of the list
-                playlist_list.insert(0, generate_playlist_preview_object(playlist))
-                home_playlist_list.insert(0, playlist_obj)
+            home_playlist_list.append(playlist_obj)
+            if playlist.playlist_id == self.user_long_uploads_playlist_id:
+                user_long_uploads_playlist_slug = playlist_slug
+            elif playlist.playlist_id == self.user_short_uploads_playlist_id:
+                user_short_uploads_playlist_slug = playlist_slug
+            elif playlist.playlist_id == self.user_lives_playlist_id:
+                user_lives_playlist_slug = playlist_slug
             else:
                 playlist_list.append(generate_playlist_preview_object(playlist))
-                home_playlist_list.append(playlist_obj)
 
         # write playlists.json file
         self.zim_file.add_item_for(
@@ -1241,7 +1220,10 @@
                 channel_description=channel_data["snippet"]["description"],
                 profile_path="profile.jpg",
                 banner_path="banner.jpg",
-                main_playlist=main_playlist_slug,
+                first_playlist=home_playlist_list[0].slug,
+                user_long_uploads_playlist=user_long_uploads_playlist_slug,
+                user_short_uploads_playlist=user_short_uploads_playlist_slug,
+                user_lives_playlist=user_lives_playlist_slug,
                 playlist_count=len(self.playlists),
                 joined_date=channel_data["snippet"]["publishedAt"],
             ).model_dump_json(by_alias=True, indent=2),

diff --git a/scraper/src/youtube2zim/youtube.py b/scraper/src/youtube2zim/youtube.py
@@ -56,6 +56,10 @@
     @classmethod
     def from_id(cls, playlist_id):
         playlist_json = get_playlist_json(playlist_id)
+        if playlist_json is None:
+            raise PlaylistNotFoundError(
+                f"Invalid playlistId `{playlist_id}`: Not Found"
+            )
         return Playlist(
             playlist_id=playlist_id,
             title=playlist_json["snippet"]["title"],
@@ -176,10 +180,13 @@
         req.raise_for_status()
         try:
             playlist_json = req.json()["items"][0]
+            total_results = req.json().get("pageInfo", {}).get("totalResults", 0)
+            if total_results == 0:
+                logger.error(f"Playlist `{playlist_id}`: No Item Available")
+                return None
         except IndexError:
-            raise PlaylistNotFoundError(
-                f"Invalid playlistId `{playlist_id}`: Not Found"
-            ) from None
+            logger.error(f"Invalid playlistId `{playlist_id}`: Not Found")
+            return None
         save_json(YOUTUBE.cache_dir, fname, playlist_json)
     return playlist_json
 
@@ -336,8 +343,9 @@
 def extract_playlists_details_from(youtube_id: str):
     """prepare a list of Playlist from user request"""
 
-    uploads_playlist_id = None
-    main_channel_id = None
+    main_channel_id = user_long_uploads_playlist_id = user_short_uploads_playlist_id = (
+        user_lives_playlist_id
+    ) = None
     if "," not in youtube_id:
         try:
             # first try to consider passed ID is a channel ID (or username or handle)
@@ -347,11 +355,36 @@
             playlist_ids = [
                 p["id"] for p in get_channel_playlists_json(main_channel_id)
             ]
-            # we always include uploads playlist (contains everything)
-            playlist_ids += [
-                channel_json["contentDetails"]["relatedPlaylists"]["uploads"]
-            ]
-            uploads_playlist_id = playlist_ids[-1]
+
+            # Get special playlists JSON objects
+            user_long_uploads_json = get_playlist_json("UULF" + main_channel_id[2:])
+            user_short_uploads_json = get_playlist_json("UUSH" + main_channel_id[2:])
+            user_lives_json = get_playlist_json("UULV" + main_channel_id[2:])
+
+            # Extract special playlists IDs if the JSON objects are not None
+            user_long_uploads_playlist_id = (
+                user_long_uploads_json["id"] if user_long_uploads_json else None
+            )
+            user_short_uploads_playlist_id = (
+                user_short_uploads_json["id"] if user_short_uploads_json else None
+            )
+            user_lives_playlist_id = user_lives_json["id"] if user_lives_json else None
+
+            # Add special playlists if they exists, in proper order
+            playlist_ids = (
+                list(
+                    filter(
+                        None,
+                        [
+                            user_long_uploads_playlist_id,
+                            user_short_uploads_playlist_id,
+                            user_lives_playlist_id,
+                        ],
+                    )
+                )
+                + playlist_ids
+            )
+
             is_playlist = False
         except ChannelNotFoundError:
             # channel not found, then ID should be a playlist
@@ -370,6 +403,8 @@
         # dict.fromkeys maintains the order of playlist_ids while removing duplicates
         [Playlist.from_id(playlist_id) for playlist_id in dict.fromkeys(playlist_ids)],
         main_channel_id,
-        uploads_playlist_id,
+        user_long_uploads_playlist_id,
+        user_short_uploads_playlist_id,
+        user_lives_playlist_id,
         is_playlist,
     )