Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Shorts, lives and long videos in the UI + fix shorts display #378

Merged
merged 2 commits into from
Nov 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]

### Fixed
### Changed

- Differentiate user uploaded shorts, lives & long videos (#367)

### Fixed

- Corrected the short video resolution in the UI (#366)
- Check for empty playlists after filtering, and after downloading videos (#375)

## [3.2.1] - 2024-11-01
Expand Down
8 changes: 8 additions & 0 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -101,3 +101,11 @@ yarn test:e2e
```

On Linux, you might need to install additional dependencies, see [Linux Prerequisites](https://docs.cypress.io/guides/getting-started/installing-cypress#Linux-Prerequisites) in the Cypress documentation.

## running integration tests checking ZIM content

We have a bunch of integration tests checking ZIM content. Once you have a the test ZIM from openZIM channel (see instructions above for Vue.JS ZIM UI), you can run the tests locally as well:

```
ZIM_FILE_PATH="output/openZIM_testing.zim" pytest scraper/tests-integration/integration.py
```
8 changes: 3 additions & 5 deletions scraper/src/youtube2zim/playlists/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,9 @@ def run(self):
(
playlists,
main_channel_id,
uploads_playlist_id,
user_long_uploads_playlist_id,
user_short_uploads_playlist_id,
user_lives_playlist_id,
is_playlist,
) = extract_playlists_details_from(self.youtube_id)

Expand All @@ -106,10 +108,6 @@ def run(self):
shutil.rmtree(self.build_dir, ignore_errors=True)

for playlist in playlists:
if playlist.playlist_id == uploads_playlist_id:
logger.info(f"Skipping playlist {playlist.playlist_id} (uploads one)")
continue

logger.info(f"Executing youtube2zim for playlist {playlist.playlist_id}")
success, process = self.run_playlist_zim(playlist)
if success:
Expand Down
5 changes: 4 additions & 1 deletion scraper/src/youtube2zim/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,10 @@
profile_path: str | None = None
banner_path: str | None = None
joined_date: str
main_playlist: str | None = None
first_playlist: str | None = None
user_long_uploads_playlist: str | None = None
user_short_uploads_playlist: str | None = None
user_lives_playlist: str | None = None

Check warning on line 111 in scraper/src/youtube2zim/schemas.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/youtube2zim/schemas.py#L108-L111

Added lines #L108 - L111 were not covered by tests
playlist_count: int


Expand Down
66 changes: 24 additions & 42 deletions scraper/src/youtube2zim/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,9 @@

# process-related
self.playlists = []
self.uploads_playlist_id = None
self.user_long_uploads_playlist_id = None
self.user_short_uploads_playlist_id = None
self.user_lives_playlist_id = None

Check warning on line 175 in scraper/src/youtube2zim/scraper.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/youtube2zim/scraper.py#L173-L175

Added lines #L173 - L175 were not covered by tests
self.videos_ids = []
self.video_ids_count = 0
self.videos_processed = 0
Expand Down Expand Up @@ -229,194 +231,170 @@
def is_single_channel(self):
return len({pl.creator_id for pl in self.playlists}) == 1

@property
def sorted_playlists(self):
"""sorted list of playlists (by title) but with Uploads one at first if any"""
if len(self.playlists) <= 1:
return self.playlists

sorted_playlists = sorted(self.playlists, key=lambda x: x.title)
index = 0
# make sure our Uploads, special playlist is first
if self.uploads_playlist_id:
try:
index = [
index
for index, p in enumerate(sorted_playlists)
if p.playlist_id == self.uploads_playlist_id
][-1]
except Exception:
index = 0
return (
[sorted_playlists[index]]
+ sorted_playlists[0:index]
+ sorted_playlists[index + 1 :]
)

def run(self):
"""execute the scraper step by step"""

try:
# first report => creates a file with appropriate structure
self.report_progress()

# validate dateafter input
self.validate_dateafter_input()

if not self.name:
raise Exception("name is mandatory")
period = datetime.date.today().strftime("%Y-%m")
self.fname = (
self.fname.format(period=period)
if self.fname
else f"{self.name}_{period}.zim"
)

# check that we can create a ZIM file in the output directory
validate_zimfile_creatable(self.output_dir, self.fname)

# check that build_dir is correct
if not self.build_dir.exists() or not self.build_dir.is_dir():
raise OSError(f"Incorrect build_dir: {self.build_dir}")

logger.info(f"starting youtube scraper for {self.youtube_id}")
logger.info(f"preparing build folder at {self.build_dir.resolve()}")
self.prepare_build_folder()

logger.info("testing Youtube credentials")
if not credentials_ok():
raise ValueError(
"Unable to connect to Youtube API v3. check `API_KEY`."
)

if self.s3_url_with_credentials and not self.s3_credentials_ok():
raise ValueError(
"Unable to connect to Optimization Cache. Check its URL."
)

# fail early if supplied branding files are missing
self.check_branding_values()

logger.info("compute playlists list to retrieve")
self.extract_playlists()

logger.info(
".. {} playlists:\n {}".format(
len(self.playlists),
"\n ".join([p.playlist_id for p in self.playlists]),
)
)

logger.info("compute list of videos")
self.extract_videos_list()

self.video_ids_count = len(self.videos_ids)
nb_videos_msg = f".. {self.video_ids_count} videos"
if self.dateafter.start.year != 1:
nb_videos_msg += (
f" in date range: {self.dateafter.start} - {datetime.date.today()}"
)
logger.info(f"{nb_videos_msg}.")

# set a timer to report progress only every 10 seconds
every(10).seconds.do(self.report_progress)

logger.info("update general metadata")
self.update_metadata()

if not self.title:
raise Exception("title is mandatory")
if not self.description:
raise Exception("description is mandatory")
if not self.creator:
raise Exception("creator is mandatory")

# check that illustration is correct
illustration = "favicon.png"
illustration_path = self.build_dir / illustration
if not illustration_path.exists() or not illustration_path.is_file():
raise OSError(
f"Incorrect illustration: {illustration} ({illustration_path})"
)
with open(illustration_path, "rb") as fh:
illustration_data = fh.read()

logger.info("building ZIM file")
self.zim_file = Creator(
filename=self.output_dir / self.fname,
main_path="index.html",
ignore_duplicates=True,
disable_metadata_checks=self.disable_metadata_checks,
)
self.zim_file.config_metadata(
Name=self.name,
Language=self.language,
Title=self.title,
Description=self.description,
LongDescription=self.long_description,
Creator=self.creator,
Publisher=self.publisher,
Tags=";".join(self.tags) if self.tags else "",
Scraper=SCRAPER,
Date=datetime.date.today(),
Illustration_48x48_at_1=illustration_data,
)
self.zim_file.start()

logger.debug(f"Preparing zimfile at {self.zim_file.filename}")

logger.info("add main channel branding to ZIM")
self.add_main_channel_branding_to_zim()

logger.debug(f"add zimui files from {self.zimui_dist}")
self.add_zimui()

# download videos (and recompress)
logger.info(
"downloading all videos, subtitles and thumbnails "
f"(concurrency={self.max_concurrency})"
)
logger.info(f" format: {self.video_format}")
logger.info(f" quality: {self.video_quality}")
logger.info(f" generated-subtitles: {self.all_subtitles}")
if self.s3_storage:
logger.info(
f" using cache: {self.s3_storage.url.netloc} "
f"with bucket: {self.s3_storage.bucket_name}"
)
succeeded, failed = self.download_video_files(
max_concurrency=self.max_concurrency
)
if failed:
logger.error(f"{len(failed)} video(s) failed to download: {failed}")
if len(failed) >= len(succeeded):
logger.critical("More than half of videos failed. exiting")
raise OSError("Too much videos failed to download")

logger.info("retrieve channel-info for all videos (author details)")
get_videos_authors_info(succeeded)

logger.info("download all author's profile pictures")
self.download_authors_branding()

logger.info("creating JSON files")
self.make_json_files(succeeded)
except KeyboardInterrupt:
logger.error("KeyboardInterrupt, exiting.")
return 1
except Exception as exc:
logger.error(f"Interrupting process due to error: {exc}")
logger.exception(exc)
return 1
else:
logger.info("Finishing ZIM file…")
self.zim_file.finish()
finally:
self.report_progress()
logger.info("removing temp folder")
shutil.rmtree(self.build_dir, ignore_errors=True)

logger.info("all done!")

Check notice on line 397 in scraper/src/youtube2zim/scraper.py

View check run for this annotation

codefactor.io / CodeFactor

scraper/src/youtube2zim/scraper.py#L234-L397

Complex Method

def add_zimui(self):
logger.info(f"Adding files in {self.zimui_dist}")
Expand Down Expand Up @@ -552,7 +530,9 @@
(
self.playlists,
self.main_channel_id,
self.uploads_playlist_id,
self.user_long_uploads_playlist_id,
self.user_short_uploads_playlist_id,
self.user_lives_playlist_id,
self.is_playlist,
) = extract_playlists_details_from(self.youtube_id)

Expand Down Expand Up @@ -934,76 +914,76 @@
if path.exists():
self.add_file_to_zim(filename, path, callback=(delete_callback, path))

def update_metadata(self):
# we use title, description, profile and banner of channel/user
# or channel of first playlist
if not self.main_channel_id:
raise Exception("main_channel_id is mandatory")
try:
main_channel_json = get_channel_json(self.main_channel_id)
except KeyError:
main_channel_json = {"snippet": {"title": "Unknown", "description": ""}}
else:
save_channel_branding(
self.channels_dir, self.main_channel_id, save_banner=True
)

# if a single playlist was requested, use if for names;
# otherwise, use main_channel's details.
auto_title = (
self.playlists[0].title
if self.is_playlist and len(self.playlists) == 1
else main_channel_json["snippet"]["title"].strip()
)
auto_description = (
clean_text(self.playlists[0].description)
if self.is_playlist and len(self.playlists) == 1
else clean_text(main_channel_json["snippet"]["description"])
) or "-"
self.title = self.title or auto_title or "-"
self.description, self.long_description = compute_descriptions(
default_description=auto_description,
user_description=self.description,
user_long_description=self.long_description,
)

if self.creator is None:
if self.is_single_channel:
self.creator = _("Youtube Channel “{title}”").format(
title=main_channel_json["snippet"]["title"]
)
else:
self.creator = _("Youtube Channels")

self.tags = self.tags or ["youtube"]
if "_videos:yes" not in self.tags:
self.tags.append("_videos:yes")

# copy our main_channel branding into /(profile|banner).jpg if not supplied
if not self.profile_path.exists():
shutil.copy(
self.channels_dir.joinpath(self.main_channel_id, "profile.jpg"),
self.profile_path,
)

# set colors from images if not supplied
if self.main_color is None or self.secondary_color is None:
profile_main, profile_secondary = get_colors(self.profile_path)
self.main_color = self.main_color or profile_main
self.secondary_color = self.secondary_color or profile_secondary

# convert profile image to png for favicon
png_profile_path = self.build_dir.joinpath("profile.png")
convert_image(self.profile_path, png_profile_path)

resize_image(
png_profile_path,
width=48,
height=48,
method="thumbnail",
dst=self.build_dir.joinpath("favicon.png"),
)
png_profile_path.unlink()

Check notice on line 986 in scraper/src/youtube2zim/scraper.py

View check run for this annotation

codefactor.io / CodeFactor

scraper/src/youtube2zim/scraper.py#L917-L986

Complex Method

def make_json_files(self, actual_videos_ids):
"""Generate JSON files to be consumed by the frontend"""
Expand Down Expand Up @@ -1045,6 +1025,7 @@
author = videos_channels[video_id]
subtitles_list = get_subtitles(video_id)
channel_data = get_channel_json(author["channelId"])

return Video(
id=video_id,
title=video["snippet"]["title"],
Expand Down Expand Up @@ -1151,10 +1132,13 @@
)

# write playlists JSON files
playlist_list = []
home_playlist_list = []
playlist_list: list[PlaylistPreview] = []
home_playlist_list: list[Playlist] = []

Check warning on line 1136 in scraper/src/youtube2zim/scraper.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/youtube2zim/scraper.py#L1135-L1136

Added lines #L1135 - L1136 were not covered by tests

user_long_uploads_playlist_slug = None
user_short_uploads_playlist_slug = None
user_lives_playlist_slug = None

Check warning on line 1140 in scraper/src/youtube2zim/scraper.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/youtube2zim/scraper.py#L1138-L1140

Added lines #L1138 - L1140 were not covered by tests

main_playlist_slug = None
empty_playlists = list(
filter(lambda playlist: len(get_videos_list(playlist)) == 0, self.playlists)
)
Expand All @@ -1167,10 +1151,6 @@
if len(self.playlists) == 0:
raise Exception("No playlist succeeded to download")

main_playlist_slug = get_playlist_slug(
self.playlists[0]
) # set first playlist as main playlist

for playlist in self.playlists:
playlist_slug = get_playlist_slug(playlist)
playlist_path = f"playlists/{playlist_slug}.json"
Expand All @@ -1195,16 +1175,15 @@
# modify playlist object for preview on homepage
playlist_obj.videos = playlist_obj.videos[:12]

if playlist.playlist_id == self.uploads_playlist_id:
main_playlist_slug = (
playlist_slug # set uploads playlist as main playlist
)
# insert uploads playlist at the beginning of the list
playlist_list.insert(0, generate_playlist_preview_object(playlist))
home_playlist_list.insert(0, playlist_obj)
home_playlist_list.append(playlist_obj)

Check warning on line 1178 in scraper/src/youtube2zim/scraper.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/youtube2zim/scraper.py#L1178

Added line #L1178 was not covered by tests
if playlist.playlist_id == self.user_long_uploads_playlist_id:
user_long_uploads_playlist_slug = playlist_slug

Check warning on line 1180 in scraper/src/youtube2zim/scraper.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/youtube2zim/scraper.py#L1180

Added line #L1180 was not covered by tests
elif playlist.playlist_id == self.user_short_uploads_playlist_id:
user_short_uploads_playlist_slug = playlist_slug

Check warning on line 1182 in scraper/src/youtube2zim/scraper.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/youtube2zim/scraper.py#L1182

Added line #L1182 was not covered by tests
elif playlist.playlist_id == self.user_lives_playlist_id:
user_lives_playlist_slug = playlist_slug

Check warning on line 1184 in scraper/src/youtube2zim/scraper.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/youtube2zim/scraper.py#L1184

Added line #L1184 was not covered by tests
else:
playlist_list.append(generate_playlist_preview_object(playlist))
home_playlist_list.append(playlist_obj)

# write playlists.json file
self.zim_file.add_item_for(
Expand Down Expand Up @@ -1241,7 +1220,10 @@
channel_description=channel_data["snippet"]["description"],
profile_path="profile.jpg",
banner_path="banner.jpg",
main_playlist=main_playlist_slug,
first_playlist=home_playlist_list[0].slug,
user_long_uploads_playlist=user_long_uploads_playlist_slug,
user_short_uploads_playlist=user_short_uploads_playlist_slug,
user_lives_playlist=user_lives_playlist_slug,
playlist_count=len(self.playlists),
joined_date=channel_data["snippet"]["publishedAt"],
).model_dump_json(by_alias=True, indent=2),
Expand Down
57 changes: 46 additions & 11 deletions scraper/src/youtube2zim/youtube.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,10 @@
@classmethod
def from_id(cls, playlist_id):
playlist_json = get_playlist_json(playlist_id)
if playlist_json is None:
raise PlaylistNotFoundError(

Check warning on line 60 in scraper/src/youtube2zim/youtube.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/youtube2zim/youtube.py#L60

Added line #L60 was not covered by tests
f"Invalid playlistId `{playlist_id}`: Not Found"
)
return Playlist(
playlist_id=playlist_id,
title=playlist_json["snippet"]["title"],
Expand Down Expand Up @@ -176,10 +180,13 @@
req.raise_for_status()
try:
playlist_json = req.json()["items"][0]
total_results = req.json().get("pageInfo", {}).get("totalResults", 0)

Check warning on line 183 in scraper/src/youtube2zim/youtube.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/youtube2zim/youtube.py#L183

Added line #L183 was not covered by tests
if total_results == 0:
logger.error(f"Playlist `{playlist_id}`: No Item Available")
return None

Check warning on line 186 in scraper/src/youtube2zim/youtube.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/youtube2zim/youtube.py#L185-L186

Added lines #L185 - L186 were not covered by tests
except IndexError:
raise PlaylistNotFoundError(
f"Invalid playlistId `{playlist_id}`: Not Found"
) from None
logger.error(f"Invalid playlistId `{playlist_id}`: Not Found")
return None

Check warning on line 189 in scraper/src/youtube2zim/youtube.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/youtube2zim/youtube.py#L188-L189

Added lines #L188 - L189 were not covered by tests
save_json(YOUTUBE.cache_dir, fname, playlist_json)
return playlist_json

Expand Down Expand Up @@ -336,8 +343,9 @@
def extract_playlists_details_from(youtube_id: str):
"""prepare a list of Playlist from user request"""

uploads_playlist_id = None
main_channel_id = None
main_channel_id = user_long_uploads_playlist_id = user_short_uploads_playlist_id = (

Check warning on line 346 in scraper/src/youtube2zim/youtube.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/youtube2zim/youtube.py#L346

Added line #L346 was not covered by tests
user_lives_playlist_id
) = None
if "," not in youtube_id:
try:
# first try to consider passed ID is a channel ID (or username or handle)
Expand All @@ -347,11 +355,36 @@
playlist_ids = [
p["id"] for p in get_channel_playlists_json(main_channel_id)
]
# we always include uploads playlist (contains everything)
playlist_ids += [
channel_json["contentDetails"]["relatedPlaylists"]["uploads"]
]
uploads_playlist_id = playlist_ids[-1]

# Get special playlists JSON objects
user_long_uploads_json = get_playlist_json("UULF" + main_channel_id[2:])
user_short_uploads_json = get_playlist_json("UUSH" + main_channel_id[2:])
user_lives_json = get_playlist_json("UULV" + main_channel_id[2:])

Check warning on line 362 in scraper/src/youtube2zim/youtube.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/youtube2zim/youtube.py#L360-L362

Added lines #L360 - L362 were not covered by tests

# Extract special playlists IDs if the JSON objects are not None
user_long_uploads_playlist_id = (

Check warning on line 365 in scraper/src/youtube2zim/youtube.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/youtube2zim/youtube.py#L365

Added line #L365 was not covered by tests
user_long_uploads_json["id"] if user_long_uploads_json else None
)
user_short_uploads_playlist_id = (

Check warning on line 368 in scraper/src/youtube2zim/youtube.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/youtube2zim/youtube.py#L368

Added line #L368 was not covered by tests
user_short_uploads_json["id"] if user_short_uploads_json else None
)
user_lives_playlist_id = user_lives_json["id"] if user_lives_json else None

Check warning on line 371 in scraper/src/youtube2zim/youtube.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/youtube2zim/youtube.py#L371

Added line #L371 was not covered by tests

# Add special playlists if they exists, in proper order
playlist_ids = (

Check warning on line 374 in scraper/src/youtube2zim/youtube.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/youtube2zim/youtube.py#L374

Added line #L374 was not covered by tests
list(
filter(
None,
[
user_long_uploads_playlist_id,
user_short_uploads_playlist_id,
user_lives_playlist_id,
],
)
)
+ playlist_ids
)

is_playlist = False
except ChannelNotFoundError:
# channel not found, then ID should be a playlist
Expand All @@ -370,6 +403,8 @@
# dict.fromkeys maintains the order of playlist_ids while removing duplicates
[Playlist.from_id(playlist_id) for playlist_id in dict.fromkeys(playlist_ids)],
main_channel_id,
uploads_playlist_id,
user_long_uploads_playlist_id,
user_short_uploads_playlist_id,
user_lives_playlist_id,
is_playlist,
)
Loading