From 12b8812cf098887606ed5210a799c6f41aa0f238 Mon Sep 17 00:00:00 2001 From: Gudsfile Date: Sat, 12 Oct 2024 10:33:40 +0200 Subject: [PATCH] =?UTF-8?q?=E2=9C=A8=20Add=20new=20metrics?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 5 +-- elastic/README_ELASTIC.md | 65 +++++++++++++++++------------------ sploty/app.py | 14 ++++++-- sploty/audio_features.py | 11 ++++-- sploty/metrics.py | 71 +++++++++++++++++++++++++++++++++++++++ 5 files changed, 124 insertions(+), 42 deletions(-) create mode 100644 sploty/metrics.py diff --git a/README.md b/README.md index e032287..a5e9dba 100644 --- a/README.md +++ b/README.md @@ -92,7 +92,8 @@ The app will : 4. Enrich spotify audio features with `sploty/audio_features.py` - The Spotify API is used at this stage, don't forget to [configure it](#spotify) - A `json database` ([TinyDB](https://github.com/msiemens/tinydb)) is used at this stage to reduce Spotify API calls by storing tracks data -5. Index their to elastic with `sploty/to_elastic.py` +5. Add additional metrics with `sploty/metrics.py` +6. Index their to elastic with `sploty/to_elastic.py` - Elasticsearch is used at this stage, don't forget to [configure it](#elasticsearch) #### How to display the help message? @@ -116,7 +117,7 @@ poetry run python sploty/app.py … --previous-enriched-streaming-history-path y Use the `-no-` options ```shell -poetry run python sploty/app.py … --no-concat --no-filter --no-enrich --no-feature --no-elastic +poetry run python sploty/app.py … --no-concat --no-filter --no-enrich --no-feature --no-metric --no-elastic ``` #### How to increase or reduce the number of lines processed at once? diff --git a/elastic/README_ELASTIC.md b/elastic/README_ELASTIC.md index 6ee1c23..2e6f34e 100644 --- a/elastic/README_ELASTIC.md +++ b/elastic/README_ELASTIC.md @@ -17,14 +17,19 @@ PUT _ingest/pipeline/spotify-stream-pipeline {"rename": { "field": "track_duration_ms", "target_field": "track.duration_ms", "ignore_missing": true }}, {"rename": { "field": "track_popularity", "target_field": "track.popularity", "ignore_missing": true }}, {"rename": { "field": "track_is_in_library", "target_field": "track.is_in_library", "ignore_missing": true }}, - {"rename": { "field": "track_is_unplayable", "target_field": "track.is_unplayable", "ignore_missing": true }}, + {"rename": { "field": "track_is_explicit", "target_field": "track.is_explicit", "ignore_missing": true }}, + {"rename": { "field": "track_is_local", "target_field": "track.is_local", "ignore_missing": true }}, + {"rename": { "field": "track_is_playable", "target_field": "track.is_playable", "ignore_missing": true }}, {"rename": { "field": "album_uri", "target_field": "album.uri", "ignore_missing": true }}, + {"rename": { "field": "album_name", "target_field": "album.name", "ignore_missing": true }}, + {"rename": { "field": "album_type", "target_field": "album.type", "ignore_missing": true }}, + {"rename": { "field": "album_release_date", "target_field": "album.release_date", "ignore_missing": true }}, {"rename": { "field": "stream_username", "target_field": "stream_context.username", "ignore_missing": true }}, {"rename": { "field": "stream_platform", "target_field": "stream_context.platform", "ignore_missing": true }}, + {"rename": { "field": "stream_normalized_platform", "target_field": "stream_context.normalized_platform", "ignore_missing": true }}, {"rename": { "field": "stream_conn_country", "target_field": "stream_context.conn_country", "ignore_missing": true }}, {"rename": { "field": "stream_ip_addr_decrypted", "target_field": "stream_context.ip_addr_decrypted", "ignore_missing": true }}, {"rename": { "field": "stream_user_agent_decrypted", "target_field": "stream_context.user_agent_decrypted", "ignore_missing": true }}, - {"rename": { "field": "stream_album_name", "target_field": "album.name", "ignore_missing": true }}, {"rename": { "field": "stream_reason_start", "target_field": "stream_context.reason_start", "ignore_missing": true }}, {"rename": { "field": "stream_reason_end", "target_field": "stream_context.reason_end", "ignore_missing": true }}, {"rename": { "field": "stream_shuffle", "target_field": "stream_context.shuffle", "ignore_missing": true }}, @@ -44,36 +49,18 @@ PUT _ingest/pipeline/spotify-stream-pipeline {"rename": { "field": "track_audio_feature_valence", "target_field": "audio_features.valence", "ignore_missing": true }}, {"rename": { "field": "track_audio_feature_tempo", "target_field": "audio_features.tempo", "ignore_missing": true }}, {"rename": { "field": "track_audio_feature_time_signature", "target_field": "audio_features.time_signature", "ignore_missing": true }}, - {"remove": { "field": ["track_src_id", "location"], "ignore_missing": true }}, {"user_agent": { "field": "stream_context.user_agent_decrypted", "ignore_missing": true }}, - { - "script": { - "source": """ - String platform = ctx['stream_context']['platform']; - String lcp = platform.toLowerCase(); - if (lcp.startsWith('ios') || lcp.startsWith('partner ios_sdk')) { - platform = 'iOS'; - } else if (lcp.startsWith('os x') || lcp.startsWith('osx')) { - platform = 'OS X'; - } else if (lcp.startsWith('partner sonos_')) { - platform = 'Sonos'; - } else if (lcp.startsWith('partner google cast_tv') || lcp.startsWith('partner google cast')) { - platform = 'Chromecast'; - } else if (lcp.startsWith('partner android_tv')) { - platform = 'Android TV'; - } else if (lcp.startsWith('android os') || lcp.startsWith('android [arm 0]')|| lcp.startsWith('android-tablet os')) { - platform = 'Android OS'; - } else if (lcp.startsWith('webplayer') || lcp.startsWith('web_player') || lcp.startsWith('partner spotify web_player')) { - platform = 'WebPlayer'; - } else if (lcp.startsWith('partner webos_tv') || lcp.startsWith('WebOs TV')) { - platform = 'WebOs TV'; - } else if (lcp.startsWith('windows')) { - platform = 'Windows'; - } - ctx['stream_context']['normalized_platform'] = platform - """ - } - } + {"convert": { "field": "stream_context.offline", "type": "boolean", "ignore_missing": true }}, + {"convert": { "field": "stream_context.shuffle", "type": "boolean", "ignore_missing": true }}, + {"convert": { "field": "stream_context.incognito_mode", "type": "boolean", "ignore_missing": true }}, + {"convert": { "field": "stream_context.skipped", "type": "boolean", "ignore_missing": true }}, + {"convert": { "field": "track.is_in_library", "type": "boolean", "ignore_missing": true }}, + {"convert": { "field": "track.is_explicit", "type": "boolean", "ignore_missing": true }}, + {"convert": { "field": "track.is_local", "type": "boolean", "ignore_missing": true }}, + {"convert": { "field": "track.is_playable", "type": "boolean", "ignore_missing": true }}, + {"convert": { "field": "is_new_track", "type": "boolean", "ignore_missing": true }}, + {"convert": { "field": "is_new_artist", "type": "boolean", "ignore_missing": true }}, + {"convert": { "field": "is_new_album", "type": "boolean", "ignore_missing": true }} ] } ``` @@ -101,7 +88,8 @@ PUT _component_template/spotify-stream-mapping "danceability": {"type": "integer"}, "key": {"type": "integer"}, "speechiness": {"type": "integer"}, - "energy": {"type": "integer"} + "energy": {"type": "integer"}, + "time_signature": {"type": "integer"} } }, "stream_context": { @@ -132,7 +120,9 @@ PUT _component_template/spotify-stream-mapping "album": { "properties": { "name": {"type": "keyword"}, - "uri": {"type": "keyword" } + "uri": {"type": "keyword"}, + "type": {"type": "keyword"}, + "release_date": {"format": "year||year_month||year_month_day", "type": "date"} } }, "track": { @@ -142,7 +132,9 @@ PUT _component_template/spotify-stream-mapping "name": {"type": "keyword"}, "uri": {"type": "keyword"}, "is_in_library": {"type": "boolean"}, - "is_unplayable": {"type": "boolean" } + "is_explicit": {"type": "boolean"}, + "is_local": {"type": "boolean"}, + "is_playable": {"type": "boolean"} } }, "end_time": {"format": "yyyy-MM-dd'T'HH:mm:ss'Z'", "type": "date"}, @@ -150,7 +142,10 @@ PUT _component_template/spotify-stream-mapping "day_name": {"type": "keyword"}, "month_name": {"type": "keyword"}, "ms_played": {"type": "long"}, - "min_played": {"type": "long"} + "min_played": {"type": "long"}, + "is_new_track": {"type": "boolean"}, + "is_new_artist": {"type": "boolean"}, + "is_new_album": {"type": "boolean"} } } } diff --git a/sploty/app.py b/sploty/app.py index 5e11b78..ed1ef70 100644 --- a/sploty/app.py +++ b/sploty/app.py @@ -6,7 +6,7 @@ from pydantic import Field, HttpUrl, v1 from pydantic_settings import BaseSettings -from sploty import audio_features, concat, enrich, filter, to_elastic +from sploty import audio_features, concat, enrich, filter, metrics, to_elastic from sploty.settings import logger @@ -23,6 +23,7 @@ class Arguments(v1.BaseModel): filter: bool = v1.Field(default=True) enrich: bool = v1.Field(default=True) feature: bool = v1.Field(default=True) + metric: bool = v1.Field(default=True) elastic: bool = v1.Field(default=True) @@ -55,7 +56,8 @@ def main() -> None: concated_streaming_history_path = Path(f"{resources_path}/sploty_concated_history.csv") to_enrich_streaming_history_path = Path(f"{resources_path}/sploty_filtered_history.csv") enriched_streaming_history_path = Path(f"{resources_path}/sploty_enriched_history.csv") - featured_streaming_history_path = Path(f"{resources_path}/sploty_featured_history.csv") # noqa: F841 + featured_streaming_history_path = Path(f"{resources_path}/sploty_featured_history.csv") + metrics_streaming_history_path = Path(f"{resources_path}/sploty_metrics_history.csv") db_path = args.db_path audio_features_db_path = Path(f"{db_path}/tracks.json") @@ -117,12 +119,18 @@ def main() -> None: audio_features.main( to_enrich_streaming_history_path, enriched_streaming_history_path, + featured_streaming_history_path, args.chunk_size, spotify_api_params, db, ) else: logger.info("skip") + logger.info("============== METRICS =============") + if args.metric: + metrics.main(featured_streaming_history_path, metrics_streaming_history_path) + else: + logger.info("skip") logger.info("============== ELASTIC =============") if args.elastic: elastic = to_elastic.get_elastic( @@ -131,7 +139,7 @@ def main() -> None: env.elastic_pass, args.elastic_timeout, ) - to_elastic.main(enriched_streaming_history_path, args.index_name, elastic) + to_elastic.main(metrics_streaming_history_path, args.index_name, elastic) else: logger.info("skip") diff --git a/sploty/audio_features.py b/sploty/audio_features.py index 35b3242..94dc764 100644 --- a/sploty/audio_features.py +++ b/sploty/audio_features.py @@ -89,7 +89,14 @@ def completes_streams_with_audio_features(df_left, left_key, df_right, right_key # len(db.all())#noqa: ERA001 -def main(to_enrich_path: list, enriched_path: str, chunk_size: int, spotify_api_params: SpotifyApiParams, db: TinyDB): +def main( # noqa: PLR0913 + to_enrich_path: list, + enriched_path: str, + featured_path: str, + chunk_size: int, + spotify_api_params: SpotifyApiParams, + db: TinyDB, +): # get the audio features of tracks saved it in the TinyDb df_stream = pd.read_csv(to_enrich_path) logger.info("%i streams", len(df_stream)) @@ -122,7 +129,7 @@ def main(to_enrich_path: list, enriched_path: str, chunk_size: int, spotify_api_ df_audio_features, "id", ) - df_completed_streams.to_csv(enriched_path, mode="w", index=False) + df_completed_streams.to_csv(featured_path, mode="w", index=False) logger.info( "%i rows are re-saved at %s with audio features completed", len(df_completed_streams), diff --git a/sploty/metrics.py b/sploty/metrics.py new file mode 100644 index 0000000..5102795 --- /dev/null +++ b/sploty/metrics.py @@ -0,0 +1,71 @@ +import logging +from pathlib import Path + +import pandas as pd +from pandas import DatetimeIndex + +logger = logging.getLogger(__name__) + + +def normalize_platform(platform: str): + normalized = { + "android os": "Android OS", + "android [arm 0]": "Android OS", + "android-tablet os": "Android OS", + "partner android_tv": "Android TV", + "partner google cast": "Chromecast", + "ios": "iOS", + "partner ios": "iOS", + "osx": "MacOS", + "os x": "MacOS", + "sonos_": "Sonos", + "partner sonos": "Sonos", + "webos tv": "WebOS TV", + "partner webos_tv": "WebOS tv", + "webplayer": "WebPlayer", + "web_player": "WebPlayer", + "partner spotify web_player": "WebPlayer", + "windows": "Windows", + "not_applicable": "not_applicable", + } + normalized_matches = [value for key, value in normalized.items() if platform.lower().startswith(key.lower())] + if len(normalized_matches) > 1: + logger.warning( + "There are several matches for the `%s` platform: %s (the first one is taken)", + platform, + normalized_matches, + ) + return normalized_matches[0] + if len(normalized_matches) < 1: + logger.warning("There is no match for the `%s` platform", platform) + return platform + return normalized_matches[0] + + +def main(enriched_path: Path, metrics_path: Path): + df_stream = pd.read_csv(enriched_path) + + df_stream["year"] = DatetimeIndex(df_stream.end_time).year.map(lambda x: f"{x:0>4}") + df_stream["month"] = (DatetimeIndex(df_stream.end_time).month).map(lambda x: f"{x:0>2}") + df_stream["month_name"] = DatetimeIndex(df_stream.end_time).month_name() + df_stream["day"] = DatetimeIndex(df_stream.end_time).day.map(lambda x: f"{x:0>2}") + df_stream["day_of_week"] = DatetimeIndex(df_stream.end_time).day_of_week.map(lambda x: f"{x:0>2}") + df_stream["day_name"] = DatetimeIndex(df_stream.end_time).day_name() + df_stream["hour"] = DatetimeIndex(df_stream.end_time).hour.map(lambda x: f"{x:0>2}") + df_stream["minute"] = DatetimeIndex(df_stream.end_time).minute.map(lambda x: f"{x:0>2}") + # ":04" writting is fixed in Python 3.10+ : https://stackoverflow.com/a/36044788 + + df_stream["min_played"] = df_stream.ms_played / 1000 / 60 + + df_stream["percentage_played"] = round((df_stream.ms_played / df_stream.track_duration_ms) * 100, 2) + df_stream["percentage_played"] = df_stream["percentage_played"].clip(0, 100) + + df_stream["is_new_track"] = ~df_stream["track_uri"].duplicated(keep="first") + df_stream["is_new_artist"] = ~df_stream["artist_uri"].duplicated(keep="first") + df_stream["is_new_album"] = ~df_stream["album_uri"].duplicated(keep="first") + + df_stream["normalized_platform"] = df_stream["platform"].apply(normalize_platform) + + df_stream["skipped"] = df_stream["skipped"].astype(bool) + + df_stream.to_csv(metrics_path, mode="w", index=False)