✨ Add new metrics

Gudsfile · Gudsfile · commit 06c4a705f4b8 · 2024-10-12T11:51:55.000+02:00
diff --git a/README.md b/README.md
@@ -92,7 +92,8 @@ The app will :
 4. Enrich spotify audio features with  `sploty/audio_features.py`
    - The Spotify API is used at this stage, don't forget to [configure it](#spotify)
    - A `json database` ([TinyDB](https://github.com/msiemens/tinydb)) is used at this stage to reduce Spotify API calls by storing tracks data
-5. Index their to elastic with `sploty/to_elastic.py`
+5. Add additional metrics with `sploty/metrics.py` 
+6. Index their to elastic with `sploty/to_elastic.py`
    - Elasticsearch is used at this stage, don't forget to [configure it](#elasticsearch)
 
 #### How to display the help message?
@@ -116,7 +117,7 @@ poetry run python sploty/app.py … --previous-enriched-streaming-history-path y
 Use the `-no-<the part>` options
 
 ```shell
-poetry run python sploty/app.py … --no-concat --no-filter --no-enrich --no-feature --no-elastic
+poetry run python sploty/app.py … --no-concat --no-filter --no-enrich --no-feature --no-metric --no-elastic
 ```
 
 #### How to increase or reduce the number of lines processed at once?
diff --git a/elastic/README_ELASTIC.md b/elastic/README_ELASTIC.md
@@ -17,14 +17,19 @@ PUT _ingest/pipeline/spotify-stream-pipeline
   {"rename": { "field": "track_duration_ms", "target_field": "track.duration_ms", "ignore_missing": true }},
   {"rename": { "field": "track_popularity", "target_field": "track.popularity", "ignore_missing": true }},
   {"rename": { "field": "track_is_in_library", "target_field": "track.is_in_library", "ignore_missing": true }},
-  {"rename": { "field": "track_is_unplayable", "target_field": "track.is_unplayable", "ignore_missing": true }},
+  {"rename": { "field": "track_is_explicit", "target_field": "track.is_explicit", "ignore_missing": true }},
+  {"rename": { "field": "track_is_local", "target_field": "track.is_local", "ignore_missing": true }},
+  {"rename": { "field": "track_is_playable", "target_field": "track.is_playable", "ignore_missing": true }},
   {"rename": { "field": "album_uri", "target_field": "album.uri", "ignore_missing": true }},
+  {"rename": { "field": "album_name", "target_field": "album.name", "ignore_missing": true }},
+  {"rename": { "field": "album_type", "target_field": "album.type", "ignore_missing": true }},
+  {"rename": { "field": "album_release_date", "target_field": "album.release_date", "ignore_missing": true }},
   {"rename": { "field": "stream_username", "target_field": "stream_context.username", "ignore_missing": true }},
   {"rename": { "field": "stream_platform", "target_field": "stream_context.platform", "ignore_missing": true }},
+  {"rename": { "field": "stream_normalized_platform", "target_field": "stream_context.normalized_platform", "ignore_missing": true }},
   {"rename": { "field": "stream_conn_country", "target_field": "stream_context.conn_country", "ignore_missing": true }},
   {"rename": { "field": "stream_ip_addr_decrypted", "target_field": "stream_context.ip_addr_decrypted", "ignore_missing": true }},
   {"rename": { "field": "stream_user_agent_decrypted", "target_field": "stream_context.user_agent_decrypted", "ignore_missing": true }},
-  {"rename": { "field": "stream_album_name", "target_field": "album.name", "ignore_missing": true }},
   {"rename": { "field": "stream_reason_start", "target_field": "stream_context.reason_start", "ignore_missing": true }},
   {"rename": { "field": "stream_reason_end", "target_field": "stream_context.reason_end", "ignore_missing": true }},
   {"rename": { "field": "stream_shuffle", "target_field": "stream_context.shuffle", "ignore_missing": true }},
@@ -44,36 +49,18 @@ PUT _ingest/pipeline/spotify-stream-pipeline
   {"rename": { "field": "track_audio_feature_valence", "target_field": "audio_features.valence", "ignore_missing": true }},
   {"rename": { "field": "track_audio_feature_tempo", "target_field": "audio_features.tempo", "ignore_missing": true }},
   {"rename": { "field": "track_audio_feature_time_signature", "target_field": "audio_features.time_signature", "ignore_missing": true }},
-  {"remove": { "field": ["track_src_id", "location"], "ignore_missing": true }},
   {"user_agent": { "field": "stream_context.user_agent_decrypted", "ignore_missing": true }},
-  {
-    "script": {
-      "source": """
-      String platform = ctx['stream_context']['platform'];
-      String lcp = platform.toLowerCase();
-      if (lcp.startsWith('ios') || lcp.startsWith('partner ios_sdk')) {
-          platform = 'iOS';
-      } else if (lcp.startsWith('os x') || lcp.startsWith('osx')) {
-          platform = 'OS X';
-      } else if (lcp.startsWith('partner sonos_')) {
-          platform = 'Sonos';
-      } else if (lcp.startsWith('partner google cast_tv') || lcp.startsWith('partner google cast')) {
-          platform = 'Chromecast';
-      } else if (lcp.startsWith('partner android_tv')) {
-          platform = 'Android TV';
-      } else if (lcp.startsWith('android os') || lcp.startsWith('android [arm 0]')|| lcp.startsWith('android-tablet os')) {
-          platform = 'Android OS';
-      } else if (lcp.startsWith('webplayer') || lcp.startsWith('web_player') || lcp.startsWith('partner spotify web_player')) {
-          platform = 'WebPlayer';
-      } else if (lcp.startsWith('partner webos_tv') || lcp.startsWith('WebOs TV')) {
-          platform = 'WebOs TV';
-      } else if (lcp.startsWith('windows')) {
-          platform = 'Windows';
-      }
-      ctx['stream_context']['normalized_platform'] = platform
-      """
-    }
-  }
+  {"convert": { "field": "stream_context.offline", "type": "boolean", "ignore_missing": true }},
+  {"convert": { "field": "stream_context.shuffle", "type": "boolean", "ignore_missing": true }},
+  {"convert": { "field": "stream_context.incognito_mode", "type": "boolean", "ignore_missing": true }},
+  {"convert": { "field": "stream_context.skipped", "type": "boolean", "ignore_missing": true }},
+  {"convert": { "field": "track.is_in_library", "type": "boolean", "ignore_missing": true }},
+  {"convert": { "field": "track.is_explicit", "type": "boolean", "ignore_missing": true }},
+  {"convert": { "field": "track.is_local", "type": "boolean", "ignore_missing": true }},
+  {"convert": { "field": "track.is_playable", "type": "boolean", "ignore_missing": true }},
+  {"convert": { "field": "is_new_track", "type": "boolean", "ignore_missing": true }},
+  {"convert": { "field": "is_new_artist", "type": "boolean", "ignore_missing": true }},
+  {"convert": { "field": "is_new_album", "type": "boolean", "ignore_missing": true }}
   ]
 }
 ```
@@ -101,7 +88,8 @@ PUT _component_template/spotify-stream-mapping
             "danceability": {"type": "integer"},
             "key": {"type": "integer"},
             "speechiness": {"type": "integer"},
-            "energy": {"type": "integer"}
+            "energy": {"type": "integer"},
+            "time_signature": {"type": "integer"}
           }
         },
         "stream_context": {
@@ -132,7 +120,9 @@ PUT _component_template/spotify-stream-mapping
         "album": {
           "properties": {
             "name": {"type": "keyword"},
-            "uri": {"type": "keyword" }
+            "uri": {"type": "keyword"},
+            "type": {"type": "keyword"},
+            "release_date": {"format": "year||year_month||year_month_day", "type": "date"}
           }
         },
         "track": {
@@ -142,15 +132,20 @@ PUT _component_template/spotify-stream-mapping
             "name": {"type": "keyword"},
             "uri": {"type": "keyword"},
             "is_in_library": {"type": "boolean"},
-            "is_unplayable": {"type": "boolean" }
+            "is_explicit": {"type": "boolean"},
+            "is_local": {"type": "boolean"},
+            "is_playable": {"type": "boolean"}
           }
         },
         "end_time": {"format": "yyyy-MM-dd'T'HH:mm:ss'Z'", "type": "date"},
         "percentage_played": {"type": "long"},
         "day_name": {"type": "keyword"},
         "month_name": {"type": "keyword"},
         "ms_played": {"type": "long"},
-        "min_played": {"type": "long"}
+        "min_played": {"type": "long"},
+        "is_new_track": {"type": "boolean"},
+        "is_new_artist": {"type": "boolean"},
+        "is_new_album": {"type": "boolean"}
       }
     }
   }
diff --git a/sploty/app.py b/sploty/app.py
@@ -6,7 +6,7 @@
 from pydantic import Field, HttpUrl, v1
 from pydantic_settings import BaseSettings
 
-from sploty import audio_features, concat, enrich, filter, to_elastic
+from sploty import audio_features, concat, enrich, filter, metrics, to_elastic
 from sploty.settings import logger
 
 
@@ -23,6 +23,7 @@ class Arguments(v1.BaseModel):
     filter: bool = v1.Field(default=True)
     enrich: bool = v1.Field(default=True)
     feature: bool = v1.Field(default=True)
+    metric: bool = v1.Field(default=True)
     elastic: bool = v1.Field(default=True)
 
 
@@ -55,7 +56,8 @@ def main() -> None:
     concated_streaming_history_path = Path(f"{resources_path}/sploty_concated_history.csv")
     to_enrich_streaming_history_path = Path(f"{resources_path}/sploty_filtered_history.csv")
     enriched_streaming_history_path = Path(f"{resources_path}/sploty_enriched_history.csv")
-    featured_streaming_history_path = Path(f"{resources_path}/sploty_featured_history.csv")  # noqa: F841
+    featured_streaming_history_path = Path(f"{resources_path}/sploty_featured_history.csv")
+    metrics_streaming_history_path = Path(f"{resources_path}/sploty_metrics_history.csv")
 
     db_path = args.db_path
     audio_features_db_path = Path(f"{db_path}/tracks.json")
@@ -117,12 +119,18 @@ def main() -> None:
         audio_features.main(
             to_enrich_streaming_history_path,
             enriched_streaming_history_path,
+            featured_streaming_history_path,
             args.chunk_size,
             spotify_api_params,
             db,
         )
     else:
         logger.info("skip")
+    logger.info("============== METRICS =============")
+    if args.metric:
+        metrics.main(featured_streaming_history_path, metrics_streaming_history_path)
+    else:
+        logger.info("skip")
     logger.info("============== ELASTIC =============")
     if args.elastic:
         elastic = to_elastic.get_elastic(
@@ -131,7 +139,7 @@ def main() -> None:
             env.elastic_pass,
             args.elastic_timeout,
         )
-        to_elastic.main(enriched_streaming_history_path, args.index_name, elastic)
+        to_elastic.main(metrics_streaming_history_path, args.index_name, elastic)
     else:
         logger.info("skip")
 
diff --git a/sploty/audio_features.py b/sploty/audio_features.py
@@ -89,7 +89,14 @@ def completes_streams_with_audio_features(df_left, left_key, df_right, right_key
 # len(db.all())#noqa: ERA001
 
 
-def main(to_enrich_path: list, enriched_path: str, chunk_size: int, spotify_api_params: SpotifyApiParams, db: TinyDB):
+def main(  # noqa: PLR0913
+    to_enrich_path: list,
+    enriched_path: str,
+    featured_path: str,
+    chunk_size: int,
+    spotify_api_params: SpotifyApiParams,
+    db: TinyDB,
+):
     # get the audio features of tracks saved it in the TinyDb
     df_stream = pd.read_csv(to_enrich_path)
     logger.info("%i streams", len(df_stream))
@@ -122,7 +129,7 @@ def main(to_enrich_path: list, enriched_path: str, chunk_size: int, spotify_api_
         df_audio_features,
         "id",
     )
-    df_completed_streams.to_csv(enriched_path, mode="w", index=False)
+    df_completed_streams.to_csv(featured_path, mode="w", index=False)
     logger.info(
         "%i rows are re-saved at %s with audio features completed",
         len(df_completed_streams),
diff --git a/sploty/metrics.py b/sploty/metrics.py
@@ -0,0 +1,75 @@
+import logging
+from pathlib import Path
+
+import pandas as pd
+from pandas import DatetimeIndex
+
+logger = logging.getLogger(__name__)
+
+
+def normalize_platform(platform: str):
+    normalized = {
+        "android os": "Android OS",
+        "android [arm 0]": "Android OS",
+        "android-tablet os": "Android OS",
+        "partner android_tv": "Android TV",
+        "partner google cast": "Chromecast",
+        "ios": "iOS",
+        "partner ios": "iOS",
+        "osx": "MacOS",
+        "os x": "MacOS",
+        "sonos_": "Sonos",
+        "partner sonos": "Sonos",
+        "webos tv": "WebOS TV",
+        "partner webos_tv": "WebOS tv",
+        "webplayer": "WebPlayer",
+        "web_player": "WebPlayer",
+        "partner spotify web_player": "WebPlayer",
+        "windows": "Windows",
+        "not_applicable": "not_applicable",
+    }
+    normalized_matches = [value for key, value in normalized.items() if platform.lower().startswith(key.lower())]
+    if len(normalized_matches) > 1:
+        logger.warning(
+            "There are several matches for the `%s` platform: %s (the first one is taken)",
+            platform,
+            normalized_matches,
+        )
+        return normalized_matches[0]
+    if len(normalized_matches) < 1:
+        logger.warning("There is no match for the `%s` platform", platform)
+        return platform
+    return normalized_matches[0]
+
+
+def main(enriched_path: Path, metrics_path: Path):
+    df_stream = pd.read_csv(enriched_path)
+
+    df_stream["year"] = DatetimeIndex(df_stream.end_time).year.map(lambda x: f"{x:0>4}")
+    df_stream["month"] = (DatetimeIndex(df_stream.end_time).month).map(lambda x: f"{x:0>2}")
+    df_stream["month_name"] = DatetimeIndex(df_stream.end_time).month_name()
+    df_stream["day"] = DatetimeIndex(df_stream.end_time).day.map(lambda x: f"{x:0>2}")
+    df_stream["day_of_week"] = DatetimeIndex(df_stream.end_time).day_of_week.map(lambda x: f"{x:0>2}")
+    df_stream["day_name"] = DatetimeIndex(df_stream.end_time).day_name()
+    df_stream["hour"] = DatetimeIndex(df_stream.end_time).hour.map(lambda x: f"{x:0>2}")
+    df_stream["minute"] = DatetimeIndex(df_stream.end_time).minute.map(lambda x: f"{x:0>2}")
+    # ":04" writting is fixed in Python 3.10+ : https://stackoverflow.com/a/36044788
+
+    df_stream["min_played"] = df_stream.ms_played / 1000 / 60
+
+    df_stream["percentage_played"] = round((df_stream.ms_played / df_stream.track_duration_ms) * 100, 2)
+    df_stream["percentage_played"] = df_stream["percentage_played"].clip(0, 100)
+
+    df_stream["is_new_track"] = ~df_stream["track_uri"].duplicated(keep="first")
+    df_stream["is_new_artist"] = ~df_stream["artist_uri"].duplicated(keep="first")
+    df_stream["is_new_album"] = ~df_stream["album_uri"].duplicated(keep="first")
+
+    df_stream["normalized_platform"] = df_stream["platform"].apply(normalize_platform)
+
+    df_stream["skipped"] = df_stream["skipped"].astype(bool)
+
+    # track_is_in_library
+    # artist_genres
+    # artist_popularity
+
+    df_stream.to_csv(metrics_path, mode="w", index=False)