♻️ Remove computed cols

Gudsfile · Oct 12, 2024 · abc2a39 · abc2a39
1 parent 8cf0245
commit abc2a39
Show file tree

Hide file tree

Showing 4 changed files with 21 additions and 92 deletions.
diff --git a/sploty/audio_features.py b/sploty/audio_features.py
@@ -111,7 +111,10 @@ def main(to_enrich_path: list, enriched_path: str, chunk_size: int, spotify_api_
 
     # add audio feature to rows that do not have it
     df_enriched_streams = pd.read_csv(enriched_path)
-    df_audio_features = pd.DataFrame(db.all())
+    df_audio_features = pd.DataFrame(db.all()).drop(
+        ["type", "uri", "track_href", "analysis_url", "duration_ms"],
+        axis=1,
+    )
 
     df_completed_streams = completes_streams_with_audio_features(
         df_enriched_streams,

diff --git a/sploty/concat.py b/sploty/concat.py
@@ -1,29 +1,17 @@
 import logging
 
 import pandas as pd
-from pandas import DatetimeIndex
 
 logger = logging.getLogger(__name__)
 
 
-def header_converter(df):
-    return df.rename(
-        columns={
-            "endTime": "end_time",
-            "msPlayed": "ms_played",
-            "artistName": "artist_name",
-            "trackName": "track_name",
-        },
-    )
-
-
 def main(input_paths: list, concated_path: str):
     """
     input_paths: files where read streaming history
     concated_path: file to write concated streaming history
     """
     # Read streaming files
-    df_stream = header_converter(pd.concat(map(pd.read_json, input_paths)))
+    df_stream = pd.concat(map(pd.read_json, input_paths))
     logger.info("%i rows in %s", len(df_stream), input_paths)
 
     df_stream = df_stream.drop_duplicates()
@@ -43,16 +31,6 @@ def main(input_paths: list, concated_path: str):
         axis=1,
     )
 
-    df_stream["track_src_id"] = df_stream.artist_name + ":" + df_stream.track_name
-    df_stream["year"] = DatetimeIndex(df_stream.end_time).year.map(lambda x: f"{x:0>4}")
-    df_stream["month"] = (DatetimeIndex(df_stream.end_time).month).map(lambda x: f"{x:0>2}")
-    df_stream["month_name"] = DatetimeIndex(df_stream.end_time).month_name()
-    df_stream["day"] = DatetimeIndex(df_stream.end_time).day.map(lambda x: f"{x:0>2}")
-    df_stream["hour"] = DatetimeIndex(df_stream.end_time).hour.map(lambda x: f"{x:0>2}")
-    df_stream["minute"] = DatetimeIndex(df_stream.end_time).minute.map(lambda x: f"{x:0>2}")
-    # ":04" writting is fixed in Python 3.10+ : https://stackoverflow.com/a/36044788
-
-    df_stream["min_played"] = df_stream.ms_played / 1000 / 60
     df_stream["id"] = df_stream.end_time + ":" + df_stream.track_uri
 
     df_stream["date"] = pd.to_datetime(df_stream.end_time)

diff --git a/sploty/enrich.py b/sploty/enrich.py
@@ -9,7 +9,8 @@
 import requests
 from pydantic import BaseModel, HttpUrl
 from requests.exceptions import HTTPError
-from settings import BoldColor
+
+from sploty.settings import BoldColor
 
 logger = logging.getLogger(__name__)
 
@@ -50,8 +51,8 @@ def another_get(spotify_api_params: SpotifyApiParams, track_uris):
 
 
 def merger(df1, df5):
-    df1["is_done"] = df1.track_src_id.isin(df5.track_src_id)
-    df = df1.merge(df5, on="track_src_id", how="left")
+    df1["is_done"] = df1.track_uri.isin(df5.track_uri)
+    df = df1.merge(df5, on="track_uri", how="left")
 
     for c_x in df.columns:
         if c_x.endswith("_x"):
@@ -62,48 +63,13 @@ def merger(df1, df5):
 
 
 def saver(df_tableau, complete_data, enriched_path):
-    sorted_cols = [
-        "id",
-        "end_time",
-        "artist_name",
-        "track_name",
-        "ms_played",
-        "min_played",
-        "track_duration_ms",
-        "percentage_played",
-        "track_popularity",
-        # 'in_library',
-        "track_src_id",
-        "artist_uri",
-        "track_uri",
-        "year",
-        "month",
-        "month_name",
-        "day",
-        "hour",
-        "minute",
-        "username",
-        "platform",
-        "conn_country",
-        "ip_addr_decrypted",
-        "user_agent_decrypted",
-        "album_name",
-        "reason_start",
-        "reason_end",
-        "shuffle",
-        "skipped",
-        "offline",
-        "offline_timestamp",
-        "incognito_mode",
-    ]
-
     complete_data = pd.DataFrame.from_dict(complete_data, orient="index")
 
     if len(complete_data) == 0:
         return df_tableau
 
     streams = merger(df_tableau, complete_data)
-    to_write = streams[streams["is_done"] == True][sorted_cols]  # noqa: E712
+    to_write = streams[streams["is_done"] == True]  # noqa: E712
     to_keep = streams[streams["is_done"] == False]  # noqa: E712
     # == to prevent "KeyError: False"
 
@@ -121,8 +87,8 @@ def saver(df_tableau, complete_data, enriched_path):
 def better_enrich(df_tableau, chunk_size, enriched_path, spotify_api_params):
     logger.info("enrich track data for %i tracks", len(df_tableau))
 
-    df = df_tableau[["track_uri", "track_name", "artist_name", "track_src_id", "ms_played"]].drop_duplicates(
-        "track_src_id",
+    df = df_tableau[["track_uri", "track_name", "artist_name", "ms_played"]].drop_duplicates(
+        "track_uri",
     )
     logger.info("reduce enrich for only %i tracks", len(df))
 
@@ -147,17 +113,22 @@ def better_enrich(df_tableau, chunk_size, enriched_path, spotify_api_params):
 
             track = response["tracks"][i]
             artist = track["artists"][0]  # only one artist :(
-            # album = track["album"] #noqa: ERA001
+            album = track["album"]
             track_uri = track["uri"]
 
             logger.debug("enrich track uri n°%i (%s)", index, track_uri)
 
-            stream["artist_uri"] = artist["uri"].split(":")[2]
+            stream["artist_uri"] = artist["id"]
             stream["track_duration_ms"] = (
                 track.get("duration_ms", np.nan) if track.get("duration_ms", None) != 0.0 else np.nan
             )
             stream["track_popularity"] = track.get("popularity", None)
-            stream["percentage_played"] = round((stream.ms_played / stream.track_duration_ms) * 100, 2)
+            stream["track_is_explicit"] = track["explicit"]
+            stream["track_is_local"] = track["is_local"]
+            stream["track_is_playable"] = track["is_playable"]
+            stream["album_uri"] = album["id"]
+            stream["album_type"] = album["album_type"]
+            stream["album_release_date"] = album["release_date"]
             dict_all[index] = stream
         checkpoint += chunk_size
         df_tableau = saver(df_tableau, dict_all, enriched_path)

diff --git a/sploty/to_elastic.py b/sploty/to_elastic.py
@@ -25,38 +25,19 @@ def main(enriched_path: str, index_name: str, elastic):
     # Rename columns
     df_stream = df_stream.rename(
         columns={
-            "end_time": "end_time",
-            "ms_played": "ms_played",
-            "min_played": "min_played",
-            "percentage_played": "percentage_played",
-            "track_uri": "track_uri",
-            "track_name": "track_name",
-            "track_duration_ms": "track_duration_ms",
-            "track_popularity": "track_popularity",
-            "track_is_in_library": "track_is_in_library",
-            "track_is_unplayable": "track_is_unplayable",
-            "artist_uri": "artist_uri",
-            "artist_name": "artist_name",
-            "artist_genres": "artist_genres",
-            "artist_popularity": "artist_popularity",
-            "album_uri": "album_uri",
-            "audio_features": "audio_features",
-            "stream_context": "stream_context",
             "username": "stream_username",
             "platform": "stream_platform",
+            "normalized_platform": "stream_normalized_platform",
             "conn_country": "stream_conn_country",
             "ip_addr_decrypted": "stream_ip_addr_decrypted",
             "user_agent_decrypted": "stream_user_agent_decrypted",
-            "album_name": "stream_album_name",
             "reason_start": "stream_reason_start",
             "reason_end": "stream_reason_end",
             "shuffle": "stream_shuffle",
             "skipped": "stream_skipped",
             "offline": "stream_offline",
             "offline_timestamp": "stream_offline_timestamp",
             "incognito_mode": "stream_incognito_mode",
-            "month_name": "month_name",
-            "id": "id",
             "danceability": "track_audio_feature_danceability",
             "energy": "track_audio_feature_energy",
             "key": "track_audio_feature_key",
@@ -72,10 +53,6 @@ def main(enriched_path: str, index_name: str, elastic):
         },
     )
 
-    df_stream = df_stream.drop(["track_src_id"], axis=1)
-    df_stream = df_stream.drop(["minute", "hour", "day", "month", "year"], axis=1)
-    df_stream = df_stream.drop(["stream_skipped"], axis=1)  # TODO fix error
-
     # Index streams
     logger.info("indexing %i tracks to %s", len(df_stream), index_name)
     json_tmp = json.loads(df_stream.to_json(orient="records"))