diff --git a/sploty/audio_features.py b/sploty/audio_features.py index ef79c9f..35b3242 100644 --- a/sploty/audio_features.py +++ b/sploty/audio_features.py @@ -111,7 +111,10 @@ def main(to_enrich_path: list, enriched_path: str, chunk_size: int, spotify_api_ # add audio feature to rows that do not have it df_enriched_streams = pd.read_csv(enriched_path) - df_audio_features = pd.DataFrame(db.all()) + df_audio_features = pd.DataFrame(db.all()).drop( + ["type", "uri", "track_href", "analysis_url", "duration_ms"], + axis=1, + ) df_completed_streams = completes_streams_with_audio_features( df_enriched_streams, diff --git a/sploty/concat.py b/sploty/concat.py index 0ede9da..a50b20f 100644 --- a/sploty/concat.py +++ b/sploty/concat.py @@ -1,29 +1,17 @@ import logging import pandas as pd -from pandas import DatetimeIndex logger = logging.getLogger(__name__) -def header_converter(df): - return df.rename( - columns={ - "endTime": "end_time", - "msPlayed": "ms_played", - "artistName": "artist_name", - "trackName": "track_name", - }, - ) - - def main(input_paths: list, concated_path: str): """ input_paths: files where read streaming history concated_path: file to write concated streaming history """ # Read streaming files - df_stream = header_converter(pd.concat(map(pd.read_json, input_paths))) + df_stream = pd.concat(map(pd.read_json, input_paths)) logger.info("%i rows in %s", len(df_stream), input_paths) df_stream = df_stream.drop_duplicates() @@ -43,16 +31,6 @@ def main(input_paths: list, concated_path: str): axis=1, ) - df_stream["track_src_id"] = df_stream.artist_name + ":" + df_stream.track_name - df_stream["year"] = DatetimeIndex(df_stream.end_time).year.map(lambda x: f"{x:0>4}") - df_stream["month"] = (DatetimeIndex(df_stream.end_time).month).map(lambda x: f"{x:0>2}") - df_stream["month_name"] = DatetimeIndex(df_stream.end_time).month_name() - df_stream["day"] = DatetimeIndex(df_stream.end_time).day.map(lambda x: f"{x:0>2}") - df_stream["hour"] = DatetimeIndex(df_stream.end_time).hour.map(lambda x: f"{x:0>2}") - df_stream["minute"] = DatetimeIndex(df_stream.end_time).minute.map(lambda x: f"{x:0>2}") - # ":04" writting is fixed in Python 3.10+ : https://stackoverflow.com/a/36044788 - - df_stream["min_played"] = df_stream.ms_played / 1000 / 60 df_stream["id"] = df_stream.end_time + ":" + df_stream.track_uri df_stream["date"] = pd.to_datetime(df_stream.end_time) diff --git a/sploty/enrich.py b/sploty/enrich.py index 8e0802b..f8e1b1a 100644 --- a/sploty/enrich.py +++ b/sploty/enrich.py @@ -9,7 +9,8 @@ import requests from pydantic import BaseModel, HttpUrl from requests.exceptions import HTTPError -from settings import BoldColor + +from sploty.settings import BoldColor logger = logging.getLogger(__name__) @@ -50,8 +51,8 @@ def another_get(spotify_api_params: SpotifyApiParams, track_uris): def merger(df1, df5): - df1["is_done"] = df1.track_src_id.isin(df5.track_src_id) - df = df1.merge(df5, on="track_src_id", how="left") + df1["is_done"] = df1.track_uri.isin(df5.track_uri) + df = df1.merge(df5, on="track_uri", how="left") for c_x in df.columns: if c_x.endswith("_x"): @@ -62,48 +63,13 @@ def merger(df1, df5): def saver(df_tableau, complete_data, enriched_path): - sorted_cols = [ - "id", - "end_time", - "artist_name", - "track_name", - "ms_played", - "min_played", - "track_duration_ms", - "percentage_played", - "track_popularity", - # 'in_library', - "track_src_id", - "artist_uri", - "track_uri", - "year", - "month", - "month_name", - "day", - "hour", - "minute", - "username", - "platform", - "conn_country", - "ip_addr_decrypted", - "user_agent_decrypted", - "album_name", - "reason_start", - "reason_end", - "shuffle", - "skipped", - "offline", - "offline_timestamp", - "incognito_mode", - ] - complete_data = pd.DataFrame.from_dict(complete_data, orient="index") if len(complete_data) == 0: return df_tableau streams = merger(df_tableau, complete_data) - to_write = streams[streams["is_done"] == True][sorted_cols] # noqa: E712 + to_write = streams[streams["is_done"] == True] # noqa: E712 to_keep = streams[streams["is_done"] == False] # noqa: E712 # == to prevent "KeyError: False" @@ -121,8 +87,8 @@ def saver(df_tableau, complete_data, enriched_path): def better_enrich(df_tableau, chunk_size, enriched_path, spotify_api_params): logger.info("enrich track data for %i tracks", len(df_tableau)) - df = df_tableau[["track_uri", "track_name", "artist_name", "track_src_id", "ms_played"]].drop_duplicates( - "track_src_id", + df = df_tableau[["track_uri", "track_name", "artist_name", "ms_played"]].drop_duplicates( + "track_uri", ) logger.info("reduce enrich for only %i tracks", len(df)) @@ -147,17 +113,22 @@ def better_enrich(df_tableau, chunk_size, enriched_path, spotify_api_params): track = response["tracks"][i] artist = track["artists"][0] # only one artist :( - # album = track["album"] #noqa: ERA001 + album = track["album"] track_uri = track["uri"] logger.debug("enrich track uri n°%i (%s)", index, track_uri) - stream["artist_uri"] = artist["uri"].split(":")[2] + stream["artist_uri"] = artist["id"] stream["track_duration_ms"] = ( track.get("duration_ms", np.nan) if track.get("duration_ms", None) != 0.0 else np.nan ) stream["track_popularity"] = track.get("popularity", None) - stream["percentage_played"] = round((stream.ms_played / stream.track_duration_ms) * 100, 2) + stream["track_is_explicit"] = track["explicit"] + stream["track_is_local"] = track["is_local"] + stream["track_is_playable"] = track["is_playable"] + stream["album_uri"] = album["id"] + stream["album_type"] = album["album_type"] + stream["album_release_date"] = album["release_date"] dict_all[index] = stream checkpoint += chunk_size df_tableau = saver(df_tableau, dict_all, enriched_path) diff --git a/sploty/to_elastic.py b/sploty/to_elastic.py index a1c0e9b..d29eacc 100644 --- a/sploty/to_elastic.py +++ b/sploty/to_elastic.py @@ -25,29 +25,12 @@ def main(enriched_path: str, index_name: str, elastic): # Rename columns df_stream = df_stream.rename( columns={ - "end_time": "end_time", - "ms_played": "ms_played", - "min_played": "min_played", - "percentage_played": "percentage_played", - "track_uri": "track_uri", - "track_name": "track_name", - "track_duration_ms": "track_duration_ms", - "track_popularity": "track_popularity", - "track_is_in_library": "track_is_in_library", - "track_is_unplayable": "track_is_unplayable", - "artist_uri": "artist_uri", - "artist_name": "artist_name", - "artist_genres": "artist_genres", - "artist_popularity": "artist_popularity", - "album_uri": "album_uri", - "audio_features": "audio_features", - "stream_context": "stream_context", "username": "stream_username", "platform": "stream_platform", + "normalized_platform": "stream_normalized_platform", "conn_country": "stream_conn_country", "ip_addr_decrypted": "stream_ip_addr_decrypted", "user_agent_decrypted": "stream_user_agent_decrypted", - "album_name": "stream_album_name", "reason_start": "stream_reason_start", "reason_end": "stream_reason_end", "shuffle": "stream_shuffle", @@ -55,8 +38,6 @@ def main(enriched_path: str, index_name: str, elastic): "offline": "stream_offline", "offline_timestamp": "stream_offline_timestamp", "incognito_mode": "stream_incognito_mode", - "month_name": "month_name", - "id": "id", "danceability": "track_audio_feature_danceability", "energy": "track_audio_feature_energy", "key": "track_audio_feature_key", @@ -72,10 +53,6 @@ def main(enriched_path: str, index_name: str, elastic): }, ) - df_stream = df_stream.drop(["track_src_id"], axis=1) - df_stream = df_stream.drop(["minute", "hour", "day", "month", "year"], axis=1) - df_stream = df_stream.drop(["stream_skipped"], axis=1) # TODO fix error - # Index streams logger.info("indexing %i tracks to %s", len(df_stream), index_name) json_tmp = json.loads(df_stream.to_json(orient="records"))