Skip to content

Commit

Permalink
♻️ Remove computed cols
Browse files Browse the repository at this point in the history
  • Loading branch information
Gudsfile committed Oct 12, 2024
1 parent 8cf0245 commit abc2a39
Show file tree
Hide file tree
Showing 4 changed files with 21 additions and 92 deletions.
5 changes: 4 additions & 1 deletion sploty/audio_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,10 @@ def main(to_enrich_path: list, enriched_path: str, chunk_size: int, spotify_api_

# add audio feature to rows that do not have it
df_enriched_streams = pd.read_csv(enriched_path)
df_audio_features = pd.DataFrame(db.all())
df_audio_features = pd.DataFrame(db.all()).drop(
["type", "uri", "track_href", "analysis_url", "duration_ms"],
axis=1,
)

df_completed_streams = completes_streams_with_audio_features(
df_enriched_streams,
Expand Down
24 changes: 1 addition & 23 deletions sploty/concat.py
Original file line number Diff line number Diff line change
@@ -1,29 +1,17 @@
import logging

import pandas as pd
from pandas import DatetimeIndex

logger = logging.getLogger(__name__)


def header_converter(df):
return df.rename(
columns={
"endTime": "end_time",
"msPlayed": "ms_played",
"artistName": "artist_name",
"trackName": "track_name",
},
)


def main(input_paths: list, concated_path: str):
"""
input_paths: files where read streaming history
concated_path: file to write concated streaming history
"""
# Read streaming files
df_stream = header_converter(pd.concat(map(pd.read_json, input_paths)))
df_stream = pd.concat(map(pd.read_json, input_paths))
logger.info("%i rows in %s", len(df_stream), input_paths)

df_stream = df_stream.drop_duplicates()
Expand All @@ -43,16 +31,6 @@ def main(input_paths: list, concated_path: str):
axis=1,
)

df_stream["track_src_id"] = df_stream.artist_name + ":" + df_stream.track_name
df_stream["year"] = DatetimeIndex(df_stream.end_time).year.map(lambda x: f"{x:0>4}")
df_stream["month"] = (DatetimeIndex(df_stream.end_time).month).map(lambda x: f"{x:0>2}")
df_stream["month_name"] = DatetimeIndex(df_stream.end_time).month_name()
df_stream["day"] = DatetimeIndex(df_stream.end_time).day.map(lambda x: f"{x:0>2}")
df_stream["hour"] = DatetimeIndex(df_stream.end_time).hour.map(lambda x: f"{x:0>2}")
df_stream["minute"] = DatetimeIndex(df_stream.end_time).minute.map(lambda x: f"{x:0>2}")
# ":04" writting is fixed in Python 3.10+ : https://stackoverflow.com/a/36044788

df_stream["min_played"] = df_stream.ms_played / 1000 / 60
df_stream["id"] = df_stream.end_time + ":" + df_stream.track_uri

df_stream["date"] = pd.to_datetime(df_stream.end_time)
Expand Down
59 changes: 15 additions & 44 deletions sploty/enrich.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@
import requests
from pydantic import BaseModel, HttpUrl
from requests.exceptions import HTTPError
from settings import BoldColor

from sploty.settings import BoldColor

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -50,8 +51,8 @@ def another_get(spotify_api_params: SpotifyApiParams, track_uris):


def merger(df1, df5):
df1["is_done"] = df1.track_src_id.isin(df5.track_src_id)
df = df1.merge(df5, on="track_src_id", how="left")
df1["is_done"] = df1.track_uri.isin(df5.track_uri)
df = df1.merge(df5, on="track_uri", how="left")

for c_x in df.columns:
if c_x.endswith("_x"):
Expand All @@ -62,48 +63,13 @@ def merger(df1, df5):


def saver(df_tableau, complete_data, enriched_path):
sorted_cols = [
"id",
"end_time",
"artist_name",
"track_name",
"ms_played",
"min_played",
"track_duration_ms",
"percentage_played",
"track_popularity",
# 'in_library',
"track_src_id",
"artist_uri",
"track_uri",
"year",
"month",
"month_name",
"day",
"hour",
"minute",
"username",
"platform",
"conn_country",
"ip_addr_decrypted",
"user_agent_decrypted",
"album_name",
"reason_start",
"reason_end",
"shuffle",
"skipped",
"offline",
"offline_timestamp",
"incognito_mode",
]

complete_data = pd.DataFrame.from_dict(complete_data, orient="index")

if len(complete_data) == 0:
return df_tableau

streams = merger(df_tableau, complete_data)
to_write = streams[streams["is_done"] == True][sorted_cols] # noqa: E712
to_write = streams[streams["is_done"] == True] # noqa: E712
to_keep = streams[streams["is_done"] == False] # noqa: E712
# == to prevent "KeyError: False"

Expand All @@ -121,8 +87,8 @@ def saver(df_tableau, complete_data, enriched_path):
def better_enrich(df_tableau, chunk_size, enriched_path, spotify_api_params):
logger.info("enrich track data for %i tracks", len(df_tableau))

df = df_tableau[["track_uri", "track_name", "artist_name", "track_src_id", "ms_played"]].drop_duplicates(
"track_src_id",
df = df_tableau[["track_uri", "track_name", "artist_name", "ms_played"]].drop_duplicates(
"track_uri",
)
logger.info("reduce enrich for only %i tracks", len(df))

Expand All @@ -147,17 +113,22 @@ def better_enrich(df_tableau, chunk_size, enriched_path, spotify_api_params):

track = response["tracks"][i]
artist = track["artists"][0] # only one artist :(
# album = track["album"] #noqa: ERA001
album = track["album"]
track_uri = track["uri"]

logger.debug("enrich track uri n°%i (%s)", index, track_uri)

stream["artist_uri"] = artist["uri"].split(":")[2]
stream["artist_uri"] = artist["id"]
stream["track_duration_ms"] = (
track.get("duration_ms", np.nan) if track.get("duration_ms", None) != 0.0 else np.nan
)
stream["track_popularity"] = track.get("popularity", None)
stream["percentage_played"] = round((stream.ms_played / stream.track_duration_ms) * 100, 2)
stream["track_is_explicit"] = track["explicit"]
stream["track_is_local"] = track["is_local"]
stream["track_is_playable"] = track["is_playable"]
stream["album_uri"] = album["id"]
stream["album_type"] = album["album_type"]
stream["album_release_date"] = album["release_date"]
dict_all[index] = stream
checkpoint += chunk_size
df_tableau = saver(df_tableau, dict_all, enriched_path)
Expand Down
25 changes: 1 addition & 24 deletions sploty/to_elastic.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,38 +25,19 @@ def main(enriched_path: str, index_name: str, elastic):
# Rename columns
df_stream = df_stream.rename(
columns={
"end_time": "end_time",
"ms_played": "ms_played",
"min_played": "min_played",
"percentage_played": "percentage_played",
"track_uri": "track_uri",
"track_name": "track_name",
"track_duration_ms": "track_duration_ms",
"track_popularity": "track_popularity",
"track_is_in_library": "track_is_in_library",
"track_is_unplayable": "track_is_unplayable",
"artist_uri": "artist_uri",
"artist_name": "artist_name",
"artist_genres": "artist_genres",
"artist_popularity": "artist_popularity",
"album_uri": "album_uri",
"audio_features": "audio_features",
"stream_context": "stream_context",
"username": "stream_username",
"platform": "stream_platform",
"normalized_platform": "stream_normalized_platform",
"conn_country": "stream_conn_country",
"ip_addr_decrypted": "stream_ip_addr_decrypted",
"user_agent_decrypted": "stream_user_agent_decrypted",
"album_name": "stream_album_name",
"reason_start": "stream_reason_start",
"reason_end": "stream_reason_end",
"shuffle": "stream_shuffle",
"skipped": "stream_skipped",
"offline": "stream_offline",
"offline_timestamp": "stream_offline_timestamp",
"incognito_mode": "stream_incognito_mode",
"month_name": "month_name",
"id": "id",
"danceability": "track_audio_feature_danceability",
"energy": "track_audio_feature_energy",
"key": "track_audio_feature_key",
Expand All @@ -72,10 +53,6 @@ def main(enriched_path: str, index_name: str, elastic):
},
)

df_stream = df_stream.drop(["track_src_id"], axis=1)
df_stream = df_stream.drop(["minute", "hour", "day", "month", "year"], axis=1)
df_stream = df_stream.drop(["stream_skipped"], axis=1) # TODO fix error

# Index streams
logger.info("indexing %i tracks to %s", len(df_stream), index_name)
json_tmp = json.loads(df_stream.to_json(orient="records"))
Expand Down

0 comments on commit abc2a39

Please sign in to comment.