Skip to content

Commit

Permalink
✨ Add new metrics
Browse files Browse the repository at this point in the history
  • Loading branch information
Gudsfile committed Oct 12, 2024
1 parent abc2a39 commit 12b8812
Show file tree
Hide file tree
Showing 5 changed files with 124 additions and 42 deletions.
5 changes: 3 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,8 @@ The app will :
4. Enrich spotify audio features with `sploty/audio_features.py`
- The Spotify API is used at this stage, don't forget to [configure it](#spotify)
- A `json database` ([TinyDB](https://github.com/msiemens/tinydb)) is used at this stage to reduce Spotify API calls by storing tracks data
5. Index their to elastic with `sploty/to_elastic.py`
5. Add additional metrics with `sploty/metrics.py`
6. Index their to elastic with `sploty/to_elastic.py`
- Elasticsearch is used at this stage, don't forget to [configure it](#elasticsearch)

#### How to display the help message?
Expand All @@ -116,7 +117,7 @@ poetry run python sploty/app.py … --previous-enriched-streaming-history-path y
Use the `-no-<the part>` options

```shell
poetry run python sploty/app.py … --no-concat --no-filter --no-enrich --no-feature --no-elastic
poetry run python sploty/app.py … --no-concat --no-filter --no-enrich --no-feature --no-metric --no-elastic
```

#### How to increase or reduce the number of lines processed at once?
Expand Down
65 changes: 30 additions & 35 deletions elastic/README_ELASTIC.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,19 @@ PUT _ingest/pipeline/spotify-stream-pipeline
{"rename": { "field": "track_duration_ms", "target_field": "track.duration_ms", "ignore_missing": true }},
{"rename": { "field": "track_popularity", "target_field": "track.popularity", "ignore_missing": true }},
{"rename": { "field": "track_is_in_library", "target_field": "track.is_in_library", "ignore_missing": true }},
{"rename": { "field": "track_is_unplayable", "target_field": "track.is_unplayable", "ignore_missing": true }},
{"rename": { "field": "track_is_explicit", "target_field": "track.is_explicit", "ignore_missing": true }},
{"rename": { "field": "track_is_local", "target_field": "track.is_local", "ignore_missing": true }},
{"rename": { "field": "track_is_playable", "target_field": "track.is_playable", "ignore_missing": true }},
{"rename": { "field": "album_uri", "target_field": "album.uri", "ignore_missing": true }},
{"rename": { "field": "album_name", "target_field": "album.name", "ignore_missing": true }},
{"rename": { "field": "album_type", "target_field": "album.type", "ignore_missing": true }},
{"rename": { "field": "album_release_date", "target_field": "album.release_date", "ignore_missing": true }},
{"rename": { "field": "stream_username", "target_field": "stream_context.username", "ignore_missing": true }},
{"rename": { "field": "stream_platform", "target_field": "stream_context.platform", "ignore_missing": true }},
{"rename": { "field": "stream_normalized_platform", "target_field": "stream_context.normalized_platform", "ignore_missing": true }},
{"rename": { "field": "stream_conn_country", "target_field": "stream_context.conn_country", "ignore_missing": true }},
{"rename": { "field": "stream_ip_addr_decrypted", "target_field": "stream_context.ip_addr_decrypted", "ignore_missing": true }},
{"rename": { "field": "stream_user_agent_decrypted", "target_field": "stream_context.user_agent_decrypted", "ignore_missing": true }},
{"rename": { "field": "stream_album_name", "target_field": "album.name", "ignore_missing": true }},
{"rename": { "field": "stream_reason_start", "target_field": "stream_context.reason_start", "ignore_missing": true }},
{"rename": { "field": "stream_reason_end", "target_field": "stream_context.reason_end", "ignore_missing": true }},
{"rename": { "field": "stream_shuffle", "target_field": "stream_context.shuffle", "ignore_missing": true }},
Expand All @@ -44,36 +49,18 @@ PUT _ingest/pipeline/spotify-stream-pipeline
{"rename": { "field": "track_audio_feature_valence", "target_field": "audio_features.valence", "ignore_missing": true }},
{"rename": { "field": "track_audio_feature_tempo", "target_field": "audio_features.tempo", "ignore_missing": true }},
{"rename": { "field": "track_audio_feature_time_signature", "target_field": "audio_features.time_signature", "ignore_missing": true }},
{"remove": { "field": ["track_src_id", "location"], "ignore_missing": true }},
{"user_agent": { "field": "stream_context.user_agent_decrypted", "ignore_missing": true }},
{
"script": {
"source": """
String platform = ctx['stream_context']['platform'];
String lcp = platform.toLowerCase();
if (lcp.startsWith('ios') || lcp.startsWith('partner ios_sdk')) {
platform = 'iOS';
} else if (lcp.startsWith('os x') || lcp.startsWith('osx')) {
platform = 'OS X';
} else if (lcp.startsWith('partner sonos_')) {
platform = 'Sonos';
} else if (lcp.startsWith('partner google cast_tv') || lcp.startsWith('partner google cast')) {
platform = 'Chromecast';
} else if (lcp.startsWith('partner android_tv')) {
platform = 'Android TV';
} else if (lcp.startsWith('android os') || lcp.startsWith('android [arm 0]')|| lcp.startsWith('android-tablet os')) {
platform = 'Android OS';
} else if (lcp.startsWith('webplayer') || lcp.startsWith('web_player') || lcp.startsWith('partner spotify web_player')) {
platform = 'WebPlayer';
} else if (lcp.startsWith('partner webos_tv') || lcp.startsWith('WebOs TV')) {
platform = 'WebOs TV';
} else if (lcp.startsWith('windows')) {
platform = 'Windows';
}
ctx['stream_context']['normalized_platform'] = platform
"""
}
}
{"convert": { "field": "stream_context.offline", "type": "boolean", "ignore_missing": true }},
{"convert": { "field": "stream_context.shuffle", "type": "boolean", "ignore_missing": true }},
{"convert": { "field": "stream_context.incognito_mode", "type": "boolean", "ignore_missing": true }},
{"convert": { "field": "stream_context.skipped", "type": "boolean", "ignore_missing": true }},
{"convert": { "field": "track.is_in_library", "type": "boolean", "ignore_missing": true }},
{"convert": { "field": "track.is_explicit", "type": "boolean", "ignore_missing": true }},
{"convert": { "field": "track.is_local", "type": "boolean", "ignore_missing": true }},
{"convert": { "field": "track.is_playable", "type": "boolean", "ignore_missing": true }},
{"convert": { "field": "is_new_track", "type": "boolean", "ignore_missing": true }},
{"convert": { "field": "is_new_artist", "type": "boolean", "ignore_missing": true }},
{"convert": { "field": "is_new_album", "type": "boolean", "ignore_missing": true }}
]
}
```
Expand Down Expand Up @@ -101,7 +88,8 @@ PUT _component_template/spotify-stream-mapping
"danceability": {"type": "integer"},
"key": {"type": "integer"},
"speechiness": {"type": "integer"},
"energy": {"type": "integer"}
"energy": {"type": "integer"},
"time_signature": {"type": "integer"}
}
},
"stream_context": {
Expand Down Expand Up @@ -132,7 +120,9 @@ PUT _component_template/spotify-stream-mapping
"album": {
"properties": {
"name": {"type": "keyword"},
"uri": {"type": "keyword" }
"uri": {"type": "keyword"},
"type": {"type": "keyword"},
"release_date": {"format": "year||year_month||year_month_day", "type": "date"}
}
},
"track": {
Expand All @@ -142,15 +132,20 @@ PUT _component_template/spotify-stream-mapping
"name": {"type": "keyword"},
"uri": {"type": "keyword"},
"is_in_library": {"type": "boolean"},
"is_unplayable": {"type": "boolean" }
"is_explicit": {"type": "boolean"},
"is_local": {"type": "boolean"},
"is_playable": {"type": "boolean"}
}
},
"end_time": {"format": "yyyy-MM-dd'T'HH:mm:ss'Z'", "type": "date"},
"percentage_played": {"type": "long"},
"day_name": {"type": "keyword"},
"month_name": {"type": "keyword"},
"ms_played": {"type": "long"},
"min_played": {"type": "long"}
"min_played": {"type": "long"},
"is_new_track": {"type": "boolean"},
"is_new_artist": {"type": "boolean"},
"is_new_album": {"type": "boolean"}
}
}
}
Expand Down
14 changes: 11 additions & 3 deletions sploty/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from pydantic import Field, HttpUrl, v1
from pydantic_settings import BaseSettings

from sploty import audio_features, concat, enrich, filter, to_elastic
from sploty import audio_features, concat, enrich, filter, metrics, to_elastic
from sploty.settings import logger


Expand All @@ -23,6 +23,7 @@ class Arguments(v1.BaseModel):
filter: bool = v1.Field(default=True)
enrich: bool = v1.Field(default=True)
feature: bool = v1.Field(default=True)
metric: bool = v1.Field(default=True)
elastic: bool = v1.Field(default=True)


Expand Down Expand Up @@ -55,7 +56,8 @@ def main() -> None:
concated_streaming_history_path = Path(f"{resources_path}/sploty_concated_history.csv")
to_enrich_streaming_history_path = Path(f"{resources_path}/sploty_filtered_history.csv")
enriched_streaming_history_path = Path(f"{resources_path}/sploty_enriched_history.csv")
featured_streaming_history_path = Path(f"{resources_path}/sploty_featured_history.csv") # noqa: F841
featured_streaming_history_path = Path(f"{resources_path}/sploty_featured_history.csv")
metrics_streaming_history_path = Path(f"{resources_path}/sploty_metrics_history.csv")

db_path = args.db_path
audio_features_db_path = Path(f"{db_path}/tracks.json")
Expand Down Expand Up @@ -117,12 +119,18 @@ def main() -> None:
audio_features.main(
to_enrich_streaming_history_path,
enriched_streaming_history_path,
featured_streaming_history_path,
args.chunk_size,
spotify_api_params,
db,
)
else:
logger.info("skip")
logger.info("============== METRICS =============")
if args.metric:
metrics.main(featured_streaming_history_path, metrics_streaming_history_path)
else:
logger.info("skip")
logger.info("============== ELASTIC =============")
if args.elastic:
elastic = to_elastic.get_elastic(
Expand All @@ -131,7 +139,7 @@ def main() -> None:
env.elastic_pass,
args.elastic_timeout,
)
to_elastic.main(enriched_streaming_history_path, args.index_name, elastic)
to_elastic.main(metrics_streaming_history_path, args.index_name, elastic)
else:
logger.info("skip")

Expand Down
11 changes: 9 additions & 2 deletions sploty/audio_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,14 @@ def completes_streams_with_audio_features(df_left, left_key, df_right, right_key
# len(db.all())#noqa: ERA001


def main(to_enrich_path: list, enriched_path: str, chunk_size: int, spotify_api_params: SpotifyApiParams, db: TinyDB):
def main( # noqa: PLR0913
to_enrich_path: list,
enriched_path: str,
featured_path: str,
chunk_size: int,
spotify_api_params: SpotifyApiParams,
db: TinyDB,
):
# get the audio features of tracks saved it in the TinyDb
df_stream = pd.read_csv(to_enrich_path)
logger.info("%i streams", len(df_stream))
Expand Down Expand Up @@ -122,7 +129,7 @@ def main(to_enrich_path: list, enriched_path: str, chunk_size: int, spotify_api_
df_audio_features,
"id",
)
df_completed_streams.to_csv(enriched_path, mode="w", index=False)
df_completed_streams.to_csv(featured_path, mode="w", index=False)
logger.info(
"%i rows are re-saved at %s with audio features completed",
len(df_completed_streams),
Expand Down
71 changes: 71 additions & 0 deletions sploty/metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
import logging
from pathlib import Path

import pandas as pd
from pandas import DatetimeIndex

logger = logging.getLogger(__name__)


def normalize_platform(platform: str):
normalized = {
"android os": "Android OS",
"android [arm 0]": "Android OS",
"android-tablet os": "Android OS",
"partner android_tv": "Android TV",
"partner google cast": "Chromecast",
"ios": "iOS",
"partner ios": "iOS",
"osx": "MacOS",
"os x": "MacOS",
"sonos_": "Sonos",
"partner sonos": "Sonos",
"webos tv": "WebOS TV",
"partner webos_tv": "WebOS tv",
"webplayer": "WebPlayer",
"web_player": "WebPlayer",
"partner spotify web_player": "WebPlayer",
"windows": "Windows",
"not_applicable": "not_applicable",
}
normalized_matches = [value for key, value in normalized.items() if platform.lower().startswith(key.lower())]
if len(normalized_matches) > 1:
logger.warning(
"There are several matches for the `%s` platform: %s (the first one is taken)",
platform,
normalized_matches,
)
return normalized_matches[0]
if len(normalized_matches) < 1:
logger.warning("There is no match for the `%s` platform", platform)
return platform
return normalized_matches[0]


def main(enriched_path: Path, metrics_path: Path):
df_stream = pd.read_csv(enriched_path)

df_stream["year"] = DatetimeIndex(df_stream.end_time).year.map(lambda x: f"{x:0>4}")
df_stream["month"] = (DatetimeIndex(df_stream.end_time).month).map(lambda x: f"{x:0>2}")
df_stream["month_name"] = DatetimeIndex(df_stream.end_time).month_name()
df_stream["day"] = DatetimeIndex(df_stream.end_time).day.map(lambda x: f"{x:0>2}")
df_stream["day_of_week"] = DatetimeIndex(df_stream.end_time).day_of_week.map(lambda x: f"{x:0>2}")
df_stream["day_name"] = DatetimeIndex(df_stream.end_time).day_name()
df_stream["hour"] = DatetimeIndex(df_stream.end_time).hour.map(lambda x: f"{x:0>2}")
df_stream["minute"] = DatetimeIndex(df_stream.end_time).minute.map(lambda x: f"{x:0>2}")
# ":04" writting is fixed in Python 3.10+ : https://stackoverflow.com/a/36044788

df_stream["min_played"] = df_stream.ms_played / 1000 / 60

df_stream["percentage_played"] = round((df_stream.ms_played / df_stream.track_duration_ms) * 100, 2)
df_stream["percentage_played"] = df_stream["percentage_played"].clip(0, 100)

df_stream["is_new_track"] = ~df_stream["track_uri"].duplicated(keep="first")
df_stream["is_new_artist"] = ~df_stream["artist_uri"].duplicated(keep="first")
df_stream["is_new_album"] = ~df_stream["album_uri"].duplicated(keep="first")

df_stream["normalized_platform"] = df_stream["platform"].apply(normalize_platform)

df_stream["skipped"] = df_stream["skipped"].astype(bool)

df_stream.to_csv(metrics_path, mode="w", index=False)

0 comments on commit 12b8812

Please sign in to comment.