Skip to content

Commit 06c4a70

Browse files
committed
✨ Add new metrics
1 parent 89ba00a commit 06c4a70

File tree

5 files changed

+128
-42
lines changed

5 files changed

+128
-42
lines changed

README.md

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,8 @@ The app will :
9292
4. Enrich spotify audio features with `sploty/audio_features.py`
9393
- The Spotify API is used at this stage, don't forget to [configure it](#spotify)
9494
- A `json database` ([TinyDB](https://github.com/msiemens/tinydb)) is used at this stage to reduce Spotify API calls by storing tracks data
95-
5. Index their to elastic with `sploty/to_elastic.py`
95+
5. Add additional metrics with `sploty/metrics.py`
96+
6. Index their to elastic with `sploty/to_elastic.py`
9697
- Elasticsearch is used at this stage, don't forget to [configure it](#elasticsearch)
9798

9899
#### How to display the help message?
@@ -116,7 +117,7 @@ poetry run python sploty/app.py … --previous-enriched-streaming-history-path y
116117
Use the `-no-<the part>` options
117118

118119
```shell
119-
poetry run python sploty/app.py … --no-concat --no-filter --no-enrich --no-feature --no-elastic
120+
poetry run python sploty/app.py … --no-concat --no-filter --no-enrich --no-feature --no-metric --no-elastic
120121
```
121122

122123
#### How to increase or reduce the number of lines processed at once?

elastic/README_ELASTIC.md

Lines changed: 30 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -17,14 +17,19 @@ PUT _ingest/pipeline/spotify-stream-pipeline
1717
{"rename": { "field": "track_duration_ms", "target_field": "track.duration_ms", "ignore_missing": true }},
1818
{"rename": { "field": "track_popularity", "target_field": "track.popularity", "ignore_missing": true }},
1919
{"rename": { "field": "track_is_in_library", "target_field": "track.is_in_library", "ignore_missing": true }},
20-
{"rename": { "field": "track_is_unplayable", "target_field": "track.is_unplayable", "ignore_missing": true }},
20+
{"rename": { "field": "track_is_explicit", "target_field": "track.is_explicit", "ignore_missing": true }},
21+
{"rename": { "field": "track_is_local", "target_field": "track.is_local", "ignore_missing": true }},
22+
{"rename": { "field": "track_is_playable", "target_field": "track.is_playable", "ignore_missing": true }},
2123
{"rename": { "field": "album_uri", "target_field": "album.uri", "ignore_missing": true }},
24+
{"rename": { "field": "album_name", "target_field": "album.name", "ignore_missing": true }},
25+
{"rename": { "field": "album_type", "target_field": "album.type", "ignore_missing": true }},
26+
{"rename": { "field": "album_release_date", "target_field": "album.release_date", "ignore_missing": true }},
2227
{"rename": { "field": "stream_username", "target_field": "stream_context.username", "ignore_missing": true }},
2328
{"rename": { "field": "stream_platform", "target_field": "stream_context.platform", "ignore_missing": true }},
29+
{"rename": { "field": "stream_normalized_platform", "target_field": "stream_context.normalized_platform", "ignore_missing": true }},
2430
{"rename": { "field": "stream_conn_country", "target_field": "stream_context.conn_country", "ignore_missing": true }},
2531
{"rename": { "field": "stream_ip_addr_decrypted", "target_field": "stream_context.ip_addr_decrypted", "ignore_missing": true }},
2632
{"rename": { "field": "stream_user_agent_decrypted", "target_field": "stream_context.user_agent_decrypted", "ignore_missing": true }},
27-
{"rename": { "field": "stream_album_name", "target_field": "album.name", "ignore_missing": true }},
2833
{"rename": { "field": "stream_reason_start", "target_field": "stream_context.reason_start", "ignore_missing": true }},
2934
{"rename": { "field": "stream_reason_end", "target_field": "stream_context.reason_end", "ignore_missing": true }},
3035
{"rename": { "field": "stream_shuffle", "target_field": "stream_context.shuffle", "ignore_missing": true }},
@@ -44,36 +49,18 @@ PUT _ingest/pipeline/spotify-stream-pipeline
4449
{"rename": { "field": "track_audio_feature_valence", "target_field": "audio_features.valence", "ignore_missing": true }},
4550
{"rename": { "field": "track_audio_feature_tempo", "target_field": "audio_features.tempo", "ignore_missing": true }},
4651
{"rename": { "field": "track_audio_feature_time_signature", "target_field": "audio_features.time_signature", "ignore_missing": true }},
47-
{"remove": { "field": ["track_src_id", "location"], "ignore_missing": true }},
4852
{"user_agent": { "field": "stream_context.user_agent_decrypted", "ignore_missing": true }},
49-
{
50-
"script": {
51-
"source": """
52-
String platform = ctx['stream_context']['platform'];
53-
String lcp = platform.toLowerCase();
54-
if (lcp.startsWith('ios') || lcp.startsWith('partner ios_sdk')) {
55-
platform = 'iOS';
56-
} else if (lcp.startsWith('os x') || lcp.startsWith('osx')) {
57-
platform = 'OS X';
58-
} else if (lcp.startsWith('partner sonos_')) {
59-
platform = 'Sonos';
60-
} else if (lcp.startsWith('partner google cast_tv') || lcp.startsWith('partner google cast')) {
61-
platform = 'Chromecast';
62-
} else if (lcp.startsWith('partner android_tv')) {
63-
platform = 'Android TV';
64-
} else if (lcp.startsWith('android os') || lcp.startsWith('android [arm 0]')|| lcp.startsWith('android-tablet os')) {
65-
platform = 'Android OS';
66-
} else if (lcp.startsWith('webplayer') || lcp.startsWith('web_player') || lcp.startsWith('partner spotify web_player')) {
67-
platform = 'WebPlayer';
68-
} else if (lcp.startsWith('partner webos_tv') || lcp.startsWith('WebOs TV')) {
69-
platform = 'WebOs TV';
70-
} else if (lcp.startsWith('windows')) {
71-
platform = 'Windows';
72-
}
73-
ctx['stream_context']['normalized_platform'] = platform
74-
"""
75-
}
76-
}
53+
{"convert": { "field": "stream_context.offline", "type": "boolean", "ignore_missing": true }},
54+
{"convert": { "field": "stream_context.shuffle", "type": "boolean", "ignore_missing": true }},
55+
{"convert": { "field": "stream_context.incognito_mode", "type": "boolean", "ignore_missing": true }},
56+
{"convert": { "field": "stream_context.skipped", "type": "boolean", "ignore_missing": true }},
57+
{"convert": { "field": "track.is_in_library", "type": "boolean", "ignore_missing": true }},
58+
{"convert": { "field": "track.is_explicit", "type": "boolean", "ignore_missing": true }},
59+
{"convert": { "field": "track.is_local", "type": "boolean", "ignore_missing": true }},
60+
{"convert": { "field": "track.is_playable", "type": "boolean", "ignore_missing": true }},
61+
{"convert": { "field": "is_new_track", "type": "boolean", "ignore_missing": true }},
62+
{"convert": { "field": "is_new_artist", "type": "boolean", "ignore_missing": true }},
63+
{"convert": { "field": "is_new_album", "type": "boolean", "ignore_missing": true }}
7764
]
7865
}
7966
```
@@ -101,7 +88,8 @@ PUT _component_template/spotify-stream-mapping
10188
"danceability": {"type": "integer"},
10289
"key": {"type": "integer"},
10390
"speechiness": {"type": "integer"},
104-
"energy": {"type": "integer"}
91+
"energy": {"type": "integer"},
92+
"time_signature": {"type": "integer"}
10593
}
10694
},
10795
"stream_context": {
@@ -132,7 +120,9 @@ PUT _component_template/spotify-stream-mapping
132120
"album": {
133121
"properties": {
134122
"name": {"type": "keyword"},
135-
"uri": {"type": "keyword" }
123+
"uri": {"type": "keyword"},
124+
"type": {"type": "keyword"},
125+
"release_date": {"format": "year||year_month||year_month_day", "type": "date"}
136126
}
137127
},
138128
"track": {
@@ -142,15 +132,20 @@ PUT _component_template/spotify-stream-mapping
142132
"name": {"type": "keyword"},
143133
"uri": {"type": "keyword"},
144134
"is_in_library": {"type": "boolean"},
145-
"is_unplayable": {"type": "boolean" }
135+
"is_explicit": {"type": "boolean"},
136+
"is_local": {"type": "boolean"},
137+
"is_playable": {"type": "boolean"}
146138
}
147139
},
148140
"end_time": {"format": "yyyy-MM-dd'T'HH:mm:ss'Z'", "type": "date"},
149141
"percentage_played": {"type": "long"},
150142
"day_name": {"type": "keyword"},
151143
"month_name": {"type": "keyword"},
152144
"ms_played": {"type": "long"},
153-
"min_played": {"type": "long"}
145+
"min_played": {"type": "long"},
146+
"is_new_track": {"type": "boolean"},
147+
"is_new_artist": {"type": "boolean"},
148+
"is_new_album": {"type": "boolean"}
154149
}
155150
}
156151
}

sploty/app.py

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
from pydantic import Field, HttpUrl, v1
77
from pydantic_settings import BaseSettings
88

9-
from sploty import audio_features, concat, enrich, filter, to_elastic
9+
from sploty import audio_features, concat, enrich, filter, metrics, to_elastic
1010
from sploty.settings import logger
1111

1212

@@ -23,6 +23,7 @@ class Arguments(v1.BaseModel):
2323
filter: bool = v1.Field(default=True)
2424
enrich: bool = v1.Field(default=True)
2525
feature: bool = v1.Field(default=True)
26+
metric: bool = v1.Field(default=True)
2627
elastic: bool = v1.Field(default=True)
2728

2829

@@ -55,7 +56,8 @@ def main() -> None:
5556
concated_streaming_history_path = Path(f"{resources_path}/sploty_concated_history.csv")
5657
to_enrich_streaming_history_path = Path(f"{resources_path}/sploty_filtered_history.csv")
5758
enriched_streaming_history_path = Path(f"{resources_path}/sploty_enriched_history.csv")
58-
featured_streaming_history_path = Path(f"{resources_path}/sploty_featured_history.csv") # noqa: F841
59+
featured_streaming_history_path = Path(f"{resources_path}/sploty_featured_history.csv")
60+
metrics_streaming_history_path = Path(f"{resources_path}/sploty_metrics_history.csv")
5961

6062
db_path = args.db_path
6163
audio_features_db_path = Path(f"{db_path}/tracks.json")
@@ -117,12 +119,18 @@ def main() -> None:
117119
audio_features.main(
118120
to_enrich_streaming_history_path,
119121
enriched_streaming_history_path,
122+
featured_streaming_history_path,
120123
args.chunk_size,
121124
spotify_api_params,
122125
db,
123126
)
124127
else:
125128
logger.info("skip")
129+
logger.info("============== METRICS =============")
130+
if args.metric:
131+
metrics.main(featured_streaming_history_path, metrics_streaming_history_path)
132+
else:
133+
logger.info("skip")
126134
logger.info("============== ELASTIC =============")
127135
if args.elastic:
128136
elastic = to_elastic.get_elastic(
@@ -131,7 +139,7 @@ def main() -> None:
131139
env.elastic_pass,
132140
args.elastic_timeout,
133141
)
134-
to_elastic.main(enriched_streaming_history_path, args.index_name, elastic)
142+
to_elastic.main(metrics_streaming_history_path, args.index_name, elastic)
135143
else:
136144
logger.info("skip")
137145

sploty/audio_features.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,14 @@ def completes_streams_with_audio_features(df_left, left_key, df_right, right_key
8989
# len(db.all())#noqa: ERA001
9090

9191

92-
def main(to_enrich_path: list, enriched_path: str, chunk_size: int, spotify_api_params: SpotifyApiParams, db: TinyDB):
92+
def main( # noqa: PLR0913
93+
to_enrich_path: list,
94+
enriched_path: str,
95+
featured_path: str,
96+
chunk_size: int,
97+
spotify_api_params: SpotifyApiParams,
98+
db: TinyDB,
99+
):
93100
# get the audio features of tracks saved it in the TinyDb
94101
df_stream = pd.read_csv(to_enrich_path)
95102
logger.info("%i streams", len(df_stream))
@@ -122,7 +129,7 @@ def main(to_enrich_path: list, enriched_path: str, chunk_size: int, spotify_api_
122129
df_audio_features,
123130
"id",
124131
)
125-
df_completed_streams.to_csv(enriched_path, mode="w", index=False)
132+
df_completed_streams.to_csv(featured_path, mode="w", index=False)
126133
logger.info(
127134
"%i rows are re-saved at %s with audio features completed",
128135
len(df_completed_streams),

sploty/metrics.py

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
import logging
2+
from pathlib import Path
3+
4+
import pandas as pd
5+
from pandas import DatetimeIndex
6+
7+
logger = logging.getLogger(__name__)
8+
9+
10+
def normalize_platform(platform: str):
11+
normalized = {
12+
"android os": "Android OS",
13+
"android [arm 0]": "Android OS",
14+
"android-tablet os": "Android OS",
15+
"partner android_tv": "Android TV",
16+
"partner google cast": "Chromecast",
17+
"ios": "iOS",
18+
"partner ios": "iOS",
19+
"osx": "MacOS",
20+
"os x": "MacOS",
21+
"sonos_": "Sonos",
22+
"partner sonos": "Sonos",
23+
"webos tv": "WebOS TV",
24+
"partner webos_tv": "WebOS tv",
25+
"webplayer": "WebPlayer",
26+
"web_player": "WebPlayer",
27+
"partner spotify web_player": "WebPlayer",
28+
"windows": "Windows",
29+
"not_applicable": "not_applicable",
30+
}
31+
normalized_matches = [value for key, value in normalized.items() if platform.lower().startswith(key.lower())]
32+
if len(normalized_matches) > 1:
33+
logger.warning(
34+
"There are several matches for the `%s` platform: %s (the first one is taken)",
35+
platform,
36+
normalized_matches,
37+
)
38+
return normalized_matches[0]
39+
if len(normalized_matches) < 1:
40+
logger.warning("There is no match for the `%s` platform", platform)
41+
return platform
42+
return normalized_matches[0]
43+
44+
45+
def main(enriched_path: Path, metrics_path: Path):
46+
df_stream = pd.read_csv(enriched_path)
47+
48+
df_stream["year"] = DatetimeIndex(df_stream.end_time).year.map(lambda x: f"{x:0>4}")
49+
df_stream["month"] = (DatetimeIndex(df_stream.end_time).month).map(lambda x: f"{x:0>2}")
50+
df_stream["month_name"] = DatetimeIndex(df_stream.end_time).month_name()
51+
df_stream["day"] = DatetimeIndex(df_stream.end_time).day.map(lambda x: f"{x:0>2}")
52+
df_stream["day_of_week"] = DatetimeIndex(df_stream.end_time).day_of_week.map(lambda x: f"{x:0>2}")
53+
df_stream["day_name"] = DatetimeIndex(df_stream.end_time).day_name()
54+
df_stream["hour"] = DatetimeIndex(df_stream.end_time).hour.map(lambda x: f"{x:0>2}")
55+
df_stream["minute"] = DatetimeIndex(df_stream.end_time).minute.map(lambda x: f"{x:0>2}")
56+
# ":04" writting is fixed in Python 3.10+ : https://stackoverflow.com/a/36044788
57+
58+
df_stream["min_played"] = df_stream.ms_played / 1000 / 60
59+
60+
df_stream["percentage_played"] = round((df_stream.ms_played / df_stream.track_duration_ms) * 100, 2)
61+
df_stream["percentage_played"] = df_stream["percentage_played"].clip(0, 100)
62+
63+
df_stream["is_new_track"] = ~df_stream["track_uri"].duplicated(keep="first")
64+
df_stream["is_new_artist"] = ~df_stream["artist_uri"].duplicated(keep="first")
65+
df_stream["is_new_album"] = ~df_stream["album_uri"].duplicated(keep="first")
66+
67+
df_stream["normalized_platform"] = df_stream["platform"].apply(normalize_platform)
68+
69+
df_stream["skipped"] = df_stream["skipped"].astype(bool)
70+
71+
# track_is_in_library
72+
# artist_genres
73+
# artist_popularity
74+
75+
df_stream.to_csv(metrics_path, mode="w", index=False)

0 commit comments

Comments
 (0)