-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathanalyse.py
795 lines (647 loc) · 28.5 KB
/
analyse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
import logging
from core.configuration import config
logging.basicConfig(level=config.VERBOSITY)
import glob
import traceback
import time
from datetime import datetime
from os import path
from json import load, dump
from typing import Optional, Dict, List, Any, Tuple, Union
from mutagen import File, FileType, MutagenError
from openpyxl import Workbook
from fuzzywuzzy.fuzz import partial_ratio, UQRatio
from fuzzywuzzy.process import extractOne
from youtubesearchpython import SearchVideos
from core.library import LibraryFile
from core.scrobble import ExtendedScrobble, TrackSourceType, RawScrobble
from core.utilities import youtube_length_to_sec, TimedContext, generate_random_filename_safe_text
from core.musicbrainz import ReleaseTrack
from core.genres import fetch_genre_by_metadata
from core.prevent_sleep import inhibit, uninhibit
from core.state import LibraryCacheState, SearchCacheState, StatisticsState, AnalysisState
log = logging.getLogger(__name__)
TypeRawLibraryCache = Dict[
str,
Union[
Dict[str, List[LibraryFile]],
Dict[str, LibraryFile]
]
]
##
# Cache the music library
##
def find_music_library_files(root_dir: str) -> List[str]:
"""
Given a root directory, build a list of audio file paths.
Args:
root_dir:
Root directory to start in. Includes subdirectories.
Returns:
A list of strings contaning audio file paths.
"""
globs: List[str] = [
path.join(root_dir, f"**/*.{ext_glob}")
for ext_glob in ("mp3", "ogg", "wav", "flac", "m4a")
]
files: List[str] = []
for g in globs:
files.extend(glob.glob(g, recursive=True))
files.sort()
return files
def build_library_metadata_cache(file_list: List[str]) -> TypeRawLibraryCache:
by_album: Dict[str, List[LibraryFile]] = {}
by_artist: Dict[str, List[LibraryFile]] = {}
by_track_title: Dict[str, List[LibraryFile]] = {}
by_track_mbid: Dict[str, LibraryFile] = {}
files_successful = 0
files_failed = 0
counter = 0
for audio_file in file_list:
# Load file metadata
try:
# TODO this currently supports mutagen easy tags, so pretty much only MP3 and MP4 are guaranteed
# Look into support for non-easy mutagen tags
# (seems like we might only need to map each type's tags with something like a dict?)
mutagen_file: Optional[FileType] = File(audio_file, easy=True)
except MutagenError as e:
# Failed to load the file, skip it
log.warning(f"Failed to load audio file ({e}): \"{audio_file}\"")
traceback.print_exc()
files_failed += 1
continue
else:
files_successful += 1
if mutagen_file is None:
log.warning(f"Audio file type could not be determined: \"{audio_file}\"")
files_failed += 1
continue
lib_file: LibraryFile = LibraryFile.from_mutagen(mutagen_file)
if lib_file.album_name is not None:
if lib_file.album_name not in by_album:
by_album[lib_file.album_name] = [lib_file]
else:
by_album[lib_file.album_name].append(lib_file)
if lib_file.artist_name is not None:
if lib_file.artist_name not in by_artist:
by_artist[lib_file.artist_name] = [lib_file]
else:
by_artist[lib_file.artist_name].append(lib_file)
if lib_file.track_title is not None:
if lib_file.track_title not in by_track_title:
by_track_title[lib_file.track_title] = [lib_file]
else:
by_track_title[lib_file.track_title].append(lib_file)
if lib_file.track_mbid is not None:
by_track_mbid[lib_file.track_mbid] = lib_file
# Log progress
counter += 1
if counter % config.CACHE_LOG_INTERVAL == 0:
log.info(f"Caching progress: {counter} files")
log.info(f"Processed {files_successful} audio files ({files_failed} failed).")
return {
"cache_by_album": by_album,
"cache_by_artist": by_artist,
"cache_by_track_title": by_track_title,
"cache_by_track_mbid": by_track_mbid,
}
def load_library_metadata() -> TypeRawLibraryCache:
"""
Loads the cached version of the music library. The cache is saved in the configurable cache directory.
Recreates LibraryFile instances from the data.
Returns:
A
"""
with open(config.LIBRARY_CACHE_FILE, "r", encoding="utf8") as lib_file:
raw = load(lib_file)
# Convert back into LibraryFile instances
def instance_libraryfiles_from_list(full: Dict[str, List[Any]]) -> Dict[str, List[LibraryFile]]:
return {
k: [LibraryFile(**lib_f) for lib_f in v] for k, v in full.items()
}
def instance_libraryfiles_from_single(full: Dict[str, dict]) -> Dict[str, LibraryFile]:
return {
k: LibraryFile(**v) for k, v in full.items()
}
return {
"cache_by_album": instance_libraryfiles_from_list(raw["cache_by_album"]),
"cache_by_artist": instance_libraryfiles_from_list(raw["cache_by_artist"]),
"cache_by_track_title": instance_libraryfiles_from_list(raw["cache_by_track_title"]),
"cache_by_track_mbid": instance_libraryfiles_from_single(raw["cache_by_track_mbid"]),
}
def save_library_metadata(raw_cache: TypeRawLibraryCache) -> None:
def serialize_libraryfiles_list(full: Dict[str, List[LibraryFile]]) -> Dict[str, List[str]]:
return {
k: [lib_f.dump() for lib_f in v] for k, v in full.items()
}
def serialize_libraryfiles_single(full: Dict[str, LibraryFile]) -> Dict[str, str]:
return {
k: v.dump() for k, v in full.items()
}
dumped = {
"cache_by_album": serialize_libraryfiles_list(raw_cache["cache_by_album"]),
"cache_by_artist": serialize_libraryfiles_list(raw_cache["cache_by_artist"]),
"cache_by_track_title": serialize_libraryfiles_list(raw_cache["cache_by_track_title"]),
"cache_by_track_mbid": serialize_libraryfiles_single(raw_cache["cache_by_track_mbid"]),
}
with open(config.LIBRARY_CACHE_FILE, "w", encoding="utf8") as lib_file:
dump(
dumped,
lib_file,
ensure_ascii=False,
)
def ensure_library_cache(state: AnalysisState) -> None:
"""
Makes sure the music library cache exists.
Generates one if needed, otherwise loads from file.
Updates passed state with the music library cache.
Args:
state:
AnalysisState instance into which to save the new LocalLibraryCache instance.
(AnalysisState's library_cache attribute is updated)
"""
# Build audio metadata cache if needed (otherwise just load the json cache)
# TODO detect changes in music library!
# If a cache already exists, load it
# TODO switch to ignore cache? (deleting the cache file also works for now)
if path.isfile(config.LIBRARY_CACHE_FILE):
log.info("Local music library cache found, loading.")
raw_cache = load_library_metadata()
log.info("Local music library cache loaded.")
elif config.LIBRARY_CACHE_FILE not in (None, ""):
log.info("No cache found, generating.")
log.info("Collecting audio files...")
file_list: List[str] = find_music_library_files(config.MUSIC_LIBRARY_ROOT)
log.info(f"Collected {len(file_list)} audio files.")
log.info("Building local music library cache...")
raw_cache = build_library_metadata_cache(file_list)
save_library_metadata(raw_cache)
else:
raw_cache = {"cache_by_album": [], "cache_by_artist": [], "cache_by_track_title": [], "cache_by_track_mbid": []}
log.info("Local music library search is disabled.")
# At this point, raw_cache has all the stuff we need
# So we separate it into smaller chunks and save each separately into our state
# This is done with LocalLibraryCache.set_from_raw_cache
library_state: LibraryCacheState = LibraryCacheState()
library_state.set_from_raw_cache(raw_cache)
# Update the AnalysisState's library_cache ref and we're done here!
state.library_cache = library_state
##
# Scrobbles
##
def load_scrobbles(state: AnalysisState) -> None:
"""
Load the scrobbles file into JSON and flatten it.
Updates state with the loaded scrobble data.
Args:
state:
AnalysisState instance to save the scrobbles into.
(AnalysisState's scrobbles attribute is updated)
"""
def load_and_flatten(json_file_path: str) -> List:
with open(json_file_path, "r", encoding="utf8") as scrobbles_file:
scrobbles_raw = load(scrobbles_file)
# Flatten scrobble pages into a big list
flattened = [item for sublist in scrobbles_raw for item in sublist]
return flattened
# TODO option to filter scrobbles by date (from, to)
# Flatten and save into state
scrobbles = load_and_flatten(config.SCROBBLES_JSON_PATH)
state.raw_scrobbles = scrobbles
log.info(f"{len(scrobbles)} scrobbles loaded.")
##
# Extended data
##
# Define search functions
def find_by_mbid(library_cache: LibraryCacheState, raw_scrobble: RawScrobble) -> Optional[ExtendedScrobble]:
"""
Try to find exact MusicBrainz track ID match in our local music library.
Args:
library_cache:
LibraryCacheState instance.
raw_scrobble:
RawScrobble instance.
Returns:
If found, an ExtendedScrobble instance formed from the matched library track. Otherwise None.
"""
library_track = library_cache.cache_by_track_mbid.get(raw_scrobble.track_mbid)
if library_track is None:
return None
else:
return ExtendedScrobble.from_library_track(raw_scrobble, library_track, TrackSourceType.LOCAL_LIBRARY_MBID)
def find_by_metadata_full_match(
library_cache: LibraryCacheState, raw_scrobble: RawScrobble
) -> Optional[ExtendedScrobble]:
"""
Try to find exact metadata match in our local music library.
Args:
library_cache:
LibraryCacheState instance.
raw_scrobble:
RawScrobble instance.
Returns:
If found, an ExtendedScrobble instance formed from the matched library track. Otherwise None.
"""
if raw_scrobble.track_title in library_cache.cache_by_track_title:
# TODO add mixed exact and partial match?
# (e.g. exact title and artist match, then partial album)
# Maybe separate the track, album and artist stages?
# First, match by track title, then filter by artist and album if possible
track_matches: List[LibraryFile] = library_cache.cache_by_track_title[raw_scrobble.track_title]
if len(track_matches) < 1:
return None
elif len(track_matches) == 1:
return ExtendedScrobble.from_library_track(
raw_scrobble, track_matches[0], TrackSourceType.LOCAL_LIBRARY_METADATA_EXACT
)
# Then: by artist
track_matches = [m for m in track_matches if m.artist_name == raw_scrobble.artist_name]
if len(track_matches) < 1:
return None
elif len(track_matches) == 1:
return ExtendedScrobble.from_library_track(
raw_scrobble, track_matches[0], TrackSourceType.LOCAL_LIBRARY_METADATA_EXACT
)
# Lastly: by album
track_matches = [m for m in track_matches if m.album_name == raw_scrobble.album_title]
if len(track_matches) < 1:
return None
elif len(track_matches) == 1:
return ExtendedScrobble.from_library_track(
raw_scrobble, track_matches[0], TrackSourceType.LOCAL_LIBRARY_METADATA_EXACT
)
else:
# Still multiple matches
# TODO should this even return None?
# Multiple matches would indicate duplicate files, so I'm not sure this should even return None,
# maybe just return the first match?
log.warning(f"Multiple matches when trying full metadata match, returning None. "
f"(\"{raw_scrobble}\" fully matches these tracks: {track_matches})")
return None
return None
def find_by_metadata_partial_match(
search_cache: SearchCacheState,
library_cache: LibraryCacheState,
raw_scrobble: RawScrobble
) -> Optional[ExtendedScrobble]:
"""
Try to find partial metadata match in our local music library.
Args:
search_cache:
SearchCacheState instance.
library_cache:
LibraryCacheState instance.
raw_scrobble:
RawScrobble instance.
Returns:
If found, an ExtendedScrobble instance formed from the matched library track. Otherwise None.
"""
# Use cached result if possible
caching_tuple = (raw_scrobble.track_title, raw_scrobble.album_title, raw_scrobble.artist_name)
if caching_tuple in search_cache.local_by_partial_metadata:
log.debug("find_by_metadata_partial_match: cache hit")
track: Optional[LibraryFile] = search_cache.local_by_partial_metadata[caching_tuple]
if track is None:
return None
else:
return ExtendedScrobble.from_library_track(
raw_scrobble, track, TrackSourceType.LOCAL_LIBRARY_METADATA_PARTIAL
)
log.debug("find_by_metadata_partial_match: cache miss")
# Start by filtering to the closest artist name match
best_artist_match: Optional[Tuple[str, int]] = extractOne(
raw_scrobble.artist_name,
library_cache.cache_list_of_artists,
scorer=UQRatio,
score_cutoff=config.FUZZY_MIN_ARTIST
)
# Edge case: if no match can be found, we should stop
if best_artist_match is None:
search_cache.local_by_partial_metadata[caching_tuple] = None
return None
# Otherwise, build a list of LibraryFiles for further filtering
current_cache_list: List[LibraryFile] = library_cache.cache_by_artist[best_artist_match[0]]
# Now filter by album if possible
if raw_scrobble.album_title not in (None, ""):
albums: List[str] = list(set([str(a.album_name) for a in current_cache_list]))
best_album_match: Optional[Tuple[str, int]] = extractOne(
raw_scrobble.album_title,
albums,
scorer=UQRatio,
score_cutoff=config.FUZZY_MIN_ALBUM
)
# If a match is found, filter the list by this album
if best_album_match is not None:
current_cache_list = [a for a in current_cache_list if a.album_name == best_album_match[0]]
# Finally, choose the best track by title
c_to_track_titles = list(set([str(a.track_title) for a in current_cache_list]))
best_track_match: Optional[Tuple[str, int]] = extractOne(
raw_scrobble.track_title,
c_to_track_titles,
scorer=UQRatio,
score_cutoff=config.FUZZY_MIN_TITLE
)
# Edge case: no title match, exit here
if best_track_match is None:
search_cache.local_by_partial_metadata[caching_tuple] = None
return None
# Otherwise build a ExtendedScrobble with this information
final_track = current_cache_list[c_to_track_titles.index(best_track_match[0])]
return ExtendedScrobble.from_library_track(
raw_scrobble, final_track, TrackSourceType.LOCAL_LIBRARY_METADATA_PARTIAL
)
def find_on_musicbrainz(raw_scrobble: RawScrobble) -> Optional[ExtendedScrobble]:
"""
Try to find a track MBID match on MusicBrainz.
Args:
raw_scrobble:
RawScrobble instance.
Returns:
If found, an ExtendedScrobble instance formed with the help of MusicBrainz' data.
"""
release_track = ReleaseTrack.from_track_mbid(raw_scrobble.track_mbid)
if release_track is None:
return None
log.debug(f"find_on_musicbrainz: got release track")
return ExtendedScrobble.from_musicbrainz_track(raw_scrobble, release_track)
def find_on_youtube(
search_cache: SearchCacheState,
raw_scrobble: RawScrobble
) -> Optional[ExtendedScrobble]:
"""
Try to find a track by metadata on YouTube. Works simply by searching for the string
"artist album track" and trying to find the best match out of the first 8 results.
Args:
search_cache:
SearchCacheState instance.
raw_scrobble:
RawScrobble instance.
Returns:
If found, an ExtendedScrobble instance with the length from the matched YouTube video.
"""
# Search YouTube for the closest "artist title" match
query = f"{raw_scrobble.artist_name} {raw_scrobble.album_title} {raw_scrobble.track_title}"
if query in search_cache.youtube_by_query:
log.debug("find_on_youtube: cache hit")
duration_sec = search_cache.youtube_by_query[query]
else:
log.debug("find_on_youtube: cache miss")
search = SearchVideos(query, mode="list", max_results=8)
# Find the closest match
closest_match = extractOne(
query,
search.titles,
scorer=partial_ratio,
score_cutoff=config.FUZZY_YOUTUBE_MIN_TITLE
)
if closest_match is None:
log.debug("find_on_youtube: no good match")
return None
else:
log.debug(f"find_on_youtube: got a good match - \"{closest_match[0]}\"")
# Parse the closest one into a proper ExtendedScrobble
index = search.titles.index(closest_match[0])
duration_human = search.durations[index]
duration_sec = youtube_length_to_sec(duration_human)
# Store the video length in cache to speed up repeated listens
search_cache.youtube_by_query[query] = duration_sec
return ExtendedScrobble.from_youtube(raw_scrobble, duration_sec)
def process_single_scrobble(
state: AnalysisState,
raw_data: Dict[Any, Any]
) -> ExtendedScrobble:
"""
Given raw data about a scrobble event, process it and attempt to find more data about it.
Queries the local music library, YouTube, MusicBrainz and Last.fm if needed.
Args:
state:
AnalysisState instance.
raw_data:
A dictionary with the raw scrobble data.
Returns:
An ExtendedScrobble instance. Contains as much data as can be extracted from it.
Modes of search:
1) Use track MBID (local library)
2) Use track metadata (local library) - try exact match first, then partial
3) Use track MBID (search on MusicBrainz)
4) Use track metadata (YouTube search)
"""
####
# Load raw scrobble data into a RawScroble instance
####
rs: RawScrobble = RawScrobble.from_raw_data(raw_data)
# Because lazy string interpolation is hard,
# we enclose the bigger debug logs in a isEnabledFor
if log.isEnabledFor(logging.DEBUG):
log.debug(f"Processing scrobble: {str(rs)} (raw_data=\"{raw_data}\")")
library_cache: LibraryCacheState = state.library_cache
search_cache: SearchCacheState = state.search_cache
stats: StatisticsState = state.statistics
#########
# Find source with track length and more accurate metadata
#########
# Multiple modes of search, first has highest priority:
# 1) Use track MBID (local library)
# 2) Use track metadata (local library) - try exact match first, then partial
# 3) Use track MBID (search on MusicBrainz)
# 4) Use track metadata (YouTube search)
scrobble: Optional[ExtendedScrobble] = None
# Try exact mbid search (local library)
if rs.track_mbid is not None:
# Look up the track in cache via mbid
scrobble = find_by_mbid(library_cache, rs)
if scrobble is not None:
log.debug(f"Match by MBID (local library): {rs}")
stats.local_mbid_hits += 1
# Try exact metadata match (local library)
if scrobble is None and rs.track_title is not None:
scrobble = find_by_metadata_full_match(library_cache, rs)
if scrobble is not None:
log.debug(f"Match by exact metadata (local library): {rs}")
stats.local_metadata_exact_hits += 1
# Try partial metadata match (local library)
if scrobble is None and rs.track_title is not None:
scrobble = find_by_metadata_partial_match(search_cache, library_cache, rs)
if scrobble is not None:
log.debug(f"Match by partial metadata (local library): {rs}")
stats.local_metadata_partial_hits += 1
# Try MusicBrainz
if scrobble is None and rs.track_mbid is not None:
scrobble = find_on_musicbrainz(rs)
if scrobble is not None:
log.debug(f"Match by MBID (MusicBrainz): {rs}")
stats.musicbrainz_hits += 1
# Try youtube search
if scrobble is None:
scrobble = find_on_youtube(search_cache, rs)
if scrobble is not None:
log.debug(f"Match by metadata (YouTube): {rs}")
stats.youtube_hits += 1
# If absolutely no match can be found, create a fallback scrobble with just the basic data
if scrobble is None:
log.debug("No match, using basic scrobble data.")
scrobble = ExtendedScrobble.from_basic_data(rs)
stats.basic_info_hits += 1
#########
# Find genre if missing
#########
if scrobble.genre_list is None:
log.debug("Fetching Last.fm genres.")
genres: List[str] = fetch_genre_by_metadata(
scrobble.track_title,
scrobble.album_name,
scrobble.artist_name,
)
scrobble.genre_list = genres
return scrobble
def generate_extended_data(state: AnalysisState):
"""
Generate extended scrobble data from the available scrobbles.
Saves the data into a spreadsheet (location determined by configuration file).
Args:
state:
AnalysisState instance to read scrobbles and cache from.
Expects state.raw_scrobbles to be already loaded.
Updates the state with
- statistics (sets the "statistics" key to an instance of StatisticsState) and
- search cache (YouTube video length / local metadata match cache / ...)
"""
log.info("Generating extended scrobble data...")
scrobbles_len = len(state.raw_scrobbles)
# Create an openpyxl workbook and select the proper sheet
xl_workbook = Workbook()
sheet = xl_workbook.active
sheet.title = "Data"
# Set up search cache
# This really pays off if the tracks repeat (over a longer period of time for example)
# TODO implement a better cache than this
# cache should probably carry over restarts, but we need a TTL
search_cache: SearchCacheState = SearchCacheState()
# Set up counters for different match types
# (for statistics at the end)
stats_state: StatisticsState = StatisticsState()
# Update the main AnalysisState with the reference to our new stats and cache
state.statistics = stats_state
state.search_cache = search_cache
# Append the spreadsheet header (camel_case names)
sheet.append(ExtendedScrobble.spreadsheet_header())
counter = 0
# Go through every scrobble and append a row for each entry
for scrobble_raw_data in state.raw_scrobbles:
try:
extended_scrobble: ExtendedScrobble = process_single_scrobble(state, scrobble_raw_data)
except Exception as e:
# In case of failure, just log and skip the scrobble
log.warning(f"Failed to process scrobble, skipping ({e}): \"{scrobble_raw_data}\"")
traceback.print_exc()
else:
sheet.append(extended_scrobble.to_spreadsheet_list())
# Log progress as configured (every parse_log_interval scrobbles)
counter += 1
if counter % config.PARSE_LOG_INTERVAL == 0:
log.info(f"Parsing progress: {counter} scrobbles "
f"({round(counter / scrobbles_len * 100, 1)}%)")
# Save the workbook to the configured path
# Exponential backoff, starting at 2s
retries_current_wait = 2
written = False
human_datetime = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
workbook_output_path = config.XLSX_OUTPUT_PATH.replace(
"{DATETIME}", human_datetime
)
# If the file already exists (which is unlikely, but possible),
# append a random suffix to the file name
if path.isfile(workbook_output_path):
random_suffix: str = generate_random_filename_safe_text(4)
log.warning(f"Configured spreadsheet output path is \"{workbook_output_path}\", but that file already exists. "
f"Appending \"_{random_suffix}\" to the filename.")
# Tuple[path without extension, ext]
split_output: Tuple[str, str] = path.splitext(workbook_output_path)
workbook_output_path = f"{split_output[0]}_{random_suffix}{split_output[1]}"
# Up to 7 retries (2^7 = 128)
while retries_current_wait <= 128:
try:
xl_workbook.save(filename=workbook_output_path)
written = True
break
except PermissionError:
log.warning(f"PermissionError while trying to open spreadsheet file, "
f"retrying in {retries_current_wait} seconds.")
time.sleep(retries_current_wait)
retries_current_wait *= 2
if written is False:
log.critical(f"Failed to write spreadsheet file to \"{config.XLSX_OUTPUT_PATH}\".")
exit(1)
def print_end_stats(state: AnalysisState):
"""
Log the analysis stats, based on the current state.
Args:
state:
AnalysisState instance to use.
Expects the "scrobbles" and "statistics" keys to be accurate.
"""
scrobbles_len = len(state.raw_scrobbles)
stats = state.statistics
c_local_mbid = stats.local_mbid_hits
c_local_metadata_exact = stats.local_metadata_exact_hits
c_local_metadata_partial = stats.local_metadata_partial_hits
c_musicbrainz = stats.musicbrainz_hits
c_youtube = stats.youtube_hits
c_basic_info = stats.basic_info_hits
perc_local_mbid = round(c_local_mbid / scrobbles_len * 100, 1)
perc_local_metadata_exact = round(c_local_metadata_exact / scrobbles_len * 100, 1)
perc_local_metadata_partial = round(c_local_metadata_partial / scrobbles_len * 100, 1)
perc_musicbrainz = round(c_musicbrainz / scrobbles_len * 100, 1)
perc_youtube = round(c_youtube / scrobbles_len * 100, 1)
perc_basic_info = round(c_basic_info / scrobbles_len * 100, 1)
log.info(f"Source statistics:\n"
f" Local library (MBID): {c_local_mbid} ({perc_local_mbid}%)\n"
f" Local library (exact metadata): {c_local_metadata_exact} ({perc_local_metadata_exact}%)\n"
f" Local library (partial metadata): {c_local_metadata_partial} ({perc_local_metadata_partial}%)\n"
f" MusicBrainz: {c_musicbrainz} ({perc_musicbrainz}%)\n"
f" YouTube: {c_youtube} ({perc_youtube}%)\n"
f" No matches, just basic data: {c_basic_info} ({perc_basic_info}%)")
def main():
"""
Main entry point for this script.
Steps:
1) If on Windows, makes sure the system won't go to sleep mid-processing
2) Builds or loads the local music library cache
3) Loads the scrobbles from file
4) Generates the extended data and outputs it to a spreadsheet
5) Shows some quick stats about the quality of lookups
"""
# Inhibit Windows system sleep, and uninhibit at the end of the script
# Silently fails on anything but Windows
# To make sure this is working on Windows, you can run "powercfg /requests" and look under SYSTEM
inhibit()
# Our main state
state: AnalysisState = AnalysisState()
# TODO move logging from specific functions into main?
##
# 1) Load/generate the library cache and scrobbles
##
# Will fill the state in-place, as will functions down the line
with TimedContext("Local library cache took {time}s", callback=log.info):
log.info("Making sure the local music library is cached...")
ensure_library_cache(state)
##
# Scrobbles
with TimedContext("Scrobbles file read and parsed in {time}s", callback=log.info):
log.info("Loading scrobbles...")
load_scrobbles(state)
##
# 2) Generate and save the extended data
##
##
# Generate data
with TimedContext("Spreadsheet generated and saved in {time}s", callback=log.info):
generate_extended_data(state)
log.info(f"Spreadsheet location: \"{config.XLSX_OUTPUT_PATH}\"")
##
# Print statistics
print_end_stats(state)
# Uninhibit Windows system sleep
uninhibit()
if __name__ == '__main__':
main()