-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathuta-net.py
664 lines (545 loc) · 24.1 KB
/
uta-net.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
#!/usr/bin/env python3
import os
import re
import sys
from typing import Dict, List, Optional
import requests
from bs4 import BeautifulSoup
import mutagen
from mutagen.id3 import ID3, USLT, ID3NoHeaderError
import difflib
import unicodedata
import argparse
from urllib.parse import quote
try:
import argcomplete
ARGCOMPLETE_AVAILABLE = True
except ImportError:
ARGCOMPLETE_AVAILABLE = False
class LyricsTagger:
def __init__(
self,
directory: Optional[str] = None,
artist_url: Optional[str] = None,
per_file_search: bool = False,
single_file: Optional[str] = None,
):
self.directory: str = self.get_directory_path(directory)
self.single_file: Optional[str] = single_file
self.audio_files: List[str] = self.get_audio_files()
self.per_file_search = per_file_search
self.artist_url: Optional[str] = (
None if per_file_search else self.get_artist_url(artist_url)
)
self.song_entries: Dict[str, str] = (
{}
if per_file_search or self.artist_url is None
else self.collect_song_entries()
)
def get_directory_path(self, directory: Optional[str] = None) -> str:
"""Get directory path from argument or use current directory."""
if not directory:
directory = os.getcwd()
print(f"Using current directory: {directory}")
if not os.path.isdir(directory):
print("The provided path is not a valid directory.")
sys.exit(1)
return directory
def get_audio_files(self) -> List[str]:
"""Get the list of audio files in the directory."""
audio_extensions = (".mp3", ".flac", ".m4a", ".ogg", ".aac")
if self.single_file:
if not os.path.isfile(os.path.join(self.directory, self.single_file)):
print(f"The specified file '{self.single_file}' does not exist.")
sys.exit(1)
if not self.single_file.lower().endswith(audio_extensions):
print(
f"The specified file '{self.single_file}' is not a supported audio file."
)
sys.exit(1)
return [self.single_file]
audio_files = sorted(
[
f
for f in os.listdir(self.directory)
if f.lower().endswith(audio_extensions)
]
)
if not audio_files:
print("No audio files found in directory.")
sys.exit(1)
return audio_files
@staticmethod
def find_best_match(
search_term: str, candidates: List[str], threshold: float = 0.8
) -> Optional[tuple[str, float]]:
"""Find the best matching string from a list of candidates using fuzzy matching.
Args:
search_term: The string to search for
candidates: List of strings to search through
threshold: Minimum similarity ratio to consider a match (0-1)
Returns:
Tuple of (best matching string, similarity ratio) or None if no match found
"""
best_match = None
highest_ratio = 0
substring_match = None
for candidate in candidates:
# Calculate similarity ratio
ratio = difflib.SequenceMatcher(None, search_term, candidate).ratio()
if ratio > highest_ratio and ratio >= threshold:
highest_ratio = ratio
best_match = candidate
# Check for substring match if we haven't found a good match yet
if not best_match and candidate in search_term:
# If we find multiple substring matches, use the longest one
if not substring_match or len(candidate) > len(substring_match):
substring_match = candidate
# Use substring match if no better match was found
if not best_match and substring_match:
substring_ratio = difflib.SequenceMatcher(
None, search_term, substring_match
).ratio()
return substring_match, substring_ratio
elif best_match:
return best_match, highest_ratio
return None
def search_artist_url(self, artist_name: str) -> Optional[tuple[str, str, str]]:
"""Search for an artist on uta-net and return their URL and details."""
# Get potential search terms
search_terms = self.extract_search_terms(artist_name)
best_match = None
highest_ratio = 0
# Try each search term until we find a confident match
for search_term in search_terms:
print(f"Trying artist search term: '{search_term}'")
entries = self.get_artist_search_results(search_term)
if not entries:
continue
artist_names = [entry[0] for entry in entries]
entry_mapping = {entry[0]: entry for entry in entries}
match_result = self.find_best_match(
artist_name, artist_names, threshold=0.3
)
if match_result:
matched_name, ratio = match_result
if ratio > highest_ratio:
highest_ratio = ratio
matched_entry = entry_mapping[matched_name]
best_match = (matched_entry[1], matched_entry[0], matched_entry[2])
if highest_ratio >= 0.3:
break
if best_match:
artist_url, artist_name_found, song_count = best_match
print(
f"Best artist match (similarity: {highest_ratio:.2f}): {artist_name_found} ({song_count})"
)
return artist_url, artist_name_found, song_count
return None
def get_artist_url(self, url: Optional[str] = None) -> Optional[str]:
"""Get uta-net artist page URL from argument or auto-detect from audio files.
Returns None if no artist URL found but title search might be possible."""
if url and re.match(r"https?://www\.uta-net\.com/artist/\d+/?", url):
return url
if not self.audio_files:
print("No audio files found in directory.")
sys.exit(1)
file_path = os.path.join(self.directory, self.audio_files[0])
audio = mutagen.File(file_path, easy=True)
if not audio or not audio.get("artist"):
print(f"Could not read artist from {self.audio_files[0]}")
return None
artist_name = str(audio["artist"][0])
print(f"Detected artist: {artist_name}")
result = self.search_artist_url(artist_name)
if not result:
print(f"No results found for artist: {artist_name}")
return None
artist_url, artist_name_found, song_count = result
print(f"Artist URL: {artist_url}")
return artist_url
def get_total_pages(self, soup: BeautifulSoup) -> int:
"""Extract the total number of pages from the artist page."""
page_info = soup.find(
"div", class_="col-7 col-lg-3 text-start text-lg-end d-none d-lg-block"
)
if page_info:
match = re.search(r"全(\d+)ページ中", page_info.text)
if match:
return int(match.group(1))
return 1
def collect_song_entries(self) -> Dict[str, str]:
"""Collect song titles and their corresponding URLs from the artist's page."""
song_entries = {}
response = requests.get(self.artist_url)
response.raise_for_status()
soup = BeautifulSoup(response.content, "html.parser")
total_pages = self.get_total_pages(soup)
for page_num in range(1, total_pages + 1):
if page_num > 1:
page_url = f"{self.artist_url.rstrip('/')}/0/{page_num}/"
response = requests.get(page_url)
response.raise_for_status()
soup = BeautifulSoup(response.content, "html.parser")
rows = soup.select("tbody.songlist-table-body tr")
for row in rows:
title_span = row.select_one("td.sp-w-100 a span.songlist-title")
a_tag = row.select_one("td.sp-w-100 a")
if not title_span or not a_tag:
continue
song_title = title_span.get_text(strip=True)
song_link = a_tag["href"]
full_song_url = f"https://www.uta-net.com{song_link}"
song_entries[song_title] = full_song_url
if not song_entries:
print("No songs found for the given artist.")
sys.exit(1)
return song_entries
@staticmethod
def normalize_text(title: str) -> str:
"""Clean title by removing emojis, symbols, and punctuation"""
title = unicodedata.normalize("NFKC", title)
cleaned = ""
for char in title:
if unicodedata.category(char).startswith(("So", "Sm", "Sk", "Sc")):
continue
if unicodedata.category(char).startswith("P") and char not in "'.:()()":
continue
cleaned += char
return cleaned.lower().strip()
def match_song_title(
self, file_title: str, song_titles: List[str]
) -> Optional[str]:
"""Match the audio file's title with the collected song titles using fuzzy matching."""
cleaned_file_title = self.normalize_text(file_title)
cleaned_song_titles = [self.normalize_text(title) for title in song_titles]
# Create mapping of cleaned titles to original titles
title_mapping = dict(zip(cleaned_song_titles, song_titles))
match_result = self.find_best_match(cleaned_file_title, cleaned_song_titles)
if match_result:
matched_clean_title, ratio = match_result
original_title = title_mapping[matched_clean_title]
print(
f"Matched '{file_title}' to '{original_title}' (similarity: {ratio:.2f})"
)
return original_title
return None
def fetch_lyrics(self, song_url: str) -> str:
"""Fetch the lyrics from the song's page."""
response = requests.get(song_url)
response.raise_for_status()
soup = BeautifulSoup(response.content, "html.parser")
lyrics_div = soup.find("div", id="kashi_area")
if lyrics_div:
# Replace <br> tags with newlines
for br in lyrics_div.find_all("br"):
br.replace_with("\n")
lyrics = lyrics_div.get_text()
# Replace multiple consecutive newlines with two newlines
lyrics = re.sub(r"\n{2,}", "\n\n", lyrics)
lyrics = lyrics.strip()
return lyrics
return ""
def write_lyrics_to_file(self, file_path: str, lyrics: str) -> None:
"""Write the lyrics to the audio file's lyrics tag."""
file_ext = os.path.splitext(file_path)[1].lower()
try:
if file_ext == ".mp3":
try:
tags = ID3(file_path)
except ID3NoHeaderError:
tags = ID3()
tags.save(file_path)
# Remove existing USLT frames to avoid duplication
tags.delall("USLT")
ulyrics = USLT(
encoding=3, lang="jpn", desc="Unsynced lyrics", text=lyrics
)
tags.add(ulyrics)
tags.save(file_path)
elif file_ext == ".flac":
from mutagen.flac import FLAC
audio = FLAC(file_path)
audio["UNSYNCEDLYRICS"] = lyrics
audio.save(file_path)
elif file_ext == ".m4a" or file_ext == ".aac":
# Handle M4A, ALAC, AAC (MP4 containers)
from mutagen.mp4 import MP4
audio = MP4(file_path)
audio["\xa9lyr"] = lyrics
audio.save(file_path)
else:
print(f"Unsupported file extension: {file_ext}")
return
print(f"Lyrics added to {os.path.basename(file_path)}")
except Exception as e:
print(f"Error adding lyrics to {os.path.basename(file_path)}: {str(e)}")
def get_title_search_results(
self, search_term: str
) -> Optional[List[tuple[str, str, str]]]:
"""Perform search and return list of (title, url, artist) tuples."""
search_url = f"https://www.uta-net.com/search/?Keyword={quote(search_term)}&Aselect=2&Bselect=3"
try:
response = requests.get(search_url)
response.raise_for_status()
soup = BeautifulSoup(response.content, "html.parser")
song_entries = []
total_pages = self.get_total_pages(soup)
for page_num in range(1, total_pages + 1):
if page_num > 1:
page_url = f"{search_url}&page={page_num}"
response = requests.get(page_url)
response.raise_for_status()
soup = BeautifulSoup(response.content, "html.parser")
rows = soup.select("tbody.songlist-table-body tr")
for row in rows:
title_link = row.select_one("td.sp-w-100 a")
artist_link = row.select_one("td.sp-none a")
if not title_link or not artist_link:
continue
song_title = title_link.select_one(
"span.songlist-title"
).text.strip()
song_artist = artist_link.text.strip()
song_url = f"https://www.uta-net.com{title_link['href']}"
song_entries.append((song_title, song_url, song_artist))
return song_entries
except requests.RequestException:
return None
def extract_search_terms(self, text: str, max_terms: int = 5) -> List[str]:
"""Extract multiple potential search terms from text, ordered by likelihood of success."""
# First try: original text
search_terms = [text]
# Second try: full normalized text
normalized = self.normalize_text(text)
if not normalized:
return [text.strip()[:10]]
search_terms.append(normalized)
# Third try: kanji sequences
kanji_matches = list(re.finditer(r"[\u4e00-\u9fff]+", normalized))
kanji_terms = sorted(
[match.group() for match in kanji_matches], key=len, reverse=True
)
search_terms.extend(kanji_terms)
# Fourth try: word-based substrings
words = normalized.split()
substrings = []
for i in range(len(words)):
for j in range(i + 1, len(words) + 1):
substring = " ".join(words[i:j])
if substring not in search_terms:
substrings.append(substring)
# Sort substrings by length in descending order
substrings.sort(key=len, reverse=True)
search_terms.extend(substrings)
# Return unique terms, limited to max_terms
return list(dict.fromkeys(search_terms))[:max_terms]
def get_lyrics_by_title_search(
self, filename: str, file_path: str
) -> Optional[tuple[bool, str]]:
"""Try to find and fetch lyrics by searching the song title directly."""
audio = mutagen.File(file_path, easy=True)
if not audio or not audio.get("title"):
return None
title = str(audio["title"][0])
artist = str(audio.get("artist", [""])[0])
print(f"\nSearching for '{title}' by '{artist}'...")
# Get potential search terms
search_terms = self.extract_search_terms(title)
best_match = None
highest_ratio = 0
# Try each search term until we find a confident match
for search_term in search_terms:
print(f"Trying search term: '{search_term}'")
entries = self.get_title_search_results(search_term)
if not entries:
continue
# Find best match among results
for song_title, song_url, song_artist in entries:
title_ratio = difflib.SequenceMatcher(
None, self.normalize_text(title), self.normalize_text(song_title)
).ratio()
artist_ratio = difflib.SequenceMatcher(
None, self.normalize_text(artist), self.normalize_text(song_artist)
).ratio()
combined_ratio = (0.3 * title_ratio) + (0.7 * artist_ratio)
# print(f" - '{song_title}' by '{song_artist}': {combined_ratio:.2f}")
if combined_ratio > highest_ratio:
highest_ratio = combined_ratio
best_match = (song_title, song_url, song_artist)
# If we found a confident match, stop searching
if highest_ratio >= 0.3:
break
if not best_match or highest_ratio < 0.3:
return (False, f"No confident matches found for '{title}' by '{artist}'")
matched_title, song_url, matched_artist = best_match
print(
f"\nBest match: '{matched_title}' by '{matched_artist}' (similarity: {highest_ratio:.2f})"
)
lyrics = self.fetch_lyrics(song_url)
if not lyrics:
return (False, f"No lyrics found for '{matched_title}'")
print("-" * 20)
print(lyrics)
print("-" * 20)
self.write_lyrics_to_file(file_path, lyrics)
print(f"Successfully added lyrics to {filename} via title search")
return (True, "Success")
def process_audio_files(self, search_by_title_pass: bool = True) -> None:
"""Process each audio file in the directory to add lyrics."""
failed_files = []
for filename in self.audio_files:
file_path = os.path.join(self.directory, filename)
audio = mutagen.File(file_path, easy=True)
if not audio:
print(f"Could not open {filename}. Skipping.")
failed_files.append((filename, "Could not open file"))
continue
# Handle per-file artist search if enabled
if self.per_file_search:
artist = audio.get("artist")
if not artist:
print(f"No artist found for {filename}. Skipping.")
failed_files.append((filename, "No artist found"))
continue
print(f"\nProcessing {filename} - Artist: {artist[0]}")
result = self.search_artist_url(str(artist[0]))
if not result:
print(f"No artist found for {artist[0]}. Skipping.")
failed_files.append((filename, f"No artist found for {artist[0]}"))
continue
self.artist_url, artist_name_found, song_count = result
print(f"Found artist: {artist_name_found} ({song_count})")
self.song_entries = self.collect_song_entries()
title = audio.get("title")
if not title:
print(f"No title found for {filename}. Skipping.")
failed_files.append((filename, "No title found"))
continue
file_title = str(title[0])
matched_title = self.match_song_title(
file_title, list(self.song_entries.keys())
)
if not matched_title:
print(f"No matching song found for '{file_title}'.")
failed_files.append(
(filename, f"No matching song found for '{file_title}'")
)
continue
song_url = self.song_entries[matched_title]
lyrics = self.fetch_lyrics(song_url)
if not lyrics:
print(f"No lyrics found for '{matched_title}'.")
failed_files.append(
(filename, f"No lyrics found for '{matched_title}'")
)
continue
print(f"Writing lyrics to '{filename}'")
print("-" * 20)
print(lyrics)
print("-" * 20)
try:
self.write_lyrics_to_file(file_path, lyrics)
except Exception as e:
failed_files.append((filename, f"Error writing lyrics: {str(e)}"))
# Try to process failed files by title search if enabled
if search_by_title_pass and failed_files:
print("\nAttempting to find lyrics by title search for failed files...")
still_failed = []
for filename, reason in failed_files:
file_path = os.path.join(self.directory, filename)
result = self.get_lyrics_by_title_search(filename, file_path)
if result is None:
still_failed.append((filename, reason))
elif not result[0]: # If search failed
still_failed.append((filename, result[1]))
# If search succeeded, file is not added to still_failed
failed_files = still_failed
# Print summary at the end
if failed_files:
print("\nSummary of files that failed:")
print("-" * 50)
for filename, reason in failed_files:
print(f"• {filename}: {reason}")
print(
f"\nTotal: {len(failed_files)} file(s) failed out of {len(self.audio_files)}"
)
else:
print(f"\nAll {len(self.audio_files)} files processed successfully!")
def get_artist_search_results(
self, search_term: str
) -> Optional[List[tuple[str, str, str]]]:
"""Perform artist search and return list of (artist_name, url, song_count) tuples."""
search_url = "https://www.uta-net.com/search/"
params = {
"target": "art",
"type": "in",
"keyword": search_term,
}
try:
response = requests.get(search_url, params=params)
response.raise_for_status()
soup = BeautifulSoup(response.content, "html.parser")
artist_entries = []
for row in soup.select("tbody.songlist-table-body tr"):
artist_link = row.select_one("a")
if not artist_link:
continue
artist_name = artist_link.select_one("span.fw-bold").text.strip()
song_count = artist_link.select_one("span.song-count").text.strip()
artist_url = f"https://www.uta-net.com{artist_link['href']}"
artist_entries.append((artist_name, artist_url, song_count))
return artist_entries
except requests.RequestException:
return None
def path_completer(prefix, **kwargs):
"""Custom path completer for argcomplete"""
if not ARGCOMPLETE_AVAILABLE:
return []
audio_extensions = (".mp3", ".flac", ".m4a", ".ogg", ".aac")
directory = os.getcwd()
files = []
for f in os.listdir(directory):
if os.path.isfile(f) and f.lower().endswith(audio_extensions):
if not prefix or f.startswith(prefix):
files.append(f)
return sorted(files)
def main() -> None:
parser = argparse.ArgumentParser(
description="Add lyrics from uta-net.com to audio files."
)
parser.add_argument(
"-d",
"--directory",
help="Directory containing audio files (default: current directory)",
)
parser.add_argument(
"-u", "--url", help="uta-net.com artist page URL (default: auto-detect)"
)
parser.add_argument(
"--per-file",
action="store_true",
help="Search for artist URL for each file individually",
)
parser.add_argument(
"--no-title-search",
action="store_true",
help="Disable searching by title for failed files",
)
parser.add_argument(
"file",
nargs="?",
help="Process a single audio file (optionally supports tab completion)",
).completer = path_completer
if ARGCOMPLETE_AVAILABLE:
argcomplete.autocomplete(parser)
args = parser.parse_args()
tagger = LyricsTagger(
directory=args.directory,
artist_url=args.url,
per_file_search=args.per_file,
single_file=args.file,
)
tagger.process_audio_files(search_by_title_pass=not args.no_title_search)
if __name__ == "__main__":
main()