-
Notifications
You must be signed in to change notification settings - Fork 0
/
fuzzy_search.py
91 lines (68 loc) · 3.37 KB
/
fuzzy_search.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import pandas as pd
from fuzzywuzzy import fuzz
from multiprocessing import Pool, cpu_count
import re
import getter as UG
# print(song_df.info())
def fuzzy_match(row, track_name, artist_name, threshold):
track_score = fuzz.token_sort_ratio(track_name, row['track_name'])
artist_score = fuzz.token_sort_ratio(artist_name, row['artist_name'])
if track_score >= threshold or artist_score >= threshold:
return {
'id': row['id'],
'uri': row['uri'],
'track_name': row['track_name'],
'artist_name': row['artist_name'],
'track_score': track_score,
'artist_score': artist_score,
'average_score': (track_score + artist_score) / 2
}
return None
def fuzzy_search_song(track_name, artist_name, song_df, threshold=70):
# Initial filtering to narrow down candidates
ts = re.split(r' ', track_name)
arts = re.split(r' ', track_name)
initial_candidates = song_df[
song_df['track_name'].str.contains(ts[0], case=False, na=False) |
song_df['artist_name'].str.contains(arts[0], case=False, na=False)
]
if len(initial_candidates) == 0:
initial_candidates = song_df
# # Prepare arguments for parallel processing
# args = [(row, track_name, artist_name, threshold) for _, row in initial_candidates.iterrows()]
# # Use multiprocessing to speed up fuzzy matching
# with Pool(cpu_count()) as pool:
# results = pool.starmap(fuzzy_match, args)
# # Filter out None results
# results = [result for result in results if result]
results = []
for _, row in initial_candidates.iterrows():
results.append(fuzzy_match(row, track_name, artist_name, threshold))
results = [result for result in results if result]
# Sort results by the average score in descending order
results = sorted(results, key=lambda x: x['average_score'], reverse=True)
return results
def iter_fuzzy_search_song(search_track_name, search_artist_name, song_df, threshold=70):
matches = []
while len(matches) == 0 and threshold > 0:
matches = fuzzy_search_song(search_track_name, search_artist_name, song_df, threshold=threshold)
threshold -= 30
if len(matches) == 0:
print("ERROR: iter_fuzzy_search_song failed. No matches found")
return matches
if __name__ == "__main__":
# Example usage
songs_path = "/nfs/guille/eecs_research/soundbendor/datasets/playlist_completion/spotify_track_features/songs.csv"
# # Sample song database
# test_data = {
# 'uri': ['1', '2', '3', '4', '5'],
# 'track_name': ['All I Want for Xmas', 'All I Want for Christmas Is You', 'Last Christmas', 'Happy Xmas (War Is Over)', 'Santa Tell Me'],
# 'artist_name': ['Mariah', 'Mariah Carey', 'Wham!', 'John Lennon & Yoko Ono', 'Ariana Grande']
# }
song_df = pd.read_csv(songs_path)
search_track_name = 'All I Want for Christmas'
search_artist_name = 'Justin Bieber and Mariah Carey'
starting_threshold = 70
matches = iter_fuzzy_search_song(search_track_name, search_artist_name, song_df, threshold=starting_threshold)
for match in matches:
print(f"Track ID: {match['uri']}, Track Name: {match['track_name']}, Artist Name: {match['artist_name']}, Track Score: {match['track_score']}, Artist Score: {match['artist_score']}, Average Score: {match['average_score']}")