-
Notifications
You must be signed in to change notification settings - Fork 0
/
diversity.py
executable file
·96 lines (83 loc) · 2.51 KB
/
diversity.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
#!/usr/bin/env python
import numpy as np
import pickle
from sklearn.cluster import KMeans
num_clusters = 20
originality_scores = []
diversity_scores = []
# save embeddings
embeddings = {}
f = open('glove.6B.50d.txt', 'r', encoding='utf-8')
i = 0
for line in f:
parts = line.split()
embeddings[parts[0]] = parts[1:]
# cluster word embeddings
alg = KMeans(n_clusters=num_clusters, random_state=0)
arr_embs = np.array(list(embeddings.values()))
arr_embs = arr_embs[0:4000,:]
print(arr_embs.shape)
#print(type(arr_embs[0]))
#import sys
#sys.exit(1)
clusters = alg.fit_predict(arr_embs)
centroids = alg.cluster_centers_
num_centroids = len(centroids)
print(centroids.shape)
f = open('data.pkl', 'rb')
names, years, artists, genres, lyrics = pickle.load(f)
print('Total songs:', len(names))
# Initialize genre-diversity data structures
div_genres = {}
count_genres = {}
for genre in genres:
if genre not in div_genres:
div_genres[genre] = 0
count_genres[genre] = 0
div_years = {}
count_years = {}
for year in years:
if year not in div_years:
div_years[year] = 0
count_years[year] = 0
div_artists = {}
count_artists = {}
for artist in artists:
if artist not in div_artists:
div_artists[artist] = 0
count_artists[artist] = 0
# calculate diversity for each lyrics
for i, (name, year, artist, genre, lyric) in enumerate(zip(names, years, artists, genres, lyrics)):
if i%100==0:
print(i)
#if i>=100000:
# break
if len(lyric)==0:
continue
clusters_used = []
for word in lyric:
if word in embeddings:
vec = embeddings[word]
cluster_idx = alg.predict([vec])
if cluster_idx not in clusters_used:
clusters_used.append(cluster_idx)
diversity = len(clusters_used) / num_centroids
div_genres[genre] += diversity
count_genres[genre] += 1
div_years[year] += diversity
count_years[year] += 1
div_artists[artist] += diversity
count_artists[artist] += 1
print('Genre diversity:')
for genre, div in div_genres.items():
if count_genres[genre]>0:
print(genre, div / count_genres[genre])
print('\nYear diversity:')
for year, div in sorted(div_years.items()):
if count_years[year]>0:
print(year, div/count_years[year], count_years[year])
threshold = sorted(count_artists.values(), reverse=True)[100]-1
print('\nArtist diversity:')
for artist, div in div_artists.items():
if count_artists[artist]>threshold:
print(artist, div/count_artists[artist])