forked from jasonppy/word-discovery
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun_kmeans.py
126 lines (109 loc) · 4.93 KB
/
run_kmeans.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import argparse
import os
import os.path as osp
import time
import numpy as np
import faiss
import pickle
from collections import namedtuple
print("I am process %s, running on %s: starting (%s)" % (
os.getpid(), os.uname()[1], time.asctime()))
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument("--dataset", type=str, default='spokencoco')
parser.add_argument("--exp_dir", type=str, default="/data2/scratch/pyp/discovery/word_unit_discovery/disc-16",help="directory to dump experiments")
parser.add_argument("--batch_size", type=int, default=40000)
parser.add_argument("--resume", action="store_true", default=False)
parser.add_argument("--max_iter", type=int, default=100)
parser.add_argument("--percentage", type=int, default=None, help="if None, the feats_type is the original name, otherwise, it's feats_type_percentage")
parser.add_argument("--threshold", type=float, default=0.90)
parser.add_argument("--reduce_method", type=str, default="mean", choices=['mean', 'max', 'median', 'weightedmean'])
parser.add_argument("--tgt_layer_for_attn", type=int, default=7, help="where attn weights are coming from, as for features, if feats_type==preFeats, and feature comes from previous layer of tgt_layer_for_attn, otherwise, feature comes from the same layer")
parser.add_argument("--segment_method", type=str, choices=['clsAttn', 'forceAlign'], default=None, help="if use cls attn segmentation or use force alignment segmentation. If use, need model_args.use_audio_cls_token to be True")
parser.add_argument('--faiss-specs', '-f', type=str,
help='faiss index specs; separated by space '
'format is: PCAx_NORM_CLUSx_SPHERICAL -> '
'PCAx if exists first apply PCA '
'NORM if exists, normalize the vector by L2 norm '
'CLUSx must exist, cluster to x clusters '
'SPEHRICAL if exists, apply spherical kmeans',
default='l2')
parser.add_argument("--seed", type=int, default=1, help="random seed for clustering")
args = parser.parse_args()
feats_type = args.dataset + "_" + args.reduce_method + "_" + str(args.threshold) + "_" + str(args.tgt_layer_for_attn) + "_" + args.segment_method
if args.percentage is not None:
feats_type = feats_type + "_" + str(args.percentage)
exp_dir = osp.join(args.exp_dir, feats_type)
if not os.path.isdir(exp_dir):
raise RuntimeError(f"{exp_dir} does not exist!!")
km_exp_dir = osp.join(exp_dir, 'kmeans_models')
os.makedirs(km_exp_dir, exist_ok=True)
faiss_spec = namedtuple("faiss_spec", ["pca", "norm", "n_clus", "sphere", "spec_str"])
def parse_faiss_specs(specs_str):
specs = []
for ss in specs_str.split():
comps = ss.split("_")
pca = 0
norm = False
n_clus = 0
sphere = False
for c in comps:
if c.startswith("PCA"):
pca = int(c[3:])
elif c == "NORM":
norm = True
elif c.startswith("CLUS"):
n_clus = int(c[4:])
elif c == "SPHERICAL":
sphere = True
assert n_clus > 0
specs.append(
faiss_spec(pca=pca, norm=norm, n_clus=n_clus, sphere=sphere, spec_str=ss)
)
return specs
faiss_specs = parse_faiss_specs(args.faiss_specs)
print("Faiss Specs:", faiss_specs)
for spec in faiss_specs: # this is a little strange, but I guess
print("Processing spec", spec)
# feats = np.load(osp.join(exp_dir, "training_data.npy"))
start_time = time.time()
with open(osp.join(exp_dir, "data_dict.pkl"), "rb") as f:
feats_dict = pickle.load(f)
feats = []
for key in feats_dict:
feats.append(feats_dict[key]['seg_feats'].numpy())
feats = np.concatenate(feats)
print("feature reading time: ", time.time() - start_time)
print("FAISS KMeans training data shape: ", feats.shape)
save_path = osp.join(km_exp_dir, spec.spec_str)
os.makedirs(save_path, exist_ok=True)
d = feats.shape[-1]
x = feats
if spec.pca > 0:
print("Computing PCA")
pca = faiss.PCAMatrix(d, spec.pca)
pca.train(x)
d = spec.pca
b = faiss.vector_to_array(pca.b)
A = faiss.vector_to_array(pca.A).reshape(pca.d_out, pca.d_in)
np.save(osp.join(save_path, "pca_A"), A.T)
np.save(osp.join(save_path, "pca_b"), b)
print("Applying PCA")
x = pca.apply_py(x)
if spec.norm:
reload = spec.pca <= 0
print("Normalizing")
faiss.normalize_L2(x)
print("Computing kmeans")
kmeans = faiss.Kmeans(
d,
spec.n_clus,
niter=100,
verbose=True,
spherical=spec.sphere,
max_points_per_centroid=feats.shape[0],
gpu=True,
nredo=5,
seed = args.seed
)
kmeans.train(x)
np.save(osp.join(save_path, "centroids"), kmeans.centroids)