Skip to content

Commit

Permalink
discard faulty embeddings
Browse files Browse the repository at this point in the history
  • Loading branch information
zaidalyafeai authored Dec 21, 2024
1 parent 9be41eb commit e9672de
Showing 1 changed file with 8 additions and 3 deletions.
11 changes: 8 additions & 3 deletions utils/clusters_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,15 @@ def compute_clusters(embeddings: List[List[float]]) -> List[int]:

def compute_reduced_embeddings(embeddings: List[List[float]]) -> List[List[float]]:
tsne_model = TSNE(n_components=2, random_state=42)
new_embeddings = []
for emb in embeddings:
print(len(emb))
embeddings = np.asarray(embeddings, dtype=object)
if len(emb) != 384:
new_embeddings.append([0]*384)
else:
new_embeddings.append(emb)

embeddings = np.asarray(new_embeddings, dtype=object)
print(embeddings.shape)
tsne_data = tsne_model.fit_transform(embeddings)
tsne_data = tsne_model.fit_transform(new_embeddings)

return (tsne_data - tsne_data.min()).tolist()

0 comments on commit e9672de

Please sign in to comment.