-
Notifications
You must be signed in to change notification settings - Fork 1
/
SimCSE_lsz.py
37 lines (27 loc) · 1.3 KB
/
SimCSE_lsz.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
import torch
from scipy.spatial.distance import cosine
from transformers import AutoModel, AutoTokenizer
# Import our models. The package will take care of downloading the models automatically
tokenizer = AutoTokenizer.from_pretrained("princeton-nlp/sup-simcse-bert-base-uncased")
model = AutoModel.from_pretrained("princeton-nlp/sup-simcse-bert-base-uncased")
# Tokenize input texts
texts = [
"The content of this book is well written. But the paper quality is poor.",
"The content of this book is not well written.",
"the paper quality is poor."
]
# texts=[
# "这本书真好看",
# "这本书写的真差",
# "你好保罗"
# ]
inputs = tokenizer(texts, padding =True, truncation=True, return_tensors="pt")
# Get the embeddings
with torch.no_grad():
embeddings = model(**inputs, output_hidden_states=True, return_dict=True).pooler_output
# Calculate cosine similarities
# cosine similarities are in [-1,1]. Higher means more similar
cosine_sim_0_1 = 1 - cosine(embeddings[0], embeddings[1])
cosine_sim_0_2 = 1- cosine(embeddings[0], embeddings[2])
print("Cosine similarity between \"%s\" and \"%s\" is: %.3f" % (texts[0], texts[1], cosine_sim_0_1))
print("Cosine similarity between \"%s\" and \"%s\" is: %.3f" % (texts[0], texts[2], cosine_sim_0_2))