-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfindtop50.py
73 lines (57 loc) · 2.56 KB
/
findtop50.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
# -*- coding: utf-8 -*-
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import ollama
import numpy as np
from numpy.linalg import norm
# Function to normalize values between 0 and 1
def normalize_0_1(values):
min_val = np.min(values)
max_val = np.max(values)
return (values - min_val) / (max_val - min_val)
# Load the prediction embeddings
prediction = np.load('./resources/prediction.npy')
# Load the embeddings and t-SNE reduced arrays
embeddings_array = np.load('./resources/240820-3.1.npy')
reduced_array = np.load('./resources/240820-3.1-tsne.npy')
# Initialize empty lists to store the results
top50indices = []
file_path_1 = './resources/dataset_240820.xlsx'
# Read the Excel file containing paper titles and abstracts
df_dataset = pd.read_excel(file_path_1)
dataset_papername = df_dataset['Column1'].tolist() # Paper titles
dataset_paperabs = df_dataset['Column2'].tolist() # Paper abstracts
# Initialize lists for storing results
papername = []
papernumber = []
papersim = []
paperabs = []
tsnex = []
tsney = []
# For each prediction embedding
for j, predict in enumerate(prediction):
# Calculate cosine similarity between the query embedding and all embeddings
similarities = cosine_similarity([predict], embeddings_array)[0]
# Normalize the similarity scores between 0 and 1
similarities_norm = normalize_0_1(similarities)
# Find the 100 most similar texts
most_similar_indices = np.argsort(similarities_norm)[-100:][::-1]
# Append results for each similar paper
for i in most_similar_indices:
papername.append(dataset_papername[i]) # Paper title
paperabs.append(dataset_paperabs[i]) # Paper abstract
papernumber.append(j) # Prediction number
papersim.append(similarities_norm[i]) # Normalized similarity score
tsnex.append(reduced_array[i+10][0]) # t-SNE x-coordinate
tsney.append(reduced_array[i+10][1]) # t-SNE y-coordinate
# Create a DataFrame to store the results
df = pd.DataFrame({
'papername': papername, # Paper title
'papernumber': papernumber, # Prediction number
'paper_similarity': papersim, # Similarity score
'tsnex': tsnex, # t-SNE x-coordinate
'tsney': tsney, # t-SNE y-coordinate
'paper_abstract': paperabs # Paper abstract
})
# Save the DataFrame to an Excel file
df.to_excel('./pred_top50indices_cos_3.1.xlsx', index=False)