Skip to content

Commit 690e0fb

Browse files
committed
Add embedding scripts
Signed-off-by: Evangelos Lamprou <vagos@lamprou.xyz>
1 parent fa0d9db commit 690e0fb

File tree

2 files changed

+161
-40
lines changed

2 files changed

+161
-40
lines changed

infrastructure/do_embedding.py

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
import json
2+
import glob
3+
import os
4+
from openai import OpenAI
5+
import pandas as pd
6+
import dotenv
7+
8+
# Load environment variables
9+
dotenv.load_dotenv()
10+
11+
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
12+
13+
# Set your OpenAI API key
14+
15+
def read_json_and_generate_embeddings(json_file):
16+
"""
17+
Reads the JSON file, combines script contents for each benchmark, sends them to the OpenAI embedding API,
18+
and returns a dictionary of dataframes containing embeddings for each benchmark.
19+
"""
20+
# Read the JSON file
21+
with open(json_file, 'r') as file:
22+
data = json.load(file)
23+
24+
embedding_df = pd.DataFrame(columns=["benchmark", "embedding"])
25+
26+
# Process each benchmark
27+
for benchmark, details in data.items():
28+
print(f"Processing benchmark: {benchmark}")
29+
30+
# Combine all script contents into a single string
31+
scripts_globs = details.get("scripts", [])
32+
combined_script = ""
33+
for script_glob in scripts_globs:
34+
for script_file in glob.glob(f"../{script_glob}"):
35+
with open(script_file, 'r') as f:
36+
combined_script += f.read() + "\n" # Append content
37+
38+
print(f"Combined script for {benchmark}: {combined_script}")
39+
40+
# Generate embedding using OpenAI's API
41+
try:
42+
response = client.embeddings.create(model="text-embedding-ada-002", # Use a suitable model for embedding
43+
input=combined_script)
44+
embedding = response.data[0].embedding
45+
except Exception as e:
46+
print(f"Error generating embedding for {benchmark}: {e}")
47+
continue
48+
49+
# Create a dataframe to hold the benchmark and its embedding
50+
embedding_df = embedding_df._append({"benchmark": benchmark, "embedding": embedding}, ignore_index=True)
51+
52+
return embedding_df
53+
54+
# Example usage
55+
if __name__ == "__main__":
56+
json_file = "./data/script-globs.json"
57+
embeddings_df = read_json_and_generate_embeddings(json_file)
58+
59+
# Save or inspect the results
60+
print(embeddings_df)
61+
embeddings_df.to_csv("./data/embeddings.csv", index=False)

infrastructure/do_pca.py

Lines changed: 100 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -3,50 +3,110 @@
33
from sklearn.decomposition import PCA
44
from sklearn.preprocessing import StandardScaler
55
import matplotlib.pyplot as plt
6+
from adjustText import adjust_text
67

7-
def perform_pca_and_plot(dataframe):
8+
def perform_pca_and_plot(dataframe1, dataframe2, name='row_analysis'):
89
"""
9-
Performs PCA on the numeric columns of the input dataframe and plots the first two principal components.
10+
Performs PCA on the numeric columns of two input dataframes and plots each pair of principal components
11+
(1&2 and 3&4) in a 2x2 grid, with one dataset per row and unified titles for each dataset.
12+
Each point is annotated with the corresponding benchmark name, avoiding label collisions.
1013
1114
Parameters:
12-
dataframe (pd.DataFrame): Input dataframe containing data for PCA.
15+
dataframe1 (pd.DataFrame): First input dataframe.
16+
dataframe2 (pd.DataFrame): Second input dataframe.
17+
name (str): Name for saving the plots.
1318
1419
Returns:
15-
pd.DataFrame: A dataframe containing the principal components.
20+
tuple: Two dataframes containing the principal components for each input dataframe.
1621
"""
17-
# Ensure numeric columns are selected for PCA
18-
numeric_cols = dataframe.select_dtypes(include=[np.number]).columns
19-
if numeric_cols.empty:
20-
raise ValueError("No numeric columns available in the dataframe for PCA.")
21-
print(f"Numeric columns selected for PCA: {numeric_cols}")
22-
23-
# Drop rows with NaN values in numeric columns (if any)
24-
dataframe_numeric = dataframe[numeric_cols].dropna()
25-
26-
# Standardize the data
27-
scaler = StandardScaler()
28-
data_scaled = scaler.fit_transform(dataframe_numeric)
29-
30-
# Perform PCA
31-
pca = PCA(n_components=2) # Reduce to 2 components for visualization
32-
principal_components = pca.fit_transform(data_scaled)
33-
34-
# Create a new dataframe with the principal components
35-
pca_df = pd.DataFrame(data=principal_components, columns=['PC1', 'PC2'])
36-
37-
# Plot the results
38-
plt.figure(figsize=(10, 7))
39-
plt.scatter(pca_df['PC1'], pca_df['PC2'], alpha=0.7)
40-
plt.title('PCA of Input DataFrame', fontsize=16)
41-
plt.xlabel('Principal Component 1', fontsize=12)
42-
plt.ylabel('Principal Component 2', fontsize=12)
43-
plt.grid(True)
44-
45-
# Optionally, add labels for points (if 'benchmark' column exists)
46-
if 'benchmark' in dataframe.columns:
47-
for i, label in enumerate(dataframe['benchmark']):
48-
plt.annotate(label, (pca_df['PC1'][i], pca_df['PC2'][i]), fontsize=8, alpha=0.6)
49-
50-
plt.savefig('pca_plot.pdf')
51-
52-
return pca_df
22+
def prepare_pca(dataframe):
23+
# Ensure numeric columns are selected for PCA
24+
numeric_cols = dataframe.select_dtypes(include=[np.number]).columns
25+
if numeric_cols.empty:
26+
raise ValueError("No numeric columns available in the dataframe for PCA.")
27+
print(f"Numeric columns selected for PCA: {numeric_cols}")
28+
29+
# Drop rows with NaN values in numeric columns and retain their indices for annotation
30+
dataframe_clean = dataframe.dropna(subset=numeric_cols)
31+
benchmark_names = dataframe_clean['benchmark'].values
32+
33+
# Standardize the data
34+
scaler = StandardScaler()
35+
data_scaled = scaler.fit_transform(dataframe_clean[numeric_cols])
36+
37+
# Perform PCA
38+
pca = PCA(n_components=4) # Reduce to 4 components for analysis
39+
principal_components = pca.fit_transform(data_scaled)
40+
41+
# Create a new dataframe with the principal components
42+
pca_df = pd.DataFrame(
43+
data=principal_components,
44+
columns=['PC1', 'PC2', 'PC3', 'PC4']
45+
)
46+
return pca_df, benchmark_names
47+
48+
# Perform PCA on both dataframes
49+
pca_df1, benchmarks1 = prepare_pca(dataframe1)
50+
pca_df2, benchmarks2 = prepare_pca(dataframe2)
51+
52+
# Create a 2x2 grid for the plots
53+
fig, axes = plt.subplots(2, 2, figsize=(12, 12), constrained_layout=True)
54+
55+
# Set the main titles for each dataset
56+
axes[0, 0].set_title('PCA from collected metrics', fontsize=14, loc='left')
57+
axes[1, 0].set_title('PCA from language model embeddings', fontsize=14, loc='left')
58+
59+
# Helper function to plot and annotate
60+
def plot_with_labels(ax, x, y, labels, title, secondary=False):
61+
scatter = ax.scatter(x, y, c='black', alpha=0.7)
62+
# ax.set_title(title, fontsize=14, loc='left')
63+
ax.set_xlabel(f'Component {1 if not secondary else 3}', fontsize=14)
64+
ax.set_ylabel(f'Component {2 if not secondary else 4}', fontsize=14)
65+
ax.grid(color='lightgray', linestyle='--', linewidth=0.5)
66+
67+
# Add text annotations
68+
texts = [ax.text(x[i], y[i], labels[i], fontsize=14, ha='center', va='center') for i in range(len(labels))]
69+
adjust_text(texts, ax=ax, arrowprops=dict(arrowstyle='-', color='gray', lw=0.5))
70+
71+
# Plot Components 1 and 2 for Dataset 1
72+
plot_with_labels(
73+
axes[0, 0],
74+
pca_df1['PC1'],
75+
pca_df1['PC2'],
76+
benchmarks1,
77+
'PCA from collected metrics'
78+
)
79+
80+
# Plot Components 3 and 4 for Dataset 1
81+
plot_with_labels(
82+
axes[0, 1],
83+
pca_df1['PC3'],
84+
pca_df1['PC4'],
85+
benchmarks1,
86+
'',
87+
True
88+
)
89+
90+
# Plot Components 1 and 2 for Dataset 2
91+
plot_with_labels(
92+
axes[1, 0],
93+
pca_df2['PC1'],
94+
pca_df2['PC2'],
95+
benchmarks2,
96+
'PCA from language model embeddings'
97+
)
98+
99+
# Plot Components 3 and 4 for Dataset 2
100+
plot_with_labels(
101+
axes[1, 1],
102+
pca_df2['PC3'],
103+
pca_df2['PC4'],
104+
benchmarks2,
105+
'',
106+
True
107+
)
108+
109+
# Save the plots
110+
plt.savefig(f'pca-row-plot-{name}.pdf', format='pdf')
111+
112+
return pca_df1, pca_df2

0 commit comments

Comments
 (0)