Skip to content

Commit 52ac055

Browse files
author
Jonathan Ellis
committed
plot PQ in the grid
1 parent b20cf30 commit 52ac055

File tree

1 file changed

+49
-19
lines changed

1 file changed

+49
-19
lines changed

plot_output.py

Lines changed: 49 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -2,42 +2,72 @@
22
import re
33
import matplotlib.pyplot as plt
44

5-
# Function to parse data and extract relevant information
5+
import re
6+
from dataclasses import dataclass, astuple
7+
8+
9+
@dataclass
10+
class Point:
11+
pq: int
12+
recall: float
13+
throughput: float
14+
M: int
15+
ef: int
16+
disk: bool
17+
overquery: int
18+
619
def parse_data(description, data):
7-
# Extract base vector count and query vector count from description
20+
"""
21+
Parses a given set of data lines to extract relevant information.
22+
23+
Parameters:
24+
- description (str): Metadata description of the dataset.
25+
- data (list of str): List of data lines to parse.
26+
27+
Returns:
28+
- dict: A dictionary containing parsed information.
29+
"""
830
base_vector_count = int(re.search(r'(\d+) base', description).group(1))
931
query_vector_count = int(re.search(r'(\d+) query', description).group(1))
1032
dimensions = int(re.search(r'dimensions (\d+)', description).group(1))
1133
dataset_name = re.search(r'hdf5/(\S+).hdf5', description).group(1)
1234

1335
parsed_data = []
36+
current_pq = None
37+
M = None
1438
for line in data:
15-
# Extract recall, query time, M and ef values
16-
recall = float(re.search(r'recall (\d+\.\d+)', line).group(1))
17-
query_time = float(re.search(r'query (\d+\.\d+)s', line).group(1))
18-
M = int(re.search(r'M=(\d+)', line).group(1))
19-
ef = int(re.search(r'ef=(\d+)', line).group(1))
20-
overquery = int(re.search(r'top 100/(\d+) ', line).group(1))
21-
22-
# Calculate throughput
23-
throughput = query_vector_count * 10 / query_time
24-
25-
parsed_data.append((recall, throughput, M, ef, overquery))
26-
39+
if "PQ@" in line:
40+
current_pq = re.search(r'PQ@(\d+)', line).group(1)
41+
elif "Build M=" in line:
42+
M = int(re.search(r'M=(\d+)', line).group(1))
43+
ef = int(re.search(r'ef=(\d+)', line).group(1))
44+
elif "Query PQ=" in line:
45+
recall = float(re.search(r'recall (\d+\.\d+)', line).group(1))
46+
query_time = float(re.search(r'in (\d+\.\d+)s', line).group(1))
47+
pq_true = re.search(r'PQ=(\w+)', line).group(1) == 'true'
48+
overquery = int(re.search(r'top 100/(\d+) ', line).group(1))
49+
50+
throughput = query_vector_count * 10 / query_time
51+
52+
assert current_pq is not None
53+
assert M is not None
54+
parsed_data.append(Point(current_pq, recall, throughput, M, ef, pq_true, overquery))
55+
2756
return {
2857
'name': dataset_name,
2958
'base_vector_count': base_vector_count,
3059
'dimensions': dimensions,
3160
'data': parsed_data
3261
}
3362

63+
3464
def is_pareto_optimal(candidate, others):
3565
"""Determine if a candidate point is Pareto-optimal."""
3666
for point in others:
3767
# Check if another point has higher or equal recall and throughput
38-
if point[0] >= candidate[0] and point[1] > candidate[1]:
68+
if point.recall >= candidate.recall and point.throughput > candidate.throughput:
3969
return False
40-
if point[0] > candidate[0] and point[1] >= candidate[1]:
70+
if point.recall > candidate.recall and point.throughput >= candidate.throughput:
4171
return False
4272
return True
4373

@@ -54,9 +84,9 @@ def plot_dataset(dataset, output_dir="."):
5484

5585
# Create plot
5686
plt.figure(figsize=(15, 20))
57-
for recall, throughput, M, ef, overquery in data:
58-
plt.scatter(recall, throughput, label=f'M={M}, ef={ef}, oq={overquery}')
59-
plt.annotate(f'M={M}, ef={ef}, oq={overquery}', (recall, throughput))
87+
for pq, recall, throughput, M, ef, disk, overquery in (astuple(p) for p in data):
88+
plt.scatter(recall, throughput, label=f'pq={pq}, M={M}, ef={ef}, disk={disk}, oq={overquery}')
89+
plt.annotate(f'pq={pq}, M={M}, ef={ef}, disk={disk}, oq={overquery}', (recall, throughput))
6090

6191
# Set title and labels
6292
plt.title(f"Dataset: {name}\\nBase Vector Count: {base_vector_count}\\nDimensions: {dimensions}")

0 commit comments

Comments
 (0)