plot PQ in the grid

Jonathan Ellis · Jonathan Ellis · commit 52ac055d06a7 · 2023-10-13T12:39:55.000-05:00
diff --git a/plot_output.py b/plot_output.py
@@ -2,42 +2,72 @@
 import re
 import matplotlib.pyplot as plt
 
-# Function to parse data and extract relevant information
+import re
+from dataclasses import dataclass, astuple
+
+
+@dataclass
+class Point:
+    pq: int
+    recall: float
+    throughput: float
+    M: int
+    ef: int
+    disk: bool
+    overquery: int
+
 def parse_data(description, data):
-    # Extract base vector count and query vector count from description
+    """
+    Parses a given set of data lines to extract relevant information.
+
+    Parameters:
+    - description (str): Metadata description of the dataset.
+    - data (list of str): List of data lines to parse.
+
+    Returns:
+    - dict: A dictionary containing parsed information.
+    """
     base_vector_count = int(re.search(r'(\d+) base', description).group(1))
     query_vector_count = int(re.search(r'(\d+) query', description).group(1))
     dimensions = int(re.search(r'dimensions (\d+)', description).group(1))
     dataset_name = re.search(r'hdf5/(\S+).hdf5', description).group(1)
 
     parsed_data = []
+    current_pq = None
+    M = None
     for line in data:
-        # Extract recall, query time, M and ef values
-        recall = float(re.search(r'recall (\d+\.\d+)', line).group(1))
-        query_time = float(re.search(r'query (\d+\.\d+)s', line).group(1))
-        M = int(re.search(r'M=(\d+)', line).group(1))
-        ef = int(re.search(r'ef=(\d+)', line).group(1))
-        overquery = int(re.search(r'top 100/(\d+) ', line).group(1))
-        
-        # Calculate throughput
-        throughput = query_vector_count * 10 / query_time
-        
-        parsed_data.append((recall, throughput, M, ef, overquery))
-    
+        if "PQ@" in line:
+            current_pq = re.search(r'PQ@(\d+)', line).group(1)
+        elif "Build M=" in line:
+            M = int(re.search(r'M=(\d+)', line).group(1))
+            ef = int(re.search(r'ef=(\d+)', line).group(1))
+        elif "Query PQ=" in line:
+            recall = float(re.search(r'recall (\d+\.\d+)', line).group(1))
+            query_time = float(re.search(r'in (\d+\.\d+)s', line).group(1))
+            pq_true = re.search(r'PQ=(\w+)', line).group(1) == 'true'
+            overquery = int(re.search(r'top 100/(\d+) ', line).group(1))
+
+            throughput = query_vector_count * 10 / query_time
+
+            assert current_pq is not None
+            assert M is not None
+            parsed_data.append(Point(current_pq, recall, throughput, M, ef, pq_true, overquery))
+
     return {
         'name': dataset_name,
         'base_vector_count': base_vector_count,
         'dimensions': dimensions,
         'data': parsed_data
     }
 
+
 def is_pareto_optimal(candidate, others):
     """Determine if a candidate point is Pareto-optimal."""
     for point in others:
         # Check if another point has higher or equal recall and throughput
-        if point[0] >= candidate[0] and point[1] > candidate[1]:
+        if point.recall >= candidate.recall and point.throughput > candidate.throughput:
             return False
-        if point[0] > candidate[0] and point[1] >= candidate[1]:
+        if point.recall > candidate.recall and point.throughput >= candidate.throughput:
             return False
     return True
 
@@ -54,9 +84,9 @@ def plot_dataset(dataset, output_dir="."):
     
     # Create plot
     plt.figure(figsize=(15, 20))
-    for recall, throughput, M, ef, overquery in data:
-        plt.scatter(recall, throughput, label=f'M={M}, ef={ef}, oq={overquery}')
-        plt.annotate(f'M={M}, ef={ef}, oq={overquery}', (recall, throughput))
+    for pq, recall, throughput, M, ef, disk, overquery in (astuple(p) for p in data):
+        plt.scatter(recall, throughput, label=f'pq={pq}, M={M}, ef={ef}, disk={disk}, oq={overquery}')
+        plt.annotate(f'pq={pq}, M={M}, ef={ef}, disk={disk}, oq={overquery}', (recall, throughput))
     
     # Set title and labels
     plt.title(f"Dataset: {name}\\nBase Vector Count: {base_vector_count}\\nDimensions: {dimensions}")