2
2
import re
3
3
import matplotlib .pyplot as plt
4
4
5
- # Function to parse data and extract relevant information
5
+ import re
6
+ from dataclasses import dataclass , astuple
7
+
8
+
9
+ @dataclass
10
+ class Point :
11
+ pq : int
12
+ recall : float
13
+ throughput : float
14
+ M : int
15
+ ef : int
16
+ disk : bool
17
+ overquery : int
18
+
6
19
def parse_data (description , data ):
7
- # Extract base vector count and query vector count from description
20
+ """
21
+ Parses a given set of data lines to extract relevant information.
22
+
23
+ Parameters:
24
+ - description (str): Metadata description of the dataset.
25
+ - data (list of str): List of data lines to parse.
26
+
27
+ Returns:
28
+ - dict: A dictionary containing parsed information.
29
+ """
8
30
base_vector_count = int (re .search (r'(\d+) base' , description ).group (1 ))
9
31
query_vector_count = int (re .search (r'(\d+) query' , description ).group (1 ))
10
32
dimensions = int (re .search (r'dimensions (\d+)' , description ).group (1 ))
11
33
dataset_name = re .search (r'hdf5/(\S+).hdf5' , description ).group (1 )
12
34
13
35
parsed_data = []
36
+ current_pq = None
37
+ M = None
14
38
for line in data :
15
- # Extract recall, query time, M and ef values
16
- recall = float (re .search (r'recall (\d+\.\d+)' , line ).group (1 ))
17
- query_time = float (re .search (r'query (\d+\.\d+)s' , line ).group (1 ))
18
- M = int (re .search (r'M=(\d+)' , line ).group (1 ))
19
- ef = int (re .search (r'ef=(\d+)' , line ).group (1 ))
20
- overquery = int (re .search (r'top 100/(\d+) ' , line ).group (1 ))
21
-
22
- # Calculate throughput
23
- throughput = query_vector_count * 10 / query_time
24
-
25
- parsed_data .append ((recall , throughput , M , ef , overquery ))
26
-
39
+ if "PQ@" in line :
40
+ current_pq = re .search (r'PQ@(\d+)' , line ).group (1 )
41
+ elif "Build M=" in line :
42
+ M = int (re .search (r'M=(\d+)' , line ).group (1 ))
43
+ ef = int (re .search (r'ef=(\d+)' , line ).group (1 ))
44
+ elif "Query PQ=" in line :
45
+ recall = float (re .search (r'recall (\d+\.\d+)' , line ).group (1 ))
46
+ query_time = float (re .search (r'in (\d+\.\d+)s' , line ).group (1 ))
47
+ pq_true = re .search (r'PQ=(\w+)' , line ).group (1 ) == 'true'
48
+ overquery = int (re .search (r'top 100/(\d+) ' , line ).group (1 ))
49
+
50
+ throughput = query_vector_count * 10 / query_time
51
+
52
+ assert current_pq is not None
53
+ assert M is not None
54
+ parsed_data .append (Point (current_pq , recall , throughput , M , ef , pq_true , overquery ))
55
+
27
56
return {
28
57
'name' : dataset_name ,
29
58
'base_vector_count' : base_vector_count ,
30
59
'dimensions' : dimensions ,
31
60
'data' : parsed_data
32
61
}
33
62
63
+
34
64
def is_pareto_optimal (candidate , others ):
35
65
"""Determine if a candidate point is Pareto-optimal."""
36
66
for point in others :
37
67
# Check if another point has higher or equal recall and throughput
38
- if point [ 0 ] >= candidate [ 0 ] and point [ 1 ] > candidate [ 1 ] :
68
+ if point . recall >= candidate . recall and point . throughput > candidate . throughput :
39
69
return False
40
- if point [ 0 ] > candidate [ 0 ] and point [ 1 ] >= candidate [ 1 ] :
70
+ if point . recall > candidate . recall and point . throughput >= candidate . throughput :
41
71
return False
42
72
return True
43
73
@@ -54,9 +84,9 @@ def plot_dataset(dataset, output_dir="."):
54
84
55
85
# Create plot
56
86
plt .figure (figsize = (15 , 20 ))
57
- for recall , throughput , M , ef , overquery in data :
58
- plt .scatter (recall , throughput , label = f'M={ M } , ef={ ef } , oq={ overquery } ' )
59
- plt .annotate (f'M={ M } , ef={ ef } , oq={ overquery } ' , (recall , throughput ))
87
+ for pq , recall , throughput , M , ef , disk , overquery in ( astuple ( p ) for p in data ) :
88
+ plt .scatter (recall , throughput , label = f'pq= { pq } , M={ M } , ef={ ef } , disk= { disk } , oq={ overquery } ' )
89
+ plt .annotate (f'pq= { pq } , M={ M } , ef={ ef } , disk= { disk } , oq={ overquery } ' , (recall , throughput ))
60
90
61
91
# Set title and labels
62
92
plt .title (f"Dataset: { name } \\ nBase Vector Count: { base_vector_count } \\ nDimensions: { dimensions } " )
0 commit comments