jina-ai · davidbp · Dec 20, 2021 · Dec 16, 2021 · Dec 16, 2021 · Dec 16, 2021
diff --git a/README.md b/README.md
@@ -161,6 +161,34 @@ One can run `executor/benchmark.py` to get a quick performance overview.
 |500000 | 467.936 | 0.046 | 0.356 | 2.823|
 |1000000 | 1025.506 | 0.091 | 0.695 | 5.778|
 
+Results with filtering from `examples/benchmark_with_filtering.py`
+
+| Stored data |% same filter| Indexing time | Query size=1 | Query size=8 | Query size=64|
+|-----|-----|-----|-----|-----|-----|
+| 10000.000 | 0.050 | 2.869 | 0.004 | 0.030 | 0.270 |
+| 10000.000 | 0.150 | 2.869 | 0.004 | 0.035 | 0.294 |
+| 10000.000 | 0.200 | 3.506 | 0.005 | 0.038 | 0.287 |
+| 10000.000 | 0.300 | 3.506 | 0.005 | 0.044 | 0.356 |
+| 10000.000 | 0.500 | 3.506 | 0.008 | 0.064 | 0.484 |
+| 10000.000 | 0.800 | 2.869 | 0.013 | 0.098 | 0.910 |
+| 100000.000 | 0.050 | 75.960 | 0.018 | 0.134 | 1.092 |
+| 100000.000 | 0.150 | 75.960 | 0.026 | 0.211 | 1.736 |
+| 100000.000 | 0.200 | 78.475 | 0.034 | 0.265 | 2.097 |
+| 100000.000 | 0.300 | 78.475 | 0.044 | 0.357 | 2.887 |
+| 100000.000 | 0.500 | 78.475 | 0.068 | 0.565 | 4.383 |
+| 100000.000 | 0.800 | 75.960 | 0.111 | 0.878 | 6.815 |
+| 500000.000 | 0.050 | 497.744 | 0.069 | 0.561 | 4.439 |
+| 500000.000 | 0.150 | 497.744 | 0.134 | 1.064 | 8.469 |
+| 500000.000 | 0.200 | 440.108 | 0.152 | 1.199 | 9.472 |
+| 500000.000 | 0.300 | 440.108 | 0.212 | 1.650 | 13.267 |
+| 500000.000 | 0.500 | 440.108 | 0.328 | 2.637 | 21.961 |
+| 500000.000 | 0.800 | 497.744 | 0.580 | 4.602 | 36.986 |
+| 1000000.000 | 0.050 | 1052.388 | 0.131 | 1.031 | 8.212 |
+| 1000000.000 | 0.150 | 1052.388 | 0.263 | 2.191 | 16.643 |
+| 1000000.000 | 0.200 | 980.598 | 0.351 | 2.659 | 21.193 |
+| 1000000.000 | 0.300 | 980.598 | 0.461 | 3.713 | 29.794 |
+| 1000000.000 | 0.500 | 980.598 | 0.732 | 5.975 | 47.356 |
+| 1000000.000 | 0.800 | 1052.388 | 1.151 | 9.255 | 73.552 |
 
 ## Research foundations of PQLite
 

diff --git a/examples/benchmark_with_filtering.py b/examples/benchmark_with_filtering.py
@@ -0,0 +1,101 @@
+
+from jina import DocumentArray, Document
+from jina.logging.profile import TimeContext
+from pqlite import PQLite
+
+import os
+import shutil
+import numpy as np
+
+n_index = [10_000, 100_000, 500_000, 1_000_000]
+
+n_query = [1, 8, 64]
+D = 768
+R = 5
+B = 5000
+n_cells = 1
+probs =[[0.20, 0.30, 0.50],
+        [0.05, 0.15, 0.80]]
+categories = ['comic', 'movie', 'audiobook']
+
+def clean_workspace():
+    if os.path.exists('./data'):
+        shutil.rmtree('./data')
+
+    if os.path.exists('./workspace'):
+        shutil.rmtree('./workspace')
+
+
+def docs_with_tags(N, D, probs, categories):
+
+    all_docs = []
+    for k,prob in enumerate(probs):
+        n_current = int(N*prob)
+        X = np.random.random((n_current, D)).astype(np.float32)
+
+        docs = [
+            Document(
+                embedding=X[i],
+                tags={
+                    'category': categories[k],
+                },
+            )
+            for i in range(n_current)
+        ]
+        all_docs.extend(docs)
+
+    return DocumentArray(all_docs)
+
+
+results = []
+for n_i in n_index:
+
+    results_ni = []
+    for current_probs in probs:
+
+        clean_workspace()
+        columns = [('category', str)]
+        idxer = PQLite(
+            dim=D,
+            initial_size=n_i,
+            n_cells=n_cells,
+            metas={'workspace': './workspace'},
+            columns=columns
+        )
+
+        da = docs_with_tags(n_i, D, current_probs, categories)
+
+        with TimeContext(f'indexing {n_i} docs') as t_i:
+            for i, _batch in enumerate(da.batch(batch_size=B)):
+                idxer.index(_batch)
+
+        for cat,prob in zip(categories, current_probs):
+            f = {'category': {'$eq': cat}}
+
+            query_times = []
+            for n_q in n_query:
+                qa = DocumentArray.empty(n_q)
+                q_embs = np.random.random([n_q, D]).astype(np.float32)
+                qa.embeddings = q_embs
+                t_qs = []
+
+                for _ in range(R):
+                    with TimeContext(f'searching {n_q} docs') as t_q:
+                        idxer.search(qa, filter=f)
+                    t_qs.append(t_q.duration)
+                query_times.append(np.mean(t_qs[1:]))
+
+            print(f'\n\nprob={prob}, current_probs={current_probs}, n_i={n_i}\n\n')
+            results_ni.append([n_i, prob, t_i.duration] + query_times)
+
+    results.append(results_ni)
+
+
+title = '| Stored data |% same filter| Indexing time | Query size=1 | Query size=8 | Query size=64|'
+print(title)
+print('|-----' * 6 + '|')
+for block in results:
+    sorted_elements_in_block = np.argsort([b[1] for b in block])
+    for pos in sorted_elements_in_block:
+        res = block[pos]
+        print(''.join([f'| {x:.3f} ' for x in res] + ['|']))