From 7bd0fd6d67398111b348557a6a3e15379877e559 Mon Sep 17 00:00:00 2001
From: David Buchaca Prats <davidbuchaca@gmail.com>
Date: Thu, 16 Dec 2021 13:04:13 +0100
Subject: [PATCH 1/5] feat: add benchmark with filtering

---
 examples/benchmark_with_filtering.py | 85 ++++++++++++++++++++++++++++
 1 file changed, 85 insertions(+)
 create mode 100644 examples/benchmark_with_filtering.py

diff --git a/examples/benchmark_with_filtering.py b/examples/benchmark_with_filtering.py
new file mode 100644
index 00000000..e0d9cfcc
--- /dev/null
+++ b/examples/benchmark_with_filtering.py
@@ -0,0 +1,85 @@
+
+import numpy as np
+from jina import DocumentArray, Document
+from jina.logging.profile import TimeContext
+from executor import PQLiteIndexer
+from pqlite.filter import Filter
+
+n_index = [10_000, 100_000, 500_000, 1_000_000]
+n_index = [5000, 10_000]
+
+n_query = [1, 8, 64]
+D = 768
+R = 5
+B = 4096
+n_cells = 1
+probs =[[0.05, 0.95/3, 0.95/3, 0.95/3],
+        [0.10, 0.90/3, 0.90/3, 0.90/3],
+        [0.30, 0.70/3, 0.70/3, 0.70/3],
+        [0.50, 0.50/3, 0.50/3, 0.50/4],
+        [0.80, 0.20/3, 0.20/3, 0.20/3]]
+
+times = {}
+
+def docs_with_tags(N, D, probs):
+    categories = ['comic', 'movie', 'audiobook', 'shoes']
+    X = np.random.random((N, D)).astype(np.float32)
+    docs = [
+        Document(
+            id=f'{i}',
+            embedding=X[i],
+            tags={
+                'category': np.random.choice(categories, p=probs),
+            },
+        )
+        for i in range(N)
+    ]
+    da = DocumentArray(docs)
+
+    return da
+
+for n_i in n_index:
+
+
+    columns = [ ('category', 'str')]
+    idxer = PQLiteIndexer(
+        dim=D,
+        initial_size=n_i,
+        n_cells=n_cells,
+        metas={'workspace': './workspace'},
+        columns=columns
+    )
+    f = {'category': {'$eq': 'comic'}}
+
+    for current_probs in probs:
+
+        da = docs_with_tags(n_i, D, current_probs)
+        with TimeContext(f'indexing {n_i} docs') as t_i:
+            for _batch in da.batch(batch_size=B):
+                idxer.index(_batch)
+
+        times[n_i] = {}
+        times[n_i][current_probs[0]]
+        times[n_i][current_probs[0]]['index'] = t_i.duration
+
+        for n_q in n_query:
+            q_embs = np.random.random([n_q, D]).astype(np.float32)
+            qa = DocumentArray.empty(n_q)
+            qa.embeddings = q_embs
+
+            t_qs = []
+
+            for _ in range(R):
+                with TimeContext(f'searching {n_q} docs') as t_q:
+                    idxer.search(qa, filter=f)
+                t_qs.append(t_q.duration)
+            times[n_i][current_probs[0]][f'query_{n_q}'] = np.mean(t_qs[1:])  # remove warm-up
+
+    idxer.clear()
+    idxer.close()
+
+print('|Stored data| Indexing time | Query size=1 | Query size=8 | Query size=64|')
+print('|---' * (len(list(times.values())[0]) + 1) + '|')
+for k, v in times.items():
+    s = ' | '.join(f'{v[vv]:.3f}' for vv in ['index', 'query_1', 'query_8', 'query_64'])
+    print(f'|{k} | {s}|')

From 976a0c49db85f0e8495c6940c160dec3f52f7066 Mon Sep 17 00:00:00 2001
From: David Buchaca Prats <davidbuchaca@gmail.com>
Date: Thu, 16 Dec 2021 13:18:02 +0100
Subject: [PATCH 2/5] fix: update indexer class in example

---
 examples/benchmark_with_filtering.py | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/examples/benchmark_with_filtering.py b/examples/benchmark_with_filtering.py
index e0d9cfcc..c701952e 100644
--- a/examples/benchmark_with_filtering.py
+++ b/examples/benchmark_with_filtering.py
@@ -2,8 +2,8 @@
 import numpy as np
 from jina import DocumentArray, Document
 from jina.logging.profile import TimeContext
-from executor import PQLiteIndexer
-from pqlite.filter import Filter
+from pqlite import PQLite
+
 
 n_index = [10_000, 100_000, 500_000, 1_000_000]
 n_index = [5000, 10_000]
@@ -41,8 +41,8 @@ def docs_with_tags(N, D, probs):
 for n_i in n_index:
 
 
-    columns = [ ('category', 'str')]
-    idxer = PQLiteIndexer(
+    columns = [ ('category', str)]
+    idxer = PQLite(
         dim=D,
         initial_size=n_i,
         n_cells=n_cells,
@@ -52,21 +52,19 @@ def docs_with_tags(N, D, probs):
     f = {'category': {'$eq': 'comic'}}
 
     for current_probs in probs:
-
         da = docs_with_tags(n_i, D, current_probs)
         with TimeContext(f'indexing {n_i} docs') as t_i:
             for _batch in da.batch(batch_size=B):
                 idxer.index(_batch)
 
         times[n_i] = {}
-        times[n_i][current_probs[0]]
+        times[n_i][current_probs[0]] ={}
         times[n_i][current_probs[0]]['index'] = t_i.duration
 
         for n_q in n_query:
             q_embs = np.random.random([n_q, D]).astype(np.float32)
             qa = DocumentArray.empty(n_q)
             qa.embeddings = q_embs
-
             t_qs = []
 
             for _ in range(R):

From 5c8856464d46580fbd35a0872b498e381880f072 Mon Sep 17 00:00:00 2001
From: David Buchaca Prats <davidbuchaca@gmail.com>
Date: Thu, 16 Dec 2021 17:35:50 +0100
Subject: [PATCH 3/5] feat: add results per percentage

---
 examples/benchmark_with_filtering.py | 80 ++++++++++++++++++----------
 1 file changed, 51 insertions(+), 29 deletions(-)

diff --git a/examples/benchmark_with_filtering.py b/examples/benchmark_with_filtering.py
index c701952e..64ec0dcd 100644
--- a/examples/benchmark_with_filtering.py
+++ b/examples/benchmark_with_filtering.py
@@ -3,27 +3,35 @@
 from jina import DocumentArray, Document
 from jina.logging.profile import TimeContext
 from pqlite import PQLite
-
+import os
+import shutil
 
 n_index = [10_000, 100_000, 500_000, 1_000_000]
-n_index = [5000, 10_000]
 
 n_query = [1, 8, 64]
 D = 768
 R = 5
-B = 4096
+B = 5000
 n_cells = 1
-probs =[[0.05, 0.95/3, 0.95/3, 0.95/3],
-        [0.10, 0.90/3, 0.90/3, 0.90/3],
+probs =[[0.10, 0.90/3, 0.90/3, 0.90/3],
         [0.30, 0.70/3, 0.70/3, 0.70/3],
-        [0.50, 0.50/3, 0.50/3, 0.50/4],
+        [0.50, 0.50/3, 0.50/3, 0.50/3],
         [0.80, 0.20/3, 0.20/3, 0.20/3]]
 
 times = {}
 
+def clean_workspace():
+    if os.path.exists('./data'):
+        shutil.rmtree('./data')
+
+    if os.path.exists('./workspace'):
+        shutil.rmtree('./workspace')
+
+
 def docs_with_tags(N, D, probs):
     categories = ['comic', 'movie', 'audiobook', 'shoes']
     X = np.random.random((N, D)).astype(np.float32)
+    np.random.seed(123)
     docs = [
         Document(
             id=f'{i}',
@@ -38,32 +46,39 @@ def docs_with_tags(N, D, probs):
 
     return da
 
-for n_i in n_index:
 
+results = {}
 
-    columns = [ ('category', str)]
-    idxer = PQLite(
-        dim=D,
-        initial_size=n_i,
-        n_cells=n_cells,
-        metas={'workspace': './workspace'},
-        columns=columns
-    )
-    f = {'category': {'$eq': 'comic'}}
+for n_i in n_index:
 
+    results[n_i] = {}
     for current_probs in probs:
+        results[n_i][current_probs[0]] = {}
+
+        times = {}
+        clean_workspace()
+        columns = [('category', str)]
+        idxer = PQLite(
+            dim=D,
+            initial_size=n_i,
+            n_cells=n_cells,
+            metas={'workspace': './workspace'},
+            columns=columns
+        )
+        f = {'category': {'$eq': 'comic'}}
+
         da = docs_with_tags(n_i, D, current_probs)
+
         with TimeContext(f'indexing {n_i} docs') as t_i:
-            for _batch in da.batch(batch_size=B):
+            for i, _batch in enumerate(da.batch(batch_size=B)):
                 idxer.index(_batch)
 
-        times[n_i] = {}
-        times[n_i][current_probs[0]] ={}
-        times[n_i][current_probs[0]]['index'] = t_i.duration
+        times[current_probs[0]] = {}
+        times[current_probs[0]]['index'] = t_i.duration
 
         for n_q in n_query:
-            q_embs = np.random.random([n_q, D]).astype(np.float32)
             qa = DocumentArray.empty(n_q)
+            q_embs = np.random.random([n_q, D]).astype(np.float32)
             qa.embeddings = q_embs
             t_qs = []
 
@@ -71,13 +86,20 @@ def docs_with_tags(N, D, probs):
                 with TimeContext(f'searching {n_q} docs') as t_q:
                     idxer.search(qa, filter=f)
                 t_qs.append(t_q.duration)
-            times[n_i][current_probs[0]][f'query_{n_q}'] = np.mean(t_qs[1:])  # remove warm-up
+            times[current_probs[0]][f'query_{n_q}'] = np.mean(t_qs[1:])  # remove warm-up
+
+        results[n_i][current_probs[0]] = times
 
-    idxer.clear()
-    idxer.close()
 
-print('|Stored data| Indexing time | Query size=1 | Query size=8 | Query size=64|')
-print('|---' * (len(list(times.values())[0]) + 1) + '|')
-for k, v in times.items():
-    s = ' | '.join(f'{v[vv]:.3f}' for vv in ['index', 'query_1', 'query_8', 'query_64'])
-    print(f'|{k} | {s}|')
+title = '| Stored data |% same filter| Indexing time | Query size=1 | Query size=8 | Query size=64|'
+print(title)
+print('|-----' * 6 + '|')
+
+for n_i in n_index:
+    times = results[n_i]
+
+    for current_probs in probs:
+        prob = current_probs[0]
+        for k, v in times[prob].items():
+            s = ' | '.join(f'{v[vv]:.3f}' for vv in ['index', 'query_1', 'query_8', 'query_64'])
+            print(f'| {n_i} | {k} | {s} |')

From 4652b21a34473c58c8e8805b7cd46ee7de144d20 Mon Sep 17 00:00:00 2001
From: David Buchaca Prats <davidbuchaca@gmail.com>
Date: Thu, 16 Dec 2021 18:59:18 +0100
Subject: [PATCH 4/5] fix: do not repeat indexing

---
 examples/benchmark_with_filtering.py | 98 +++++++++++++---------------
 1 file changed, 47 insertions(+), 51 deletions(-)

diff --git a/examples/benchmark_with_filtering.py b/examples/benchmark_with_filtering.py
index 64ec0dcd..fc243226 100644
--- a/examples/benchmark_with_filtering.py
+++ b/examples/benchmark_with_filtering.py
@@ -1,10 +1,11 @@
 
-import numpy as np
 from jina import DocumentArray, Document
 from jina.logging.profile import TimeContext
 from pqlite import PQLite
+
 import os
 import shutil
+import numpy as np
 
 n_index = [10_000, 100_000, 500_000, 1_000_000]
 
@@ -13,12 +14,9 @@
 R = 5
 B = 5000
 n_cells = 1
-probs =[[0.10, 0.90/3, 0.90/3, 0.90/3],
-        [0.30, 0.70/3, 0.70/3, 0.70/3],
-        [0.50, 0.50/3, 0.50/3, 0.50/3],
-        [0.80, 0.20/3, 0.20/3, 0.20/3]]
-
-times = {}
+probs =[[0.20, 0.30, 0.50],
+        [0.05, 0.15, 0.80]]
+categories = ['comic', 'movie', 'audiobook']
 
 def clean_workspace():
     if os.path.exists('./data'):
@@ -28,34 +26,33 @@ def clean_workspace():
         shutil.rmtree('./workspace')
 
 
-def docs_with_tags(N, D, probs):
-    categories = ['comic', 'movie', 'audiobook', 'shoes']
-    X = np.random.random((N, D)).astype(np.float32)
-    np.random.seed(123)
-    docs = [
-        Document(
-            id=f'{i}',
-            embedding=X[i],
-            tags={
-                'category': np.random.choice(categories, p=probs),
-            },
-        )
-        for i in range(N)
-    ]
-    da = DocumentArray(docs)
+def docs_with_tags(N, D, probs, categories):
+
+    all_docs = []
+    for k,prob in enumerate(probs):
+        n_current = int(N*prob)
+        X = np.random.random((n_current, D)).astype(np.float32)
 
-    return da
+        docs = [
+            Document(
+                embedding=X[i],
+                tags={
+                    'category': categories[k],
+                },
+            )
+            for i in range(n_current)
+        ]
+        all_docs.extend(docs)
 
+    return DocumentArray(all_docs)
 
-results = {}
 
+results = []
 for n_i in n_index:
 
-    results[n_i] = {}
+    results_ni = []
     for current_probs in probs:
-        results[n_i][current_probs[0]] = {}
 
-        times = {}
         clean_workspace()
         columns = [('category', str)]
         idxer = PQLite(
@@ -65,41 +62,40 @@ def docs_with_tags(N, D, probs):
             metas={'workspace': './workspace'},
             columns=columns
         )
-        f = {'category': {'$eq': 'comic'}}
 
-        da = docs_with_tags(n_i, D, current_probs)
+        da = docs_with_tags(n_i, D, current_probs, categories)
 
         with TimeContext(f'indexing {n_i} docs') as t_i:
             for i, _batch in enumerate(da.batch(batch_size=B)):
                 idxer.index(_batch)
 
-        times[current_probs[0]] = {}
-        times[current_probs[0]]['index'] = t_i.duration
+        for cat,prob in zip(categories, current_probs):
+            f = {'category': {'$eq': cat}}
 
-        for n_q in n_query:
-            qa = DocumentArray.empty(n_q)
-            q_embs = np.random.random([n_q, D]).astype(np.float32)
-            qa.embeddings = q_embs
-            t_qs = []
+            query_times = []
+            for n_q in n_query:
+                qa = DocumentArray.empty(n_q)
+                q_embs = np.random.random([n_q, D]).astype(np.float32)
+                qa.embeddings = q_embs
+                t_qs = []
 
-            for _ in range(R):
-                with TimeContext(f'searching {n_q} docs') as t_q:
-                    idxer.search(qa, filter=f)
-                t_qs.append(t_q.duration)
-            times[current_probs[0]][f'query_{n_q}'] = np.mean(t_qs[1:])  # remove warm-up
+                for _ in range(R):
+                    with TimeContext(f'searching {n_q} docs') as t_q:
+                        idxer.search(qa, filter=f)
+                    t_qs.append(t_q.duration)
+                query_times.append(np.mean(t_qs[1:]))
 
-        results[n_i][current_probs[0]] = times
+            print(f'\n\nprob={prob}, current_probs={current_probs}, n_i={n_i}\n\n')
+            results_ni.append([n_i, prob, t_i.duration] + query_times)
+
+    results.append(results_ni)
 
 
 title = '| Stored data |% same filter| Indexing time | Query size=1 | Query size=8 | Query size=64|'
 print(title)
 print('|-----' * 6 + '|')
-
-for n_i in n_index:
-    times = results[n_i]
-
-    for current_probs in probs:
-        prob = current_probs[0]
-        for k, v in times[prob].items():
-            s = ' | '.join(f'{v[vv]:.3f}' for vv in ['index', 'query_1', 'query_8', 'query_64'])
-            print(f'| {n_i} | {k} | {s} |')
+for block in results:
+    sorted_elements_in_block = np.argsort([b[1] for b in block])
+    for pos in sorted_elements_in_block:
+        res = block[pos]
+        print(''.join([f'| {x:.3f} ' for x in res] + ['|']))

From a345d8ac949133290bf1765fd4ab73911b28b7cd Mon Sep 17 00:00:00 2001
From: David Buchaca Prats <davidbuchaca@gmail.com>
Date: Thu, 16 Dec 2021 22:03:28 +0100
Subject: [PATCH 5/5] docs: update readme with results

---
 README.md | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/README.md b/README.md
index ce508618..8b136933 100644
--- a/README.md
+++ b/README.md
@@ -161,6 +161,34 @@ One can run `executor/benchmark.py` to get a quick performance overview.
 |500000 | 467.936 | 0.046 | 0.356 | 2.823|
 |1000000 | 1025.506 | 0.091 | 0.695 | 5.778|
 
+Results with filtering from `examples/benchmark_with_filtering.py`
+
+| Stored data |% same filter| Indexing time | Query size=1 | Query size=8 | Query size=64|
+|-----|-----|-----|-----|-----|-----|
+| 10000.000 | 0.050 | 2.869 | 0.004 | 0.030 | 0.270 |
+| 10000.000 | 0.150 | 2.869 | 0.004 | 0.035 | 0.294 |
+| 10000.000 | 0.200 | 3.506 | 0.005 | 0.038 | 0.287 |
+| 10000.000 | 0.300 | 3.506 | 0.005 | 0.044 | 0.356 |
+| 10000.000 | 0.500 | 3.506 | 0.008 | 0.064 | 0.484 |
+| 10000.000 | 0.800 | 2.869 | 0.013 | 0.098 | 0.910 |
+| 100000.000 | 0.050 | 75.960 | 0.018 | 0.134 | 1.092 |
+| 100000.000 | 0.150 | 75.960 | 0.026 | 0.211 | 1.736 |
+| 100000.000 | 0.200 | 78.475 | 0.034 | 0.265 | 2.097 |
+| 100000.000 | 0.300 | 78.475 | 0.044 | 0.357 | 2.887 |
+| 100000.000 | 0.500 | 78.475 | 0.068 | 0.565 | 4.383 |
+| 100000.000 | 0.800 | 75.960 | 0.111 | 0.878 | 6.815 |
+| 500000.000 | 0.050 | 497.744 | 0.069 | 0.561 | 4.439 |
+| 500000.000 | 0.150 | 497.744 | 0.134 | 1.064 | 8.469 |
+| 500000.000 | 0.200 | 440.108 | 0.152 | 1.199 | 9.472 |
+| 500000.000 | 0.300 | 440.108 | 0.212 | 1.650 | 13.267 |
+| 500000.000 | 0.500 | 440.108 | 0.328 | 2.637 | 21.961 |
+| 500000.000 | 0.800 | 497.744 | 0.580 | 4.602 | 36.986 |
+| 1000000.000 | 0.050 | 1052.388 | 0.131 | 1.031 | 8.212 |
+| 1000000.000 | 0.150 | 1052.388 | 0.263 | 2.191 | 16.643 |
+| 1000000.000 | 0.200 | 980.598 | 0.351 | 2.659 | 21.193 |
+| 1000000.000 | 0.300 | 980.598 | 0.461 | 3.713 | 29.794 |
+| 1000000.000 | 0.500 | 980.598 | 0.732 | 5.975 | 47.356 |
+| 1000000.000 | 0.800 | 1052.388 | 1.151 | 9.255 | 73.552 |
 
 ## Research foundations of PQLite