From 7bd0fd6d67398111b348557a6a3e15379877e559 Mon Sep 17 00:00:00 2001 From: David Buchaca Prats Date: Thu, 16 Dec 2021 13:04:13 +0100 Subject: [PATCH 1/5] feat: add benchmark with filtering --- examples/benchmark_with_filtering.py | 85 ++++++++++++++++++++++++++++ 1 file changed, 85 insertions(+) create mode 100644 examples/benchmark_with_filtering.py diff --git a/examples/benchmark_with_filtering.py b/examples/benchmark_with_filtering.py new file mode 100644 index 00000000..e0d9cfcc --- /dev/null +++ b/examples/benchmark_with_filtering.py @@ -0,0 +1,85 @@ + +import numpy as np +from jina import DocumentArray, Document +from jina.logging.profile import TimeContext +from executor import PQLiteIndexer +from pqlite.filter import Filter + +n_index = [10_000, 100_000, 500_000, 1_000_000] +n_index = [5000, 10_000] + +n_query = [1, 8, 64] +D = 768 +R = 5 +B = 4096 +n_cells = 1 +probs =[[0.05, 0.95/3, 0.95/3, 0.95/3], + [0.10, 0.90/3, 0.90/3, 0.90/3], + [0.30, 0.70/3, 0.70/3, 0.70/3], + [0.50, 0.50/3, 0.50/3, 0.50/4], + [0.80, 0.20/3, 0.20/3, 0.20/3]] + +times = {} + +def docs_with_tags(N, D, probs): + categories = ['comic', 'movie', 'audiobook', 'shoes'] + X = np.random.random((N, D)).astype(np.float32) + docs = [ + Document( + id=f'{i}', + embedding=X[i], + tags={ + 'category': np.random.choice(categories, p=probs), + }, + ) + for i in range(N) + ] + da = DocumentArray(docs) + + return da + +for n_i in n_index: + + + columns = [ ('category', 'str')] + idxer = PQLiteIndexer( + dim=D, + initial_size=n_i, + n_cells=n_cells, + metas={'workspace': './workspace'}, + columns=columns + ) + f = {'category': {'$eq': 'comic'}} + + for current_probs in probs: + + da = docs_with_tags(n_i, D, current_probs) + with TimeContext(f'indexing {n_i} docs') as t_i: + for _batch in da.batch(batch_size=B): + idxer.index(_batch) + + times[n_i] = {} + times[n_i][current_probs[0]] + times[n_i][current_probs[0]]['index'] = t_i.duration + + for n_q in n_query: + q_embs = np.random.random([n_q, D]).astype(np.float32) + qa = DocumentArray.empty(n_q) + qa.embeddings = q_embs + + t_qs = [] + + for _ in range(R): + with TimeContext(f'searching {n_q} docs') as t_q: + idxer.search(qa, filter=f) + t_qs.append(t_q.duration) + times[n_i][current_probs[0]][f'query_{n_q}'] = np.mean(t_qs[1:]) # remove warm-up + + idxer.clear() + idxer.close() + +print('|Stored data| Indexing time | Query size=1 | Query size=8 | Query size=64|') +print('|---' * (len(list(times.values())[0]) + 1) + '|') +for k, v in times.items(): + s = ' | '.join(f'{v[vv]:.3f}' for vv in ['index', 'query_1', 'query_8', 'query_64']) + print(f'|{k} | {s}|') From 976a0c49db85f0e8495c6940c160dec3f52f7066 Mon Sep 17 00:00:00 2001 From: David Buchaca Prats Date: Thu, 16 Dec 2021 13:18:02 +0100 Subject: [PATCH 2/5] fix: update indexer class in example --- examples/benchmark_with_filtering.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/examples/benchmark_with_filtering.py b/examples/benchmark_with_filtering.py index e0d9cfcc..c701952e 100644 --- a/examples/benchmark_with_filtering.py +++ b/examples/benchmark_with_filtering.py @@ -2,8 +2,8 @@ import numpy as np from jina import DocumentArray, Document from jina.logging.profile import TimeContext -from executor import PQLiteIndexer -from pqlite.filter import Filter +from pqlite import PQLite + n_index = [10_000, 100_000, 500_000, 1_000_000] n_index = [5000, 10_000] @@ -41,8 +41,8 @@ def docs_with_tags(N, D, probs): for n_i in n_index: - columns = [ ('category', 'str')] - idxer = PQLiteIndexer( + columns = [ ('category', str)] + idxer = PQLite( dim=D, initial_size=n_i, n_cells=n_cells, @@ -52,21 +52,19 @@ def docs_with_tags(N, D, probs): f = {'category': {'$eq': 'comic'}} for current_probs in probs: - da = docs_with_tags(n_i, D, current_probs) with TimeContext(f'indexing {n_i} docs') as t_i: for _batch in da.batch(batch_size=B): idxer.index(_batch) times[n_i] = {} - times[n_i][current_probs[0]] + times[n_i][current_probs[0]] ={} times[n_i][current_probs[0]]['index'] = t_i.duration for n_q in n_query: q_embs = np.random.random([n_q, D]).astype(np.float32) qa = DocumentArray.empty(n_q) qa.embeddings = q_embs - t_qs = [] for _ in range(R): From 5c8856464d46580fbd35a0872b498e381880f072 Mon Sep 17 00:00:00 2001 From: David Buchaca Prats Date: Thu, 16 Dec 2021 17:35:50 +0100 Subject: [PATCH 3/5] feat: add results per percentage --- examples/benchmark_with_filtering.py | 80 ++++++++++++++++++---------- 1 file changed, 51 insertions(+), 29 deletions(-) diff --git a/examples/benchmark_with_filtering.py b/examples/benchmark_with_filtering.py index c701952e..64ec0dcd 100644 --- a/examples/benchmark_with_filtering.py +++ b/examples/benchmark_with_filtering.py @@ -3,27 +3,35 @@ from jina import DocumentArray, Document from jina.logging.profile import TimeContext from pqlite import PQLite - +import os +import shutil n_index = [10_000, 100_000, 500_000, 1_000_000] -n_index = [5000, 10_000] n_query = [1, 8, 64] D = 768 R = 5 -B = 4096 +B = 5000 n_cells = 1 -probs =[[0.05, 0.95/3, 0.95/3, 0.95/3], - [0.10, 0.90/3, 0.90/3, 0.90/3], +probs =[[0.10, 0.90/3, 0.90/3, 0.90/3], [0.30, 0.70/3, 0.70/3, 0.70/3], - [0.50, 0.50/3, 0.50/3, 0.50/4], + [0.50, 0.50/3, 0.50/3, 0.50/3], [0.80, 0.20/3, 0.20/3, 0.20/3]] times = {} +def clean_workspace(): + if os.path.exists('./data'): + shutil.rmtree('./data') + + if os.path.exists('./workspace'): + shutil.rmtree('./workspace') + + def docs_with_tags(N, D, probs): categories = ['comic', 'movie', 'audiobook', 'shoes'] X = np.random.random((N, D)).astype(np.float32) + np.random.seed(123) docs = [ Document( id=f'{i}', @@ -38,32 +46,39 @@ def docs_with_tags(N, D, probs): return da -for n_i in n_index: +results = {} - columns = [ ('category', str)] - idxer = PQLite( - dim=D, - initial_size=n_i, - n_cells=n_cells, - metas={'workspace': './workspace'}, - columns=columns - ) - f = {'category': {'$eq': 'comic'}} +for n_i in n_index: + results[n_i] = {} for current_probs in probs: + results[n_i][current_probs[0]] = {} + + times = {} + clean_workspace() + columns = [('category', str)] + idxer = PQLite( + dim=D, + initial_size=n_i, + n_cells=n_cells, + metas={'workspace': './workspace'}, + columns=columns + ) + f = {'category': {'$eq': 'comic'}} + da = docs_with_tags(n_i, D, current_probs) + with TimeContext(f'indexing {n_i} docs') as t_i: - for _batch in da.batch(batch_size=B): + for i, _batch in enumerate(da.batch(batch_size=B)): idxer.index(_batch) - times[n_i] = {} - times[n_i][current_probs[0]] ={} - times[n_i][current_probs[0]]['index'] = t_i.duration + times[current_probs[0]] = {} + times[current_probs[0]]['index'] = t_i.duration for n_q in n_query: - q_embs = np.random.random([n_q, D]).astype(np.float32) qa = DocumentArray.empty(n_q) + q_embs = np.random.random([n_q, D]).astype(np.float32) qa.embeddings = q_embs t_qs = [] @@ -71,13 +86,20 @@ def docs_with_tags(N, D, probs): with TimeContext(f'searching {n_q} docs') as t_q: idxer.search(qa, filter=f) t_qs.append(t_q.duration) - times[n_i][current_probs[0]][f'query_{n_q}'] = np.mean(t_qs[1:]) # remove warm-up + times[current_probs[0]][f'query_{n_q}'] = np.mean(t_qs[1:]) # remove warm-up + + results[n_i][current_probs[0]] = times - idxer.clear() - idxer.close() -print('|Stored data| Indexing time | Query size=1 | Query size=8 | Query size=64|') -print('|---' * (len(list(times.values())[0]) + 1) + '|') -for k, v in times.items(): - s = ' | '.join(f'{v[vv]:.3f}' for vv in ['index', 'query_1', 'query_8', 'query_64']) - print(f'|{k} | {s}|') +title = '| Stored data |% same filter| Indexing time | Query size=1 | Query size=8 | Query size=64|' +print(title) +print('|-----' * 6 + '|') + +for n_i in n_index: + times = results[n_i] + + for current_probs in probs: + prob = current_probs[0] + for k, v in times[prob].items(): + s = ' | '.join(f'{v[vv]:.3f}' for vv in ['index', 'query_1', 'query_8', 'query_64']) + print(f'| {n_i} | {k} | {s} |') From 4652b21a34473c58c8e8805b7cd46ee7de144d20 Mon Sep 17 00:00:00 2001 From: David Buchaca Prats Date: Thu, 16 Dec 2021 18:59:18 +0100 Subject: [PATCH 4/5] fix: do not repeat indexing --- examples/benchmark_with_filtering.py | 98 +++++++++++++--------------- 1 file changed, 47 insertions(+), 51 deletions(-) diff --git a/examples/benchmark_with_filtering.py b/examples/benchmark_with_filtering.py index 64ec0dcd..fc243226 100644 --- a/examples/benchmark_with_filtering.py +++ b/examples/benchmark_with_filtering.py @@ -1,10 +1,11 @@ -import numpy as np from jina import DocumentArray, Document from jina.logging.profile import TimeContext from pqlite import PQLite + import os import shutil +import numpy as np n_index = [10_000, 100_000, 500_000, 1_000_000] @@ -13,12 +14,9 @@ R = 5 B = 5000 n_cells = 1 -probs =[[0.10, 0.90/3, 0.90/3, 0.90/3], - [0.30, 0.70/3, 0.70/3, 0.70/3], - [0.50, 0.50/3, 0.50/3, 0.50/3], - [0.80, 0.20/3, 0.20/3, 0.20/3]] - -times = {} +probs =[[0.20, 0.30, 0.50], + [0.05, 0.15, 0.80]] +categories = ['comic', 'movie', 'audiobook'] def clean_workspace(): if os.path.exists('./data'): @@ -28,34 +26,33 @@ def clean_workspace(): shutil.rmtree('./workspace') -def docs_with_tags(N, D, probs): - categories = ['comic', 'movie', 'audiobook', 'shoes'] - X = np.random.random((N, D)).astype(np.float32) - np.random.seed(123) - docs = [ - Document( - id=f'{i}', - embedding=X[i], - tags={ - 'category': np.random.choice(categories, p=probs), - }, - ) - for i in range(N) - ] - da = DocumentArray(docs) +def docs_with_tags(N, D, probs, categories): + + all_docs = [] + for k,prob in enumerate(probs): + n_current = int(N*prob) + X = np.random.random((n_current, D)).astype(np.float32) - return da + docs = [ + Document( + embedding=X[i], + tags={ + 'category': categories[k], + }, + ) + for i in range(n_current) + ] + all_docs.extend(docs) + return DocumentArray(all_docs) -results = {} +results = [] for n_i in n_index: - results[n_i] = {} + results_ni = [] for current_probs in probs: - results[n_i][current_probs[0]] = {} - times = {} clean_workspace() columns = [('category', str)] idxer = PQLite( @@ -65,41 +62,40 @@ def docs_with_tags(N, D, probs): metas={'workspace': './workspace'}, columns=columns ) - f = {'category': {'$eq': 'comic'}} - da = docs_with_tags(n_i, D, current_probs) + da = docs_with_tags(n_i, D, current_probs, categories) with TimeContext(f'indexing {n_i} docs') as t_i: for i, _batch in enumerate(da.batch(batch_size=B)): idxer.index(_batch) - times[current_probs[0]] = {} - times[current_probs[0]]['index'] = t_i.duration + for cat,prob in zip(categories, current_probs): + f = {'category': {'$eq': cat}} - for n_q in n_query: - qa = DocumentArray.empty(n_q) - q_embs = np.random.random([n_q, D]).astype(np.float32) - qa.embeddings = q_embs - t_qs = [] + query_times = [] + for n_q in n_query: + qa = DocumentArray.empty(n_q) + q_embs = np.random.random([n_q, D]).astype(np.float32) + qa.embeddings = q_embs + t_qs = [] - for _ in range(R): - with TimeContext(f'searching {n_q} docs') as t_q: - idxer.search(qa, filter=f) - t_qs.append(t_q.duration) - times[current_probs[0]][f'query_{n_q}'] = np.mean(t_qs[1:]) # remove warm-up + for _ in range(R): + with TimeContext(f'searching {n_q} docs') as t_q: + idxer.search(qa, filter=f) + t_qs.append(t_q.duration) + query_times.append(np.mean(t_qs[1:])) - results[n_i][current_probs[0]] = times + print(f'\n\nprob={prob}, current_probs={current_probs}, n_i={n_i}\n\n') + results_ni.append([n_i, prob, t_i.duration] + query_times) + + results.append(results_ni) title = '| Stored data |% same filter| Indexing time | Query size=1 | Query size=8 | Query size=64|' print(title) print('|-----' * 6 + '|') - -for n_i in n_index: - times = results[n_i] - - for current_probs in probs: - prob = current_probs[0] - for k, v in times[prob].items(): - s = ' | '.join(f'{v[vv]:.3f}' for vv in ['index', 'query_1', 'query_8', 'query_64']) - print(f'| {n_i} | {k} | {s} |') +for block in results: + sorted_elements_in_block = np.argsort([b[1] for b in block]) + for pos in sorted_elements_in_block: + res = block[pos] + print(''.join([f'| {x:.3f} ' for x in res] + ['|'])) From a345d8ac949133290bf1765fd4ab73911b28b7cd Mon Sep 17 00:00:00 2001 From: David Buchaca Prats Date: Thu, 16 Dec 2021 22:03:28 +0100 Subject: [PATCH 5/5] docs: update readme with results --- README.md | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/README.md b/README.md index ce508618..8b136933 100644 --- a/README.md +++ b/README.md @@ -161,6 +161,34 @@ One can run `executor/benchmark.py` to get a quick performance overview. |500000 | 467.936 | 0.046 | 0.356 | 2.823| |1000000 | 1025.506 | 0.091 | 0.695 | 5.778| +Results with filtering from `examples/benchmark_with_filtering.py` + +| Stored data |% same filter| Indexing time | Query size=1 | Query size=8 | Query size=64| +|-----|-----|-----|-----|-----|-----| +| 10000.000 | 0.050 | 2.869 | 0.004 | 0.030 | 0.270 | +| 10000.000 | 0.150 | 2.869 | 0.004 | 0.035 | 0.294 | +| 10000.000 | 0.200 | 3.506 | 0.005 | 0.038 | 0.287 | +| 10000.000 | 0.300 | 3.506 | 0.005 | 0.044 | 0.356 | +| 10000.000 | 0.500 | 3.506 | 0.008 | 0.064 | 0.484 | +| 10000.000 | 0.800 | 2.869 | 0.013 | 0.098 | 0.910 | +| 100000.000 | 0.050 | 75.960 | 0.018 | 0.134 | 1.092 | +| 100000.000 | 0.150 | 75.960 | 0.026 | 0.211 | 1.736 | +| 100000.000 | 0.200 | 78.475 | 0.034 | 0.265 | 2.097 | +| 100000.000 | 0.300 | 78.475 | 0.044 | 0.357 | 2.887 | +| 100000.000 | 0.500 | 78.475 | 0.068 | 0.565 | 4.383 | +| 100000.000 | 0.800 | 75.960 | 0.111 | 0.878 | 6.815 | +| 500000.000 | 0.050 | 497.744 | 0.069 | 0.561 | 4.439 | +| 500000.000 | 0.150 | 497.744 | 0.134 | 1.064 | 8.469 | +| 500000.000 | 0.200 | 440.108 | 0.152 | 1.199 | 9.472 | +| 500000.000 | 0.300 | 440.108 | 0.212 | 1.650 | 13.267 | +| 500000.000 | 0.500 | 440.108 | 0.328 | 2.637 | 21.961 | +| 500000.000 | 0.800 | 497.744 | 0.580 | 4.602 | 36.986 | +| 1000000.000 | 0.050 | 1052.388 | 0.131 | 1.031 | 8.212 | +| 1000000.000 | 0.150 | 1052.388 | 0.263 | 2.191 | 16.643 | +| 1000000.000 | 0.200 | 980.598 | 0.351 | 2.659 | 21.193 | +| 1000000.000 | 0.300 | 980.598 | 0.461 | 3.713 | 29.794 | +| 1000000.000 | 0.500 | 980.598 | 0.732 | 5.975 | 47.356 | +| 1000000.000 | 0.800 | 1052.388 | 1.151 | 9.255 | 73.552 | ## Research foundations of PQLite