diff --git a/README.md b/README.md index ce508618..8b136933 100644 --- a/README.md +++ b/README.md @@ -161,6 +161,34 @@ One can run `executor/benchmark.py` to get a quick performance overview. |500000 | 467.936 | 0.046 | 0.356 | 2.823| |1000000 | 1025.506 | 0.091 | 0.695 | 5.778| +Results with filtering from `examples/benchmark_with_filtering.py` + +| Stored data |% same filter| Indexing time | Query size=1 | Query size=8 | Query size=64| +|-----|-----|-----|-----|-----|-----| +| 10000.000 | 0.050 | 2.869 | 0.004 | 0.030 | 0.270 | +| 10000.000 | 0.150 | 2.869 | 0.004 | 0.035 | 0.294 | +| 10000.000 | 0.200 | 3.506 | 0.005 | 0.038 | 0.287 | +| 10000.000 | 0.300 | 3.506 | 0.005 | 0.044 | 0.356 | +| 10000.000 | 0.500 | 3.506 | 0.008 | 0.064 | 0.484 | +| 10000.000 | 0.800 | 2.869 | 0.013 | 0.098 | 0.910 | +| 100000.000 | 0.050 | 75.960 | 0.018 | 0.134 | 1.092 | +| 100000.000 | 0.150 | 75.960 | 0.026 | 0.211 | 1.736 | +| 100000.000 | 0.200 | 78.475 | 0.034 | 0.265 | 2.097 | +| 100000.000 | 0.300 | 78.475 | 0.044 | 0.357 | 2.887 | +| 100000.000 | 0.500 | 78.475 | 0.068 | 0.565 | 4.383 | +| 100000.000 | 0.800 | 75.960 | 0.111 | 0.878 | 6.815 | +| 500000.000 | 0.050 | 497.744 | 0.069 | 0.561 | 4.439 | +| 500000.000 | 0.150 | 497.744 | 0.134 | 1.064 | 8.469 | +| 500000.000 | 0.200 | 440.108 | 0.152 | 1.199 | 9.472 | +| 500000.000 | 0.300 | 440.108 | 0.212 | 1.650 | 13.267 | +| 500000.000 | 0.500 | 440.108 | 0.328 | 2.637 | 21.961 | +| 500000.000 | 0.800 | 497.744 | 0.580 | 4.602 | 36.986 | +| 1000000.000 | 0.050 | 1052.388 | 0.131 | 1.031 | 8.212 | +| 1000000.000 | 0.150 | 1052.388 | 0.263 | 2.191 | 16.643 | +| 1000000.000 | 0.200 | 980.598 | 0.351 | 2.659 | 21.193 | +| 1000000.000 | 0.300 | 980.598 | 0.461 | 3.713 | 29.794 | +| 1000000.000 | 0.500 | 980.598 | 0.732 | 5.975 | 47.356 | +| 1000000.000 | 0.800 | 1052.388 | 1.151 | 9.255 | 73.552 | ## Research foundations of PQLite diff --git a/examples/benchmark_with_filtering.py b/examples/benchmark_with_filtering.py new file mode 100644 index 00000000..fc243226 --- /dev/null +++ b/examples/benchmark_with_filtering.py @@ -0,0 +1,101 @@ + +from jina import DocumentArray, Document +from jina.logging.profile import TimeContext +from pqlite import PQLite + +import os +import shutil +import numpy as np + +n_index = [10_000, 100_000, 500_000, 1_000_000] + +n_query = [1, 8, 64] +D = 768 +R = 5 +B = 5000 +n_cells = 1 +probs =[[0.20, 0.30, 0.50], + [0.05, 0.15, 0.80]] +categories = ['comic', 'movie', 'audiobook'] + +def clean_workspace(): + if os.path.exists('./data'): + shutil.rmtree('./data') + + if os.path.exists('./workspace'): + shutil.rmtree('./workspace') + + +def docs_with_tags(N, D, probs, categories): + + all_docs = [] + for k,prob in enumerate(probs): + n_current = int(N*prob) + X = np.random.random((n_current, D)).astype(np.float32) + + docs = [ + Document( + embedding=X[i], + tags={ + 'category': categories[k], + }, + ) + for i in range(n_current) + ] + all_docs.extend(docs) + + return DocumentArray(all_docs) + + +results = [] +for n_i in n_index: + + results_ni = [] + for current_probs in probs: + + clean_workspace() + columns = [('category', str)] + idxer = PQLite( + dim=D, + initial_size=n_i, + n_cells=n_cells, + metas={'workspace': './workspace'}, + columns=columns + ) + + da = docs_with_tags(n_i, D, current_probs, categories) + + with TimeContext(f'indexing {n_i} docs') as t_i: + for i, _batch in enumerate(da.batch(batch_size=B)): + idxer.index(_batch) + + for cat,prob in zip(categories, current_probs): + f = {'category': {'$eq': cat}} + + query_times = [] + for n_q in n_query: + qa = DocumentArray.empty(n_q) + q_embs = np.random.random([n_q, D]).astype(np.float32) + qa.embeddings = q_embs + t_qs = [] + + for _ in range(R): + with TimeContext(f'searching {n_q} docs') as t_q: + idxer.search(qa, filter=f) + t_qs.append(t_q.duration) + query_times.append(np.mean(t_qs[1:])) + + print(f'\n\nprob={prob}, current_probs={current_probs}, n_i={n_i}\n\n') + results_ni.append([n_i, prob, t_i.duration] + query_times) + + results.append(results_ni) + + +title = '| Stored data |% same filter| Indexing time | Query size=1 | Query size=8 | Query size=64|' +print(title) +print('|-----' * 6 + '|') +for block in results: + sorted_elements_in_block = np.argsort([b[1] for b in block]) + for pos in sorted_elements_in_block: + res = block[pos] + print(''.join([f'| {x:.3f} ' for x in res] + ['|']))