Skip to content

Commit

Permalink
add kgn algorithm (#506)
Browse files Browse the repository at this point in the history
* add kgn algorithm

* Update README.md

* Remove redundant get_memory_usage method from kgn class
  • Loading branch information
Henry-yan authored Jun 17, 2024
1 parent fcdf494 commit da31331
Show file tree
Hide file tree
Showing 5 changed files with 213 additions and 0 deletions.
1 change: 1 addition & 0 deletions .github/workflows/benchmarks.yml
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ jobs:
- glass
- hnswlib
- kdtree
- kgn
- luceneknn
- milvus
- mrpt
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ Evaluated
* [RediSearch](https://github.com/redisearch/redisearch) ![https://img.shields.io/github/stars/redisearch/redisearch?style=social](https://img.shields.io/github/stars/redisearch/redisearch?style=social)
* [pg_embedding](https://github.com/neondatabase/pg_embedding) ![https://img.shields.io/github/stars/pg_embedding/pg_embedding?style=social](https://img.shields.io/github/stars/neondatabase/pg_embedding?style=social)
* [Descartes(01AI)](https://github.com/xiaoming-01ai/descartes)
* [kgn](https://github.com/Henry-yan/kgn)

Data sets
=========
Expand Down
9 changes: 9 additions & 0 deletions ann_benchmarks/algorithms/kgn/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
FROM ann-benchmarks

RUN apt update
RUN apt install -y git cmake g++ python3 python3-setuptools python3-pip libblas-dev liblapack-dev
RUN pip3 install wheel pybind11 faiss-cpu

WORKDIR /home/app
RUN git clone https://github.com/Henry-yan/kgn.git
RUN pip3 install kgn/pykgn-1.0.0-cp310-cp310-linux_x86_64.whl
39 changes: 39 additions & 0 deletions ann_benchmarks/algorithms/kgn/config.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
float:
euclidean:
- base_args: ['@metric','@dimension']
constructor: Kgn
disabled: false
docker_tag: ann-benchmarks-kgn
module: ann_benchmarks.algorithms.kgn
name: kgn
run_groups:
Kgn:
args:
L: 100
R: 50
index_type : "KGN"
optimize : true
batch : false
kmeans_ep: 0
kmeans_type: 0
level: [1,2]
query_args: [[5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100, 105, 110, 115]]
angular:
- base_args: ['@metric','@dimension']
constructor: Kgn
disabled: false
docker_tag: ann-benchmarks-kgn
module: ann_benchmarks.algorithms.kgn
name: kgn
run_groups:
Kgn:
args:
L: 500
R: 96
index_type : "NSG"
optimize : true
batch : false
kmeans_ep: 0
kmeans_type: 0
level: [1,2]
query_args: [[10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100, 105, 110, 115, 200, 300, 400, 500]]
163 changes: 163 additions & 0 deletions ann_benchmarks/algorithms/kgn/module.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
import psutil
import os
from time import time
from sklearn import preprocessing

import pykgn as kgn
import numpy as np
import faiss
from faiss import Kmeans

from ..base.module import BaseANN


class EPSearcher:
def __init__(self, data: np.ndarray, cur_ep: int) -> None:
self.data = data
self.cur_ep = cur_ep

def search(self, query: np.ndarray) -> int:
raise NotImplementedError

class EPSearcherKmeans_re(EPSearcher):
def __init__(self, data: np.ndarray, cur_ep: int, max_deep: int, metric) -> None:
super().__init__(data, cur_ep)
self.centers = defaultdict(list)
for i in range(1,max_deep+1):
self.centers[i] = []
final_centers = self.recursive_kmeans_centers(data, 2, max_deep)
ncenters = 0
cen = []
for i in range(max_deep, 0, -1):
ncenters += len(self.centers[i])
for j in range(len(self.centers[i])):
for k in range(len(self.centers[i][j])):
cen.append(self.centers[i][j][k])

final = np.array(cen).reshape(ncenters, -1).astype('float32')

raw_index = faiss.IndexFlatL2(data.shape[1])
raw_index.add(data)
_, self.RI = raw_index.search(final, 1)

def recursive_kmeans_centers(self, data, num_clustters, max_deep):
if max_deep == 1:
kmeans = faiss.Kmeans(d=data.shape[1], k=num_clustters, verbose=False)
kmeans.train(data)
self.centers[max_deep].extend(kmeans.centroids.tolist())
return kmeans.centroids
kmeans = faiss.Kmeans(data.shape[1], num_clustters, seed=123, verbose=False)
kmeans.train(data)
_, labels = kmeans.index.search(data, 1)

centers = kmeans.centroids

self.centers[max_deep].extend(centers.tolist())
result_centers = centers

for i in range(num_clustters):
subset_data = data[labels.reshape(-1) == i]
subset_centers = self.recursive_kmeans_centers(subset_data, num_clustters, max_deep-1)
result_centers = np.concatenate((result_centers,subset_centers))
return result_centers

def get_cent(self, )-> np.ndarray:
return self.RI

def metric_mapping(metric):
mapping_dict = {"angular": "IP", "euclidean": "L2"}
metric_type = mapping_dict.get(metric)
if metric_type is None:
raise ValueError(f"The specified metric type '{metric}' is not recognized or supported by KGN.")
return metric_type

class Kgn(BaseANN):
def __init__(self, metric, dim, method_param):
self.metric = metric_mapping(metric)
self.R = method_param['R']
self.L = method_param['L']
self.index_type = method_param['index_type']
self.optimize = method_param['optimize']
self.batch = method_param['batch']
self.kmeans_ep = method_param['kmeans_ep']
self.kmeans_type = method_param['kmeans_type']
self.level = method_param['level']
self.name = 'kgn_(%s)' % (method_param)
self.dir = 'indices'
self.path = f'{metric}_{dim}_{self.index_type}_R_{self.R}_L_{self.L}.kgn'

def fit(self, X):
print(self.name, self.level, self.metric)
if self.metric == "IP":
X = preprocessing.normalize(X, "l2", axis=1)
self.d = X.shape[1]
if not os.path.exists(self.dir):
os.mkdir(self.dir)
if self.path not in os.listdir(self.dir):
print("build Index")
p = kgn.Index(self.index_type, dim=self.d,
metric=self.metric, R=self.R, L=self.L)
g = p.build(X,20)
g.save(os.path.join(self.dir, self.path))
del p
del g

# find kmeans centers -- RI
if(self.kmeans_type==0):
RI = np.array([])
elif(self.kmeans_type==2):
t = time()
kmeans_ep_searcher = EPSearcherKmeans_re(X, 0, self.kmeans_ep, self.metric)
T = time() - t
print("Time of bi_kmeans = ", T, " k=", self.kmeans_ep)
RI = kmeans_ep_searcher.get_cent()
else:
print("Error: no such kmeans algorithm in main_opt.py")
print("kmeans_ep", self.kmeans_ep)
g = kgn.Graph()
g.load(os.path.join(self.dir, self.path))
if self.level == 1:
self.searcher = kgn.Searcher(g, X, self.metric, "SQ8U",20)
elif self.level == 2:
self.searcher = kgn.Searcher(g, X, self.metric, "SQ4U",20)
print("Make Searcher")

if self.optimize:
if self.batch:
if self.level <= 4:
self.searcher.optimize()
else:
print(self.level, "no needs optimized")
pass
else:
if self.level <= 4:
self.searcher.optimize(1)
else:
print(self.level, "no needs optimized")
pass
print("Optimize Parameters")


def set_query_arguments(self, ef):
self.searcher.set_ef(ef)
self.ef = ef

def prepare_query(self, q, n):
if self.metric == 'IP':
q = q / np.linalg.norm(q)
self.q = q
self.n = n

def run_prepared_query(self):
if self.level <= 3:
self.res = self.searcher.search(
self.q, self.n)
else:
self.res = self.searcher.search(
self.q, self.n)

def get_prepared_query_results(self):
return self.res

def freeIndex(self):
del self.searcher

0 comments on commit da31331

Please sign in to comment.