Skip to content

Commit 7876e3d

Browse files
authored
Merge pull request #13 from bgokden/refactor
Fix known bugs
2 parents a010aaf + c459380 commit 7876e3d

File tree

6 files changed

+23
-21
lines changed

6 files changed

+23
-21
lines changed

README.md

Lines changed: 11 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ Veri also supports creating sub sample spaces of data by default.
1515

1616
Veri works as a cluster that can hold a Vector Space with fixed dimension and allows easy querying of k nearest neighbour search queries and also querying a sample space to be used in a machine learning algorithm.
1717

18-
Veri is currently in Alpha Stage
18+
Veri is currently in Beta Stage
1919

2020
*Veri means data in Turkish.*
2121

@@ -25,7 +25,7 @@ In machine learning, data scientist usually convert data into a feature label ve
2525

2626
I have worked in different roles as a Data Engineer, Data scientist and a Software Developer. In many projects, I wanted a scalable approach to vector space search which is not available. I wanted to optimise the data ingestion and data querying into one tool.
2727

28-
Veri is meant to be scale. Each Veri instance tries to synchronise its data with other peers and keep a statistically identical subset of the general vector space.
28+
Veri is meant to scale. Each Veri instance tries to synchronise its data with other peers and keep a statistically identical subset of the general vector space.
2929

3030
## What does statistically identical mean?
3131

@@ -34,11 +34,12 @@ Every instance continue, exchanging data as long as their average and histogram
3434

3535
## Knn querying
3636

37-
Veri internally has a kd-tree, but it also queries its neighbours and merges the result. It is very similar to map-reduce process done on the fly without planning.
37+
Veri internally has an internal key-value store, but it also queries its neighbours and merges the result.
38+
It is very similar to map-reduce process done on the fly without planning.
3839

39-
When a knn query is stated, veri creates a unique id,
40+
When a knn query is stated, veri creates a unique hash,
4041
Starts a timer,
41-
Then do a local kd-tree search,
42+
Then do a local knn search locally,
4243
Then calls its peers to do the same with a smaller timeout,
4344
Merges results into a map,
4445
Waits for timeout and then do a refine process on the result map,
@@ -50,19 +51,14 @@ Every knn query has a timeout and timeout defines the precision of the result. U
5051

5152
## High Availability
5253

53-
Veri has a different way of approaching high availability.
54-
Veri as a cluster try to use all the memory it is allowed to use.
55-
If there is enough memory, all the data is replicated to every instance.
56-
If there is not enough memory, data is split within instances using histogram balancing.
57-
If memory is nearly full, Veri will reject insertion requests.
58-
So if you want more high availability, use more instances.
59-
Currently, it is recommended to use another database for long term storage. Usually vector spaces, change over time and only the original data is kept. So I didn't implement a direct backend into it. Instead, you can regularly insert new data and evict old data. So you will keep your vector space up to date. Veri can respond queries while data being inserted or deleted, unlike most knn search systems.
54+
Veri replicates the data to its peers periodically and data is persisted to the disk for crahes.
6055

6156
TODO:
62-
- Add Dump data function to allow machine learning algorithms to get a Sample Space.
63-
- Add Query Caching and Return Cached Result instead of rejecting result.
64-
- Add Internal classification endpoint.
57+
- Test multinode syncranization
6558
- Authentication.
6659
- Documentation.
6760

61+
### Note:
62+
Veri uses [badger](https://github.com/dgraph-io/badger) internally. Many functions are made possible thanks to badger.
63+
6864
Contact me for any questions: berkgokden@gmail.com

data/aggregator.go

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import (
66

77
"github.com/bgokden/go-cache"
88
pb "github.com/bgokden/veri/veriservice"
9+
"github.com/jinzhu/copier"
910
)
1011

1112
type AggregatorInterface interface {
@@ -48,7 +49,7 @@ func (a *Aggregator) IsNewScoredBetter(old, new float64) bool {
4849
}
4950

5051
func (a *Aggregator) BestScore(scoredDatum *pb.ScoredDatum) float64 {
51-
if a.Context != nil || len(a.Context.GetDatum()) > 0 {
52+
if a.Context != nil && len(a.Context.GetDatum()) > 0 {
5253
var isSet = false
5354
var current float64
5455
// When Context is crioritized search score is ignored.
@@ -102,7 +103,10 @@ func (a *Aggregator) Insert(scoredDatum *pb.ScoredDatum) error {
102103
aGroupAggregator := aGroupAggregatorInterface.(AggregatorInterface)
103104
return aGroupAggregator.Insert(scoredDatum)
104105
} else {
105-
aGroupAggregator := NewAggrator(a.Config, false, nil)
106+
var aConfig pb.SearchConfig
107+
copier.Copy(&aConfig, a.Config)
108+
aConfig.Limit = a.Config.GroupLimit // TODO: find a better solution.
109+
aGroupAggregator := NewAggrator(&aConfig, false, nil)
106110
a.DeDuplicationMap.Set(keyString, aGroupAggregator, cache.NoExpiration)
107111
return aGroupAggregator.Insert(scoredDatum)
108112
}

data/data.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,8 @@ func NewPreData(config *pb.DataConfig, dataPath string) *Data {
6666
func (dt *Data) InitData() error {
6767
log.Printf("Init Data %v\n", dt.Config)
6868
if dt.Initialized == false {
69-
db, err := badger.Open(badger.DefaultOptions(dt.DBPath))
69+
options := badger.DefaultOptions(dt.DBPath).WithKeepL0InMemory(true)
70+
db, err := badger.Open(options)
7071
if err != nil {
7172
return err
7273
}

data/search.go

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -198,18 +198,15 @@ func GetSearchKey(datum *pb.Datum, config *pb.SearchConfig) string {
198198
func (dt *Data) AggregatedSearch(datum *pb.Datum, scoredDatumStreamOutput chan<- *pb.ScoredDatum, upperWaitGroup *sync.WaitGroup, config *pb.SearchConfig) error {
199199
duration := time.Duration(config.Timeout) * time.Millisecond
200200
timeLimit := time.After(duration)
201-
// log.Printf("DatumKey: %v\n", datum.GetKey())
202201
queryKey := GetSearchKey(datum, config)
203202
if dt.QueryCache == nil {
204203
dt.InitData()
205204
}
206205
if result, ok := dt.QueryCache.Get(queryKey); ok {
207-
log.Printf("Found in cache\n")
208206
cachedResult := result.([]*pb.ScoredDatum)
209207
for _, i := range cachedResult {
210208
scoredDatumStreamOutput <- i
211209
}
212-
log.Printf("Returned in cache\n")
213210
if upperWaitGroup != nil {
214211
upperWaitGroup.Done()
215212
}
@@ -271,6 +268,7 @@ func (dt *Data) AggregatedSearch(datum *pb.Datum, scoredDatumStreamOutput chan<-
271268
}
272269
cacheDuration := time.Duration(config.CacheDuration) * time.Second
273270
dt.QueryCache.Set(queryKey, result, cacheDuration)
271+
dt.QueryCache.IncrementExpiration(queryKey, cacheDuration)
274272
log.Printf("AggregatedSearch: finished. Cache Duration: %v\n", cacheDuration)
275273
return nil
276274
}

go.mod

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ require (
88
github.com/dgraph-io/badger/v2 v2.2007.2
99
github.com/golang/protobuf v1.4.1
1010
github.com/gorilla/mux v1.7.2
11+
github.com/jinzhu/copier v0.0.0-20201025035756-632e723a6687
1112
github.com/magneticio/go-common v0.0.1
1213
github.com/mitchellh/go-homedir v1.1.0
1314
github.com/pkg/errors v0.8.1

go.sum

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,8 @@ github.com/imdario/mergo v0.3.7 h1:Y+UAYTZ7gDEuOfhxKWy+dvb5dRQ6rJjFSdX2HZY1/gI=
141141
github.com/imdario/mergo v0.3.7/go.mod h1:2EnlNZ0deacrJVfApfmtdGgDfMuh/nq6Ok1EcJh5FfA=
142142
github.com/inconshreveable/mousetrap v1.0.0 h1:Z8tu5sraLXCXIcARxBp/8cbvlwVa7Z1NHg9XEKhtSvM=
143143
github.com/inconshreveable/mousetrap v1.0.0/go.mod h1:PxqpIevigyE2G7u3NXJIT2ANytuPF1OarO4DADm73n8=
144+
github.com/jinzhu/copier v0.0.0-20201025035756-632e723a6687 h1:bWXum+xWafUxxJpcXnystwg5m3iVpPYtrGJFc1rjfLc=
145+
github.com/jinzhu/copier v0.0.0-20201025035756-632e723a6687/go.mod h1:24xnZezI2Yqac9J61UC6/dG/k76ttpq0DdJI3QmUvro=
144146
github.com/jonboulle/clockwork v0.1.0/go.mod h1:Ii8DK3G1RaLaWxj9trq07+26W01tbo22gdxWY5EU2bo=
145147
github.com/json-iterator/go v1.1.6/go.mod h1:+SdeFBvtyEkXs7REEP0seUULqWtbJapLOCVDaaPEHmU=
146148
github.com/jstemmer/go-junit-report v0.0.0-20190106144839-af01ea7f8024/go.mod h1:6v2b51hI/fHJwM22ozAgKL4VKDeJcHhJFhtBdhmNjmU=

0 commit comments

Comments
 (0)