Skip to content

Commit

Permalink
allow progressive embedding processing
Browse files Browse the repository at this point in the history
  • Loading branch information
ubaldus committed Jan 7, 2025
1 parent 8dbd114 commit ffdff15
Show file tree
Hide file tree
Showing 4 changed files with 15 additions and 21 deletions.
29 changes: 13 additions & 16 deletions db.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,14 @@ func (h *DBHandler) initializeDB() error {
text TEXT NOT NULL,
pow INTEGER DEFAULT 0
)`,
`CREATE TABLE IF NOT EXISTS vectors (
id INTEGER PRIMARY KEY,
embedding BLOB
)`,
`CREATE TABLE IF NOT EXISTS vectors_ann (
id INTEGER PRIMARY KEY,
embedding BLOB
)`,
}

for _, query := range queries {
Expand Down Expand Up @@ -623,32 +631,21 @@ func (h *DBHandler) RebuildHashSearch() error {
return nil
}

func (h *DBHandler) RebuildEmbeddings() error {
func (h *DBHandler) ProcessEmbeddings() error {
batchSize := 1000
offset := 0
totalCount := 0
offset := 0

err := h.db.QueryRow("SELECT COUNT(*) FROM hashes").Scan(&totalCount)
err := h.db.QueryRow("SELECT COUNT(*) FROM hashes WHERE id NOT IN (select id from vectors_ann)").Scan(&totalCount)
if err != nil {
return fmt.Errorf("error getting total count of hashes: %w", err)
}

if _, err := h.db.Exec("DROP TABLE IF EXISTS vectors"); err != nil {
return err
}
if _, err := h.db.Exec("CREATE TABLE vectors (id INTEGER PRIMARY KEY, embedding BLOB)"); err != nil {
return fmt.Errorf("error creating vectors table: %w", err)
}
if _, err := h.db.Exec("DROP TABLE IF EXISTS vectors_ann"); err != nil {
return err
}
if _, err := h.db.Exec("CREATE TABLE vectors_ann (id INTEGER PRIMARY KEY, embedding BLOB)"); err != nil {
return fmt.Errorf("error creating vectors_ann table: %w", err)
}
log.Printf("Pending embeddings: %d", totalCount)

startTime := time.Now()
for {
rows, err := h.db.Query(`SELECT id, hash, text FROM hashes LIMIT ? OFFSET ?`, batchSize, offset)
rows, err := h.db.Query(`SELECT id, hash, text FROM hashes WHERE id NOT IN (select id from vectors_ann) LIMIT ?`, batchSize)
if err != nil {
return fmt.Errorf("error querying hashes: %w", err)
}
Expand Down
1 change: 0 additions & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ go 1.22.5
toolchain go1.22.10

require (
github.com/asg017/sqlite-vec-go-bindings v0.1.6
github.com/mattn/go-sqlite3 v1.14.24
github.com/openai/openai-go v0.1.0-alpha.39
golang.org/x/net v0.33.0
Expand Down
2 changes: 0 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
github.com/asg017/sqlite-vec-go-bindings v0.1.6 h1:Nx0jAzyS38XpkKznJ9xQjFXz2X9tI7KqjwVxV8RNoww=
github.com/asg017/sqlite-vec-go-bindings v0.1.6/go.mod h1:A8+cTt/nKFsYCQF6OgzSNpKZrzNo5gQsXBTfsXHXY0Q=
github.com/mattn/go-sqlite3 v1.14.24 h1:tpSp2G2KyMnnQu99ngJ47EIkWVmliIizyZBfPrBWDRM=
github.com/mattn/go-sqlite3 v1.14.24/go.mod h1:Uh1q+B4BYcTPb+yiD3kU8Ct7aC0hY9fxUwlHK0RXw+Y=
github.com/openai/openai-go v0.1.0-alpha.39 h1:FvoNWy7BPhA0TjGOK5huRGU5sAUEx2jeubLXz34K9LE=
Expand Down
4 changes: 2 additions & 2 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ import (
"os"
)

const Version = "0.1.2"
const Version = "0.1.3"

type Config struct {
importPath string //https://dumps.wikimedia.org/other/enterprise_html/runs/...
Expand Down Expand Up @@ -121,7 +121,7 @@ func main() {
}
}
if options.aiEmbeddingSync {
if err := db.RebuildEmbeddings(); err != nil {
if err := db.ProcessEmbeddings(); err != nil {
log.Fatalf("Error processing embeddings: %v\n", err)
}
}
Expand Down

0 comments on commit ffdff15

Please sign in to comment.