-
Notifications
You must be signed in to change notification settings - Fork 55
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Chore/14 implement embeddings, loaders, indexes (#15)
- Loading branch information
Showing
20 changed files
with
2,446 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
package document | ||
|
||
type Document struct { | ||
Content string `json:"content"` | ||
Metadata map[string]interface{} `json:"metadata"` | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
package embedder | ||
|
||
// Embedding is the result of an embedding operation. | ||
type Embedding struct { | ||
Embedding []float32 `json:"embedding"` | ||
Index int `json:"index"` | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,81 @@ | ||
package openaiembedder | ||
|
||
import ( | ||
"context" | ||
"fmt" | ||
"os" | ||
|
||
"github.com/henomis/lingoose/document" | ||
"github.com/henomis/lingoose/embedder" | ||
"github.com/sashabaranov/go-openai" | ||
) | ||
|
||
type Model int | ||
|
||
const ( | ||
Unknown Model = iota | ||
AdaSimilarity | ||
BabbageSimilarity | ||
CurieSimilarity | ||
DavinciSimilarity | ||
AdaSearchDocument | ||
AdaSearchQuery | ||
BabbageSearchDocument | ||
BabbageSearchQuery | ||
CurieSearchDocument | ||
CurieSearchQuery | ||
DavinciSearchDocument | ||
DavinciSearchQuery | ||
AdaCodeSearchCode | ||
AdaCodeSearchText | ||
BabbageCodeSearchCode | ||
BabbageCodeSearchText | ||
AdaEmbeddingV2 | ||
) | ||
|
||
type OpenAIEmbedder struct { | ||
openAIClient *openai.Client | ||
model Model | ||
} | ||
|
||
func New(model Model) (*OpenAIEmbedder, error) { | ||
openAIKey := os.Getenv("OPENAI_API_KEY") | ||
if openAIKey == "" { | ||
return nil, fmt.Errorf("OPENAI_API_KEY not set") | ||
} | ||
|
||
return &OpenAIEmbedder{ | ||
openAIClient: openai.NewClient(openAIKey), | ||
model: model, | ||
}, nil | ||
} | ||
|
||
func (t *OpenAIEmbedder) Embed(ctx context.Context, docs []document.Document) ([]embedder.Embedding, error) { | ||
|
||
input := []string{} | ||
for _, doc := range docs { | ||
input = append(input, doc.Content) | ||
} | ||
|
||
resp, err := t.openAIClient.CreateEmbeddings( | ||
ctx, | ||
openai.EmbeddingRequest{ | ||
Input: input, | ||
Model: openai.EmbeddingModel(t.model), | ||
}, | ||
) | ||
if err != nil { | ||
return nil, err | ||
} | ||
|
||
var embeddings []embedder.Embedding | ||
|
||
for _, obj := range resp.Data { | ||
embeddings = append(embeddings, embedder.Embedding{ | ||
Embedding: obj.Embedding, | ||
Index: obj.Index, | ||
}) | ||
} | ||
|
||
return embeddings, nil | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,127 @@ | ||
package main | ||
|
||
import ( | ||
"context" | ||
"fmt" | ||
"os" | ||
|
||
openaiembedder "github.com/henomis/lingoose/embedder/openai" | ||
"github.com/henomis/lingoose/index" | ||
"github.com/henomis/lingoose/llm/openai" | ||
"github.com/henomis/lingoose/loader" | ||
"github.com/henomis/lingoose/prompt" | ||
"github.com/henomis/lingoose/textsplitter" | ||
pineconego "github.com/henomis/pinecone-go" | ||
pineconerequest "github.com/henomis/pinecone-go/request" | ||
pineconeresponse "github.com/henomis/pinecone-go/response" | ||
) | ||
|
||
func main() { | ||
|
||
openaiEmbedder, err := openaiembedder.New(openaiembedder.AdaEmbeddingV2) | ||
if err != nil { | ||
panic(err) | ||
} | ||
|
||
pineconeApiKey := os.Getenv("PINECONE_API_KEY") | ||
if pineconeApiKey == "" { | ||
panic("PINECONE_API_KEY is not set") | ||
} | ||
|
||
pineconeEnvironment := os.Getenv("PINECONE_ENVIRONMENT") | ||
if pineconeEnvironment == "" { | ||
panic("PINECONE_ENVIRONMENT is not set") | ||
} | ||
|
||
pineconeClient := pineconego.New(pineconeEnvironment, pineconeApiKey) | ||
|
||
whoamiReq := &pineconerequest.Whoami{} | ||
whoamiResp := &pineconeresponse.Whoami{} | ||
|
||
err = pineconeClient.Whoami(context.Background(), whoamiReq, whoamiResp) | ||
if err != nil { | ||
panic(err) | ||
} | ||
|
||
pineconeIndex, err := index.NewPinecone("test", whoamiResp.ProjectID, openaiEmbedder) | ||
if err != nil { | ||
panic(err) | ||
} | ||
|
||
indexSize, err := pineconeIndex.Size() | ||
if err != nil { | ||
panic(err) | ||
} | ||
|
||
if indexSize == 0 { | ||
loader, err := loader.NewDirectoryLoader(".", ".txt") | ||
if err != nil { | ||
panic(err) | ||
} | ||
|
||
documents, err := loader.Load() | ||
if err != nil { | ||
panic(err) | ||
} | ||
|
||
textSplitter := textsplitter.NewRecursiveCharacterTextSplitter(1000, 20, nil, nil) | ||
|
||
documentChunks := textSplitter.SplitDocuments(documents) | ||
|
||
for _, doc := range documentChunks { | ||
fmt.Println(doc.Content) | ||
fmt.Println("----------") | ||
fmt.Println(doc.Metadata) | ||
fmt.Println("----------") | ||
fmt.Println() | ||
|
||
} | ||
|
||
err = pineconeIndex.LoadFromDocuments(context.Background(), documentChunks) | ||
if err != nil { | ||
panic(err) | ||
} | ||
} | ||
|
||
query := "What is the purpose of the NATO Alliance?" | ||
topk := 3 | ||
similarities, err := pineconeIndex.SimilaritySearch( | ||
context.Background(), | ||
query, | ||
&topk, | ||
) | ||
if err != nil { | ||
panic(err) | ||
} | ||
|
||
for _, similarity := range similarities { | ||
fmt.Printf("Similarity: %f\n", similarity.Score) | ||
fmt.Printf("Document: %s\n", similarity.Document.Content) | ||
fmt.Println("Metadata: ", similarity.Document.Metadata) | ||
fmt.Println("----------") | ||
} | ||
|
||
llmOpenAI, err := openai.New(openai.GPT3TextDavinci003, openai.DefaultOpenAITemperature, openai.DefaultOpenAIMaxTokens, true) | ||
if err != nil { | ||
panic(err) | ||
} | ||
|
||
prompt1, err := prompt.NewPromptTemplate( | ||
"Based on the following context answer to the question.\n\nContext:\n{{.context}}\n\nQuestion: {{.query}}", | ||
map[string]string{ | ||
"query": query, | ||
"context": similarities[0].Document.Content, | ||
}, | ||
) | ||
if err != nil { | ||
panic(err) | ||
} | ||
|
||
err = prompt1.Format(nil) | ||
if err != nil { | ||
panic(err) | ||
} | ||
|
||
llmOpenAI.Completion(prompt1.Prompt()) | ||
|
||
} |
Oops, something went wrong.