Chore/14 implement embeddings, loaders, indexes (#15)

henomis · Apr 26, 2023 · b4073ea · b4073ea
1 parent 4ad4d76
commit b4073ea
Show file tree

Hide file tree

Showing 20 changed files with 2,446 additions and 2 deletions.
diff --git a/README.md b/README.md
@@ -11,7 +11,7 @@
 here below an image from docs/assets/img/lingoose.png
 
 # Overview
-**LinGoose** is a powerful Go framework for developing Large Language Model (LLM) based applications using pipelines. It is designed to be a complete solution and provides multiple components, including Prompts, Templates, Chat, Output Decoders, LLM, Pipelines, and Memory. With **LinGoose**, you can interact with LLM AI through prompts and generate complex templates. Additionally, it includes a chat feature, allowing you to create chatbots. The Output Decoders component enables you to extract specific information from the output of the LLM, while the LLM interface allows you to send prompts to various AI, such as the ones provided by OpenAI. You can chain multiple LLM steps together using Pipelines and store the output of each step in Memory for later retrieval.
+**LinGoose** is a powerful Go framework for developing Large Language Model (LLM) based applications using pipelines. It is designed to be a complete solution and provides multiple components, including Prompts, Templates, Chat, Output Decoders, LLM, Pipelines, and Memory. With **LinGoose**, you can interact with LLM AI through prompts and generate complex templates. Additionally, it includes a chat feature, allowing you to create chatbots. The Output Decoders component enables you to extract specific information from the output of the LLM, while the LLM interface allows you to send prompts to various AI, such as the ones provided by OpenAI. You can chain multiple LLM steps together using Pipelines and store the output of each step in Memory for later retrieval. **LinGoose** also includes a Document component, which is used to store text, and a Loader component, which is used to load Documents from various sources. Finally, it includes TextSplitters, which are used to split text or Documents into multiple parts, Embedders, which are used to embed text or Documents into embeddings, and Indexes, which are used to store embeddings and documents and to perform searches.
 
 # Components
 **LinGoose** is composed of multiple components, each one with its own purpose.
@@ -25,6 +25,11 @@ here below an image from docs/assets/img/lingoose.png
 |**LLMs** |[llm/openai](llm/openai/) | LLM is an interface to various AI such as the ones provided by OpenAI. It is responsible for sending the prompt to the AI and retrieving the output. |
 |**Pipelines** | [pipeline](pipeline/)|Pipelines are used to chain multiple LLM steps together. |
 |**Memory** | [memory/ram](memory/ram/)|Memory is used to store the output of each step. It can be used to retrieve the output of a previous step. |
+|**Document** | [document](document/)|Document is used to store a text |
+|**Loaders** | [loader](loader/)|Loaders are used to load Documents from various sources. |
+|**TextSplitters**| [textsplitter](textsplitter/)|TextSplitters are used to split text or Documents into multiple parts. |
+|**Embedders** | [embedder](embedder/)|Embedders are used to embed text or Documents into embeddings. |
+|**Indexes**| [index](index/)|Indexes are used to store embeddings and documents and to perform searches. |
 
 # Usage
 

diff --git a/docs/index.html b/docs/index.html
@@ -107,6 +107,31 @@ <h1 id="components">Components</h1>
                 <td><a href="../memory/ram/">memory/ram</a></td>
                 <td>Memory is used to store the output of each step. It can be used to retrieve the output of a previous step.</td>
                 </tr>
+                <tr>
+                <td><strong>Memory</strong></td>
+                <td><a href="../document/">document</a></td>
+                <td>Document is used to store a text.</td>
+                </tr>
+                <tr>
+                <td><strong>Loader</strong></td>
+                <td><a href="../loader/">loader</a></td>
+                <td>Loaders are used to load Documents from various sources.</td>
+                </tr>
+                <tr>
+                <td><strong>TextSplitters</strong></td>
+                <td><a href="../textsplitter/">textsplitter</a></td>
+                <td>TextSplitters are used to split text or Documents into multiple parts.</td>
+                </tr>
+                <tr>
+                <td><strong>Embedders</strong></td>
+                <td><a href="../embedder/">embedder</a></td>
+                <td>Embedders are used to embed text or Documents into embeddings.</td>
+                </tr>
+                <tr>
+                <td><strong>Indexes</strong></td>
+                <td><a href="../index/">index</a></td>
+                <td>Indexes are used to store embeddings and documents and to perform searches.</td>
+                </tr>
                 </tbody></table>
                 <br>
                 <h1 id="usage">Usage</h1>

diff --git a/document/document.go b/document/document.go
@@ -0,0 +1,6 @@
+package document
+
+type Document struct {
+	Content  string                 `json:"content"`
+	Metadata map[string]interface{} `json:"metadata"`
+}
diff --git a/embedder/embedding.go b/embedder/embedding.go
@@ -0,0 +1,7 @@
+package embedder
+
+// Embedding is the result of an embedding operation.
+type Embedding struct {
+	Embedding []float32 `json:"embedding"`
+	Index     int       `json:"index"`
+}
diff --git a/embedder/openai/openai.go b/embedder/openai/openai.go
@@ -0,0 +1,81 @@
+package openaiembedder
+
+import (
+	"context"
+	"fmt"
+	"os"
+
+	"github.com/henomis/lingoose/document"
+	"github.com/henomis/lingoose/embedder"
+	"github.com/sashabaranov/go-openai"
+)
+
+type Model int
+
+const (
+	Unknown Model = iota
+	AdaSimilarity
+	BabbageSimilarity
+	CurieSimilarity
+	DavinciSimilarity
+	AdaSearchDocument
+	AdaSearchQuery
+	BabbageSearchDocument
+	BabbageSearchQuery
+	CurieSearchDocument
+	CurieSearchQuery
+	DavinciSearchDocument
+	DavinciSearchQuery
+	AdaCodeSearchCode
+	AdaCodeSearchText
+	BabbageCodeSearchCode
+	BabbageCodeSearchText
+	AdaEmbeddingV2
+)
+
+type OpenAIEmbedder struct {
+	openAIClient *openai.Client
+	model        Model
+}
+
+func New(model Model) (*OpenAIEmbedder, error) {
+	openAIKey := os.Getenv("OPENAI_API_KEY")
+	if openAIKey == "" {
+		return nil, fmt.Errorf("OPENAI_API_KEY not set")
+	}
+
+	return &OpenAIEmbedder{
+		openAIClient: openai.NewClient(openAIKey),
+		model:        model,
+	}, nil
+}
+
+func (t *OpenAIEmbedder) Embed(ctx context.Context, docs []document.Document) ([]embedder.Embedding, error) {
+
+	input := []string{}
+	for _, doc := range docs {
+		input = append(input, doc.Content)
+	}
+
+	resp, err := t.openAIClient.CreateEmbeddings(
+		ctx,
+		openai.EmbeddingRequest{
+			Input: input,
+			Model: openai.EmbeddingModel(t.model),
+		},
+	)
+	if err != nil {
+		return nil, err
+	}
+
+	var embeddings []embedder.Embedding
+
+	for _, obj := range resp.Data {
+		embeddings = append(embeddings, embedder.Embedding{
+			Embedding: obj.Embedding,
+			Index:     obj.Index,
+		})
+	}
+
+	return embeddings, nil
+}
diff --git a/examples/embeddings/pinecone/main.go b/examples/embeddings/pinecone/main.go
@@ -0,0 +1,127 @@
+package main
+
+import (
+	"context"
+	"fmt"
+	"os"
+
+	openaiembedder "github.com/henomis/lingoose/embedder/openai"
+	"github.com/henomis/lingoose/index"
+	"github.com/henomis/lingoose/llm/openai"
+	"github.com/henomis/lingoose/loader"
+	"github.com/henomis/lingoose/prompt"
+	"github.com/henomis/lingoose/textsplitter"
+	pineconego "github.com/henomis/pinecone-go"
+	pineconerequest "github.com/henomis/pinecone-go/request"
+	pineconeresponse "github.com/henomis/pinecone-go/response"
+)
+
+func main() {
+
+	openaiEmbedder, err := openaiembedder.New(openaiembedder.AdaEmbeddingV2)
+	if err != nil {
+		panic(err)
+	}
+
+	pineconeApiKey := os.Getenv("PINECONE_API_KEY")
+	if pineconeApiKey == "" {
+		panic("PINECONE_API_KEY is not set")
+	}
+
+	pineconeEnvironment := os.Getenv("PINECONE_ENVIRONMENT")
+	if pineconeEnvironment == "" {
+		panic("PINECONE_ENVIRONMENT is not set")
+	}
+
+	pineconeClient := pineconego.New(pineconeEnvironment, pineconeApiKey)
+
+	whoamiReq := &pineconerequest.Whoami{}
+	whoamiResp := &pineconeresponse.Whoami{}
+
+	err = pineconeClient.Whoami(context.Background(), whoamiReq, whoamiResp)
+	if err != nil {
+		panic(err)
+	}
+
+	pineconeIndex, err := index.NewPinecone("test", whoamiResp.ProjectID, openaiEmbedder)
+	if err != nil {
+		panic(err)
+	}
+
+	indexSize, err := pineconeIndex.Size()
+	if err != nil {
+		panic(err)
+	}
+
+	if indexSize == 0 {
+		loader, err := loader.NewDirectoryLoader(".", ".txt")
+		if err != nil {
+			panic(err)
+		}
+
+		documents, err := loader.Load()
+		if err != nil {
+			panic(err)
+		}
+
+		textSplitter := textsplitter.NewRecursiveCharacterTextSplitter(1000, 20, nil, nil)
+
+		documentChunks := textSplitter.SplitDocuments(documents)
+
+		for _, doc := range documentChunks {
+			fmt.Println(doc.Content)
+			fmt.Println("----------")
+			fmt.Println(doc.Metadata)
+			fmt.Println("----------")
+			fmt.Println()
+
+		}
+
+		err = pineconeIndex.LoadFromDocuments(context.Background(), documentChunks)
+		if err != nil {
+			panic(err)
+		}
+	}
+
+	query := "What is the purpose of the NATO Alliance?"
+	topk := 3
+	similarities, err := pineconeIndex.SimilaritySearch(
+		context.Background(),
+		query,
+		&topk,
+	)
+	if err != nil {
+		panic(err)
+	}
+
+	for _, similarity := range similarities {
+		fmt.Printf("Similarity: %f\n", similarity.Score)
+		fmt.Printf("Document: %s\n", similarity.Document.Content)
+		fmt.Println("Metadata: ", similarity.Document.Metadata)
+		fmt.Println("----------")
+	}
+
+	llmOpenAI, err := openai.New(openai.GPT3TextDavinci003, openai.DefaultOpenAITemperature, openai.DefaultOpenAIMaxTokens, true)
+	if err != nil {
+		panic(err)
+	}
+
+	prompt1, err := prompt.NewPromptTemplate(
+		"Based on the following context answer to the question.\n\nContext:\n{{.context}}\n\nQuestion: {{.query}}",
+		map[string]string{
+			"query":   query,
+			"context": similarities[0].Document.Content,
+		},
+	)
+	if err != nil {
+		panic(err)
+	}
+
+	err = prompt1.Format(nil)
+	if err != nil {
+		panic(err)
+	}
+
+	llmOpenAI.Completion(prompt1.Prompt())
+
+}