From 693c6bf1ee22a10227a5b9086dfc3073b5fd8cec Mon Sep 17 00:00:00 2001 From: Ubaldo Porcheddu Date: Sun, 12 Jan 2025 23:21:58 +0000 Subject: [PATCH] allow downloading database and gguf from huggingface --- .gitignore | 1 + main.go | 8 ++- setup.go | 161 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 169 insertions(+), 1 deletion(-) create mode 100644 setup.go diff --git a/.gitignore b/.gitignore index f7b423a..4ed326c 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,4 @@ wikilite wikilite.exe wikilite.db* dist/ +*.gguf diff --git a/main.go b/main.go index bfdbf29..45f7844 100644 --- a/main.go +++ b/main.go @@ -10,7 +10,7 @@ import ( "os" ) -const Version = "0.3.0" +const Version = "0.4.0" type Config struct { aiApiKey string @@ -23,6 +23,7 @@ type Config struct { limit int log bool logFile string + setup bool web bool webHost string webPort int @@ -52,6 +53,7 @@ func parseConfig() (*Config, error) { flag.IntVar(&options.limit, "limit", 5, "Maximum number of search results") flag.BoolVar(&options.log, "log", false, "Enable logging") flag.StringVar(&options.logFile, "log-file", "", "Log file path") + flag.BoolVar(&options.setup, "setup", false, "Download a ready made database and embeddings model") flag.BoolVar(&options.web, "web", false, "Enable web interface") flag.StringVar(&options.webHost, "web-host", "localhost", "Web server host") @@ -86,6 +88,10 @@ func main() { os.Exit(0) } + if options.setup { + Setup() + } + if options.log || options.logFile != "" { if options.logFile != "" { logFile, err := os.OpenFile(options.logFile, os.O_WRONLY|os.O_CREATE|os.O_APPEND, 0644) diff --git a/setup.go b/setup.go new file mode 100644 index 0000000..c15317e --- /dev/null +++ b/setup.go @@ -0,0 +1,161 @@ +// Copyright (C) 2025 by Ubaldo Porcheddu + +package main + +import ( + "bufio" + "compress/gzip" + "encoding/json" + "fmt" + "io" + "net/http" + "os" + "strings" +) + +type SetupSibling struct { + Rfilename string `json:"rfilename"` +} + +type SetupDatasetInfo struct { + Siblings []SetupSibling `json:"siblings"` +} + +func setupDownloadFile(url string, outputPath string) error { + resp, err := http.Get(url) + if err != nil { + return err + } + defer resp.Body.Close() + + out, err := os.Create(outputPath) + if err != nil { + return err + } + defer out.Close() + + _, err = io.Copy(out, resp.Body) + return err +} + +func setupGunzipFile(url string, outputPath string) error { + resp, err := http.Get(url) + if err != nil { + return err + } + defer resp.Body.Close() + + gzReader, err := gzip.NewReader(resp.Body) + if err != nil { + return err + } + defer gzReader.Close() + + out, err := os.Create(outputPath) + if err != nil { + return err + } + defer out.Close() + + _, err = io.Copy(out, gzReader) + return err +} + +func setupFetchDatasetInfo(url string) (*SetupDatasetInfo, error) { + resp, err := http.Get(url) + if err != nil { + return nil, err + } + defer resp.Body.Close() + + var datasetInfo SetupDatasetInfo + err = json.NewDecoder(resp.Body).Decode(&datasetInfo) + if err != nil { + return nil, err + } + + return &datasetInfo, nil +} + +func setupFilterAndDisplayFiles(siblings []SetupSibling) []SetupSibling { + var dbFiles []SetupSibling + for _, sibling := range siblings { + if strings.HasSuffix(sibling.Rfilename, ".db.gz") { + dbFiles = append(dbFiles, sibling) + fmt.Printf("%d. %s\n", len(dbFiles), sibling.Rfilename) + } + } + return dbFiles +} + +func setupGetGGUFFileName(dbFile string) string { + baseName := strings.TrimSuffix(dbFile, ".db.gz") + parts := strings.Split(baseName, ".") + if len(parts) == 2 { + return parts[1] + ".gguf" + } + return "" +} + +func setupFileExists(filename string) bool { + _, err := os.Stat(filename) + return !os.IsNotExist(err) +} + +func Setup() { + url := "https://huggingface.co/api/datasets/eja/wikilite" + + datasetInfo, err := setupFetchDatasetInfo(url) + if err != nil { + fmt.Println("Error fetching dataset info:", err) + return + } + + dbFiles := setupFilterAndDisplayFiles(datasetInfo.Siblings) + if len(dbFiles) == 0 { + fmt.Println("No .db.gz files found.") + return + } + + fmt.Print("Choose a file by number: ") + reader := bufio.NewReader(os.Stdin) + input, _ := reader.ReadString('\n') + var choice int + _, err = fmt.Sscanf(input, "%d", &choice) + if err != nil || choice < 1 || choice > len(dbFiles) { + fmt.Println("Invalid choice.") + return + } + + selectedDB := dbFiles[choice-1].Rfilename + baseURL := "https://huggingface.co/datasets/eja/wikilite/resolve/main/" + + if setupFileExists("wikilite.db") { + fmt.Println("A wikilite.db already exists in the current directory.") + return + } + + fmt.Println("Downloading and extracting", selectedDB) + err = setupGunzipFile(baseURL+selectedDB, "wikilite.db") + if err != nil { + fmt.Println("Error downloading and extracting file:", err) + return + } + fmt.Println("Saved as wikilite.db") + + ggufFile := setupGetGGUFFileName(selectedDB) + if ggufFile != "" { + if setupFileExists(ggufFile) { + fmt.Printf("%s already exists in the current directory.\n", ggufFile) + return + } + + fmt.Println("Downloading corresponding gguf model:", ggufFile) + err = setupDownloadFile(baseURL+ggufFile, ggufFile) + if err != nil { + fmt.Println("Error downloading .gguf file:", err) + return + } + fmt.Println("Saved as", ggufFile) + } +}