diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..81bc6f3 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "llama.cpp"] + path = llama.cpp + url = https://github.com/ggerganov/llama.cpp diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..b9c46f0 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2023 go-skynet authors + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..8a7dd9f --- /dev/null +++ b/Makefile @@ -0,0 +1,153 @@ +ifndef UNAME_S +UNAME_S := $(shell uname -s) +endif + +ifndef UNAME_P +UNAME_P := $(shell uname -p) +endif + +ifndef UNAME_M +UNAME_M := $(shell uname -m) +endif + +CCV := $(shell $(CC) --version | head -n 1) +CXXV := $(shell $(CXX) --version | head -n 1) + +# Mac OS + Arm can report x86_64 +# ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789 +ifeq ($(UNAME_S),Darwin) + ifneq ($(UNAME_P),arm) + SYSCTL_M := $(shell sysctl -n hw.optional.arm64 2>/dev/null) + ifeq ($(SYSCTL_M),1) + # UNAME_P := arm + # UNAME_M := arm64 + warn := $(warning Your arch is announced as x86_64, but it seems to actually be ARM64. Not fixing that can lead to bad performance. For more info see: https://github.com/ggerganov/whisper.cpp/issues/66\#issuecomment-1282546789) + endif + endif +endif + +# +# Compile flags +# + +# keep standard at C11 and C++11 +CFLAGS = -I./llama.cpp -I. -O3 -DNDEBUG -std=c11 -fPIC +CXXFLAGS = -I./llama.cpp -I. -I./llama.cpp/examples -I./examples -O3 -DNDEBUG -std=c++11 -fPIC +LDFLAGS = + +# warnings +CFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith -Wno-unused-function +CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function + +# OS specific +# TODO: support Windows +ifeq ($(UNAME_S),Linux) + CFLAGS += -pthread + CXXFLAGS += -pthread +endif +ifeq ($(UNAME_S),Darwin) + CFLAGS += -pthread + CXXFLAGS += -pthread +endif +ifeq ($(UNAME_S),FreeBSD) + CFLAGS += -pthread + CXXFLAGS += -pthread +endif +ifeq ($(UNAME_S),NetBSD) + CFLAGS += -pthread + CXXFLAGS += -pthread +endif +ifeq ($(UNAME_S),OpenBSD) + CFLAGS += -pthread + CXXFLAGS += -pthread +endif +ifeq ($(UNAME_S),Haiku) + CFLAGS += -pthread + CXXFLAGS += -pthread +endif + +# Architecture specific +# TODO: probably these flags need to be tweaked on some architectures +# feel free to update the Makefile for your architecture and send a pull request or issue +ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686)) + # Use all CPU extensions that are available: + CFLAGS += -march=native -mtune=native +endif +ifneq ($(filter ppc64%,$(UNAME_M)),) + POWER9_M := $(shell grep "POWER9" /proc/cpuinfo) + ifneq (,$(findstring POWER9,$(POWER9_M))) + CFLAGS += -mcpu=power9 + CXXFLAGS += -mcpu=power9 + endif + # Require c++23's std::byteswap for big-endian support. + ifeq ($(UNAME_M),ppc64) + CXXFLAGS += -std=c++23 -DGGML_BIG_ENDIAN + endif +endif +ifndef LLAMA_NO_ACCELERATE + # Mac M1 - include Accelerate framework. + # `-framework Accelerate` works on Mac Intel as well, with negliable performance boost (as of the predict time). + ifeq ($(UNAME_S),Darwin) + CFLAGS += -DGGML_USE_ACCELERATE + LDFLAGS += -framework Accelerate + endif +endif +ifdef LLAMA_OPENBLAS + CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas + LDFLAGS += -lopenblas +endif +ifdef LLAMA_GPROF + CFLAGS += -pg + CXXFLAGS += -pg +endif +ifneq ($(filter aarch64%,$(UNAME_M)),) + CFLAGS += -mcpu=native + CXXFLAGS += -mcpu=native +endif +ifneq ($(filter armv6%,$(UNAME_M)),) + # Raspberry Pi 1, 2, 3 + CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access +endif +ifneq ($(filter armv7%,$(UNAME_M)),) + # Raspberry Pi 4 + CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations +endif +ifneq ($(filter armv8%,$(UNAME_M)),) + # Raspberry Pi 4 + CFLAGS += -mfp16-format=ieee -mno-unaligned-access +endif + +# +# Print build information +# + +$(info I llama.cpp build info: ) +$(info I UNAME_S: $(UNAME_S)) +$(info I UNAME_P: $(UNAME_P)) +$(info I UNAME_M: $(UNAME_M)) +$(info I CFLAGS: $(CFLAGS)) +$(info I CXXFLAGS: $(CXXFLAGS)) +$(info I LDFLAGS: $(LDFLAGS)) +$(info I CC: $(CCV)) +$(info I CXX: $(CXXV)) +$(info ) + +llama.cpp/ggml.o: + $(MAKE) -C llama.cpp ggml.o + +llama.cpp/llama.o: + $(MAKE) -C llama.cpp llama.o + +llama.cpp/common.o: + $(MAKE) -C llama.cpp common.o + +binding.o: llama.cpp/ggml.o llama.cpp/llama.o llama.cpp/common.o + $(CXX) $(CXXFLAGS) -I./llama.cpp -I./llama.cpp/examples binding.cpp -o binding.o -c $(LDFLAGS) + +libbinding.a: binding.o + ar src libbinding.a llama.cpp/ggml.o llama.cpp/common.o llama.cpp/llama.o binding.o + +clean: + rm -rf *.o + rm -rf *.a + $(MAKE) -C llama.cpp clean \ No newline at end of file diff --git a/binding.cpp b/binding.cpp new file mode 100644 index 0000000..d96fd1b --- /dev/null +++ b/binding.cpp @@ -0,0 +1,195 @@ +#include "common.h" +#include "llama.h" +#include "binding.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) +#include +#include +#elif defined (_WIN32) +#include +#endif + +#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32) +void sigint_handler(int signo) { + if (signo == SIGINT) { + _exit(130); + } +} +#endif + +int llama_predict(void* params_ptr, void* state_pr, char* result) { + gpt_params* params_p = (gpt_params*) params_ptr; + llama_context* ctx = (llama_context*) state_pr; + + gpt_params params = *params_p; + + if (params.seed <= 0) { + params.seed = time(NULL); + } + + std::mt19937 rng(params.seed); + + // Add a space in front of the first character to match OG llama tokenizer behavior + params.prompt.insert(0, 1, ' '); + + // tokenize the prompt + auto embd_inp = ::llama_tokenize(ctx, params.prompt, true); + + const int n_ctx = llama_n_ctx(ctx); + + // number of tokens to keep when resetting context + if (params.n_keep < 0 || params.n_keep > (int)embd_inp.size() || params.instruct) { + params.n_keep = (int)embd_inp.size(); + } + + // determine newline token + auto llama_token_newline = ::llama_tokenize(ctx, "\n", false); + + // TODO: replace with ring-buffer + std::vector last_n_tokens(n_ctx); + std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0); + + int n_past = 0; + int n_remain = params.n_predict; + int n_consumed = 0; + + std::vector embd; + std::string res = ""; + + while (n_remain != 0) { + // predict + if (embd.size() > 0) { + // infinite text generation via context swapping + // if we run out of context: + // - take the n_keep first tokens from the original prompt (via n_past) + // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in a batch + if (n_past + (int) embd.size() > n_ctx) { + const int n_left = n_past - params.n_keep; + + n_past = params.n_keep; + + // insert n_left/2 tokens at the start of embd from last_n_tokens + embd.insert(embd.begin(), last_n_tokens.begin() + n_ctx - n_left/2 - embd.size(), last_n_tokens.end() - embd.size()); + } + + if (llama_eval(ctx, embd.data(), embd.size(), n_past, params.n_threads)) { + fprintf(stderr, "%s : failed to eval\n", __func__); + return 1; + } + } + + n_past += embd.size(); + embd.clear(); + + if ((int) embd_inp.size() <= n_consumed) { + // out of user input, sample next token + const int32_t top_k = params.top_k; + const float top_p = params.top_p; + const float temp = params.temp; + const float repeat_penalty = params.repeat_penalty; + + llama_token id = 0; + + { + auto logits = llama_get_logits(ctx); + + if (params.ignore_eos) { + logits[llama_token_eos()] = 0; + } + + id = llama_sample_top_p_top_k(ctx, + last_n_tokens.data() + n_ctx - params.repeat_last_n, + params.repeat_last_n, top_k, top_p, temp, repeat_penalty); + + last_n_tokens.erase(last_n_tokens.begin()); + last_n_tokens.push_back(id); + } + + // add it to the context + embd.push_back(id); + + // decrement remaining sampling budget + --n_remain; + } else { + // some user input remains from prompt or interaction, forward it to processing + while ((int) embd_inp.size() > n_consumed) { + embd.push_back(embd_inp[n_consumed]); + last_n_tokens.erase(last_n_tokens.begin()); + last_n_tokens.push_back(embd_inp[n_consumed]); + ++n_consumed; + if ((int) embd.size() >= params.n_batch) { + break; + } + } + } + + for (auto id : embd) { + res += llama_token_to_str(ctx, id); + } + + // end of text token + if (embd.back() == llama_token_eos()) { + break; + } + } + +#if defined (_WIN32) + signal(SIGINT, SIG_DFL); +#endif + strcpy(result, res.c_str()); + return 0; +} + +void llama_free_model(void *state_ptr) { + llama_context* ctx = (llama_context*) state_ptr; + llama_free(ctx); +} + +void llama_free_params(void* params_ptr) { + gpt_params* params = (gpt_params*) params_ptr; + delete params; +} + + +void* llama_allocate_params(const char *prompt, int seed, int threads, int tokens, int top_k, + float top_p, float temp, float repeat_penalty, int repeat_last_n, bool ignore_eos, bool memory_f16) { + gpt_params* params = new gpt_params; + params->seed = seed; + params->n_threads = threads; + params->n_predict = tokens; + params->repeat_last_n = repeat_last_n; + + params->top_k = top_k; + params->top_p = top_p; + params->memory_f16 = memory_f16; + params->temp = temp; + params->repeat_penalty = repeat_penalty; + + params->prompt = prompt; + params->ignore_eos = ignore_eos; + + return params; +} + +void* load_model(const char *fname, int n_ctx, int n_parts, int n_seed, bool memory_f16, bool mlock) { + // load the model + auto lparams = llama_context_default_params(); + + lparams.n_ctx = n_ctx; + lparams.n_parts = n_parts; + lparams.seed = n_seed; + lparams.f16_kv = memory_f16; + lparams.use_mlock = mlock; + + return llama_init_from_file(fname, lparams); +} diff --git a/binding.h b/binding.h new file mode 100644 index 0000000..c061131 --- /dev/null +++ b/binding.h @@ -0,0 +1,20 @@ +#ifdef __cplusplus +extern "C" { +#endif + +#include + +void* load_model(const char *fname, int n_ctx, int n_parts, int n_seed, bool memory_f16, bool mlock); + +void* llama_allocate_params(const char *prompt, int seed, int threads, int tokens, + int top_k, float top_p, float temp, float repeat_penalty, int repeat_last_n, bool ignore_eos, bool memory_f16); + +void llama_free_params(void* params_ptr); + +void llama_free_model(void* state); + +int llama_predict(void* params_ptr, void* state_pr, char* result); + +#ifdef __cplusplus +} +#endif diff --git a/examples/main.go b/examples/main.go new file mode 100644 index 0000000..14c4685 --- /dev/null +++ b/examples/main.go @@ -0,0 +1,80 @@ +package main + +import ( + "bufio" + "flag" + "fmt" + "io" + "os" + "runtime" + "strings" + + llama "github.com/go-skynet/go-llama.cpp" +) + +var ( + threads = 4 + tokens = 128 +) + +func main() { + var model string + + flags := flag.NewFlagSet(os.Args[0], flag.ExitOnError) + flags.StringVar(&model, "m", "./models/7B/ggml-model-q4_0.bin", "path to q4_0.bin model file to load") + flags.IntVar(&threads, "t", runtime.NumCPU(), "number of threads to use during computation") + flags.IntVar(&tokens, "n", 512, "number of tokens to predict") + + err := flags.Parse(os.Args[1:]) + if err != nil { + fmt.Printf("Parsing program arguments failed: %s", err) + os.Exit(1) + } + l, err := llama.New(model, llama.SetContext(128), llama.SetParts(-1)) + if err != nil { + fmt.Println("Loading the model failed:", err.Error()) + os.Exit(1) + } + fmt.Printf("Model loaded successfully.\n") + + reader := bufio.NewReader(os.Stdin) + + for { + text := readMultiLineInput(reader) + + res, err := l.Predict(text, llama.SetTokens(tokens), llama.SetThreads(threads), llama.SetTopK(90), llama.SetTopP(0.86)) + if err != nil { + panic(err) + } + fmt.Printf("\ngolang: %s\n", res) + + fmt.Printf("\n\n") + } +} + +// readMultiLineInput reads input until an empty line is entered. +func readMultiLineInput(reader *bufio.Reader) string { + var lines []string + fmt.Print(">>> ") + + for { + line, err := reader.ReadString('\n') + if err != nil { + if err == io.EOF { + os.Exit(0) + } + fmt.Printf("Reading the prompt failed: %s", err) + os.Exit(1) + } + + if len(strings.TrimSpace(line)) == 0 { + break + } + + lines = append(lines, line) + } + + text := strings.Join(lines, "") + fmt.Println("Sending", text) + return text +} diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..3d1fa4b --- /dev/null +++ b/go.mod @@ -0,0 +1,3 @@ +module github.com/go-skynet/go-llama.cpp + +go 1.19 diff --git a/llama.cpp b/llama.cpp new file mode 160000 index 0000000..53dbba7 --- /dev/null +++ b/llama.cpp @@ -0,0 +1 @@ +Subproject commit 53dbba769537e894ead5c6913ab2fd3a4658b738 diff --git a/llama.go b/llama.go new file mode 100644 index 0000000..dc31ca3 --- /dev/null +++ b/llama.go @@ -0,0 +1,56 @@ +package llama + +// #cgo CXXFLAGS: -I./llama.cpp/examples -I./llama.cpp +// #cgo LDFLAGS: -L./ -lbinding -lm -lstdc++ +// #include "binding.h" +import "C" +import ( + "fmt" + "strings" + "unsafe" +) + +type LLama struct { + state unsafe.Pointer +} + +func New(model string, opts ...ModelOption) (*LLama, error) { + mo := NewModelOptions(opts...) + modelPath := C.CString(model) + result := C.load_model(modelPath, C.int(mo.ContextSize), C.int(mo.Parts), C.int(mo.Seed), C.bool(mo.F16Memory), C.bool(mo.MLock)) + if result == nil { + return nil, fmt.Errorf("failed loading model") + } + + return &LLama{state: result}, nil +} + +func (l *LLama) Free() { + C.llama_free_model(l.state) +} + +func (l *LLama) Predict(text string, opts ...PredictOption) (string, error) { + po := NewPredictOptions(opts...) + + input := C.CString(text) + if po.Tokens == 0 { + po.Tokens = 99999999 + } + out := make([]byte, po.Tokens) + + params := C.llama_allocate_params(input, C.int(po.Seed), C.int(po.Threads), C.int(po.Tokens), C.int(po.TopK), + C.float(po.TopP), C.float(po.Temperature), C.float(po.Penalty), C.int(po.Repeat), C.bool(po.IgnoreEOS), C.bool(po.F16KV)) + ret := C.llama_predict(params, l.state, (*C.char)(unsafe.Pointer(&out[0]))) + if ret != 0 { + return "", fmt.Errorf("inference failed") + } + res := C.GoString((*C.char)(unsafe.Pointer(&out[0]))) + + res = strings.TrimPrefix(res, " ") + res = strings.TrimPrefix(res, text) + res = strings.TrimPrefix(res, "\n") + + C.llama_free_params(params) + + return res, nil +} diff --git a/options.go b/options.go new file mode 100644 index 0000000..ae46fd3 --- /dev/null +++ b/options.go @@ -0,0 +1,148 @@ +package llama + +import "runtime" + +type ModelOptions struct { + ContextSize int + Parts int + Seed int + F16Memory bool + MLock bool +} + +type PredictOptions struct { + Seed, Threads, Tokens, TopK, Repeat int + TopP, Temperature, Penalty float64 + F16KV bool + IgnoreEOS bool +} + +type PredictOption func(p *PredictOptions) +type ModelOption func(p *ModelOptions) + +var DefaultModelOptions ModelOptions = ModelOptions{ + ContextSize: 512, + Seed: 0, + F16Memory: false, + MLock: false, +} + +var DefaultOptions PredictOptions = PredictOptions{ + Seed: -1, + Threads: runtime.NumCPU(), + Tokens: 128, + TopK: 10000, + TopP: 0.90, + Temperature: 0.96, + Penalty: 1, + Repeat: 64, +} + +// SetContext sets the context size. +func SetContext(c int) ModelOption { + return func(p *ModelOptions) { + p.ContextSize = c + } +} + +func SetModelSeed(c int) ModelOption { + return func(p *ModelOptions) { + p.Seed = c + } +} + +func SetParts(c int) ModelOption { + return func(p *ModelOptions) { + p.Parts = c + } +} + +var EnableF16Memory ModelOption = func(p *ModelOptions) { + p.F16Memory = true +} + +var EnableF16KV PredictOption = func(p *PredictOptions) { + p.F16KV = true +} + +var EnableMLock ModelOption = func(p *ModelOptions) { + p.MLock = true +} + +// Create a new PredictOptions object with the given options. +func NewModelOptions(opts ...ModelOption) ModelOptions { + p := DefaultModelOptions + for _, opt := range opts { + opt(&p) + } + return p +} + +var IgnoreEOS PredictOption = func(p *PredictOptions) { + p.IgnoreEOS = true +} + +// SetSeed sets the random seed for sampling text generation. +func SetSeed(seed int) PredictOption { + return func(p *PredictOptions) { + p.Seed = seed + } +} + +// SetThreads sets the number of threads to use for text generation. +func SetThreads(threads int) PredictOption { + return func(p *PredictOptions) { + p.Threads = threads + } +} + +// SetTokens sets the number of tokens to generate. +func SetTokens(tokens int) PredictOption { + return func(p *PredictOptions) { + p.Tokens = tokens + } +} + +// SetTopK sets the value for top-K sampling. +func SetTopK(topk int) PredictOption { + return func(p *PredictOptions) { + p.TopK = topk + } +} + +// SetTopP sets the value for nucleus sampling. +func SetTopP(topp float64) PredictOption { + return func(p *PredictOptions) { + p.TopP = topp + } +} + +// SetTemperature sets the temperature value for text generation. +func SetTemperature(temp float64) PredictOption { + return func(p *PredictOptions) { + p.Temperature = temp + } +} + +// SetPenalty sets the repetition penalty for text generation. +func SetPenalty(penalty float64) PredictOption { + return func(p *PredictOptions) { + p.Penalty = penalty + } +} + +// SetRepeat sets the number of times to repeat text generation. +func SetRepeat(repeat int) PredictOption { + return func(p *PredictOptions) { + p.Repeat = repeat + } +} + +// Create a new PredictOptions object with the given options. +func NewPredictOptions(opts ...PredictOption) PredictOptions { + p := DefaultOptions + for _, opt := range opts { + opt(&p) + } + return p +}