allow to set GPU layers

go-skynet · May 14, 2023 · 3501b34 · 3501b34
1 parent 7d9b011
commit 3501b34
Show file tree

Hide file tree

Showing 6 changed files with 20 additions and 4 deletions.
diff --git a/README.md b/README.md
@@ -25,6 +25,12 @@ cd go-llama.cpp
 make libbinding.a
 ```
 
+To build with OpenBLAS, for example:
+
+```
+CMAKE_ARGS="-DLLAMA_OPENBLAS=ON" make libbinding.a
+```
+
 Now you can run the example with:
 
 ```

diff --git a/binding.cpp b/binding.cpp
@@ -369,7 +369,7 @@ void* llama_allocate_params(const char *prompt, int seed, int threads, int token
 }
 
 
-void* load_model(const char *fname, int n_ctx, int n_parts, int n_seed, bool memory_f16, bool mlock, bool embeddings) {
+void* load_model(const char *fname, int n_ctx, int n_parts, int n_seed, bool memory_f16, bool mlock, bool embeddings, int n_gpu_layers) {
     // load the model
     auto lparams = llama_context_default_params();
 
@@ -379,6 +379,8 @@ void* load_model(const char *fname, int n_ctx, int n_parts, int n_seed, bool mem
     lparams.f16_kv     = memory_f16;
     lparams.embedding  = embeddings;
     lparams.use_mlock  = mlock;
+    lparams.n_gpu_layers = n_gpu_layers;
+
     void* res = nullptr;
     try {
         res = llama_init_from_file(fname, lparams);

diff --git a/binding.h b/binding.h
@@ -8,7 +8,7 @@ extern "C" {
 
 extern unsigned char tokenCallback(void *, char *);
 
-void* load_model(const char *fname, int n_ctx, int n_parts, int n_seed, bool memory_f16, bool mlock, bool embeddings);
+void* load_model(const char *fname, int n_ctx, int n_parts, int n_seed, bool memory_f16, bool mlock, bool embeddings, int n_gpu);
 
 int get_embeddings(void* params_ptr, void* state_pr, float * res_embeddings);
 

diff --git a/llama.cpp b/llama.cpp
diff --git a/llama.go b/llama.go
@@ -22,7 +22,7 @@ type LLama struct {
 func New(model string, opts ...ModelOption) (*LLama, error) {
 	mo := NewModelOptions(opts...)
 	modelPath := C.CString(model)
-	result := C.load_model(modelPath, C.int(mo.ContextSize), C.int(mo.Parts), C.int(mo.Seed), C.bool(mo.F16Memory), C.bool(mo.MLock), C.bool(mo.Embeddings))
+	result := C.load_model(modelPath, C.int(mo.ContextSize), C.int(mo.Parts), C.int(mo.Seed), C.bool(mo.F16Memory), C.bool(mo.MLock), C.bool(mo.Embeddings), C.int(mo.NGPULayers))
 	if result == nil {
 		return nil, fmt.Errorf("failed loading model")
 	}

diff --git a/options.go b/options.go
@@ -7,6 +7,7 @@ type ModelOptions struct {
 	F16Memory   bool
 	MLock       bool
 	Embeddings  bool
+	NGPULayers  int
 }
 
 type PredictOptions struct {
@@ -113,6 +114,13 @@ var IgnoreEOS PredictOption = func(p *PredictOptions) {
 	p.IgnoreEOS = true
 }
 
+// SetGPULayers sets the number of GPU layers to use to offload computation
+func SetGPULayers(n int) ModelOption {
+	return func(p *ModelOptions) {
+		p.NGPULayers = n
+	}
+}
+
 // SetTokenCallback sets the prompts that will stop predictions.
 func SetTokenCallback(fn func(string) bool) PredictOption {
 	return func(p *PredictOptions) {