diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000..81bc6f3
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "llama.cpp"]
+	path = llama.cpp
+	url = https://github.com/ggerganov/llama.cpp
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..b9c46f0
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2023 go-skynet authors
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
\ No newline at end of file
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..8a7dd9f
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,153 @@
+ifndef UNAME_S
+UNAME_S := $(shell uname -s)
+endif
+
+ifndef UNAME_P
+UNAME_P := $(shell uname -p)
+endif
+
+ifndef UNAME_M
+UNAME_M := $(shell uname -m)
+endif
+
+CCV := $(shell $(CC) --version | head -n 1)
+CXXV := $(shell $(CXX) --version | head -n 1)
+
+# Mac OS + Arm can report x86_64
+# ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789
+ifeq ($(UNAME_S),Darwin)
+	ifneq ($(UNAME_P),arm)
+		SYSCTL_M := $(shell sysctl -n hw.optional.arm64 2>/dev/null)
+		ifeq ($(SYSCTL_M),1)
+			# UNAME_P := arm
+			# UNAME_M := arm64
+			warn := $(warning Your arch is announced as x86_64, but it seems to actually be ARM64. Not fixing that can lead to bad performance. For more info see: https://github.com/ggerganov/whisper.cpp/issues/66\#issuecomment-1282546789)
+		endif
+	endif
+endif
+
+#
+# Compile flags
+#
+
+# keep standard at C11 and C++11
+CFLAGS   = -I./llama.cpp -I. -O3 -DNDEBUG -std=c11 -fPIC
+CXXFLAGS = -I./llama.cpp -I. -I./llama.cpp/examples -I./examples -O3 -DNDEBUG -std=c++11 -fPIC
+LDFLAGS  =
+
+# warnings
+CFLAGS   += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith -Wno-unused-function
+CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function
+
+# OS specific
+# TODO: support Windows
+ifeq ($(UNAME_S),Linux)
+	CFLAGS   += -pthread
+	CXXFLAGS += -pthread
+endif
+ifeq ($(UNAME_S),Darwin)
+	CFLAGS   += -pthread
+	CXXFLAGS += -pthread
+endif
+ifeq ($(UNAME_S),FreeBSD)
+	CFLAGS   += -pthread
+	CXXFLAGS += -pthread
+endif
+ifeq ($(UNAME_S),NetBSD)
+	CFLAGS   += -pthread
+	CXXFLAGS += -pthread
+endif
+ifeq ($(UNAME_S),OpenBSD)
+	CFLAGS   += -pthread
+	CXXFLAGS += -pthread
+endif
+ifeq ($(UNAME_S),Haiku)
+	CFLAGS   += -pthread
+	CXXFLAGS += -pthread
+endif
+
+# Architecture specific
+# TODO: probably these flags need to be tweaked on some architectures
+#       feel free to update the Makefile for your architecture and send a pull request or issue
+ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686))
+	# Use all CPU extensions that are available:
+	CFLAGS += -march=native -mtune=native
+endif
+ifneq ($(filter ppc64%,$(UNAME_M)),)
+	POWER9_M := $(shell grep "POWER9" /proc/cpuinfo)
+	ifneq (,$(findstring POWER9,$(POWER9_M)))
+		CFLAGS += -mcpu=power9
+		CXXFLAGS += -mcpu=power9
+	endif
+	# Require c++23's std::byteswap for big-endian support.
+	ifeq ($(UNAME_M),ppc64)
+		CXXFLAGS += -std=c++23 -DGGML_BIG_ENDIAN
+	endif
+endif
+ifndef LLAMA_NO_ACCELERATE
+	# Mac M1 - include Accelerate framework.
+	# `-framework Accelerate` works on Mac Intel as well, with negliable performance boost (as of the predict time).
+	ifeq ($(UNAME_S),Darwin)
+		CFLAGS  += -DGGML_USE_ACCELERATE
+		LDFLAGS += -framework Accelerate
+	endif
+endif
+ifdef LLAMA_OPENBLAS
+	CFLAGS  += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas
+	LDFLAGS += -lopenblas
+endif
+ifdef LLAMA_GPROF
+	CFLAGS   += -pg
+	CXXFLAGS += -pg
+endif
+ifneq ($(filter aarch64%,$(UNAME_M)),)
+	CFLAGS += -mcpu=native
+	CXXFLAGS += -mcpu=native
+endif
+ifneq ($(filter armv6%,$(UNAME_M)),)
+	# Raspberry Pi 1, 2, 3
+	CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access
+endif
+ifneq ($(filter armv7%,$(UNAME_M)),)
+	# Raspberry Pi 4
+	CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations
+endif
+ifneq ($(filter armv8%,$(UNAME_M)),)
+	# Raspberry Pi 4
+	CFLAGS += -mfp16-format=ieee -mno-unaligned-access
+endif
+
+#
+# Print build information
+#
+
+$(info I llama.cpp build info: )
+$(info I UNAME_S:  $(UNAME_S))
+$(info I UNAME_P:  $(UNAME_P))
+$(info I UNAME_M:  $(UNAME_M))
+$(info I CFLAGS:   $(CFLAGS))
+$(info I CXXFLAGS: $(CXXFLAGS))
+$(info I LDFLAGS:  $(LDFLAGS))
+$(info I CC:       $(CCV))
+$(info I CXX:      $(CXXV))
+$(info )
+
+llama.cpp/ggml.o:
+	$(MAKE) -C llama.cpp ggml.o
+
+llama.cpp/llama.o:
+	$(MAKE) -C llama.cpp llama.o
+
+llama.cpp/common.o:
+	$(MAKE) -C llama.cpp common.o
+
+binding.o: llama.cpp/ggml.o llama.cpp/llama.o llama.cpp/common.o
+	$(CXX) $(CXXFLAGS) -I./llama.cpp -I./llama.cpp/examples binding.cpp -o binding.o -c $(LDFLAGS)
+
+libbinding.a: binding.o
+	ar src libbinding.a llama.cpp/ggml.o llama.cpp/common.o llama.cpp/llama.o binding.o
+
+clean:
+	rm -rf *.o
+	rm -rf *.a
+	$(MAKE) -C llama.cpp clean
\ No newline at end of file
diff --git a/binding.cpp b/binding.cpp
new file mode 100644
index 0000000..d96fd1b
--- /dev/null
+++ b/binding.cpp
@@ -0,0 +1,195 @@
+#include "common.h"
+#include "llama.h"
+#include "binding.h"
+
+#include <cassert>
+#include <cinttypes>
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <vector>
+
+#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
+#include <signal.h>
+#include <unistd.h>
+#elif defined (_WIN32)
+#include <signal.h>
+#endif
+
+#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
+void sigint_handler(int signo) {
+    if (signo == SIGINT) {
+            _exit(130);
+    }
+}
+#endif
+
+int llama_predict(void* params_ptr, void* state_pr, char* result) {
+    gpt_params* params_p = (gpt_params*) params_ptr;
+    llama_context* ctx = (llama_context*) state_pr;
+  
+    gpt_params params = *params_p;
+
+    if (params.seed <= 0) {
+        params.seed = time(NULL);
+    }
+
+    std::mt19937 rng(params.seed);
+  
+    // Add a space in front of the first character to match OG llama tokenizer behavior
+    params.prompt.insert(0, 1, ' ');
+
+    // tokenize the prompt
+    auto embd_inp = ::llama_tokenize(ctx, params.prompt, true);
+
+    const int n_ctx = llama_n_ctx(ctx);
+
+    // number of tokens to keep when resetting context
+    if (params.n_keep < 0 || params.n_keep > (int)embd_inp.size() || params.instruct) {
+        params.n_keep = (int)embd_inp.size();
+    }
+
+    // determine newline token
+    auto llama_token_newline = ::llama_tokenize(ctx, "\n", false);
+
+    // TODO: replace with ring-buffer
+    std::vector<llama_token> last_n_tokens(n_ctx);
+    std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0);
+
+    int n_past     = 0;
+    int n_remain   = params.n_predict;
+    int n_consumed = 0;
+
+    std::vector<llama_token> embd;
+    std::string res = "";
+
+    while (n_remain != 0) {
+        // predict
+        if (embd.size() > 0) {
+            // infinite text generation via context swapping
+            // if we run out of context:
+            // - take the n_keep first tokens from the original prompt (via n_past)
+            // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in a batch
+            if (n_past + (int) embd.size() > n_ctx) {
+                const int n_left = n_past - params.n_keep;
+
+                n_past = params.n_keep;
+
+                // insert n_left/2 tokens at the start of embd from last_n_tokens
+                embd.insert(embd.begin(), last_n_tokens.begin() + n_ctx - n_left/2 - embd.size(), last_n_tokens.end() - embd.size());
+            }
+
+            if (llama_eval(ctx, embd.data(), embd.size(), n_past, params.n_threads)) {
+                fprintf(stderr, "%s : failed to eval\n", __func__);
+                return 1;
+            }
+        }
+
+        n_past += embd.size();
+        embd.clear();
+
+        if ((int) embd_inp.size() <= n_consumed) {
+            // out of user input, sample next token
+            const int32_t top_k          = params.top_k;
+            const float   top_p          = params.top_p;
+            const float   temp           = params.temp;
+            const float   repeat_penalty = params.repeat_penalty;
+
+            llama_token id = 0;
+
+            {
+                auto logits = llama_get_logits(ctx);
+
+                if (params.ignore_eos) {
+                    logits[llama_token_eos()] = 0;
+                }
+
+                id = llama_sample_top_p_top_k(ctx,
+                        last_n_tokens.data() + n_ctx - params.repeat_last_n,
+                        params.repeat_last_n, top_k, top_p, temp, repeat_penalty);
+
+                last_n_tokens.erase(last_n_tokens.begin());
+                last_n_tokens.push_back(id);
+            }
+
+            // add it to the context
+            embd.push_back(id);
+
+            // decrement remaining sampling budget
+            --n_remain;
+        } else {
+            // some user input remains from prompt or interaction, forward it to processing
+            while ((int) embd_inp.size() > n_consumed) {
+                embd.push_back(embd_inp[n_consumed]);
+                last_n_tokens.erase(last_n_tokens.begin());
+                last_n_tokens.push_back(embd_inp[n_consumed]);
+                ++n_consumed;
+                if ((int) embd.size() >= params.n_batch) {
+                    break;
+                }
+            }
+        }
+
+        for (auto id : embd) {
+            res += llama_token_to_str(ctx, id);
+        }
+      
+        // end of text token
+        if (embd.back() == llama_token_eos()) {
+                break;
+        }
+    }
+
+#if defined (_WIN32)
+    signal(SIGINT, SIG_DFL);
+#endif
+    strcpy(result, res.c_str()); 
+    return 0;
+}
+
+void llama_free_model(void *state_ptr) {
+    llama_context* ctx = (llama_context*) state_ptr;
+    llama_free(ctx);
+}
+
+void llama_free_params(void* params_ptr) {
+    gpt_params* params = (gpt_params*) params_ptr;
+    delete params;
+}
+
+
+void* llama_allocate_params(const char *prompt, int seed, int threads, int tokens, int top_k,
+                            float top_p, float temp, float repeat_penalty, int repeat_last_n, bool ignore_eos, bool memory_f16) {
+    gpt_params* params = new gpt_params;
+    params->seed = seed;
+    params->n_threads = threads;
+    params->n_predict = tokens;
+    params->repeat_last_n = repeat_last_n;
+
+    params->top_k = top_k;
+    params->top_p = top_p;
+    params->memory_f16 = memory_f16;
+    params->temp = temp;
+    params->repeat_penalty = repeat_penalty;
+
+    params->prompt = prompt;
+    params->ignore_eos = ignore_eos;
+    
+    return params;
+}
+
+void* load_model(const char *fname, int n_ctx, int n_parts, int n_seed, bool memory_f16, bool mlock) {
+    // load the model
+    auto lparams = llama_context_default_params();
+
+    lparams.n_ctx      = n_ctx;
+    lparams.n_parts    = n_parts;
+    lparams.seed       = n_seed;
+    lparams.f16_kv     = memory_f16;
+    lparams.use_mlock  = mlock;
+
+    return llama_init_from_file(fname, lparams);
+}
diff --git a/binding.h b/binding.h
new file mode 100644
index 0000000..c061131
--- /dev/null
+++ b/binding.h
@@ -0,0 +1,20 @@
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdbool.h>
+
+void* load_model(const char *fname, int n_ctx, int n_parts, int n_seed, bool memory_f16, bool mlock);
+
+void* llama_allocate_params(const char *prompt, int seed, int threads, int tokens,
+                            int top_k, float top_p, float temp, float repeat_penalty, int repeat_last_n, bool ignore_eos, bool memory_f16);
+
+void llama_free_params(void* params_ptr);
+
+void llama_free_model(void* state);
+
+int llama_predict(void* params_ptr, void* state_pr, char* result);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/examples/main.go b/examples/main.go
new file mode 100644
index 0000000..14c4685
--- /dev/null
+++ b/examples/main.go
@@ -0,0 +1,80 @@
+package main
+
+import (
+	"bufio"
+	"flag"
+	"fmt"
+	"io"
+	"os"
+	"runtime"
+	"strings"
+
+	llama "github.com/go-skynet/go-llama.cpp"
+)
+
+var (
+	threads = 4
+	tokens  = 128
+)
+
+func main() {
+	var model string
+
+	flags := flag.NewFlagSet(os.Args[0], flag.ExitOnError)
+	flags.StringVar(&model, "m", "./models/7B/ggml-model-q4_0.bin", "path to q4_0.bin model file to load")
+	flags.IntVar(&threads, "t", runtime.NumCPU(), "number of threads to use during computation")
+	flags.IntVar(&tokens, "n", 512, "number of tokens to predict")
+
+	err := flags.Parse(os.Args[1:])
+	if err != nil {
+		fmt.Printf("Parsing program arguments failed: %s", err)
+		os.Exit(1)
+	}
+	l, err := llama.New(model, llama.SetContext(128), llama.SetParts(-1))
+	if err != nil {
+		fmt.Println("Loading the model failed:", err.Error())
+		os.Exit(1)
+	}
+	fmt.Printf("Model loaded successfully.\n")
+
+	reader := bufio.NewReader(os.Stdin)
+
+	for {
+		text := readMultiLineInput(reader)
+
+		res, err := l.Predict(text, llama.SetTokens(tokens), llama.SetThreads(threads), llama.SetTopK(90), llama.SetTopP(0.86))
+		if err != nil {
+			panic(err)
+		}
+		fmt.Printf("\ngolang: %s\n", res)
+
+		fmt.Printf("\n\n")
+	}
+}
+
+// readMultiLineInput reads input until an empty line is entered.
+func readMultiLineInput(reader *bufio.Reader) string {
+	var lines []string
+	fmt.Print(">>> ")
+
+	for {
+		line, err := reader.ReadString('\n')
+		if err != nil {
+			if err == io.EOF {
+				os.Exit(0)
+			}
+			fmt.Printf("Reading the prompt failed: %s", err)
+			os.Exit(1)
+		}
+
+		if len(strings.TrimSpace(line)) == 0 {
+			break
+		}
+
+		lines = append(lines, line)
+	}
+
+	text := strings.Join(lines, "")
+	fmt.Println("Sending", text)
+	return text
+}
diff --git a/go.mod b/go.mod
new file mode 100644
index 0000000..3d1fa4b
--- /dev/null
+++ b/go.mod
@@ -0,0 +1,3 @@
+module github.com/go-skynet/go-llama.cpp
+
+go 1.19
diff --git a/llama.cpp b/llama.cpp
new file mode 160000
index 0000000..53dbba7
--- /dev/null
+++ b/llama.cpp
@@ -0,0 +1 @@
+Subproject commit 53dbba769537e894ead5c6913ab2fd3a4658b738
diff --git a/llama.go b/llama.go
new file mode 100644
index 0000000..dc31ca3
--- /dev/null
+++ b/llama.go
@@ -0,0 +1,56 @@
+package llama
+
+// #cgo CXXFLAGS: -I./llama.cpp/examples -I./llama.cpp
+// #cgo LDFLAGS: -L./ -lbinding -lm -lstdc++
+// #include "binding.h"
+import "C"
+import (
+	"fmt"
+	"strings"
+	"unsafe"
+)
+
+type LLama struct {
+	state unsafe.Pointer
+}
+
+func New(model string, opts ...ModelOption) (*LLama, error) {
+	mo := NewModelOptions(opts...)
+	modelPath := C.CString(model)
+	result := C.load_model(modelPath, C.int(mo.ContextSize), C.int(mo.Parts), C.int(mo.Seed), C.bool(mo.F16Memory), C.bool(mo.MLock))
+	if result == nil {
+		return nil, fmt.Errorf("failed loading model")
+	}
+
+	return &LLama{state: result}, nil
+}
+
+func (l *LLama) Free() {
+	C.llama_free_model(l.state)
+}
+
+func (l *LLama) Predict(text string, opts ...PredictOption) (string, error) {
+	po := NewPredictOptions(opts...)
+
+	input := C.CString(text)
+	if po.Tokens == 0 {
+		po.Tokens = 99999999
+	}
+	out := make([]byte, po.Tokens)
+
+	params := C.llama_allocate_params(input, C.int(po.Seed), C.int(po.Threads), C.int(po.Tokens), C.int(po.TopK),
+		C.float(po.TopP), C.float(po.Temperature), C.float(po.Penalty), C.int(po.Repeat), C.bool(po.IgnoreEOS), C.bool(po.F16KV))
+	ret := C.llama_predict(params, l.state, (*C.char)(unsafe.Pointer(&out[0])))
+	if ret != 0 {
+		return "", fmt.Errorf("inference failed")
+	}
+	res := C.GoString((*C.char)(unsafe.Pointer(&out[0])))
+
+	res = strings.TrimPrefix(res, " ")
+	res = strings.TrimPrefix(res, text)
+	res = strings.TrimPrefix(res, "\n")
+
+	C.llama_free_params(params)
+
+	return res, nil
+}
diff --git a/options.go b/options.go
new file mode 100644
index 0000000..ae46fd3
--- /dev/null
+++ b/options.go
@@ -0,0 +1,148 @@
+package llama
+
+import "runtime"
+
+type ModelOptions struct {
+	ContextSize int
+	Parts       int
+	Seed        int
+	F16Memory   bool
+	MLock       bool
+}
+
+type PredictOptions struct {
+	Seed, Threads, Tokens, TopK, Repeat int
+	TopP, Temperature, Penalty          float64
+	F16KV                               bool
+	IgnoreEOS                           bool
+}
+
+type PredictOption func(p *PredictOptions)
+type ModelOption func(p *ModelOptions)
+
+var DefaultModelOptions ModelOptions = ModelOptions{
+	ContextSize: 512,
+	Seed:        0,
+	F16Memory:   false,
+	MLock:       false,
+}
+
+var DefaultOptions PredictOptions = PredictOptions{
+	Seed:        -1,
+	Threads:     runtime.NumCPU(),
+	Tokens:      128,
+	TopK:        10000,
+	TopP:        0.90,
+	Temperature: 0.96,
+	Penalty:     1,
+	Repeat:      64,
+}
+
+// SetContext sets the context size.
+func SetContext(c int) ModelOption {
+	return func(p *ModelOptions) {
+		p.ContextSize = c
+	}
+}
+
+func SetModelSeed(c int) ModelOption {
+	return func(p *ModelOptions) {
+		p.Seed = c
+	}
+}
+
+func SetParts(c int) ModelOption {
+	return func(p *ModelOptions) {
+		p.Parts = c
+	}
+}
+
+var EnableF16Memory ModelOption = func(p *ModelOptions) {
+	p.F16Memory = true
+}
+
+var EnableF16KV PredictOption = func(p *PredictOptions) {
+	p.F16KV = true
+}
+
+var EnableMLock ModelOption = func(p *ModelOptions) {
+	p.MLock = true
+}
+
+// Create a new PredictOptions object with the given options.
+func NewModelOptions(opts ...ModelOption) ModelOptions {
+	p := DefaultModelOptions
+	for _, opt := range opts {
+		opt(&p)
+	}
+	return p
+}
+
+var IgnoreEOS PredictOption = func(p *PredictOptions) {
+	p.IgnoreEOS = true
+}
+
+// SetSeed sets the random seed for sampling text generation.
+func SetSeed(seed int) PredictOption {
+	return func(p *PredictOptions) {
+		p.Seed = seed
+	}
+}
+
+// SetThreads sets the number of threads to use for text generation.
+func SetThreads(threads int) PredictOption {
+	return func(p *PredictOptions) {
+		p.Threads = threads
+	}
+}
+
+// SetTokens sets the number of tokens to generate.
+func SetTokens(tokens int) PredictOption {
+	return func(p *PredictOptions) {
+		p.Tokens = tokens
+	}
+}
+
+// SetTopK sets the value for top-K sampling.
+func SetTopK(topk int) PredictOption {
+	return func(p *PredictOptions) {
+		p.TopK = topk
+	}
+}
+
+// SetTopP sets the value for nucleus sampling.
+func SetTopP(topp float64) PredictOption {
+	return func(p *PredictOptions) {
+		p.TopP = topp
+	}
+}
+
+// SetTemperature sets the temperature value for text generation.
+func SetTemperature(temp float64) PredictOption {
+	return func(p *PredictOptions) {
+		p.Temperature = temp
+	}
+}
+
+// SetPenalty sets the repetition penalty for text generation.
+func SetPenalty(penalty float64) PredictOption {
+	return func(p *PredictOptions) {
+		p.Penalty = penalty
+	}
+}
+
+// SetRepeat sets the number of times to repeat text generation.
+func SetRepeat(repeat int) PredictOption {
+	return func(p *PredictOptions) {
+		p.Repeat = repeat
+	}
+}
+
+// Create a new PredictOptions object with the given options.
+func NewPredictOptions(opts ...PredictOption) PredictOptions {
+	p := DefaultOptions
+	for _, opt := range opts {
+		opt(&p)
+	}
+	return p
+}