-
-
Notifications
You must be signed in to change notification settings - Fork 85
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
10 changed files
with
680 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
[submodule "llama.cpp"] | ||
path = llama.cpp | ||
url = https://github.com/ggerganov/llama.cpp |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
MIT License | ||
|
||
Copyright (c) 2023 go-skynet authors | ||
|
||
Permission is hereby granted, free of charge, to any person obtaining a copy | ||
of this software and associated documentation files (the "Software"), to deal | ||
in the Software without restriction, including without limitation the rights | ||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
copies of the Software, and to permit persons to whom the Software is | ||
furnished to do so, subject to the following conditions: | ||
|
||
The above copyright notice and this permission notice shall be included in all | ||
copies or substantial portions of the Software. | ||
|
||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
SOFTWARE. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,153 @@ | ||
ifndef UNAME_S | ||
UNAME_S := $(shell uname -s) | ||
endif | ||
|
||
ifndef UNAME_P | ||
UNAME_P := $(shell uname -p) | ||
endif | ||
|
||
ifndef UNAME_M | ||
UNAME_M := $(shell uname -m) | ||
endif | ||
|
||
CCV := $(shell $(CC) --version | head -n 1) | ||
CXXV := $(shell $(CXX) --version | head -n 1) | ||
|
||
# Mac OS + Arm can report x86_64 | ||
# ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789 | ||
ifeq ($(UNAME_S),Darwin) | ||
ifneq ($(UNAME_P),arm) | ||
SYSCTL_M := $(shell sysctl -n hw.optional.arm64 2>/dev/null) | ||
ifeq ($(SYSCTL_M),1) | ||
# UNAME_P := arm | ||
# UNAME_M := arm64 | ||
warn := $(warning Your arch is announced as x86_64, but it seems to actually be ARM64. Not fixing that can lead to bad performance. For more info see: https://github.com/ggerganov/whisper.cpp/issues/66\#issuecomment-1282546789) | ||
endif | ||
endif | ||
endif | ||
|
||
# | ||
# Compile flags | ||
# | ||
|
||
# keep standard at C11 and C++11 | ||
CFLAGS = -I./llama.cpp -I. -O3 -DNDEBUG -std=c11 -fPIC | ||
CXXFLAGS = -I./llama.cpp -I. -I./llama.cpp/examples -I./examples -O3 -DNDEBUG -std=c++11 -fPIC | ||
LDFLAGS = | ||
|
||
# warnings | ||
CFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith -Wno-unused-function | ||
CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function | ||
|
||
# OS specific | ||
# TODO: support Windows | ||
ifeq ($(UNAME_S),Linux) | ||
CFLAGS += -pthread | ||
CXXFLAGS += -pthread | ||
endif | ||
ifeq ($(UNAME_S),Darwin) | ||
CFLAGS += -pthread | ||
CXXFLAGS += -pthread | ||
endif | ||
ifeq ($(UNAME_S),FreeBSD) | ||
CFLAGS += -pthread | ||
CXXFLAGS += -pthread | ||
endif | ||
ifeq ($(UNAME_S),NetBSD) | ||
CFLAGS += -pthread | ||
CXXFLAGS += -pthread | ||
endif | ||
ifeq ($(UNAME_S),OpenBSD) | ||
CFLAGS += -pthread | ||
CXXFLAGS += -pthread | ||
endif | ||
ifeq ($(UNAME_S),Haiku) | ||
CFLAGS += -pthread | ||
CXXFLAGS += -pthread | ||
endif | ||
|
||
# Architecture specific | ||
# TODO: probably these flags need to be tweaked on some architectures | ||
# feel free to update the Makefile for your architecture and send a pull request or issue | ||
ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686)) | ||
# Use all CPU extensions that are available: | ||
CFLAGS += -march=native -mtune=native | ||
endif | ||
ifneq ($(filter ppc64%,$(UNAME_M)),) | ||
POWER9_M := $(shell grep "POWER9" /proc/cpuinfo) | ||
ifneq (,$(findstring POWER9,$(POWER9_M))) | ||
CFLAGS += -mcpu=power9 | ||
CXXFLAGS += -mcpu=power9 | ||
endif | ||
# Require c++23's std::byteswap for big-endian support. | ||
ifeq ($(UNAME_M),ppc64) | ||
CXXFLAGS += -std=c++23 -DGGML_BIG_ENDIAN | ||
endif | ||
endif | ||
ifndef LLAMA_NO_ACCELERATE | ||
# Mac M1 - include Accelerate framework. | ||
# `-framework Accelerate` works on Mac Intel as well, with negliable performance boost (as of the predict time). | ||
ifeq ($(UNAME_S),Darwin) | ||
CFLAGS += -DGGML_USE_ACCELERATE | ||
LDFLAGS += -framework Accelerate | ||
endif | ||
endif | ||
ifdef LLAMA_OPENBLAS | ||
CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas | ||
LDFLAGS += -lopenblas | ||
endif | ||
ifdef LLAMA_GPROF | ||
CFLAGS += -pg | ||
CXXFLAGS += -pg | ||
endif | ||
ifneq ($(filter aarch64%,$(UNAME_M)),) | ||
CFLAGS += -mcpu=native | ||
CXXFLAGS += -mcpu=native | ||
endif | ||
ifneq ($(filter armv6%,$(UNAME_M)),) | ||
# Raspberry Pi 1, 2, 3 | ||
CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access | ||
endif | ||
ifneq ($(filter armv7%,$(UNAME_M)),) | ||
# Raspberry Pi 4 | ||
CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations | ||
endif | ||
ifneq ($(filter armv8%,$(UNAME_M)),) | ||
# Raspberry Pi 4 | ||
CFLAGS += -mfp16-format=ieee -mno-unaligned-access | ||
endif | ||
|
||
# | ||
# Print build information | ||
# | ||
|
||
$(info I llama.cpp build info: ) | ||
$(info I UNAME_S: $(UNAME_S)) | ||
$(info I UNAME_P: $(UNAME_P)) | ||
$(info I UNAME_M: $(UNAME_M)) | ||
$(info I CFLAGS: $(CFLAGS)) | ||
$(info I CXXFLAGS: $(CXXFLAGS)) | ||
$(info I LDFLAGS: $(LDFLAGS)) | ||
$(info I CC: $(CCV)) | ||
$(info I CXX: $(CXXV)) | ||
$(info ) | ||
|
||
llama.cpp/ggml.o: | ||
$(MAKE) -C llama.cpp ggml.o | ||
|
||
llama.cpp/llama.o: | ||
$(MAKE) -C llama.cpp llama.o | ||
|
||
llama.cpp/common.o: | ||
$(MAKE) -C llama.cpp common.o | ||
|
||
binding.o: llama.cpp/ggml.o llama.cpp/llama.o llama.cpp/common.o | ||
$(CXX) $(CXXFLAGS) -I./llama.cpp -I./llama.cpp/examples binding.cpp -o binding.o -c $(LDFLAGS) | ||
|
||
libbinding.a: binding.o | ||
ar src libbinding.a llama.cpp/ggml.o llama.cpp/common.o llama.cpp/llama.o binding.o | ||
|
||
clean: | ||
rm -rf *.o | ||
rm -rf *.a | ||
$(MAKE) -C llama.cpp clean |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,195 @@ | ||
#include "common.h" | ||
#include "llama.h" | ||
#include "binding.h" | ||
|
||
#include <cassert> | ||
#include <cinttypes> | ||
#include <cmath> | ||
#include <cstdio> | ||
#include <cstring> | ||
#include <fstream> | ||
#include <iostream> | ||
#include <string> | ||
#include <vector> | ||
|
||
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) | ||
#include <signal.h> | ||
#include <unistd.h> | ||
#elif defined (_WIN32) | ||
#include <signal.h> | ||
#endif | ||
|
||
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32) | ||
void sigint_handler(int signo) { | ||
if (signo == SIGINT) { | ||
_exit(130); | ||
} | ||
} | ||
#endif | ||
|
||
int llama_predict(void* params_ptr, void* state_pr, char* result) { | ||
gpt_params* params_p = (gpt_params*) params_ptr; | ||
llama_context* ctx = (llama_context*) state_pr; | ||
|
||
gpt_params params = *params_p; | ||
|
||
if (params.seed <= 0) { | ||
params.seed = time(NULL); | ||
} | ||
|
||
std::mt19937 rng(params.seed); | ||
|
||
// Add a space in front of the first character to match OG llama tokenizer behavior | ||
params.prompt.insert(0, 1, ' '); | ||
|
||
// tokenize the prompt | ||
auto embd_inp = ::llama_tokenize(ctx, params.prompt, true); | ||
|
||
const int n_ctx = llama_n_ctx(ctx); | ||
|
||
// number of tokens to keep when resetting context | ||
if (params.n_keep < 0 || params.n_keep > (int)embd_inp.size() || params.instruct) { | ||
params.n_keep = (int)embd_inp.size(); | ||
} | ||
|
||
// determine newline token | ||
auto llama_token_newline = ::llama_tokenize(ctx, "\n", false); | ||
|
||
// TODO: replace with ring-buffer | ||
std::vector<llama_token> last_n_tokens(n_ctx); | ||
std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0); | ||
|
||
int n_past = 0; | ||
int n_remain = params.n_predict; | ||
int n_consumed = 0; | ||
|
||
std::vector<llama_token> embd; | ||
std::string res = ""; | ||
|
||
while (n_remain != 0) { | ||
// predict | ||
if (embd.size() > 0) { | ||
// infinite text generation via context swapping | ||
// if we run out of context: | ||
// - take the n_keep first tokens from the original prompt (via n_past) | ||
// - take half of the last (n_ctx - n_keep) tokens and recompute the logits in a batch | ||
if (n_past + (int) embd.size() > n_ctx) { | ||
const int n_left = n_past - params.n_keep; | ||
|
||
n_past = params.n_keep; | ||
|
||
// insert n_left/2 tokens at the start of embd from last_n_tokens | ||
embd.insert(embd.begin(), last_n_tokens.begin() + n_ctx - n_left/2 - embd.size(), last_n_tokens.end() - embd.size()); | ||
} | ||
|
||
if (llama_eval(ctx, embd.data(), embd.size(), n_past, params.n_threads)) { | ||
fprintf(stderr, "%s : failed to eval\n", __func__); | ||
return 1; | ||
} | ||
} | ||
|
||
n_past += embd.size(); | ||
embd.clear(); | ||
|
||
if ((int) embd_inp.size() <= n_consumed) { | ||
// out of user input, sample next token | ||
const int32_t top_k = params.top_k; | ||
const float top_p = params.top_p; | ||
const float temp = params.temp; | ||
const float repeat_penalty = params.repeat_penalty; | ||
|
||
llama_token id = 0; | ||
|
||
{ | ||
auto logits = llama_get_logits(ctx); | ||
|
||
if (params.ignore_eos) { | ||
logits[llama_token_eos()] = 0; | ||
} | ||
|
||
id = llama_sample_top_p_top_k(ctx, | ||
last_n_tokens.data() + n_ctx - params.repeat_last_n, | ||
params.repeat_last_n, top_k, top_p, temp, repeat_penalty); | ||
|
||
last_n_tokens.erase(last_n_tokens.begin()); | ||
last_n_tokens.push_back(id); | ||
} | ||
|
||
// add it to the context | ||
embd.push_back(id); | ||
|
||
// decrement remaining sampling budget | ||
--n_remain; | ||
} else { | ||
// some user input remains from prompt or interaction, forward it to processing | ||
while ((int) embd_inp.size() > n_consumed) { | ||
embd.push_back(embd_inp[n_consumed]); | ||
last_n_tokens.erase(last_n_tokens.begin()); | ||
last_n_tokens.push_back(embd_inp[n_consumed]); | ||
++n_consumed; | ||
if ((int) embd.size() >= params.n_batch) { | ||
break; | ||
} | ||
} | ||
} | ||
|
||
for (auto id : embd) { | ||
res += llama_token_to_str(ctx, id); | ||
} | ||
|
||
// end of text token | ||
if (embd.back() == llama_token_eos()) { | ||
break; | ||
} | ||
} | ||
|
||
#if defined (_WIN32) | ||
signal(SIGINT, SIG_DFL); | ||
#endif | ||
strcpy(result, res.c_str()); | ||
return 0; | ||
} | ||
|
||
void llama_free_model(void *state_ptr) { | ||
llama_context* ctx = (llama_context*) state_ptr; | ||
llama_free(ctx); | ||
} | ||
|
||
void llama_free_params(void* params_ptr) { | ||
gpt_params* params = (gpt_params*) params_ptr; | ||
delete params; | ||
} | ||
|
||
|
||
void* llama_allocate_params(const char *prompt, int seed, int threads, int tokens, int top_k, | ||
float top_p, float temp, float repeat_penalty, int repeat_last_n, bool ignore_eos, bool memory_f16) { | ||
gpt_params* params = new gpt_params; | ||
params->seed = seed; | ||
params->n_threads = threads; | ||
params->n_predict = tokens; | ||
params->repeat_last_n = repeat_last_n; | ||
|
||
params->top_k = top_k; | ||
params->top_p = top_p; | ||
params->memory_f16 = memory_f16; | ||
params->temp = temp; | ||
params->repeat_penalty = repeat_penalty; | ||
|
||
params->prompt = prompt; | ||
params->ignore_eos = ignore_eos; | ||
|
||
return params; | ||
} | ||
|
||
void* load_model(const char *fname, int n_ctx, int n_parts, int n_seed, bool memory_f16, bool mlock) { | ||
// load the model | ||
auto lparams = llama_context_default_params(); | ||
|
||
lparams.n_ctx = n_ctx; | ||
lparams.n_parts = n_parts; | ||
lparams.seed = n_seed; | ||
lparams.f16_kv = memory_f16; | ||
lparams.use_mlock = mlock; | ||
|
||
return llama_init_from_file(fname, lparams); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
#ifdef __cplusplus | ||
extern "C" { | ||
#endif | ||
|
||
#include <stdbool.h> | ||
|
||
void* load_model(const char *fname, int n_ctx, int n_parts, int n_seed, bool memory_f16, bool mlock); | ||
|
||
void* llama_allocate_params(const char *prompt, int seed, int threads, int tokens, | ||
int top_k, float top_p, float temp, float repeat_penalty, int repeat_last_n, bool ignore_eos, bool memory_f16); | ||
|
||
void llama_free_params(void* params_ptr); | ||
|
||
void llama_free_model(void* state); | ||
|
||
int llama_predict(void* params_ptr, void* state_pr, char* result); | ||
|
||
#ifdef __cplusplus | ||
} | ||
#endif |
Oops, something went wrong.