Skip to content

Commit

Permalink
First import
Browse files Browse the repository at this point in the history
  • Loading branch information
mudler committed Apr 4, 2023
1 parent 4b01db1 commit 24b85a9
Show file tree
Hide file tree
Showing 10 changed files with 680 additions and 0 deletions.
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[submodule "llama.cpp"]
path = llama.cpp
url = https://github.com/ggerganov/llama.cpp
21 changes: 21 additions & 0 deletions LICENSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
MIT License

Copyright (c) 2023 go-skynet authors

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
153 changes: 153 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
ifndef UNAME_S
UNAME_S := $(shell uname -s)
endif

ifndef UNAME_P
UNAME_P := $(shell uname -p)
endif

ifndef UNAME_M
UNAME_M := $(shell uname -m)
endif

CCV := $(shell $(CC) --version | head -n 1)
CXXV := $(shell $(CXX) --version | head -n 1)

# Mac OS + Arm can report x86_64
# ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789
ifeq ($(UNAME_S),Darwin)
ifneq ($(UNAME_P),arm)
SYSCTL_M := $(shell sysctl -n hw.optional.arm64 2>/dev/null)
ifeq ($(SYSCTL_M),1)
# UNAME_P := arm
# UNAME_M := arm64
warn := $(warning Your arch is announced as x86_64, but it seems to actually be ARM64. Not fixing that can lead to bad performance. For more info see: https://github.com/ggerganov/whisper.cpp/issues/66\#issuecomment-1282546789)
endif
endif
endif

#
# Compile flags
#

# keep standard at C11 and C++11
CFLAGS = -I./llama.cpp -I. -O3 -DNDEBUG -std=c11 -fPIC
CXXFLAGS = -I./llama.cpp -I. -I./llama.cpp/examples -I./examples -O3 -DNDEBUG -std=c++11 -fPIC
LDFLAGS =

# warnings
CFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith -Wno-unused-function
CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function

# OS specific
# TODO: support Windows
ifeq ($(UNAME_S),Linux)
CFLAGS += -pthread
CXXFLAGS += -pthread
endif
ifeq ($(UNAME_S),Darwin)
CFLAGS += -pthread
CXXFLAGS += -pthread
endif
ifeq ($(UNAME_S),FreeBSD)
CFLAGS += -pthread
CXXFLAGS += -pthread
endif
ifeq ($(UNAME_S),NetBSD)
CFLAGS += -pthread
CXXFLAGS += -pthread
endif
ifeq ($(UNAME_S),OpenBSD)
CFLAGS += -pthread
CXXFLAGS += -pthread
endif
ifeq ($(UNAME_S),Haiku)
CFLAGS += -pthread
CXXFLAGS += -pthread
endif

# Architecture specific
# TODO: probably these flags need to be tweaked on some architectures
# feel free to update the Makefile for your architecture and send a pull request or issue
ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686))
# Use all CPU extensions that are available:
CFLAGS += -march=native -mtune=native
endif
ifneq ($(filter ppc64%,$(UNAME_M)),)
POWER9_M := $(shell grep "POWER9" /proc/cpuinfo)
ifneq (,$(findstring POWER9,$(POWER9_M)))
CFLAGS += -mcpu=power9
CXXFLAGS += -mcpu=power9
endif
# Require c++23's std::byteswap for big-endian support.
ifeq ($(UNAME_M),ppc64)
CXXFLAGS += -std=c++23 -DGGML_BIG_ENDIAN
endif
endif
ifndef LLAMA_NO_ACCELERATE
# Mac M1 - include Accelerate framework.
# `-framework Accelerate` works on Mac Intel as well, with negliable performance boost (as of the predict time).
ifeq ($(UNAME_S),Darwin)
CFLAGS += -DGGML_USE_ACCELERATE
LDFLAGS += -framework Accelerate
endif
endif
ifdef LLAMA_OPENBLAS
CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas
LDFLAGS += -lopenblas
endif
ifdef LLAMA_GPROF
CFLAGS += -pg
CXXFLAGS += -pg
endif
ifneq ($(filter aarch64%,$(UNAME_M)),)
CFLAGS += -mcpu=native
CXXFLAGS += -mcpu=native
endif
ifneq ($(filter armv6%,$(UNAME_M)),)
# Raspberry Pi 1, 2, 3
CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access
endif
ifneq ($(filter armv7%,$(UNAME_M)),)
# Raspberry Pi 4
CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations
endif
ifneq ($(filter armv8%,$(UNAME_M)),)
# Raspberry Pi 4
CFLAGS += -mfp16-format=ieee -mno-unaligned-access
endif

#
# Print build information
#

$(info I llama.cpp build info: )
$(info I UNAME_S: $(UNAME_S))
$(info I UNAME_P: $(UNAME_P))
$(info I UNAME_M: $(UNAME_M))
$(info I CFLAGS: $(CFLAGS))
$(info I CXXFLAGS: $(CXXFLAGS))
$(info I LDFLAGS: $(LDFLAGS))
$(info I CC: $(CCV))
$(info I CXX: $(CXXV))
$(info )

llama.cpp/ggml.o:
$(MAKE) -C llama.cpp ggml.o

llama.cpp/llama.o:
$(MAKE) -C llama.cpp llama.o

llama.cpp/common.o:
$(MAKE) -C llama.cpp common.o

binding.o: llama.cpp/ggml.o llama.cpp/llama.o llama.cpp/common.o
$(CXX) $(CXXFLAGS) -I./llama.cpp -I./llama.cpp/examples binding.cpp -o binding.o -c $(LDFLAGS)

libbinding.a: binding.o
ar src libbinding.a llama.cpp/ggml.o llama.cpp/common.o llama.cpp/llama.o binding.o

clean:
rm -rf *.o
rm -rf *.a
$(MAKE) -C llama.cpp clean
195 changes: 195 additions & 0 deletions binding.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,195 @@
#include "common.h"
#include "llama.h"
#include "binding.h"

#include <cassert>
#include <cinttypes>
#include <cmath>
#include <cstdio>
#include <cstring>
#include <fstream>
#include <iostream>
#include <string>
#include <vector>

#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
#include <signal.h>
#include <unistd.h>
#elif defined (_WIN32)
#include <signal.h>
#endif

#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
void sigint_handler(int signo) {
if (signo == SIGINT) {
_exit(130);
}
}
#endif

int llama_predict(void* params_ptr, void* state_pr, char* result) {
gpt_params* params_p = (gpt_params*) params_ptr;
llama_context* ctx = (llama_context*) state_pr;

gpt_params params = *params_p;

if (params.seed <= 0) {
params.seed = time(NULL);
}

std::mt19937 rng(params.seed);

// Add a space in front of the first character to match OG llama tokenizer behavior
params.prompt.insert(0, 1, ' ');

// tokenize the prompt
auto embd_inp = ::llama_tokenize(ctx, params.prompt, true);

const int n_ctx = llama_n_ctx(ctx);

// number of tokens to keep when resetting context
if (params.n_keep < 0 || params.n_keep > (int)embd_inp.size() || params.instruct) {
params.n_keep = (int)embd_inp.size();
}

// determine newline token
auto llama_token_newline = ::llama_tokenize(ctx, "\n", false);

// TODO: replace with ring-buffer
std::vector<llama_token> last_n_tokens(n_ctx);
std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0);

int n_past = 0;
int n_remain = params.n_predict;
int n_consumed = 0;

std::vector<llama_token> embd;
std::string res = "";

while (n_remain != 0) {
// predict
if (embd.size() > 0) {
// infinite text generation via context swapping
// if we run out of context:
// - take the n_keep first tokens from the original prompt (via n_past)
// - take half of the last (n_ctx - n_keep) tokens and recompute the logits in a batch
if (n_past + (int) embd.size() > n_ctx) {
const int n_left = n_past - params.n_keep;

n_past = params.n_keep;

// insert n_left/2 tokens at the start of embd from last_n_tokens
embd.insert(embd.begin(), last_n_tokens.begin() + n_ctx - n_left/2 - embd.size(), last_n_tokens.end() - embd.size());
}

if (llama_eval(ctx, embd.data(), embd.size(), n_past, params.n_threads)) {
fprintf(stderr, "%s : failed to eval\n", __func__);
return 1;
}
}

n_past += embd.size();
embd.clear();

if ((int) embd_inp.size() <= n_consumed) {
// out of user input, sample next token
const int32_t top_k = params.top_k;
const float top_p = params.top_p;
const float temp = params.temp;
const float repeat_penalty = params.repeat_penalty;

llama_token id = 0;

{
auto logits = llama_get_logits(ctx);

if (params.ignore_eos) {
logits[llama_token_eos()] = 0;
}

id = llama_sample_top_p_top_k(ctx,
last_n_tokens.data() + n_ctx - params.repeat_last_n,
params.repeat_last_n, top_k, top_p, temp, repeat_penalty);

last_n_tokens.erase(last_n_tokens.begin());
last_n_tokens.push_back(id);
}

// add it to the context
embd.push_back(id);

// decrement remaining sampling budget
--n_remain;
} else {
// some user input remains from prompt or interaction, forward it to processing
while ((int) embd_inp.size() > n_consumed) {
embd.push_back(embd_inp[n_consumed]);
last_n_tokens.erase(last_n_tokens.begin());
last_n_tokens.push_back(embd_inp[n_consumed]);
++n_consumed;
if ((int) embd.size() >= params.n_batch) {
break;
}
}
}

for (auto id : embd) {
res += llama_token_to_str(ctx, id);
}

// end of text token
if (embd.back() == llama_token_eos()) {
break;
}
}

#if defined (_WIN32)
signal(SIGINT, SIG_DFL);
#endif
strcpy(result, res.c_str());
return 0;
}

void llama_free_model(void *state_ptr) {
llama_context* ctx = (llama_context*) state_ptr;
llama_free(ctx);
}

void llama_free_params(void* params_ptr) {
gpt_params* params = (gpt_params*) params_ptr;
delete params;
}


void* llama_allocate_params(const char *prompt, int seed, int threads, int tokens, int top_k,
float top_p, float temp, float repeat_penalty, int repeat_last_n, bool ignore_eos, bool memory_f16) {
gpt_params* params = new gpt_params;
params->seed = seed;
params->n_threads = threads;
params->n_predict = tokens;
params->repeat_last_n = repeat_last_n;

params->top_k = top_k;
params->top_p = top_p;
params->memory_f16 = memory_f16;
params->temp = temp;
params->repeat_penalty = repeat_penalty;

params->prompt = prompt;
params->ignore_eos = ignore_eos;

return params;
}

void* load_model(const char *fname, int n_ctx, int n_parts, int n_seed, bool memory_f16, bool mlock) {
// load the model
auto lparams = llama_context_default_params();

lparams.n_ctx = n_ctx;
lparams.n_parts = n_parts;
lparams.seed = n_seed;
lparams.f16_kv = memory_f16;
lparams.use_mlock = mlock;

return llama_init_from_file(fname, lparams);
}
20 changes: 20 additions & 0 deletions binding.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#ifdef __cplusplus
extern "C" {
#endif

#include <stdbool.h>

void* load_model(const char *fname, int n_ctx, int n_parts, int n_seed, bool memory_f16, bool mlock);

void* llama_allocate_params(const char *prompt, int seed, int threads, int tokens,
int top_k, float top_p, float temp, float repeat_penalty, int repeat_last_n, bool ignore_eos, bool memory_f16);

void llama_free_params(void* params_ptr);

void llama_free_model(void* state);

int llama_predict(void* params_ptr, void* state_pr, char* result);

#ifdef __cplusplus
}
#endif
Loading

0 comments on commit 24b85a9

Please sign in to comment.