Skip to content

Commit d5b51bc

Browse files
committed
Merge branch 'concedo' into ycros-rope-linear-8k
2 parents 385229a + ddaa4f2 commit d5b51bc

40 files changed

+6566
-2179
lines changed

CMakeLists.txt

+13
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ endif()
4444
option(LLAMA_CUBLAS "llama: use cuBLAS" ON)
4545
set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
4646
set(LLAMA_CUDA_DMMV_Y "1" CACHE STRING "llama: y block size for dmmv CUDA kernels")
47+
set(LLAMA_CUDA_MMV_Y "1" CACHE STRING "llama: y block size for mmv CUDA kernels")
4748
option(LLAMA_CUDA_DMMV_F16 "llama: use 16 bit floats for dmmv CUDA kernels" OFF)
4849
set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
4950
option(LLAMA_K_QUANTS "llama: use k-quants" ON)
@@ -76,8 +77,11 @@ if (LLAMA_CUBLAS)
7677
set(GGML_V2_LEGACY_CUDA_SOURCES otherarch/ggml_v2-cuda-legacy.cu otherarch/ggml_v2-cuda-legacy.h)
7778

7879
add_compile_definitions(GGML_USE_CUBLAS)
80+
add_compile_definitions(GGML_CUDA_FORCE_DMMV) #non dmmv broken for me
81+
7982
add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
8083
add_compile_definitions(GGML_CUDA_DMMV_Y=${LLAMA_CUDA_DMMV_Y})
84+
add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
8185
if (LLAMA_CUDA_DMMV_F16)
8286
add_compile_definitions(GGML_CUDA_DMMV_F16)
8387
endif()
@@ -89,6 +93,15 @@ if (LLAMA_CUBLAS)
8993
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
9094
endif()
9195

96+
if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
97+
if (LLAMA_CUDA_DMMV_F16)
98+
set(CMAKE_CUDA_ARCHITECTURES "61") # needed for f16 CUDA intrinsics
99+
else()
100+
set(CMAKE_CUDA_ARCHITECTURES "52;61") # lowest CUDA 12 standard + lowest for integer intrinsics
101+
endif()
102+
endif()
103+
message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
104+
92105
else()
93106
message(WARNING "cuBLAS not found")
94107
endif()

Makefile

+3-1
Original file line numberDiff line numberDiff line change
@@ -144,16 +144,18 @@ ifdef LLAMA_CUBLAS
144144
CUBLASLD_FLAGS = -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
145145
CUBLAS_OBJS = ggml-cuda.o ggml_v2-cuda.o ggml_v2-cuda-legacy.o
146146
NVCC = nvcc
147-
NVCCFLAGS = --forward-unknown-to-host-compiler -arch=native
147+
NVCCFLAGS = --forward-unknown-to-host-compiler -arch=native -DGGML_CUDA_FORCE_DMMV
148148
ifdef LLAMA_CUDA_DMMV_X
149149
NVCCFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
150150
else
151151
NVCCFLAGS += -DGGML_CUDA_DMMV_X=32
152152
endif # LLAMA_CUDA_DMMV_X
153153
ifdef LLAMA_CUDA_DMMV_Y
154+
NVCCFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y)
154155
NVCCFLAGS += -DGGML_CUDA_DMMV_Y=$(LLAMA_CUDA_DMMV_Y)
155156
else
156157
NVCCFLAGS += -DGGML_CUDA_DMMV_Y=1
158+
NVCCFLAGS += -DGGML_CUDA_MMV_Y=1
157159
endif # LLAMA_CUDA_DMMV_Y
158160
ifdef LLAMA_CUDA_DMMV_F16
159161
NVCCFLAGS += -DGGML_CUDA_DMMV_F16

convert.py

+6
Original file line numberDiff line numberDiff line change
@@ -154,9 +154,15 @@ def guessed(model: 'LazyModel') -> 'Params':
154154
# try transformer naming first
155155
if "model.layers.0.self_attn.q_proj.weight" in model:
156156
n_layer=next(i for i in itertools.count() if f"model.layers.{i}.self_attn.q_proj.weight" not in model)
157+
elif "model.layers.0.self_attn.W_pack.weight" in model: # next: try baichuan naming
158+
n_layer=next(i for i in itertools.count() if f"model.layers.{i}.self_attn.W_pack.weight" not in model)
157159
else:
158160
n_layer=next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model)
159161

162+
if n_layer < 1:
163+
raise Exception("failed to guess 'n_layer'. This model is unknown or unsupported.\n"
164+
"Suggestion: provide 'config.json' of the model in the same directory containing model files.")
165+
160166
n_head=n_embd // 128 # guessed
161167

162168
return Params(

examples/alpaca.sh

+1-1
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
cd `dirname $0`
88
cd ..
99

10-
./main -m ./models/ggml-alpaca-7b-q4.bin \
10+
./main -m ./models/alpaca.13b.ggmlv3.q8_0.bin \
1111
--color \
1212
-f ./prompts/alpaca.txt \
1313
--ctx_size 2048 \

examples/common.h

+2-1
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ struct gpt_params {
3131
int32_t n_gpu_layers = 0; // number of layers to store in VRAM
3232
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
3333
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
34-
bool low_vram = 0; // if true, reduce VRAM usage at the cost of performance
34+
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
3535

3636
// sampling parameters
3737
std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
@@ -59,6 +59,7 @@ struct gpt_params {
5959
std::string lora_adapter = ""; // lora adapter path
6060
std::string lora_base = ""; // base model path for the lora adapter
6161

62+
bool low_vram = false; // if true, reduce VRAM usage at the cost of performance
6263
bool memory_f16 = true; // use f16 instead of f32 for memory kv
6364
bool random_prompt = false; // do not randomize prompt if none provided
6465
bool use_color = false; // use color to distinguish generations and inputs

examples/embd-input/embd-input-lib.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ struct MyModel* create_mymodel(int argc, char ** argv) {
2929

3030
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
3131

32-
if (params.seed < 0) {
32+
if (params.seed == LLAMA_DEFAULT_SEED) {
3333
params.seed = time(NULL);
3434
}
3535
fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);

examples/embedding/embedding.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ int main(int argc, char ** argv) {
1818
params.embedding = true;
1919

2020
if (params.n_ctx > 2048) {
21-
fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"
21+
fprintf(stderr, "%s: warning: model might not support context sizes greater than 2048 tokens (%d specified);"
2222
"expect poor results\n", __func__, params.n_ctx);
2323
}
2424

examples/main/main.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ int main(int argc, char ** argv) {
8585
}
8686

8787
if (params.n_ctx > 2048) {
88-
fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"
88+
fprintf(stderr, "%s: warning: model might not support context sizes greater than 2048 tokens (%d specified);"
8989
"expect poor results\n", __func__, params.n_ctx);
9090
} else if (params.n_ctx < 8) {
9191
fprintf(stderr, "%s: warning: minimum context size is 8, using minimum size.\n", __func__);

examples/perplexity/perplexity.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,7 @@ int main(int argc, char ** argv) {
130130
params.n_batch = std::min(params.n_batch, params.n_ctx);
131131

132132
if (params.n_ctx > 2048) {
133-
fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"
133+
fprintf(stderr, "%s: warning: model might not support context sizes greater than 2048 tokens (%d specified);"
134134
"expect poor results\n", __func__, params.n_ctx);
135135
}
136136

examples/quantize-stats/quantize-stats.cpp

+7-7
Original file line numberDiff line numberDiff line change
@@ -147,7 +147,7 @@ void test_roundtrip_on_chunk(
147147
const ggml_tensor * layer,
148148
int64_t offset,
149149
int64_t chunk_size,
150-
const quantize_fns_t & qfns,
150+
const ggml_type_traits_t & qfns,
151151
bool use_reference,
152152
float * input_scratch,
153153
char * quantized_scratch,
@@ -163,11 +163,11 @@ void test_roundtrip_on_chunk(
163163
}
164164

165165
if (use_reference) {
166-
qfns.quantize_row_q_reference(input_scratch, quantized_scratch, chunk_size);
166+
qfns.from_float_reference(input_scratch, quantized_scratch, chunk_size);
167167
} else {
168-
qfns.quantize_row_q(input_scratch, quantized_scratch, chunk_size);
168+
qfns.from_float(input_scratch, quantized_scratch, chunk_size);
169169
}
170-
qfns.dequantize_row_q(quantized_scratch, output_scratch, chunk_size);
170+
qfns.to_float(quantized_scratch, output_scratch, chunk_size);
171171

172172
update_error_stats(chunk_size, input_scratch, output_scratch, stats);
173173
}
@@ -177,7 +177,7 @@ void test_roundtrip_on_chunk(
177177
void test_roundtrip_on_layer(
178178
std::string & name,
179179
bool print_layer_stats,
180-
const quantize_fns_t & qfns,
180+
const ggml_type_traits_t & qfns,
181181
bool use_reference,
182182
const ggml_tensor * layer,
183183
std::vector<float> & input_scratch,
@@ -388,8 +388,8 @@ int main(int argc, char ** argv) {
388388
if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), i) == params.include_types.end()) {
389389
continue;
390390
}
391-
quantize_fns_t qfns = ggml_internal_get_quantize_fn(i);
392-
if (qfns.quantize_row_q && qfns.dequantize_row_q) {
391+
ggml_type_traits_t qfns = ggml_internal_get_type_traits(type);
392+
if (qfns.from_float && qfns.to_float) {
393393
if (params.verbose) {
394394
printf("testing %s ...\n", ggml_type_name(type));
395395
}

examples/server/README.md

+52-8
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,13 @@
11
# llama.cpp/example/server
22

3-
This example demonstrates a simple HTTP API server to interact with llama.cpp.
3+
This example demonstrates a simple HTTP API server and a simple web front end to interact with llama.cpp.
44

55
Command line options:
66

77
- `--threads N`, `-t N`: Set the number of threads to use during computation.
88
- `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`).
99
- `-m ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses.
10-
- `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference.
10+
- `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of 4096.
1111
- `-ngl N`, `--n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
1212
- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
1313
- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.
@@ -21,24 +21,22 @@ Command line options:
2121
- `-to N`, `--timeout N`: Server read/write timeout in seconds. Default `600`.
2222
- `--host`: Set the hostname or ip address to listen. Default `127.0.0.1`.
2323
- `--port`: Set the port to listen. Default: `8080`.
24+
- `--path`: path from which to serve static files (default examples/server/public)
2425
- `--embedding`: Enable embedding extraction, Default: disabled.
2526

2627
## Build
2728

28-
Build llama.cpp with server from repository root with either make or CMake.
29+
server is build alongside everything else from the root of the project
2930

3031
- Using `make`:
3132

3233
```bash
33-
LLAMA_BUILD_SERVER=1 make
34+
make
3435
```
3536

3637
- Using `CMake`:
3738

3839
```bash
39-
mkdir build-server
40-
cd build-server
41-
cmake -DLLAMA_BUILD_SERVER=ON ..
4240
cmake --build . --config Release
4341
```
4442

@@ -59,7 +57,7 @@ server.exe -m models\7B\ggml-model.bin -c 2048
5957
```
6058

6159
The above command will start a server that by default listens on `127.0.0.1:8080`.
62-
You can consume the endpoints with Postman or NodeJS with axios library.
60+
You can consume the endpoints with Postman or NodeJS with axios library. You can visit the web front end at the same url.
6361

6462
## Testing with CURL
6563

@@ -190,3 +188,49 @@ Run with bash:
190188
```sh
191189
bash chat.sh
192190
```
191+
192+
### API like OAI
193+
194+
API example using Python Flask: [api_like_OAI.py](api_like_OAI.py)
195+
This example must be used with server.cpp
196+
197+
```sh
198+
python api_like_OAI.py
199+
```
200+
201+
After running the API server, you can use it in Python by setting the API base URL.
202+
```python
203+
openai.api_base = "http://<Your api-server IP>:port"
204+
```
205+
206+
Then you can utilize llama.cpp as an OpenAI's **chat.completion** or **text_completion** API
207+
208+
### Extending or building alternative Web Front End
209+
210+
The default location for the static files is `examples/server/public`. You can extend the front end by running the server binary with `--path` set to `./your-directory` and importing `/completion.js` to get access to the llamaComplete() method.
211+
212+
Read the documentation in `/completion.js` to see convenient ways to access llama.
213+
214+
A simple example is below:
215+
216+
```html
217+
<html>
218+
<body>
219+
<pre>
220+
<script type="module">
221+
import { llama } from '/completion.js'
222+
223+
const prompt = `### Instruction:
224+
Write dad jokes, each one paragraph.
225+
You can use html formatting if needed.
226+
227+
### Response:`
228+
229+
for await (const chunk of llama(prompt)) {
230+
document.write(chunk.data.content)
231+
}
232+
</script>
233+
</pre>
234+
</body>
235+
</html>
236+
```

0 commit comments

Comments
 (0)