Work out kinks with the latest sync

Tested and restored some functionality of llamafiler; Q8_0 on ARM; fixed memory leaks; copied latest upstream quantization code; migrate to newer llama.cpp APIs; etc. llamafiler is now able to serve 1800 embeddings per second; that's 6.81x faster than the llama.cpp/examples/server/ upstream
Mozilla-Ocho · Aug 3, 2024 · e914f13 · e914f13
1 parent 5da8d62
commit e914f13
Show file tree

Hide file tree

Showing 8 changed files with 3,048 additions and 5,234 deletions.
diff --git a/llama.cpp/ggml-common.h b/llama.cpp/ggml-common.h
@@ -322,9 +322,9 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_half) + QK_K / 16 + 3*QK_K/4, "w
 // This is only used for intermediate quantization and dot products
 // [kawrakow] Note: I have switched the order of bsums and qs. This results in some performance gain on Arm
 typedef struct {
-    float   d;              // delta
-    int16_t bsums[QK_K/16]; // sum of quants in groups of 16
-    int8_t  qs[QK_K];       // quants
+    float   d;              // #1 delta
+    int16_t bsums[QK_K/16]; // #2 sum of quants in groups of 16
+    int8_t  qs[QK_K];       // #3 quants
 } block_q8_K;
 static_assert(sizeof(block_q8_K) == sizeof(float) + QK_K + QK_K/16*sizeof(int16_t), "wrong q8_K block size/padding");