From d6cc466bd48dd27474ecb00c3baba2e8a887f6c4 Mon Sep 17 00:00:00 2001 From: Swetha B S Date: Tue, 3 Jun 2025 02:55:32 -0700 Subject: [PATCH 1/9] Get_Rows & Dequantize implementation adapted to work for repacked weights of type q4_0 --- ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp | 145 ++++++++++++++++++++++++- src/whisper.cpp | 43 +++++--- 2 files changed, 172 insertions(+), 16 deletions(-) diff --git a/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp b/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp index 0a3ff867cfe..5f143e25477 100644 --- a/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +++ b/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp @@ -6046,6 +6046,9 @@ template src[0]->ne[2]) * op->src[1]->ne[2]; return true; + case GGML_OP_GET_ROWS: + size = 0; // GET_ROWS (standard and repacked) doesn't need a work buffer + return true; default: // GGML_ABORT("fatal error"); break; @@ -6061,6 +6064,9 @@ template src[0]; + + switch (src0->type) { + case GGML_TYPE_Q4_0: { + ggml_compute_forward_get_rows_q4_0x8(params, dst); + } break; + default: + GGML_ABORT("fatal error"); + break; + } + } + + static void ggml_compute_forward_get_rows_q4_0x8( + const ggml_compute_params *params, + ggml_tensor *dst) { + const ggml_tensor *src0 = dst->src[0]; + const ggml_tensor *src1 = dst->src[1]; + + GGML_TENSOR_BINARY_OP_LOCALS + + const int64_t nc = ne00; + const int64_t nr = ggml_nelements(src1); + + assert(ne0 == nc); + assert(ne02 == ne11); + assert(nb00 == ggml_type_size(src0->type)); + assert(ggml_nrows(dst) == nr); + + const int ith = params->ith; + const int nth = params->nth; + + // rows per thread + const int dr = (nr + nth - 1) / nth; + + // row range for this thread + const int ir0 = dr * ith; + const int ir1 = MIN(ir0 + dr, nr); + + constexpr int nrows_interleaved = 8; + const size_t sizeof_one_repacked_block = sizeof(block_q4_0x8); + + const int num_repacked_blocks_per_row_width = nc / QK4_0; + + const size_t stride_between_actual_row_groups = num_repacked_blocks_per_row_width * sizeof_one_repacked_block; + + for (int64_t i = ir0; i < ir1; ++i) { + const int64_t i12 = i / (ne11 * ne10); + const int64_t i11 = (i - i12 * ne11 * ne10) / ne10; + const int64_t i10 = (i - i12 * ne11 * ne10 - i11 * ne10); + const int64_t i01 = *(int32_t *)((char *)src1->data + i10 * nb10 + i11 * nb11 + i12 * nb12); // original logical row + + GGML_ASSERT(i01 >= 0 && i01 < ne01); + + int row_group_idx = i01 / nrows_interleaved; + const int row_idx_in_group = i01 % nrows_interleaved; + + const char *base_ptr_for_higher_dims_in_src0 = (const char *)src0->data + i11 * nb02 + i12 * nb03; + + // Pointer to the first block_q4_0x8 of the identified row_group_idx + const block_q4_0x8 *p_first_repacked_block_of_group_x8 = (const block_q4_0x8 *)(base_ptr_for_higher_dims_in_src0 + row_group_idx * stride_between_actual_row_groups); + + dequantize_row_q4_0x8( + p_first_repacked_block_of_group_x8, + (float *)((char *)dst->data + i10 * nb1 + i11 * nb2 + i12 * nb3), nc, row_idx_in_group); + } + } + + /** + * Dequantizes a single logical row from data repacked with quant interleaving. + * + * @param p_repacked_group_column_blocks Pointer to the start of 'block_q4_0x8' for the row group. + * @param y Output buffer for the dequantized float values. + * @param k Total number of elements (columns) in the logical row. + * @param row_idx_in_group Index (0-7) of the logical row to dequantize. + */ + static void dequantize_row_q4_0x8( + const block_q4_0x8 *GGML_RESTRICT p_repacked_group_column_blocks, + float *GGML_RESTRICT y, + int64_t k, + int row_idx_in_group) { + const int GGML_Q4_0_X8_INTERLEAVE_SIZE = 8; + assert(k % QK4_0 == 0); + assert(row_idx_in_group >= 0 && row_idx_in_group < GGML_Q4_0_X8_INTERLEAVE_SIZE); + + const int nb = k / QK4_0; + const int bytes_for_half_elements = (QK4_0 / 2) / 2; + + const int offset_to_second_half_data = bytes_for_half_elements * GGML_Q4_0_X8_INTERLEAVE_SIZE; + const uint64_t xor_mask = 0x8888888888888888ULL; + const int qk4_0_half_elements = QK4_0 / 2; + + for (int i = 0; i < nb; ++i) { + const block_q4_0x8 *current_column_repacked_block = &p_repacked_group_column_blocks[i]; + const float d_val = GGML_FP16_TO_FP32(current_column_repacked_block->d[row_idx_in_group]); + float *y_curr = y + i * QK4_0; + + const int8_t *qs_first_half_repacked_ptr = &(current_column_repacked_block->qs[row_idx_in_group * bytes_for_half_elements]); + + uint64_t first_half_chunk_u64; + memcpy(&first_half_chunk_u64, qs_first_half_repacked_ptr, sizeof(uint64_t)); + first_half_chunk_u64 ^= xor_mask; // Reverse the XOR + const uint8_t *original_qs_first_half_bytes = (const uint8_t *)&first_half_chunk_u64; + + const int8_t *qs_second_half_repacked_ptr = &(current_column_repacked_block->qs[offset_to_second_half_data + (row_idx_in_group * bytes_for_half_elements)]); + + uint64_t second_half_chunk_u64; + memcpy(&second_half_chunk_u64, qs_second_half_repacked_ptr, sizeof(uint64_t)); + second_half_chunk_u64 ^= xor_mask; // Reverse the XOR + const uint8_t *original_qs_second_half_bytes = (const uint8_t *)&second_half_chunk_u64; + + // dequantizing all QK4_0's for this block. + for (int j = 0; j < bytes_for_half_elements; ++j) { + const uint8_t quant_byte_first = original_qs_first_half_bytes[j]; + y_curr[j] = ((quant_byte_first & 0x0F) - 8) * d_val; + y_curr[j + qk4_0_half_elements] = ((quant_byte_first >> 4) - 8) * d_val; + + const uint8_t quant_byte_second = original_qs_second_half_bytes[j]; + const int out_idx_base_second_half = j + bytes_for_half_elements; // Offset for the second set of low nibbles + y_curr[out_idx_base_second_half] = ((quant_byte_second & 0x0F) - 8) * d_val; + y_curr[out_idx_base_second_half + qk4_0_half_elements] = ((quant_byte_second >> 4) - 8) * d_val; + } + } + } + void forward_mul_mat(ggml_compute_params * params, ggml_tensor * op) { const ggml_tensor * src0 = op->src[0]; const ggml_tensor * src1 = op->src[1]; @@ -6398,12 +6530,23 @@ class extra_buffer_type : ggml::cpu::extra_buffer_type { //if (op->src[1]->type == GGML_TYPE_Q8_0) { // return true; //} + } else if (op->op == GGML_OP_GET_ROWS + && op->src[0]->buffer + && (ggml_n_dims(op->src[0]) == 2) + && op->src[0]->buffer->buft == ggml_backend_cpu_aarch64_buffer_type() + && ggml_aarch64_get_optimal_repack_type(op->src[0])) { + if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) { + return false; + } + if (op->src[0]->type == GGML_TYPE_Q4_0) { + return true; + } } return false; } ggml::cpu::tensor_traits * get_tensor_traits(const struct ggml_tensor * op) override { - if (op->op == GGML_OP_MUL_MAT || op->op == GGML_OP_MUL_MAT_ID) { + if (op->op == GGML_OP_MUL_MAT || op->op == GGML_OP_MUL_MAT_ID || op->op == GGML_OP_GET_ROWS) { if (op->src[0]->buffer && op->src[0]->buffer->buft == ggml_backend_cpu_aarch64_buffer_type()) { return (ggml::cpu::tensor_traits *) op->src[0]->extra; } diff --git a/src/whisper.cpp b/src/whisper.cpp index a2f28d7db54..3ac77dc7003 100644 --- a/src/whisper.cpp +++ b/src/whisper.cpp @@ -1437,24 +1437,25 @@ static bool weight_buft_supported(const whisper_hparams & hparams, ggml_tensor * // GPU and default CPU backend support all operators op_supported = true; } else { - switch (op) { - // The current extra_buffer_type implementations only support GGML_OP_MUL_MAT - case GGML_OP_MUL_MAT: { - ggml_init_params params = { - /*.mem_size =*/ 2 * ggml_tensor_overhead(), - /*.mem_buffer =*/ nullptr, - /*.no_alloc =*/ true, - }; + ggml_init_params params = { + /*.mem_size =*/ 2 * ggml_tensor_overhead(), + /*.mem_buffer =*/ nullptr, + /*.no_alloc =*/ true, + }; - ggml_context_ptr ctx_ptr { ggml_init(params) }; - if (!ctx_ptr) { - throw std::runtime_error("failed to create ggml context"); - } - ggml_context * ctx = ctx_ptr.get(); + ggml_context_ptr ctx_ptr { ggml_init(params) }; + if (!ctx_ptr) { + throw std::runtime_error("failed to create ggml context"); + } + ggml_context * ctx = ctx_ptr.get(); - ggml_tensor * op_tensor = nullptr; + ggml_tensor * op_tensor = nullptr; + + int64_t n_ctx = hparams.n_audio_ctx; - int64_t n_ctx = hparams.n_audio_ctx; + switch (op) { + // The current extra_buffer_type implementations only support GGML_OP_MUL_MAT & GGML_OP_GET_ROWS (q4_0) + case GGML_OP_MUL_MAT: { ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], n_ctx, w->ne[2], w->ne[3]); op_tensor = ggml_mul_mat(ctx, w, b); @@ -1466,6 +1467,18 @@ static bool weight_buft_supported(const whisper_hparams & hparams, ggml_tensor * w->buffer = nullptr; break; } + case GGML_OP_GET_ROWS: { + ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_ctx); + op_tensor = ggml_get_rows(ctx, w, b); + + // create a temporary dummy buffer for the weight so that supports_op can check the buffer type + GGML_ASSERT(w->buffer == nullptr); + w->buffer = ggml_backend_buft_alloc_buffer(buft, 0); + op_supported = ggml_backend_dev_supports_op(dev, op_tensor); + ggml_backend_buffer_free(w->buffer); + w->buffer = nullptr; + break; + } default: { op_supported = false; break; From 994e02a5eb1526fb9dcdb7448c930fe88fb19095 Mon Sep 17 00:00:00 2001 From: Swetha B S Date: Fri, 6 Jun 2025 03:46:05 -0700 Subject: [PATCH 2/9] Resolve PR comments --- ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp | 42 +++++++++++++------------- src/whisper.cpp | 2 +- 2 files changed, 22 insertions(+), 22 deletions(-) diff --git a/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp b/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp index 5f143e25477..a2b5492543b 100644 --- a/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +++ b/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp @@ -6074,9 +6074,9 @@ template src[0]; + void forward_get_rows(const ggml_compute_params * params, + ggml_tensor * dst) { + const ggml_tensor * src0 = dst->src[0]; switch (src0->type) { case GGML_TYPE_Q4_0: { @@ -6089,10 +6089,10 @@ template src[0]; - const ggml_tensor *src1 = dst->src[1]; + const ggml_compute_params * params, + ggml_tensor * dst) { + const ggml_tensor * src0 = dst->src[0]; + const ggml_tensor * src1 = dst->src[1]; GGML_TENSOR_BINARY_OP_LOCALS @@ -6132,10 +6132,10 @@ template data + i11 * nb02 + i12 * nb03; + const char * base_ptr_for_higher_dims_in_src0 = (const char *)src0->data + i11 * nb02 + i12 * nb03; // Pointer to the first block_q4_0x8 of the identified row_group_idx - const block_q4_0x8 *p_first_repacked_block_of_group_x8 = (const block_q4_0x8 *)(base_ptr_for_higher_dims_in_src0 + row_group_idx * stride_between_actual_row_groups); + const block_q4_0x8 * p_first_repacked_block_of_group_x8 = (const block_q4_0x8 *)(base_ptr_for_higher_dims_in_src0 + row_group_idx * stride_between_actual_row_groups); dequantize_row_q4_0x8( p_first_repacked_block_of_group_x8, @@ -6152,8 +6152,8 @@ template d[row_idx_in_group]); - float *y_curr = y + i * QK4_0; + float * y_curr = y + i * QK4_0; - const int8_t *qs_first_half_repacked_ptr = &(current_column_repacked_block->qs[row_idx_in_group * bytes_for_half_elements]); + const int8_t * qs_first_half_repacked_ptr = &(current_column_repacked_block->qs[row_idx_in_group * bytes_for_half_elements]); uint64_t first_half_chunk_u64; memcpy(&first_half_chunk_u64, qs_first_half_repacked_ptr, sizeof(uint64_t)); first_half_chunk_u64 ^= xor_mask; // Reverse the XOR - const uint8_t *original_qs_first_half_bytes = (const uint8_t *)&first_half_chunk_u64; + const uint8_t * original_qs_first_half_bytes = (const uint8_t *)&first_half_chunk_u64; - const int8_t *qs_second_half_repacked_ptr = &(current_column_repacked_block->qs[offset_to_second_half_data + (row_idx_in_group * bytes_for_half_elements)]); + const int8_t * qs_second_half_repacked_ptr = &(current_column_repacked_block->qs[offset_to_second_half_data + (row_idx_in_group * bytes_for_half_elements)]); uint64_t second_half_chunk_u64; memcpy(&second_half_chunk_u64, qs_second_half_repacked_ptr, sizeof(uint64_t)); second_half_chunk_u64 ^= xor_mask; // Reverse the XOR - const uint8_t *original_qs_second_half_bytes = (const uint8_t *)&second_half_chunk_u64; + const uint8_t * original_qs_second_half_bytes = (const uint8_t *)&second_half_chunk_u64; // dequantizing all QK4_0's for this block. for (int j = 0; j < bytes_for_half_elements; ++j) { @@ -6530,10 +6530,10 @@ class extra_buffer_type : ggml::cpu::extra_buffer_type { //if (op->src[1]->type == GGML_TYPE_Q8_0) { // return true; //} - } else if (op->op == GGML_OP_GET_ROWS - && op->src[0]->buffer - && (ggml_n_dims(op->src[0]) == 2) - && op->src[0]->buffer->buft == ggml_backend_cpu_aarch64_buffer_type() + } else if (op->op == GGML_OP_GET_ROWS + && op->src[0]->buffer + && (ggml_n_dims(op->src[0]) == 2) + && op->src[0]->buffer->buft == ggml_backend_cpu_aarch64_buffer_type() && ggml_aarch64_get_optimal_repack_type(op->src[0])) { if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) { return false; diff --git a/src/whisper.cpp b/src/whisper.cpp index 3ac77dc7003..30bfa107838 100644 --- a/src/whisper.cpp +++ b/src/whisper.cpp @@ -1450,7 +1450,7 @@ static bool weight_buft_supported(const whisper_hparams & hparams, ggml_tensor * ggml_context * ctx = ctx_ptr.get(); ggml_tensor * op_tensor = nullptr; - + int64_t n_ctx = hparams.n_audio_ctx; switch (op) { From ed1d3a2beb1065538994e07a9cf9584b18f8f452 Mon Sep 17 00:00:00 2001 From: Swetha B S Date: Tue, 3 Jun 2025 02:55:32 -0700 Subject: [PATCH 3/9] Get_Rows & Dequantize implementation adapted to work for repacked weights of type q4_0 --- ggml/src/ggml-cpu/repack.cpp | 145 ++++++++++++++++++++++++++++++++++- src/whisper.cpp | 43 +++++++---- 2 files changed, 172 insertions(+), 16 deletions(-) diff --git a/ggml/src/ggml-cpu/repack.cpp b/ggml/src/ggml-cpu/repack.cpp index 628142d5f63..5a80365d09b 100644 --- a/ggml/src/ggml-cpu/repack.cpp +++ b/ggml/src/ggml-cpu/repack.cpp @@ -1181,6 +1181,9 @@ template src[0]->ne[2]) * op->src[1]->ne[2]; return true; + case GGML_OP_GET_ROWS: + size = 0; // GET_ROWS (standard and repacked) doesn't need a work buffer + return true; default: // GGML_ABORT("fatal error"); break; @@ -1196,6 +1199,9 @@ template src[0]; + + switch (src0->type) { + case GGML_TYPE_Q4_0: { + ggml_compute_forward_get_rows_q4_0x8(params, dst); + } break; + default: + GGML_ABORT("fatal error"); + break; + } + } + + static void ggml_compute_forward_get_rows_q4_0x8( + const ggml_compute_params * params, + ggml_tensor * dst) { + const ggml_tensor * src0 = dst->src[0]; + const ggml_tensor * src1 = dst->src[1]; + + GGML_TENSOR_BINARY_OP_LOCALS + + const int64_t nc = ne00; + const int64_t nr = ggml_nelements(src1); + + assert(ne0 == nc); + assert(ne02 == ne11); + assert(nb00 == ggml_type_size(src0->type)); + assert(ggml_nrows(dst) == nr); + + const int ith = params->ith; + const int nth = params->nth; + + // rows per thread + const int dr = (nr + nth - 1) / nth; + + // row range for this thread + const int ir0 = dr * ith; + const int ir1 = MIN(ir0 + dr, nr); + + constexpr int nrows_interleaved = 8; + const size_t sizeof_one_repacked_block = sizeof(block_q4_0x8); + + const int num_repacked_blocks_per_row_width = nc / QK4_0; + + const size_t stride_between_actual_row_groups = num_repacked_blocks_per_row_width * sizeof_one_repacked_block; + + for (int64_t i = ir0; i < ir1; ++i) { + const int64_t i12 = i / (ne11 * ne10); + const int64_t i11 = (i - i12 * ne11 * ne10) / ne10; + const int64_t i10 = (i - i12 * ne11 * ne10 - i11 * ne10); + const int64_t i01 = *(int32_t *)((char *)src1->data + i10 * nb10 + i11 * nb11 + i12 * nb12); // original logical row + + GGML_ASSERT(i01 >= 0 && i01 < ne01); + + int row_group_idx = i01 / nrows_interleaved; + const int row_idx_in_group = i01 % nrows_interleaved; + + const char * base_ptr_for_higher_dims_in_src0 = (const char *)src0->data + i11 * nb02 + i12 * nb03; + + // Pointer to the first block_q4_0x8 of the identified row_group_idx + const block_q4_0x8 * p_first_repacked_block_of_group_x8 = (const block_q4_0x8 *)(base_ptr_for_higher_dims_in_src0 + row_group_idx * stride_between_actual_row_groups); + + dequantize_row_q4_0x8( + p_first_repacked_block_of_group_x8, + (float *)((char *)dst->data + i10 * nb1 + i11 * nb2 + i12 * nb3), nc, row_idx_in_group); + } + } + + /** + * Dequantizes a single logical row from data repacked with quant interleaving. + * + * @param p_repacked_group_column_blocks Pointer to the start of 'block_q4_0x8' for the row group. + * @param y Output buffer for the dequantized float values. + * @param k Total number of elements (columns) in the logical row. + * @param row_idx_in_group Index (0-7) of the logical row to dequantize. + */ + static void dequantize_row_q4_0x8( + const block_q4_0x8 * GGML_RESTRICT p_repacked_group_column_blocks, + float * GGML_RESTRICT y, + int64_t k, + int row_idx_in_group) { + const int GGML_Q4_0_X8_INTERLEAVE_SIZE = 8; + assert(k % QK4_0 == 0); + assert(row_idx_in_group >= 0 && row_idx_in_group < GGML_Q4_0_X8_INTERLEAVE_SIZE); + + const int nb = k / QK4_0; + const int bytes_for_half_elements = (QK4_0 / 2) / 2; + + const int offset_to_second_half_data = bytes_for_half_elements * GGML_Q4_0_X8_INTERLEAVE_SIZE; + const uint64_t xor_mask = 0x8888888888888888ULL; + const int qk4_0_half_elements = QK4_0 / 2; + + for (int i = 0; i < nb; ++i) { + const block_q4_0x8 * current_column_repacked_block = &p_repacked_group_column_blocks[i]; + const float d_val = GGML_FP16_TO_FP32(current_column_repacked_block->d[row_idx_in_group]); + float * y_curr = y + i * QK4_0; + + const int8_t * qs_first_half_repacked_ptr = &(current_column_repacked_block->qs[row_idx_in_group * bytes_for_half_elements]); + + uint64_t first_half_chunk_u64; + memcpy(&first_half_chunk_u64, qs_first_half_repacked_ptr, sizeof(uint64_t)); + first_half_chunk_u64 ^= xor_mask; // Reverse the XOR + const uint8_t * original_qs_first_half_bytes = (const uint8_t *)&first_half_chunk_u64; + + const int8_t * qs_second_half_repacked_ptr = &(current_column_repacked_block->qs[offset_to_second_half_data + (row_idx_in_group * bytes_for_half_elements)]); + + uint64_t second_half_chunk_u64; + memcpy(&second_half_chunk_u64, qs_second_half_repacked_ptr, sizeof(uint64_t)); + second_half_chunk_u64 ^= xor_mask; // Reverse the XOR + const uint8_t * original_qs_second_half_bytes = (const uint8_t *)&second_half_chunk_u64; + + // dequantizing all QK4_0's for this block. + for (int j = 0; j < bytes_for_half_elements; ++j) { + const uint8_t quant_byte_first = original_qs_first_half_bytes[j]; + y_curr[j] = ((quant_byte_first & 0x0F) - 8) * d_val; + y_curr[j + qk4_0_half_elements] = ((quant_byte_first >> 4) - 8) * d_val; + + const uint8_t quant_byte_second = original_qs_second_half_bytes[j]; + const int out_idx_base_second_half = j + bytes_for_half_elements; // Offset for the second set of low nibbles + y_curr[out_idx_base_second_half] = ((quant_byte_second & 0x0F) - 8) * d_val; + y_curr[out_idx_base_second_half + qk4_0_half_elements] = ((quant_byte_second >> 4) - 8) * d_val; + } + } + } + int repack(struct ggml_tensor * t, const void * data, size_t data_size) override { GGML_LOG_DEBUG("%s: repack tensor %s with %s_%dx%d\n", __func__, t->name, ggml_type_name(t->type), (int) NB_COLS, (int) INTER_SIZE); @@ -1533,12 +1665,23 @@ class extra_buffer_type : ggml::cpu::extra_buffer_type { //if (op->src[1]->type == GGML_TYPE_Q8_0) { // return true; //} + } else if (op->op == GGML_OP_GET_ROWS + && op->src[0]->buffer + && (ggml_n_dims(op->src[0]) == 2) + && op->src[0]->buffer->buft == ggml_backend_cpu_repack_buffer_type() + && ggml_repack_get_optimal_repack_type(op->src[0])) { + if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) { + return false; + } + if (op->src[0]->type == GGML_TYPE_Q4_0) { + return true; + } } return false; } ggml::cpu::tensor_traits * get_tensor_traits(const struct ggml_tensor * op) override { - if (op->op == GGML_OP_MUL_MAT || op->op == GGML_OP_MUL_MAT_ID) { + if (op->op == GGML_OP_MUL_MAT || op->op == GGML_OP_MUL_MAT_ID || op->op == GGML_OP_GET_ROWS) { if (op->src[0]->buffer && op->src[0]->buffer->buft == ggml_backend_cpu_repack_buffer_type()) { return (ggml::cpu::tensor_traits *) op->src[0]->extra; } diff --git a/src/whisper.cpp b/src/whisper.cpp index 6483ae8ab9d..41174b37e41 100644 --- a/src/whisper.cpp +++ b/src/whisper.cpp @@ -1437,24 +1437,25 @@ static bool weight_buft_supported(const whisper_hparams & hparams, ggml_tensor * // GPU and default CPU backend support all operators op_supported = true; } else { - switch (op) { - // The current extra_buffer_type implementations only support GGML_OP_MUL_MAT - case GGML_OP_MUL_MAT: { - ggml_init_params params = { - /*.mem_size =*/ 2 * ggml_tensor_overhead(), - /*.mem_buffer =*/ nullptr, - /*.no_alloc =*/ true, - }; + ggml_init_params params = { + /*.mem_size =*/ 2 * ggml_tensor_overhead(), + /*.mem_buffer =*/ nullptr, + /*.no_alloc =*/ true, + }; - ggml_context_ptr ctx_ptr { ggml_init(params) }; - if (!ctx_ptr) { - throw std::runtime_error("failed to create ggml context"); - } - ggml_context * ctx = ctx_ptr.get(); + ggml_context_ptr ctx_ptr { ggml_init(params) }; + if (!ctx_ptr) { + throw std::runtime_error("failed to create ggml context"); + } + ggml_context * ctx = ctx_ptr.get(); - ggml_tensor * op_tensor = nullptr; + ggml_tensor * op_tensor = nullptr; + + int64_t n_ctx = hparams.n_audio_ctx; - int64_t n_ctx = hparams.n_audio_ctx; + switch (op) { + // The current extra_buffer_type implementations only support GGML_OP_MUL_MAT & GGML_OP_GET_ROWS (q4_0) + case GGML_OP_MUL_MAT: { ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], n_ctx, w->ne[2], w->ne[3]); op_tensor = ggml_mul_mat(ctx, w, b); @@ -1466,6 +1467,18 @@ static bool weight_buft_supported(const whisper_hparams & hparams, ggml_tensor * w->buffer = nullptr; break; } + case GGML_OP_GET_ROWS: { + ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_ctx); + op_tensor = ggml_get_rows(ctx, w, b); + + // create a temporary dummy buffer for the weight so that supports_op can check the buffer type + GGML_ASSERT(w->buffer == nullptr); + w->buffer = ggml_backend_buft_alloc_buffer(buft, 0); + op_supported = ggml_backend_dev_supports_op(dev, op_tensor); + ggml_backend_buffer_free(w->buffer); + w->buffer = nullptr; + break; + } default: { op_supported = false; break; From 6959d4190e2647589a5a3e46ba73810288db54ba Mon Sep 17 00:00:00 2001 From: Swetha B S Date: Fri, 6 Jun 2025 03:46:05 -0700 Subject: [PATCH 4/9] Resolve PR comments --- src/whisper.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/whisper.cpp b/src/whisper.cpp index 41174b37e41..31c5ee3ba61 100644 --- a/src/whisper.cpp +++ b/src/whisper.cpp @@ -1450,7 +1450,7 @@ static bool weight_buft_supported(const whisper_hparams & hparams, ggml_tensor * ggml_context * ctx = ctx_ptr.get(); ggml_tensor * op_tensor = nullptr; - + int64_t n_ctx = hparams.n_audio_ctx; switch (op) { From ed85572819eced6355ab213cea672d351a74c50f Mon Sep 17 00:00:00 2001 From: Swetha B S Date: Tue, 3 Jun 2025 02:55:32 -0700 Subject: [PATCH 5/9] Get_Rows & Dequantize implementation adapted to work for repacked weights of type q4_0 --- ggml/src/ggml-cpu/repack.cpp | 145 ++++++++++++++++++++++++++++++++++- src/whisper.cpp | 43 +++++++---- 2 files changed, 172 insertions(+), 16 deletions(-) diff --git a/ggml/src/ggml-cpu/repack.cpp b/ggml/src/ggml-cpu/repack.cpp index 5c6715d5c01..ae1fe2336f4 100644 --- a/ggml/src/ggml-cpu/repack.cpp +++ b/ggml/src/ggml-cpu/repack.cpp @@ -1170,6 +1170,9 @@ template src[0]->ne[2]) * op->src[1]->ne[2]; return true; + case GGML_OP_GET_ROWS: + size = 0; // GET_ROWS (standard and repacked) doesn't need a work buffer + return true; default: // GGML_ABORT("fatal error"); break; @@ -1185,6 +1188,9 @@ template src[0]; + + switch (src0->type) { + case GGML_TYPE_Q4_0: { + ggml_compute_forward_get_rows_q4_0x8(params, dst); + } break; + default: + GGML_ABORT("fatal error"); + break; + } + } + + static void ggml_compute_forward_get_rows_q4_0x8( + const ggml_compute_params * params, + ggml_tensor * dst) { + const ggml_tensor * src0 = dst->src[0]; + const ggml_tensor * src1 = dst->src[1]; + + GGML_TENSOR_BINARY_OP_LOCALS + + const int64_t nc = ne00; + const int64_t nr = ggml_nelements(src1); + + assert(ne0 == nc); + assert(ne02 == ne11); + assert(nb00 == ggml_type_size(src0->type)); + assert(ggml_nrows(dst) == nr); + + const int ith = params->ith; + const int nth = params->nth; + + // rows per thread + const int dr = (nr + nth - 1) / nth; + + // row range for this thread + const int ir0 = dr * ith; + const int ir1 = MIN(ir0 + dr, nr); + + constexpr int nrows_interleaved = 8; + const size_t sizeof_one_repacked_block = sizeof(block_q4_0x8); + + const int num_repacked_blocks_per_row_width = nc / QK4_0; + + const size_t stride_between_actual_row_groups = num_repacked_blocks_per_row_width * sizeof_one_repacked_block; + + for (int64_t i = ir0; i < ir1; ++i) { + const int64_t i12 = i / (ne11 * ne10); + const int64_t i11 = (i - i12 * ne11 * ne10) / ne10; + const int64_t i10 = (i - i12 * ne11 * ne10 - i11 * ne10); + const int64_t i01 = *(int32_t *)((char *)src1->data + i10 * nb10 + i11 * nb11 + i12 * nb12); // original logical row + + GGML_ASSERT(i01 >= 0 && i01 < ne01); + + int row_group_idx = i01 / nrows_interleaved; + const int row_idx_in_group = i01 % nrows_interleaved; + + const char * base_ptr_for_higher_dims_in_src0 = (const char *)src0->data + i11 * nb02 + i12 * nb03; + + // Pointer to the first block_q4_0x8 of the identified row_group_idx + const block_q4_0x8 * p_first_repacked_block_of_group_x8 = (const block_q4_0x8 *)(base_ptr_for_higher_dims_in_src0 + row_group_idx * stride_between_actual_row_groups); + + dequantize_row_q4_0x8( + p_first_repacked_block_of_group_x8, + (float *)((char *)dst->data + i10 * nb1 + i11 * nb2 + i12 * nb3), nc, row_idx_in_group); + } + } + + /** + * Dequantizes a single logical row from data repacked with quant interleaving. + * + * @param p_repacked_group_column_blocks Pointer to the start of 'block_q4_0x8' for the row group. + * @param y Output buffer for the dequantized float values. + * @param k Total number of elements (columns) in the logical row. + * @param row_idx_in_group Index (0-7) of the logical row to dequantize. + */ + static void dequantize_row_q4_0x8( + const block_q4_0x8 * GGML_RESTRICT p_repacked_group_column_blocks, + float * GGML_RESTRICT y, + int64_t k, + int row_idx_in_group) { + const int GGML_Q4_0_X8_INTERLEAVE_SIZE = 8; + assert(k % QK4_0 == 0); + assert(row_idx_in_group >= 0 && row_idx_in_group < GGML_Q4_0_X8_INTERLEAVE_SIZE); + + const int nb = k / QK4_0; + const int bytes_for_half_elements = (QK4_0 / 2) / 2; + + const int offset_to_second_half_data = bytes_for_half_elements * GGML_Q4_0_X8_INTERLEAVE_SIZE; + const uint64_t xor_mask = 0x8888888888888888ULL; + const int qk4_0_half_elements = QK4_0 / 2; + + for (int i = 0; i < nb; ++i) { + const block_q4_0x8 * current_column_repacked_block = &p_repacked_group_column_blocks[i]; + const float d_val = GGML_FP16_TO_FP32(current_column_repacked_block->d[row_idx_in_group]); + float * y_curr = y + i * QK4_0; + + const int8_t * qs_first_half_repacked_ptr = &(current_column_repacked_block->qs[row_idx_in_group * bytes_for_half_elements]); + + uint64_t first_half_chunk_u64; + memcpy(&first_half_chunk_u64, qs_first_half_repacked_ptr, sizeof(uint64_t)); + first_half_chunk_u64 ^= xor_mask; // Reverse the XOR + const uint8_t * original_qs_first_half_bytes = (const uint8_t *)&first_half_chunk_u64; + + const int8_t * qs_second_half_repacked_ptr = &(current_column_repacked_block->qs[offset_to_second_half_data + (row_idx_in_group * bytes_for_half_elements)]); + + uint64_t second_half_chunk_u64; + memcpy(&second_half_chunk_u64, qs_second_half_repacked_ptr, sizeof(uint64_t)); + second_half_chunk_u64 ^= xor_mask; // Reverse the XOR + const uint8_t * original_qs_second_half_bytes = (const uint8_t *)&second_half_chunk_u64; + + // dequantizing all QK4_0's for this block. + for (int j = 0; j < bytes_for_half_elements; ++j) { + const uint8_t quant_byte_first = original_qs_first_half_bytes[j]; + y_curr[j] = ((quant_byte_first & 0x0F) - 8) * d_val; + y_curr[j + qk4_0_half_elements] = ((quant_byte_first >> 4) - 8) * d_val; + + const uint8_t quant_byte_second = original_qs_second_half_bytes[j]; + const int out_idx_base_second_half = j + bytes_for_half_elements; // Offset for the second set of low nibbles + y_curr[out_idx_base_second_half] = ((quant_byte_second & 0x0F) - 8) * d_val; + y_curr[out_idx_base_second_half + qk4_0_half_elements] = ((quant_byte_second >> 4) - 8) * d_val; + } + } + } + int repack(struct ggml_tensor * t, const void * data, size_t data_size) override { GGML_LOG_DEBUG("%s: repack tensor %s with %s_%dx%d\n", __func__, t->name, ggml_type_name(t->type), (int) NB_COLS, (int) INTER_SIZE); @@ -1522,12 +1654,23 @@ class extra_buffer_type : ggml::cpu::extra_buffer_type { //if (op->src[1]->type == GGML_TYPE_Q8_0) { // return true; //} + } else if (op->op == GGML_OP_GET_ROWS + && op->src[0]->buffer + && (ggml_n_dims(op->src[0]) == 2) + && op->src[0]->buffer->buft == ggml_backend_cpu_repack_buffer_type() + && ggml_repack_get_optimal_repack_type(op->src[0])) { + if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) { + return false; + } + if (op->src[0]->type == GGML_TYPE_Q4_0) { + return true; + } } return false; } ggml::cpu::tensor_traits * get_tensor_traits(const struct ggml_tensor * op) override { - if (op->op == GGML_OP_MUL_MAT || op->op == GGML_OP_MUL_MAT_ID) { + if (op->op == GGML_OP_MUL_MAT || op->op == GGML_OP_MUL_MAT_ID || op->op == GGML_OP_GET_ROWS) { if (op->src[0]->buffer && op->src[0]->buffer->buft == ggml_backend_cpu_repack_buffer_type()) { return (ggml::cpu::tensor_traits *) op->src[0]->extra; } diff --git a/src/whisper.cpp b/src/whisper.cpp index fe3e135bee6..00d3f14f05c 100644 --- a/src/whisper.cpp +++ b/src/whisper.cpp @@ -1437,24 +1437,25 @@ static bool weight_buft_supported(const whisper_hparams & hparams, ggml_tensor * // GPU and default CPU backend support all operators op_supported = true; } else { - switch (op) { - // The current extra_buffer_type implementations only support GGML_OP_MUL_MAT - case GGML_OP_MUL_MAT: { - ggml_init_params params = { - /*.mem_size =*/ 2 * ggml_tensor_overhead(), - /*.mem_buffer =*/ nullptr, - /*.no_alloc =*/ true, - }; + ggml_init_params params = { + /*.mem_size =*/ 2 * ggml_tensor_overhead(), + /*.mem_buffer =*/ nullptr, + /*.no_alloc =*/ true, + }; - ggml_context_ptr ctx_ptr { ggml_init(params) }; - if (!ctx_ptr) { - throw std::runtime_error("failed to create ggml context"); - } - ggml_context * ctx = ctx_ptr.get(); + ggml_context_ptr ctx_ptr { ggml_init(params) }; + if (!ctx_ptr) { + throw std::runtime_error("failed to create ggml context"); + } + ggml_context * ctx = ctx_ptr.get(); - ggml_tensor * op_tensor = nullptr; + ggml_tensor * op_tensor = nullptr; + + int64_t n_ctx = hparams.n_audio_ctx; - int64_t n_ctx = hparams.n_audio_ctx; + switch (op) { + // The current extra_buffer_type implementations only support GGML_OP_MUL_MAT & GGML_OP_GET_ROWS (q4_0) + case GGML_OP_MUL_MAT: { ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], n_ctx, w->ne[2], w->ne[3]); op_tensor = ggml_mul_mat(ctx, w, b); @@ -1466,6 +1467,18 @@ static bool weight_buft_supported(const whisper_hparams & hparams, ggml_tensor * w->buffer = nullptr; break; } + case GGML_OP_GET_ROWS: { + ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_ctx); + op_tensor = ggml_get_rows(ctx, w, b); + + // create a temporary dummy buffer for the weight so that supports_op can check the buffer type + GGML_ASSERT(w->buffer == nullptr); + w->buffer = ggml_backend_buft_alloc_buffer(buft, 0); + op_supported = ggml_backend_dev_supports_op(dev, op_tensor); + ggml_backend_buffer_free(w->buffer); + w->buffer = nullptr; + break; + } default: { op_supported = false; break; From aea31754424fcf2b9f0cf0daa6ad1114b11efbec Mon Sep 17 00:00:00 2001 From: Swetha B S Date: Fri, 6 Jun 2025 03:46:05 -0700 Subject: [PATCH 6/9] Resolve PR comments --- src/whisper.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/whisper.cpp b/src/whisper.cpp index 00d3f14f05c..46bc934ed9a 100644 --- a/src/whisper.cpp +++ b/src/whisper.cpp @@ -1450,7 +1450,7 @@ static bool weight_buft_supported(const whisper_hparams & hparams, ggml_tensor * ggml_context * ctx = ctx_ptr.get(); ggml_tensor * op_tensor = nullptr; - + int64_t n_ctx = hparams.n_audio_ctx; switch (op) { From b0c631cfb692480716fba2941eea22ad7cd8664f Mon Sep 17 00:00:00 2001 From: Swetha B S Date: Tue, 15 Jul 2025 04:01:25 -0700 Subject: [PATCH 7/9] Templating to differenciate the block_q4_0 --- ggml/src/ggml-cpu/repack.cpp | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/ggml/src/ggml-cpu/repack.cpp b/ggml/src/ggml-cpu/repack.cpp index 396cc305639..c77c330d666 100644 --- a/ggml/src/ggml-cpu/repack.cpp +++ b/ggml/src/ggml-cpu/repack.cpp @@ -1419,7 +1419,14 @@ template type) { case GGML_TYPE_Q4_0: { - ggml_compute_forward_get_rows_q4_0x8(params, dst); + if (ggml_cpu_has_avx2() || (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)) { + if (src0->ne[1] % 8 == 0) { + ggml_compute_forward_get_rows_q4_0(params, dst, 8); + } + } else { + GGML_ABORT("Unsupported block interleaved size for get_rows function"); + } + } break; default: GGML_ABORT("fatal error"); @@ -1427,9 +1434,11 @@ template + static void ggml_compute_forward_get_rows_q4_0( const ggml_compute_params * params, - ggml_tensor * dst) { + ggml_tensor * dst, + int nrows_interleaved) { const ggml_tensor * src0 = dst->src[0]; const ggml_tensor * src1 = dst->src[1]; @@ -1453,8 +1462,7 @@ template data + i11 * nb02 + i12 * nb03; - // Pointer to the first block_q4_0x8 of the identified row_group_idx - const block_q4_0x8 * p_first_repacked_block_of_group_x8 = (const block_q4_0x8 *)(base_ptr_for_higher_dims_in_src0 + row_group_idx * stride_between_actual_row_groups); + // Pointer to the first of the identified row_group_idx + const BLOCK_TYPE * p_first_repacked_block_of_group_block_type = (const BLOCK_TYPE *)(base_ptr_for_higher_dims_in_src0 + row_group_idx * stride_between_actual_row_groups); - dequantize_row_q4_0x8( - p_first_repacked_block_of_group_x8, + dequantize_row_q4_0( + p_first_repacked_block_of_group_block_type, (float *)((char *)dst->data + i10 * nb1 + i11 * nb2 + i12 * nb3), nc, row_idx_in_group); } } @@ -1490,8 +1498,9 @@ template + static void dequantize_row_q4_0( + const BLOCK_TYPE * GGML_RESTRICT p_repacked_group_column_blocks, float * GGML_RESTRICT y, int64_t k, int row_idx_in_group) { From d39e4e6eb0615c48f70e7ad92ef79ac5d0fe7879 Mon Sep 17 00:00:00 2001 From: Swetha B S Date: Wed, 16 Jul 2025 01:01:39 -0700 Subject: [PATCH 8/9] Templating to differenciate the block_q4_0's in get_rows function --- ggml/src/ggml-cpu/repack.cpp | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/ggml/src/ggml-cpu/repack.cpp b/ggml/src/ggml-cpu/repack.cpp index c77c330d666..4cc50a6b77b 100644 --- a/ggml/src/ggml-cpu/repack.cpp +++ b/ggml/src/ggml-cpu/repack.cpp @@ -1419,7 +1419,7 @@ template type) { case GGML_TYPE_Q4_0: { - if (ggml_cpu_has_avx2() || (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)) { + if (ggml_cpu_has_avx2()) { if (src0->ne[1] % 8 == 0) { ggml_compute_forward_get_rows_q4_0(params, dst, 8); } @@ -1484,23 +1484,22 @@ template of the identified row_group_idx const BLOCK_TYPE * p_first_repacked_block_of_group_block_type = (const BLOCK_TYPE *)(base_ptr_for_higher_dims_in_src0 + row_group_idx * stride_between_actual_row_groups); - dequantize_row_q4_0( + dequantize_row_q4_0( p_first_repacked_block_of_group_block_type, (float *)((char *)dst->data + i10 * nb1 + i11 * nb2 + i12 * nb3), nc, row_idx_in_group); } } /** - * Dequantizes a single logical row from data repacked with quant interleaving. + * Dequantizes a single logical row from data repacked with quant interleaving for repacked block_q4_0x8 * * @param p_repacked_group_column_blocks Pointer to the start of 'block_q4_0x8' for the row group. * @param y Output buffer for the dequantized float values. * @param k Total number of elements (columns) in the logical row. * @param row_idx_in_group Index (0-7) of the logical row to dequantize. */ - template static void dequantize_row_q4_0( - const BLOCK_TYPE * GGML_RESTRICT p_repacked_group_column_blocks, + const block_q4_0x8 * GGML_RESTRICT p_repacked_group_column_blocks, float * GGML_RESTRICT y, int64_t k, int row_idx_in_group) { From ce73bd3aca24d25009d16d5b79f7f03cd1ff7bbc Mon Sep 17 00:00:00 2001 From: Swetha B S Date: Thu, 17 Jul 2025 03:42:31 -0700 Subject: [PATCH 9/9] Minor changes --- src/whisper.cpp | 32 +------------------------------- 1 file changed, 1 insertion(+), 31 deletions(-) diff --git a/src/whisper.cpp b/src/whisper.cpp index d6c27776482..5c08478aefd 100644 --- a/src/whisper.cpp +++ b/src/whisper.cpp @@ -1437,22 +1437,6 @@ static bool weight_buft_supported(const whisper_hparams & hparams, ggml_tensor * // GPU and default CPU backend support all operators op_supported = true; } else { - ggml_init_params params = { - /*.mem_size =*/ 2 * ggml_tensor_overhead(), - /*.mem_buffer =*/ nullptr, - /*.no_alloc =*/ true, - }; - - ggml_context_ptr ctx_ptr { ggml_init(params) }; - if (!ctx_ptr) { - throw std::runtime_error("failed to create ggml context"); - } - ggml_context * ctx = ctx_ptr.get(); - - ggml_tensor * op_tensor = nullptr; - - int64_t n_ctx = hparams.n_audio_ctx; - switch (op) { // The current extra_buffer_type implementations only support GGML_OP_MUL_MAT and GGML_OP_GET_ROWS case GGML_OP_GET_ROWS: @@ -1475,10 +1459,8 @@ static bool weight_buft_supported(const whisper_hparams & hparams, ggml_tensor * int64_t n_ctx = hparams.n_audio_ctx; ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], n_ctx, w->ne[2], w->ne[3]); op_tensor = ggml_mul_mat(ctx, w, b); - } - else if (op == GGML_OP_GET_ROWS) { + } else if (op == GGML_OP_GET_ROWS) { int64_t num_indices = 8; - int64_t n_ctx = hparams.n_audio_ctx; ggml_tensor * indices = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, num_indices); op_tensor = ggml_get_rows(ctx, w, indices); } @@ -1491,18 +1473,6 @@ static bool weight_buft_supported(const whisper_hparams & hparams, ggml_tensor * w->buffer = nullptr; break; } - // case GGML_OP_GET_ROWS: { - // ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_ctx); - // op_tensor = ggml_get_rows(ctx, w, b); - - // // create a temporary dummy buffer for the weight so that supports_op can check the buffer type - // GGML_ASSERT(w->buffer == nullptr); - // w->buffer = ggml_backend_buft_alloc_buffer(buft, 0); - // op_supported = ggml_backend_dev_supports_op(dev, op_tensor); - // ggml_backend_buffer_free(w->buffer); - // w->buffer = nullptr; - // break; - // } default: { op_supported = false; break;