From 476df8e53ece2e4b8a4131110ed0487defe774d7 Mon Sep 17 00:00:00 2001 From: bepis Date: Thu, 23 Oct 2025 17:59:25 -0700 Subject: [PATCH 01/25] global bool --- common.hpp | 2 +- examples/cli/main.cpp | 5 ++++ flux.hpp | 12 ++++----- ggml | 2 +- ggml_extend.hpp | 57 +++++++++++++++++++++++++++++++++++++++---- mmdit.hpp | 4 +-- qwen_image.hpp | 4 +-- stable-diffusion.cpp | 9 +++++++ stable-diffusion.h | 1 + wan.hpp | 12 ++++----- 10 files changed, 85 insertions(+), 23 deletions(-) diff --git a/common.hpp b/common.hpp index d32167145..4a891bc8b 100644 --- a/common.hpp +++ b/common.hpp @@ -28,7 +28,7 @@ class DownSampleBlock : public GGMLBlock { if (vae_downsample) { auto conv = std::dynamic_pointer_cast(blocks["conv"]); - x = ggml_pad(ctx, x, 1, 1, 0, 0); + x = sd_pad(ctx, x, 1, 1, 0, 0); x = conv->forward(ctx, x); } else { auto conv = std::dynamic_pointer_cast(blocks["op"]); diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp index ff36cea25..cd6310736 100644 --- a/examples/cli/main.cpp +++ b/examples/cli/main.cpp @@ -113,6 +113,7 @@ struct SDParams { bool diffusion_flash_attn = false; bool diffusion_conv_direct = false; bool vae_conv_direct = false; + bool circular_pad = false; bool canny_preprocess = false; bool color = false; int upscale_repeats = 1; @@ -183,6 +184,7 @@ void print_params(SDParams params) { printf(" diffusion flash attention: %s\n", params.diffusion_flash_attn ? "true" : "false"); printf(" diffusion Conv2d direct: %s\n", params.diffusion_conv_direct ? "true" : "false"); printf(" vae_conv_direct: %s\n", params.vae_conv_direct ? "true" : "false"); + printf(" circular padding: %s\n", params.circular_pad ? "true" : "false"); printf(" control_strength: %.2f\n", params.control_strength); printf(" prompt: %s\n", params.prompt.c_str()); printf(" negative_prompt: %s\n", params.negative_prompt.c_str()); @@ -304,6 +306,7 @@ void print_usage(int argc, const char* argv[]) { printf(" This might crash if it is not supported by the backend.\n"); printf(" --vae-conv-direct use Conv2d direct in the vae model (should improve the performance)\n"); printf(" This might crash if it is not supported by the backend.\n"); + printf(" --circular use circular padding for convolutions and pad ops\n"); printf(" --control-net-cpu keep controlnet in cpu (for low vram)\n"); printf(" --canny apply canny preprocessor (edge detection)\n"); printf(" --color colors the logging tags according to level\n"); @@ -573,6 +576,7 @@ void parse_args(int argc, const char** argv, SDParams& params) { {"", "--diffusion-fa", "", true, ¶ms.diffusion_flash_attn}, {"", "--diffusion-conv-direct", "", true, ¶ms.diffusion_conv_direct}, {"", "--vae-conv-direct", "", true, ¶ms.vae_conv_direct}, + {"", "--circular", "", true, ¶ms.circular_pad}, {"", "--canny", "", true, ¶ms.canny_preprocess}, {"-v", "--verbose", "", true, ¶ms.verbose}, {"", "--color", "", true, ¶ms.color}, @@ -1386,6 +1390,7 @@ int main(int argc, const char* argv[]) { params.diffusion_flash_attn, params.diffusion_conv_direct, params.vae_conv_direct, + params.circular_pad, params.force_sdxl_vae_conv_scale, params.chroma_use_dit_mask, params.chroma_use_t5_mask, diff --git a/flux.hpp b/flux.hpp index 2ed410419..c03b3ce2a 100644 --- a/flux.hpp +++ b/flux.hpp @@ -696,7 +696,7 @@ namespace Flux { vec = approx->forward(ctx, vec); // [344, N, hidden_size] if (y != NULL) { - txt_img_mask = ggml_pad(ctx, y, img->ne[1], 0, 0, 0); + txt_img_mask = sd_pad(ctx, y, img->ne[1], 0, 0, 0); } } else { auto time_in = std::dynamic_pointer_cast(blocks["time_in"]); @@ -759,7 +759,7 @@ namespace Flux { int64_t patch_size = 2; int pad_h = (patch_size - H % patch_size) % patch_size; int pad_w = (patch_size - W % patch_size) % patch_size; - x = ggml_pad(ctx, x, pad_w, pad_h, 0, 0); // [N, C, H + pad_h, W + pad_w] + x = sd_pad(ctx, x, pad_w, pad_h, 0, 0); // [N, C, H + pad_h, W + pad_w] // img = rearrange(x, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=patch_size, pw=patch_size) auto img = patchify(ctx, x, patch_size); // [N, h*w, C * patch_size * patch_size] @@ -815,9 +815,9 @@ namespace Flux { ggml_tensor* mask = ggml_view_4d(ctx, c_concat, c_concat->ne[0], c_concat->ne[1], 1, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], c_concat->nb[2] * C); ggml_tensor* control = ggml_view_4d(ctx, c_concat, c_concat->ne[0], c_concat->ne[1], C, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], c_concat->nb[2] * (C + 1)); - masked = ggml_pad(ctx, masked, pad_w, pad_h, 0, 0); - mask = ggml_pad(ctx, mask, pad_w, pad_h, 0, 0); - control = ggml_pad(ctx, control, pad_w, pad_h, 0, 0); + masked = sd_pad(ctx, masked, pad_w, pad_h, 0, 0); + mask = sd_pad(ctx, mask, pad_w, pad_h, 0, 0); + control = sd_pad(ctx, control, pad_w, pad_h, 0, 0); masked = patchify(ctx, masked, patch_size); mask = patchify(ctx, mask, patch_size); @@ -827,7 +827,7 @@ namespace Flux { } else if (params.version == VERSION_FLUX_CONTROLS) { GGML_ASSERT(c_concat != NULL); - ggml_tensor* control = ggml_pad(ctx, c_concat, pad_w, pad_h, 0, 0); + ggml_tensor* control = sd_pad(ctx, c_concat, pad_w, pad_h, 0, 0); control = patchify(ctx, control, patch_size); img = ggml_concat(ctx, img, control, 0); } diff --git a/ggml b/ggml index 7bffd79a4..25d358c62 160000 --- a/ggml +++ b/ggml @@ -1 +1 @@ -Subproject commit 7bffd79a4bec72e9a3bfbedb582a218b84401c13 +Subproject commit 25d358c627186901b6506ee70faed598613eff05 diff --git a/ggml_extend.hpp b/ggml_extend.hpp index d8df0d8f6..7a253404b 100644 --- a/ggml_extend.hpp +++ b/ggml_extend.hpp @@ -60,6 +60,39 @@ #define SD_UNUSED(x) (void)(x) #endif +inline bool& sd_global_circular_padding_enabled() { + static bool enabled = false; + return enabled; +} + +__STATIC_INLINE__ struct ggml_tensor* sd_pad(struct ggml_context* ctx, + struct ggml_tensor* a, + int p0, + int p1, + int p2, + int p3) { + if (sd_global_circular_padding_enabled()) { + return ggml_pad_circular(ctx, a, 0, p0, 0, p1, 0, p2, 0, p3); + } + return ggml_pad(ctx, a, p0, p1, p2, p3); +} + +__STATIC_INLINE__ struct ggml_tensor* sd_pad_ext(struct ggml_context* ctx, + struct ggml_tensor* a, + int lp0, + int rp0, + int lp1, + int rp1, + int lp2, + int rp2, + int lp3, + int rp3) { + if (sd_global_circular_padding_enabled()) { + return ggml_pad_circular(ctx, a, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3); + } + return ggml_pad_ext(ctx, a, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3); +} + __STATIC_INLINE__ void ggml_log_callback_default(ggml_log_level level, const char* text, void*) { switch (level) { case GGML_LOG_LEVEL_DEBUG: @@ -986,10 +1019,24 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_nn_conv_2d(struct ggml_context* ctx, if (scale != 1.f) { x = ggml_scale(ctx, x, scale); } + const bool use_circular = sd_global_circular_padding_enabled() && (p0 != 0 || p1 != 0); + const bool is_depthwise = (w->ne[2] == 1 && x->ne[2] == w->ne[3]); if (direct) { - x = ggml_conv_2d_direct(ctx, w, x, s0, s1, p0, p1, d0, d1); + if (use_circular) { + if (is_depthwise) { + x = ggml_conv_2d_dw_direct_circular(ctx, w, x, s0, s1, p0, p1, d0, d1); + } else { + x = ggml_conv_2d_direct_circular(ctx, w, x, s0, s1, p0, p1, d0, d1); + } + } else { + x = ggml_conv_2d_direct(ctx, w, x, s0, s1, p0, p1, d0, d1); + } } else { - x = ggml_conv_2d(ctx, w, x, s0, s1, p0, p1, d0, d1); + if (use_circular) { + x = ggml_conv_2d_circular(ctx, w, x, s0, s1, p0, p1, d0, d1); + } else { + x = ggml_conv_2d(ctx, w, x, s0, s1, p0, p1, d0, d1); + } } if (scale != 1.f) { x = ggml_scale(ctx, x, 1.f / scale); @@ -1190,7 +1237,7 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_nn_attention_ext(struct ggml_context* auto build_kqv = [&](ggml_tensor* q_in, ggml_tensor* k_in, ggml_tensor* v_in, ggml_tensor* mask_in) -> ggml_tensor* { if (kv_pad != 0) { - k_in = ggml_pad(ctx, k_in, 0, kv_pad, 0, 0); + k_in = sd_pad(ctx, k_in, 0, kv_pad, 0, 0); } if (kv_scale != 1.0f) { k_in = ggml_scale(ctx, k_in, kv_scale); @@ -1200,7 +1247,7 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_nn_attention_ext(struct ggml_context* v_in = ggml_nn_cont(ctx, ggml_permute(ctx, v_in, 0, 2, 1, 3)); v_in = ggml_reshape_3d(ctx, v_in, d_head, L_k, n_kv_head * N); if (kv_pad != 0) { - v_in = ggml_pad(ctx, v_in, 0, kv_pad, 0, 0); + v_in = sd_pad(ctx, v_in, 0, kv_pad, 0, 0); } if (kv_scale != 1.0f) { v_in = ggml_scale(ctx, v_in, kv_scale); @@ -1223,7 +1270,7 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_nn_attention_ext(struct ggml_context* mask_pad = GGML_PAD(L_q, GGML_KQ_MASK_PAD) - mask_in->ne[1]; } if (mask_pad > 0) { - mask_in = ggml_pad(ctx, mask_in, 0, mask_pad, 0, 0); + mask_in = sd_pad(ctx, mask_in, 0, mask_pad, 0, 0); } mask_in = ggml_cast(ctx, mask_in, GGML_TYPE_F16); } diff --git a/mmdit.hpp b/mmdit.hpp index d9d19340c..1b3f2276f 100644 --- a/mmdit.hpp +++ b/mmdit.hpp @@ -80,7 +80,7 @@ struct PatchEmbed : public GGMLBlock { int64_t H = x->ne[1]; int pad_h = (patch_size - H % patch_size) % patch_size; int pad_w = (patch_size - W % patch_size) % patch_size; - x = ggml_pad(ctx, x, pad_w, pad_h, 0, 0); // TODO: reflect pad mode + x = sd_pad(ctx, x, pad_w, pad_h, 0, 0); // TODO: reflect pad mode } x = proj->forward(ctx, x); @@ -997,4 +997,4 @@ struct MMDiTRunner : public GGMLRunner { } }; -#endif \ No newline at end of file +#endif diff --git a/qwen_image.hpp b/qwen_image.hpp index ce4e62dce..cc336ff28 100644 --- a/qwen_image.hpp +++ b/qwen_image.hpp @@ -363,7 +363,7 @@ namespace Qwen { int pad_h = (params.patch_size - H % params.patch_size) % params.patch_size; int pad_w = (params.patch_size - W % params.patch_size) % params.patch_size; - x = ggml_pad(ctx, x, pad_w, pad_h, 0, 0); // [N, C, H + pad_h, W + pad_w] + x = sd_pad(ctx, x, pad_w, pad_h, 0, 0); // [N, C, H + pad_h, W + pad_w] return x; } @@ -691,4 +691,4 @@ namespace Qwen { } // namespace name -#endif // __QWEN_IMAGE_HPP__ \ No newline at end of file +#endif // __QWEN_IMAGE_HPP__ diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 87b6a3779..e0c19f3a1 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -114,6 +114,7 @@ class StableDiffusionGGML { bool use_tiny_autoencoder = false; sd_tiling_params_t vae_tiling_params = {false, 0, 0, 0.5f, 0, 0}; bool offload_params_to_cpu = false; + bool circular_pad = false; bool stacked_id = false; bool is_using_v_parameterization = false; @@ -187,6 +188,11 @@ class StableDiffusionGGML { taesd_path = SAFE_STR(sd_ctx_params->taesd_path); use_tiny_autoencoder = taesd_path.size() > 0; offload_params_to_cpu = sd_ctx_params->offload_params_to_cpu; + circular_pad = sd_ctx_params->circular_pad; + sd_global_circular_padding_enabled() = circular_pad; + if (circular_pad) { + LOG_INFO("Using circular padding for convolutions"); + } if (sd_ctx_params->rng_type == STD_DEFAULT_RNG) { rng = std::make_shared(); @@ -1820,6 +1826,7 @@ void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) { sd_ctx_params->keep_control_net_on_cpu = false; sd_ctx_params->keep_vae_on_cpu = false; sd_ctx_params->diffusion_flash_attn = false; + sd_ctx_params->circular_pad = false; sd_ctx_params->chroma_use_dit_mask = true; sd_ctx_params->chroma_use_t5_mask = false; sd_ctx_params->chroma_t5_mask_pad = 1; @@ -1860,6 +1867,7 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) { "keep_control_net_on_cpu: %s\n" "keep_vae_on_cpu: %s\n" "diffusion_flash_attn: %s\n" + "circular_pad: %s\n" "chroma_use_dit_mask: %s\n" "chroma_use_t5_mask: %s\n" "chroma_t5_mask_pad: %d\n", @@ -1889,6 +1897,7 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) { BOOL_STR(sd_ctx_params->keep_control_net_on_cpu), BOOL_STR(sd_ctx_params->keep_vae_on_cpu), BOOL_STR(sd_ctx_params->diffusion_flash_attn), + BOOL_STR(sd_ctx_params->circular_pad), BOOL_STR(sd_ctx_params->chroma_use_dit_mask), BOOL_STR(sd_ctx_params->chroma_use_t5_mask), sd_ctx_params->chroma_t5_mask_pad); diff --git a/stable-diffusion.h b/stable-diffusion.h index a891a58f1..1512c7192 100644 --- a/stable-diffusion.h +++ b/stable-diffusion.h @@ -164,6 +164,7 @@ typedef struct { bool diffusion_flash_attn; bool diffusion_conv_direct; bool vae_conv_direct; + bool circular_pad; bool force_sdxl_vae_conv_scale; bool chroma_use_dit_mask; bool chroma_use_t5_mask; diff --git a/wan.hpp b/wan.hpp index 31fa90b3a..8d2e29641 100644 --- a/wan.hpp +++ b/wan.hpp @@ -73,7 +73,7 @@ namespace WAN { lp2 -= (int)cache_x->ne[2]; } - x = ggml_pad_ext(ctx, x, lp0, rp0, lp1, rp1, lp2, rp2, 0, 0); + x = sd_pad_ext(ctx, x, lp0, rp0, lp1, rp1, lp2, rp2, 0, 0); return ggml_nn_conv_3d(ctx, x, w, b, in_channels, std::get<2>(stride), std::get<1>(stride), std::get<0>(stride), 0, 0, 0, @@ -172,7 +172,7 @@ namespace WAN { 2); } if (chunk_idx == 1 && cache_x->ne[2] < 2) { // Rep - cache_x = ggml_pad_ext(ctx, cache_x, 0, 0, 0, 0, (int)cache_x->ne[2], 0, 0, 0); + cache_x = sd_pad_ext(ctx, cache_x, 0, 0, 0, 0, (int)cache_x->ne[2], 0, 0, 0); // aka cache_x = torch.cat([torch.zeros_like(cache_x).to(cache_x.device),cache_x],dim=2) } if (chunk_idx == 1) { @@ -198,9 +198,9 @@ namespace WAN { } else if (mode == "upsample3d") { x = ggml_upscale(ctx, x, 2, GGML_SCALE_MODE_NEAREST); } else if (mode == "downsample2d") { - x = ggml_pad(ctx, x, 1, 1, 0, 0); + x = sd_pad(ctx, x, 1, 1, 0, 0); } else if (mode == "downsample3d") { - x = ggml_pad(ctx, x, 1, 1, 0, 0); + x = sd_pad(ctx, x, 1, 1, 0, 0); } x = resample_1->forward(ctx, x); x = ggml_nn_cont(ctx, ggml_torch_permute(ctx, x, 0, 1, 3, 2)); // (c, t, h, w) @@ -260,7 +260,7 @@ namespace WAN { int64_t pad_t = (factor_t - T % factor_t) % factor_t; - x = ggml_pad_ext(ctx, x, 0, 0, 0, 0, pad_t, 0, 0, 0); + x = sd_pad_ext(ctx, x, 0, 0, 0, 0, pad_t, 0, 0, 0); T = x->ne[2]; x = ggml_reshape_4d(ctx, x, W * H, factor_t, T / factor_t, C); // [C, T/factor_t, factor_t, H*W] @@ -1838,7 +1838,7 @@ namespace WAN { int pad_t = (std::get<0>(params.patch_size) - T % std::get<0>(params.patch_size)) % std::get<0>(params.patch_size); int pad_h = (std::get<1>(params.patch_size) - H % std::get<1>(params.patch_size)) % std::get<1>(params.patch_size); int pad_w = (std::get<2>(params.patch_size) - W % std::get<2>(params.patch_size)) % std::get<2>(params.patch_size); - x = ggml_pad(ctx, x, pad_w, pad_h, pad_t, 0); // [N*C, T + pad_t, H + pad_h, W + pad_w] + x = sd_pad(ctx, x, pad_w, pad_h, pad_t, 0); // [N*C, T + pad_t, H + pad_h, W + pad_w] return x; } From 6d85b94039d8c6bfa13fa2f815bb4a32824a6de6 Mon Sep 17 00:00:00 2001 From: bepis Date: Thu, 23 Oct 2025 18:01:28 -0700 Subject: [PATCH 02/25] reworked circular to global flag --- ggml_extend.hpp | 19 ++++++++++++++----- stable-diffusion.cpp | 2 +- 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/ggml_extend.hpp b/ggml_extend.hpp index 7a253404b..7c43b3e19 100644 --- a/ggml_extend.hpp +++ b/ggml_extend.hpp @@ -19,6 +19,7 @@ #include #include #include +#include #include "ggml-alloc.h" #include "ggml-backend.h" @@ -60,18 +61,26 @@ #define SD_UNUSED(x) (void)(x) #endif -inline bool& sd_global_circular_padding_enabled() { - static bool enabled = false; +inline std::atomic& sd_circular_padding_flag() { + static std::atomic enabled{false}; return enabled; } +inline void sd_set_circular_padding_enabled(bool enabled) { + sd_circular_padding_flag().store(enabled, std::memory_order_relaxed); +} + +inline bool sd_is_circular_padding_enabled() { + return sd_circular_padding_flag().load(std::memory_order_relaxed); +} + __STATIC_INLINE__ struct ggml_tensor* sd_pad(struct ggml_context* ctx, struct ggml_tensor* a, int p0, int p1, int p2, int p3) { - if (sd_global_circular_padding_enabled()) { + if (sd_is_circular_padding_enabled()) { return ggml_pad_circular(ctx, a, 0, p0, 0, p1, 0, p2, 0, p3); } return ggml_pad(ctx, a, p0, p1, p2, p3); @@ -87,7 +96,7 @@ __STATIC_INLINE__ struct ggml_tensor* sd_pad_ext(struct ggml_context* ctx, int rp2, int lp3, int rp3) { - if (sd_global_circular_padding_enabled()) { + if (sd_is_circular_padding_enabled()) { return ggml_pad_circular(ctx, a, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3); } return ggml_pad_ext(ctx, a, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3); @@ -1019,7 +1028,7 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_nn_conv_2d(struct ggml_context* ctx, if (scale != 1.f) { x = ggml_scale(ctx, x, scale); } - const bool use_circular = sd_global_circular_padding_enabled() && (p0 != 0 || p1 != 0); + const bool use_circular = sd_is_circular_padding_enabled() && (p0 != 0 || p1 != 0); const bool is_depthwise = (w->ne[2] == 1 && x->ne[2] == w->ne[3]); if (direct) { if (use_circular) { diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index e0c19f3a1..adc007be9 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -189,7 +189,7 @@ class StableDiffusionGGML { use_tiny_autoencoder = taesd_path.size() > 0; offload_params_to_cpu = sd_ctx_params->offload_params_to_cpu; circular_pad = sd_ctx_params->circular_pad; - sd_global_circular_padding_enabled() = circular_pad; + sd_set_circular_padding_enabled(circular_pad); if (circular_pad) { LOG_INFO("Using circular padding for convolutions"); } From 009271189fb7e77f566012518655d5981f0b152e Mon Sep 17 00:00:00 2001 From: bepis Date: Thu, 23 Oct 2025 18:22:26 -0700 Subject: [PATCH 03/25] cleaner implementation of tiling support in sd cpp --- rope.hpp | 42 +++++++++++++++++++++++++++--------------- 1 file changed, 27 insertions(+), 15 deletions(-) diff --git a/rope.hpp b/rope.hpp index 295c9a217..4b68e2ac1 100644 --- a/rope.hpp +++ b/rope.hpp @@ -50,24 +50,36 @@ namespace Rope { omega[i] = 1.0 / std::pow(theta, scale[i]); } - int pos_size = pos.size(); - std::vector> out(pos_size, std::vector(half_dim)); - for (int i = 0; i < pos_size; ++i) { + for (size_t i = 0; i < pos.size(); ++i) { + float position = pos[i]; for (int j = 0; j < half_dim; ++j) { - out[i][j] = pos[i] * omega[j]; - } - } - - std::vector> result(pos_size, std::vector(half_dim * 4)); - for (int i = 0; i < pos_size; ++i) { - for (int j = 0; j < half_dim; ++j) { - result[i][4 * j] = std::cos(out[i][j]); - result[i][4 * j + 1] = -std::sin(out[i][j]); - result[i][4 * j + 2] = std::sin(out[i][j]); - result[i][4 * j + 3] = std::cos(out[i][j]); + float omega_val = omega[j]; + float original_angle = position * omega_val; + float angle = original_angle; + if (sd_is_circular_padding_enabled()) { + constexpr float TWO_PI = 6.28318530717958647692f; + float wrap_f = static_cast(wrap); + float cycles = omega_val * wrap_f / TWO_PI; + float rounded = std::round(cycles); // closest periodic harmonic + float periodic_omega = TWO_PI * rounded / wrap_f; + float periodic_angle = position * periodic_omega; + float rel_pos = std::fmod(position, wrap_f); + if (rel_pos < 0.0f) { + rel_pos += wrap_f; + } + float t = wrap_f > 0.0f ? rel_pos / wrap_f : 0.0f; + float window = 0.5f - 0.5f * std::cos(TWO_PI * t); // 0 at edges, 1 in the middle + window = std::clamp(window, 0.0f, 1.0f); + angle = periodic_angle + window * (original_angle - periodic_angle); + } + float sin_val = std::sin(angle); + float cos_val = std::cos(angle); + result[i][4 * j] = cos_val; + result[i][4 * j + 1] = -sin_val; + result[i][4 * j + 2] = sin_val; + result[i][4 * j + 3] = cos_val; } } - return result; } From ee0e82a40bdade750b2454013058cf4aff04fc73 Mon Sep 17 00:00:00 2001 From: bepis Date: Fri, 24 Oct 2025 13:52:58 -0700 Subject: [PATCH 04/25] cleaned rope --- ggml | 2 +- ggml_extend.hpp | 84 +++++++++++++++++++++++++------------------------ rope.hpp | 63 +++++++++++++++++++++++++++++++++---- 3 files changed, 101 insertions(+), 48 deletions(-) diff --git a/ggml b/ggml index 25d358c62..6eb26b3c7 160000 --- a/ggml +++ b/ggml @@ -1 +1 @@ -Subproject commit 25d358c627186901b6506ee70faed598613eff05 +Subproject commit 6eb26b3c74ed06f600e61f48d62dc39f9c1166c0 diff --git a/ggml_extend.hpp b/ggml_extend.hpp index 7c43b3e19..638beec0d 100644 --- a/ggml_extend.hpp +++ b/ggml_extend.hpp @@ -61,47 +61,6 @@ #define SD_UNUSED(x) (void)(x) #endif -inline std::atomic& sd_circular_padding_flag() { - static std::atomic enabled{false}; - return enabled; -} - -inline void sd_set_circular_padding_enabled(bool enabled) { - sd_circular_padding_flag().store(enabled, std::memory_order_relaxed); -} - -inline bool sd_is_circular_padding_enabled() { - return sd_circular_padding_flag().load(std::memory_order_relaxed); -} - -__STATIC_INLINE__ struct ggml_tensor* sd_pad(struct ggml_context* ctx, - struct ggml_tensor* a, - int p0, - int p1, - int p2, - int p3) { - if (sd_is_circular_padding_enabled()) { - return ggml_pad_circular(ctx, a, 0, p0, 0, p1, 0, p2, 0, p3); - } - return ggml_pad(ctx, a, p0, p1, p2, p3); -} - -__STATIC_INLINE__ struct ggml_tensor* sd_pad_ext(struct ggml_context* ctx, - struct ggml_tensor* a, - int lp0, - int rp0, - int lp1, - int rp1, - int lp2, - int rp2, - int lp3, - int rp3) { - if (sd_is_circular_padding_enabled()) { - return ggml_pad_circular(ctx, a, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3); - } - return ggml_pad_ext(ctx, a, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3); -} - __STATIC_INLINE__ void ggml_log_callback_default(ggml_log_level level, const char* text, void*) { switch (level) { case GGML_LOG_LEVEL_DEBUG: @@ -628,6 +587,49 @@ __STATIC_INLINE__ void ggml_tensor_clamp(struct ggml_tensor* src, float min, flo } } + + +inline std::atomic& sd_circular_padding_flag() { + static std::atomic enabled{false}; + return enabled; +} + +inline void sd_set_circular_padding_enabled(bool enabled) { + sd_circular_padding_flag().store(enabled, std::memory_order_relaxed); +} + +inline bool sd_is_circular_padding_enabled() { + return sd_circular_padding_flag().load(std::memory_order_relaxed); +} + +__STATIC_INLINE__ struct ggml_tensor* sd_pad(struct ggml_context* ctx, + struct ggml_tensor* a, + int p0, + int p1, + int p2, + int p3) { + if (sd_is_circular_padding_enabled()) { + return ggml_pad_circular(ctx, a, 0, p0, 0, p1, 0, p2, 0, p3); + } + return ggml_pad(ctx, a, p0, p1, p2, p3); +} + +__STATIC_INLINE__ struct ggml_tensor* sd_pad_ext(struct ggml_context* ctx, + struct ggml_tensor* a, + int lp0, + int rp0, + int lp1, + int rp1, + int lp2, + int rp2, + int lp3, + int rp3) { + if (sd_is_circular_padding_enabled()) { + return ggml_pad_circular(ctx, a, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3); + } + return ggml_pad_ext(ctx, a, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3); +} + __STATIC_INLINE__ struct ggml_tensor* ggml_tensor_concat(struct ggml_context* ctx, struct ggml_tensor* a, struct ggml_tensor* b, diff --git a/rope.hpp b/rope.hpp index 4b68e2ac1..8280de528 100644 --- a/rope.hpp +++ b/rope.hpp @@ -1,6 +1,8 @@ #ifndef __ROPE_HPP__ #define __ROPE_HPP__ +#include +#include #include #include "ggml_extend.hpp" @@ -39,15 +41,20 @@ namespace Rope { return flat_vec; } - __STATIC_INLINE__ std::vector> rope(const std::vector& pos, int dim, int theta) { + __STATIC_INLINE__ std::vector> rope(const std::vector& pos, + int dim, + int theta, + const std::vector* wraps = nullptr) { assert(dim % 2 == 0); int half_dim = dim / 2; + std::vector> result(pos.size(), std::vector(half_dim * 4)); + std::vector scale = linspace(0.f, (dim * 1.f - 2) / dim, half_dim); std::vector omega(half_dim); for (int i = 0; i < half_dim; ++i) { - omega[i] = 1.0 / std::pow(theta, scale[i]); + omega[i] = 1.0f / std::pow(theta, scale[i]); } for (size_t i = 0; i < pos.size(); ++i) { @@ -56,7 +63,13 @@ namespace Rope { float omega_val = omega[j]; float original_angle = position * omega_val; float angle = original_angle; - if (sd_is_circular_padding_enabled()) { + int wrap = 0; + if (wraps != nullptr && !wraps->empty()) { + size_t wrap_size = wraps->size(); + size_t wrap_idx = wrap_size > 0 ? (i % wrap_size) : 0; + wrap = (*wraps)[wrap_idx]; + } + if (wrap > 0) { constexpr float TWO_PI = 6.28318530717958647692f; float wrap_f = static_cast(wrap); float cycles = omega_val * wrap_f / TWO_PI; @@ -80,6 +93,7 @@ namespace Rope { result[i][4 * j + 3] = cos_val; } } + return result; } @@ -134,7 +148,8 @@ namespace Rope { __STATIC_INLINE__ std::vector embed_nd(const std::vector>& ids, int bs, int theta, - const std::vector& axes_dim) { + const std::vector& axes_dim, + const std::vector>* axes_wraps = nullptr) { std::vector> trans_ids = transpose(ids); size_t pos_len = ids.size() / bs; int num_axes = axes_dim.size(); @@ -149,7 +164,12 @@ namespace Rope { std::vector> emb(bs * pos_len, std::vector(emb_dim * 2 * 2, 0.0)); int offset = 0; for (int i = 0; i < num_axes; ++i) { - std::vector> rope_emb = rope(trans_ids[i], axes_dim[i], theta); // [bs*pos_len, axes_dim[i]/2 * 2 * 2] + const std::vector* axis_wrap = nullptr; + if (axes_wraps != nullptr && i < (int)axes_wraps->size()) { + axis_wrap = &(*axes_wraps)[i]; + } + std::vector> rope_emb = + rope(trans_ids[i], axes_dim[i], theta, axis_wrap); // [bs*pos_len, axes_dim[i]/2 * 2 * 2] for (int b = 0; b < bs; ++b) { for (int j = 0; j < pos_len; ++j) { for (int k = 0; k < rope_emb[0].size(); ++k) { @@ -264,7 +284,38 @@ namespace Rope { int theta, const std::vector& axes_dim) { std::vector> ids = gen_qwen_image_ids(h, w, patch_size, bs, context_len, ref_latents, increase_ref_index); - return embed_nd(ids, bs, theta, axes_dim); + std::vector> axes_wraps; + if (sd_is_circular_padding_enabled() && bs > 0 && axes_dim.size() >= 3) { + int pad_h = (patch_size - (h % patch_size)) % patch_size; + int pad_w = (patch_size - (w % patch_size)) % patch_size; + int h_len = (h + pad_h) / patch_size; + int w_len = (w + pad_w) / patch_size; + if (h_len > 0 && w_len > 0) { + const size_t total_tokens = ids.size(); + // Track per-token wrap lengths for the row/column axes so only spatial tokens become periodic. + axes_wraps.assign(axes_dim.size(), std::vector(total_tokens / bs, 0)); + size_t cursor = 0; + for (ggml_tensor* ref : ref_latents) { + if (ref == nullptr) { + continue; + } + int ref_h = static_cast(ref->ne[1]); + int ref_w = static_cast(ref->ne[0]); + int ref_pad_h = (patch_size - (ref_h % patch_size)) % patch_size; + int ref_pad_w = (patch_size - (ref_w % patch_size)) % patch_size; + int ref_h_len = (ref_h + ref_pad_h) / patch_size; + int ref_w_len = (ref_w + ref_pad_w) / patch_size; + size_t ref_n_tokens = static_cast(ref_h_len) * static_cast(ref_w_len); + for (size_t token_i = 0; token_i < ref_n_tokens; ++token_i) { + axes_wraps[1][cursor + token_i] = ref_h_len; + axes_wraps[2][cursor + token_i] = ref_w_len; + } + cursor += ref_n_tokens; + } + } + } + const std::vector>* wraps_ptr = axes_wraps.empty() ? nullptr : &axes_wraps; + return embed_nd(ids, bs, theta, axes_dim, wraps_ptr); } __STATIC_INLINE__ std::vector> gen_vid_ids(int t, From cbb261dfdb6dedaa4539ef082749037c05f0cb34 Mon Sep 17 00:00:00 2001 From: bepis Date: Fri, 24 Oct 2025 14:46:28 -0700 Subject: [PATCH 05/25] working simplified but still need wraps --- ggml_extend.hpp | 7 +++++-- rope.hpp | 31 +++++++++++++++++-------------- 2 files changed, 22 insertions(+), 16 deletions(-) diff --git a/ggml_extend.hpp b/ggml_extend.hpp index 638beec0d..9699b12cd 100644 --- a/ggml_extend.hpp +++ b/ggml_extend.hpp @@ -611,7 +611,9 @@ __STATIC_INLINE__ struct ggml_tensor* sd_pad(struct ggml_context* ctx, if (sd_is_circular_padding_enabled()) { return ggml_pad_circular(ctx, a, 0, p0, 0, p1, 0, p2, 0, p3); } - return ggml_pad(ctx, a, p0, p1, p2, p3); + else { + return ggml_pad(ctx, a, p0, p1, p2, p3); + } } __STATIC_INLINE__ struct ggml_tensor* sd_pad_ext(struct ggml_context* ctx, @@ -1030,7 +1032,8 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_nn_conv_2d(struct ggml_context* ctx, if (scale != 1.f) { x = ggml_scale(ctx, x, scale); } - const bool use_circular = sd_is_circular_padding_enabled() && (p0 != 0 || p1 != 0); + const bool use_circular = sd_is_circular_padding_enabled(); + LOG_DEBUG("use circular conv %d", use_circular ? 1 : 0); const bool is_depthwise = (w->ne[2] == 1 && x->ne[2] == w->ne[3]); if (direct) { if (use_circular) { diff --git a/rope.hpp b/rope.hpp index 8280de528..34d119ef9 100644 --- a/rope.hpp +++ b/rope.hpp @@ -63,9 +63,10 @@ namespace Rope { float omega_val = omega[j]; float original_angle = position * omega_val; float angle = original_angle; - int wrap = 0; + float wrap = 0; if (wraps != nullptr && !wraps->empty()) { size_t wrap_size = wraps->size(); + // mod batch size since we only store this for one item in the batch size_t wrap_idx = wrap_size > 0 ? (i % wrap_size) : 0; wrap = (*wraps)[wrap_idx]; } @@ -73,17 +74,11 @@ namespace Rope { constexpr float TWO_PI = 6.28318530717958647692f; float wrap_f = static_cast(wrap); float cycles = omega_val * wrap_f / TWO_PI; - float rounded = std::round(cycles); // closest periodic harmonic - float periodic_omega = TWO_PI * rounded / wrap_f; - float periodic_angle = position * periodic_omega; - float rel_pos = std::fmod(position, wrap_f); - if (rel_pos < 0.0f) { - rel_pos += wrap_f; - } - float t = wrap_f > 0.0f ? rel_pos / wrap_f : 0.0f; - float window = 0.5f - 0.5f * std::cos(TWO_PI * t); // 0 at edges, 1 in the middle - window = std::clamp(window, 0.0f, 1.0f); - angle = periodic_angle + window * (original_angle - periodic_angle); + // closest periodic harmonic, necessary to ensure things neatly tile + // without this round, things don't tile at the boundaries and you end up + // with the model knowing what is "center" + float rounded = std::round(cycles); + angle = position * TWO_PI * rounded / wrap_f; } float sin_val = std::sin(angle); float cos_val = std::cos(angle); @@ -282,7 +277,9 @@ namespace Rope { const std::vector& ref_latents, bool increase_ref_index, int theta, - const std::vector& axes_dim) { + const std::vector& axes_dim, + bool circular = false) { + circular = true; std::vector> ids = gen_qwen_image_ids(h, w, patch_size, bs, context_len, ref_latents, increase_ref_index); std::vector> axes_wraps; if (sd_is_circular_padding_enabled() && bs > 0 && axes_dim.size() >= 3) { @@ -294,7 +291,13 @@ namespace Rope { const size_t total_tokens = ids.size(); // Track per-token wrap lengths for the row/column axes so only spatial tokens become periodic. axes_wraps.assign(axes_dim.size(), std::vector(total_tokens / bs, 0)); - size_t cursor = 0; + size_t cursor = context_len; // ignore text tokens + const size_t img_tokens = static_cast(h_len) * static_cast(w_len); + for (size_t token_i = 0; token_i < img_tokens; ++token_i) { + axes_wraps[1][cursor + token_i] = h_len; + axes_wraps[2][cursor + token_i] = w_len; + } + cursor += img_tokens; for (ggml_tensor* ref : ref_latents) { if (ref == nullptr) { continue; From 8d7f6793bcb478647e03ce626bafdf459042e1c9 Mon Sep 17 00:00:00 2001 From: bepis Date: Fri, 24 Oct 2025 15:24:59 -0700 Subject: [PATCH 06/25] Further clean of rope --- rope.hpp | 44 ++++++++++++++++++++++---------------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/rope.hpp b/rope.hpp index 34d119ef9..82084403e 100644 --- a/rope.hpp +++ b/rope.hpp @@ -44,7 +44,7 @@ namespace Rope { __STATIC_INLINE__ std::vector> rope(const std::vector& pos, int dim, int theta, - const std::vector* wraps = nullptr) { + const std::vector* wrap_dims = nullptr) { assert(dim % 2 == 0); int half_dim = dim / 2; @@ -63,16 +63,16 @@ namespace Rope { float omega_val = omega[j]; float original_angle = position * omega_val; float angle = original_angle; - float wrap = 0; - if (wraps != nullptr && !wraps->empty()) { - size_t wrap_size = wraps->size(); + int wrap_dim = 0; + if (wrap_dims != nullptr && !wrap_dims->empty()) { + size_t wrap_size = wrap_dims->size(); // mod batch size since we only store this for one item in the batch size_t wrap_idx = wrap_size > 0 ? (i % wrap_size) : 0; - wrap = (*wraps)[wrap_idx]; + wrap_dim = (*wrap_dims)[wrap_idx]; } - if (wrap > 0) { + if (wrap_dim > 0) { constexpr float TWO_PI = 6.28318530717958647692f; - float wrap_f = static_cast(wrap); + float wrap_f = static_cast(wrap_dim); float cycles = omega_val * wrap_f / TWO_PI; // closest periodic harmonic, necessary to ensure things neatly tile // without this round, things don't tile at the boundaries and you end up @@ -144,7 +144,7 @@ namespace Rope { int bs, int theta, const std::vector& axes_dim, - const std::vector>* axes_wraps = nullptr) { + const std::vector>* wrap_dims = nullptr) { std::vector> trans_ids = transpose(ids); size_t pos_len = ids.size() / bs; int num_axes = axes_dim.size(); @@ -159,12 +159,12 @@ namespace Rope { std::vector> emb(bs * pos_len, std::vector(emb_dim * 2 * 2, 0.0)); int offset = 0; for (int i = 0; i < num_axes; ++i) { - const std::vector* axis_wrap = nullptr; - if (axes_wraps != nullptr && i < (int)axes_wraps->size()) { - axis_wrap = &(*axes_wraps)[i]; + const std::vector* axis_wrap_dims = nullptr; + if (wrap_dims != nullptr && i < (int)wrap_dims->size()) { + axis_wrap_dims = &(*wrap_dims)[i]; } std::vector> rope_emb = - rope(trans_ids[i], axes_dim[i], theta, axis_wrap); // [bs*pos_len, axes_dim[i]/2 * 2 * 2] + rope(trans_ids[i], axes_dim[i], theta, axis_wrap_dims); // [bs*pos_len, axes_dim[i]/2 * 2 * 2] for (int b = 0; b < bs; ++b) { for (int j = 0; j < pos_len; ++j) { for (int k = 0; k < rope_emb[0].size(); ++k) { @@ -277,11 +277,10 @@ namespace Rope { const std::vector& ref_latents, bool increase_ref_index, int theta, - const std::vector& axes_dim, - bool circular = false) { - circular = true; + const std::vector& axes_dim) { std::vector> ids = gen_qwen_image_ids(h, w, patch_size, bs, context_len, ref_latents, increase_ref_index); - std::vector> axes_wraps; + std::vector> wrap_dims; + // This logic simply stores the (pad and patch_adjusted) sizes of images so we can make sure rope correctly tiles if (sd_is_circular_padding_enabled() && bs > 0 && axes_dim.size() >= 3) { int pad_h = (patch_size - (h % patch_size)) % patch_size; int pad_w = (patch_size - (w % patch_size)) % patch_size; @@ -290,14 +289,15 @@ namespace Rope { if (h_len > 0 && w_len > 0) { const size_t total_tokens = ids.size(); // Track per-token wrap lengths for the row/column axes so only spatial tokens become periodic. - axes_wraps.assign(axes_dim.size(), std::vector(total_tokens / bs, 0)); + wrap_dims.assign(axes_dim.size(), std::vector(total_tokens / bs, 0)); size_t cursor = context_len; // ignore text tokens const size_t img_tokens = static_cast(h_len) * static_cast(w_len); for (size_t token_i = 0; token_i < img_tokens; ++token_i) { - axes_wraps[1][cursor + token_i] = h_len; - axes_wraps[2][cursor + token_i] = w_len; + wrap_dims[1][cursor + token_i] = h_len; + wrap_dims[2][cursor + token_i] = w_len; } cursor += img_tokens; + // For each reference image, store wrap sizes as well for (ggml_tensor* ref : ref_latents) { if (ref == nullptr) { continue; @@ -310,14 +310,14 @@ namespace Rope { int ref_w_len = (ref_w + ref_pad_w) / patch_size; size_t ref_n_tokens = static_cast(ref_h_len) * static_cast(ref_w_len); for (size_t token_i = 0; token_i < ref_n_tokens; ++token_i) { - axes_wraps[1][cursor + token_i] = ref_h_len; - axes_wraps[2][cursor + token_i] = ref_w_len; + wrap_dims[1][cursor + token_i] = ref_h_len; + wrap_dims[2][cursor + token_i] = ref_w_len; } cursor += ref_n_tokens; } } } - const std::vector>* wraps_ptr = axes_wraps.empty() ? nullptr : &axes_wraps; + const std::vector>* wraps_ptr = wrap_dims.empty() ? nullptr : &wrap_dims; return embed_nd(ids, bs, theta, axes_dim, wraps_ptr); } From 4f2db1bef6f97c0026dfba158a44eadecc707adf Mon Sep 17 00:00:00 2001 From: bepis Date: Fri, 24 Oct 2025 16:03:08 -0700 Subject: [PATCH 07/25] resolve flux conflict --- flux.hpp | 79 ++++++++++++++++++++++++++++---------------------------- ggml | 2 +- 2 files changed, 41 insertions(+), 40 deletions(-) diff --git a/flux.hpp b/flux.hpp index c03b3ce2a..355184be2 100644 --- a/flux.hpp +++ b/flux.hpp @@ -1,6 +1,7 @@ #ifndef __FLUX_HPP__ #define __FLUX_HPP__ +#include #include #include "ggml_extend.hpp" @@ -18,7 +19,7 @@ namespace Flux { blocks["out_layer"] = std::shared_ptr(new Linear(hidden_dim, hidden_dim, true)); } - struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) { + struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) override { // x: [..., in_dim] // return: [..., hidden_dim] auto in_layer = std::dynamic_pointer_cast(blocks["in_layer"]); @@ -36,7 +37,7 @@ namespace Flux { int64_t hidden_size; float eps; - void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") { + void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") override { ggml_type wtype = GGML_TYPE_F32; params["scale"] = ggml_new_tensor_1d(ctx, wtype, hidden_size); } @@ -47,7 +48,7 @@ namespace Flux { : hidden_size(hidden_size), eps(eps) {} - struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) { + struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) override { struct ggml_tensor* w = params["scale"]; x = ggml_rms_norm(ctx, x, eps); x = ggml_mul(ctx, x, w); @@ -136,11 +137,11 @@ namespace Flux { }; struct ModulationOut { - ggml_tensor* shift = NULL; - ggml_tensor* scale = NULL; - ggml_tensor* gate = NULL; + ggml_tensor* shift = nullptr; + ggml_tensor* scale = nullptr; + ggml_tensor* gate = nullptr; - ModulationOut(ggml_tensor* shift = NULL, ggml_tensor* scale = NULL, ggml_tensor* gate = NULL) + ModulationOut(ggml_tensor* shift = nullptr, ggml_tensor* scale = nullptr, ggml_tensor* gate = nullptr) : shift(shift), scale(scale), gate(gate) {} ModulationOut(struct ggml_context* ctx, ggml_tensor* vec, int64_t offset) { @@ -259,7 +260,7 @@ namespace Flux { struct ggml_tensor* txt, struct ggml_tensor* vec, struct ggml_tensor* pe, - struct ggml_tensor* mask = NULL) { + struct ggml_tensor* mask = nullptr) { // img: [N, n_img_token, hidden_size] // txt: [N, n_txt_token, hidden_size] // pe: [n_img_token + n_txt_token, d_head/2, 2, 2] @@ -398,7 +399,7 @@ namespace Flux { ModulationOut get_distil_mod(struct ggml_context* ctx, struct ggml_tensor* vec) { int64_t offset = 3 * idx; - return ModulationOut(ctx, vec, offset); + return {ctx, vec, offset}; } struct ggml_tensor* forward(struct ggml_context* ctx, @@ -406,7 +407,7 @@ namespace Flux { struct ggml_tensor* x, struct ggml_tensor* vec, struct ggml_tensor* pe, - struct ggml_tensor* mask = NULL) { + struct ggml_tensor* mask = nullptr) { // x: [N, n_token, hidden_size] // pe: [n_token, d_head/2, 2, 2] // return: [N, n_token, hidden_size] @@ -485,7 +486,7 @@ namespace Flux { auto shift = ggml_view_2d(ctx, vec, vec->ne[0], vec->ne[1], vec->nb[1], stride * (offset + 0)); // [N, dim] auto scale = ggml_view_2d(ctx, vec, vec->ne[0], vec->ne[1], vec->nb[1], stride * (offset + 1)); // [N, dim] // No gate - return ModulationOut(shift, scale, NULL); + return {shift, scale, nullptr}; } struct ggml_tensor* forward(struct ggml_context* ctx, @@ -664,7 +665,7 @@ namespace Flux { struct ggml_tensor* y, struct ggml_tensor* guidance, struct ggml_tensor* pe, - struct ggml_tensor* mod_index_arange = NULL, + struct ggml_tensor* mod_index_arange = nullptr, std::vector skip_layers = {}) { auto img_in = std::dynamic_pointer_cast(blocks["img_in"]); auto txt_in = std::dynamic_pointer_cast(blocks["txt_in"]); @@ -672,7 +673,7 @@ namespace Flux { img = img_in->forward(ctx, img); struct ggml_tensor* vec; - struct ggml_tensor* txt_img_mask = NULL; + struct ggml_tensor* txt_img_mask = nullptr; if (params.is_chroma) { int64_t mod_index_length = 344; auto approx = std::dynamic_pointer_cast(blocks["distilled_guidance_layer"]); @@ -681,7 +682,7 @@ namespace Flux { // auto mod_index_arange = ggml_arange(ctx, 0, (float)mod_index_length, 1); // ggml_arange tot working on a lot of backends, precomputing it on CPU instead - GGML_ASSERT(arange != NULL); + GGML_ASSERT(arange != nullptr); auto modulation_index = ggml_nn_timestep_embedding(ctx, mod_index_arange, 32, 10000, 1000.f); // [1, 344, 32] // Batch broadcast (will it ever be useful) @@ -695,15 +696,15 @@ namespace Flux { vec = ggml_cont(ctx, ggml_permute(ctx, vec, 0, 2, 1, 3)); // [344, N, 64] vec = approx->forward(ctx, vec); // [344, N, hidden_size] - if (y != NULL) { - txt_img_mask = sd_pad(ctx, y, img->ne[1], 0, 0, 0); + if (y != nullptr) { + txt_img_mask = ggml_pad(ctx, y, img->ne[1], 0, 0, 0); } } else { auto time_in = std::dynamic_pointer_cast(blocks["time_in"]); auto vector_in = std::dynamic_pointer_cast(blocks["vector_in"]); vec = time_in->forward(ctx, ggml_nn_timestep_embedding(ctx, timesteps, 256, 10000, 1000.f)); if (params.guidance_embed) { - GGML_ASSERT(guidance != NULL); + GGML_ASSERT(guidance != nullptr); auto guidance_in = std::dynamic_pointer_cast(blocks["guidance_in"]); // bf16 and fp16 result is different auto g_in = ggml_nn_timestep_embedding(ctx, guidance, 256, 10000, 1000.f); @@ -759,7 +760,7 @@ namespace Flux { int64_t patch_size = 2; int pad_h = (patch_size - H % patch_size) % patch_size; int pad_w = (patch_size - W % patch_size) % patch_size; - x = sd_pad(ctx, x, pad_w, pad_h, 0, 0); // [N, C, H + pad_h, W + pad_w] + x = ggml_pad(ctx, x, pad_w, pad_h, 0, 0); // [N, C, H + pad_h, W + pad_w] // img = rearrange(x, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=patch_size, pw=patch_size) auto img = patchify(ctx, x, patch_size); // [N, h*w, C * patch_size * patch_size] @@ -775,14 +776,14 @@ namespace Flux { struct ggml_tensor* y, struct ggml_tensor* guidance, struct ggml_tensor* pe, - struct ggml_tensor* mod_index_arange = NULL, + struct ggml_tensor* mod_index_arange = nullptr, std::vector ref_latents = {}, std::vector skip_layers = {}) { // Forward pass of DiT. // x: (N, C, H, W) tensor of spatial inputs (images or latent representations of images) // timestep: (N,) tensor of diffusion timesteps // context: (N, L, D) - // c_concat: NULL, or for (N,C+M, H, W) for Fill + // c_concat: nullptr, or for (N,C+M, H, W) for Fill // y: (N, adm_in_channels) tensor of class labels // guidance: (N,) // pe: (L, d_head/2, 2, 2) @@ -801,7 +802,7 @@ namespace Flux { uint64_t img_tokens = img->ne[1]; if (params.version == VERSION_FLUX_FILL) { - GGML_ASSERT(c_concat != NULL); + GGML_ASSERT(c_concat != nullptr); ggml_tensor* masked = ggml_view_4d(ctx, c_concat, c_concat->ne[0], c_concat->ne[1], C, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], 0); ggml_tensor* mask = ggml_view_4d(ctx, c_concat, c_concat->ne[0], c_concat->ne[1], 8 * 8, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], c_concat->nb[2] * C); @@ -810,14 +811,14 @@ namespace Flux { img = ggml_concat(ctx, img, ggml_concat(ctx, masked, mask, 0), 0); } else if (params.version == VERSION_FLEX_2) { - GGML_ASSERT(c_concat != NULL); + GGML_ASSERT(c_concat != nullptr); ggml_tensor* masked = ggml_view_4d(ctx, c_concat, c_concat->ne[0], c_concat->ne[1], C, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], 0); ggml_tensor* mask = ggml_view_4d(ctx, c_concat, c_concat->ne[0], c_concat->ne[1], 1, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], c_concat->nb[2] * C); ggml_tensor* control = ggml_view_4d(ctx, c_concat, c_concat->ne[0], c_concat->ne[1], C, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], c_concat->nb[2] * (C + 1)); - masked = sd_pad(ctx, masked, pad_w, pad_h, 0, 0); - mask = sd_pad(ctx, mask, pad_w, pad_h, 0, 0); - control = sd_pad(ctx, control, pad_w, pad_h, 0, 0); + masked = ggml_pad(ctx, masked, pad_w, pad_h, 0, 0); + mask = ggml_pad(ctx, mask, pad_w, pad_h, 0, 0); + control = ggml_pad(ctx, control, pad_w, pad_h, 0, 0); masked = patchify(ctx, masked, patch_size); mask = patchify(ctx, mask, patch_size); @@ -825,9 +826,9 @@ namespace Flux { img = ggml_concat(ctx, img, ggml_concat(ctx, ggml_concat(ctx, masked, mask, 0), control, 0), 0); } else if (params.version == VERSION_FLUX_CONTROLS) { - GGML_ASSERT(c_concat != NULL); + GGML_ASSERT(c_concat != nullptr); - ggml_tensor* control = sd_pad(ctx, c_concat, pad_w, pad_h, 0, 0); + ggml_tensor* control = ggml_pad(ctx, c_concat, pad_w, pad_h, 0, 0); control = patchify(ctx, control, patch_size); img = ggml_concat(ctx, img, control, 0); } @@ -924,7 +925,7 @@ namespace Flux { flux.init(params_ctx, tensor_types, prefix); } - std::string get_desc() { + std::string get_desc() override { return "flux"; } @@ -944,18 +945,18 @@ namespace Flux { GGML_ASSERT(x->ne[3] == 1); struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, FLUX_GRAPH_SIZE, false); - struct ggml_tensor* mod_index_arange = NULL; + struct ggml_tensor* mod_index_arange = nullptr; x = to_backend(x); context = to_backend(context); - if (c_concat != NULL) { + if (c_concat != nullptr) { c_concat = to_backend(c_concat); } if (flux_params.is_chroma) { guidance = ggml_set_f32(guidance, 0); if (!use_mask) { - y = NULL; + y = nullptr; } // ggml_arange is not working on some backends, precompute it @@ -987,7 +988,7 @@ namespace Flux { auto pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, flux_params.axes_dim_sum / 2, pos_len); // pe->data = pe_vec.data(); // print_ggml_tensor(pe); - // pe->data = NULL; + // pe->data = nullptr; set_backend_tensor_data(pe, pe_vec.data()); struct ggml_tensor* out = flux.forward(compute_ctx, @@ -1017,8 +1018,8 @@ namespace Flux { struct ggml_tensor* guidance, std::vector ref_latents = {}, bool increase_ref_index = false, - struct ggml_tensor** output = NULL, - struct ggml_context* output_ctx = NULL, + struct ggml_tensor** output = nullptr, + struct ggml_context* output_ctx = nullptr, std::vector skip_layers = std::vector()) { // x: [N, in_channels, h, w] // timesteps: [N, ] @@ -1035,11 +1036,11 @@ namespace Flux { void test() { struct ggml_init_params params; params.mem_size = static_cast(20 * 1024 * 1024); // 20 MB - params.mem_buffer = NULL; + params.mem_buffer = nullptr; params.no_alloc = false; struct ggml_context* work_ctx = ggml_init(params); - GGML_ASSERT(work_ctx != NULL); + GGML_ASSERT(work_ctx != nullptr); { // cpu f16: @@ -1063,10 +1064,10 @@ namespace Flux { ggml_set_f32(y, 0.01f); // print_ggml_tensor(y); - struct ggml_tensor* out = NULL; + struct ggml_tensor* out = nullptr; int t0 = ggml_time_ms(); - compute(8, x, timesteps, context, NULL, y, guidance, {}, false, &out, work_ctx); + compute(8, x, timesteps, context, nullptr, y, guidance, {}, false, &out, work_ctx); int t1 = ggml_time_ms(); print_ggml_tensor(out); @@ -1078,7 +1079,7 @@ namespace Flux { // ggml_backend_t backend = ggml_backend_cuda_init(0); ggml_backend_t backend = ggml_backend_cpu_init(); ggml_type model_data_type = GGML_TYPE_Q8_0; - std::shared_ptr flux = std::shared_ptr(new FluxRunner(backend, false)); + std::shared_ptr flux = std::make_shared(backend, false); { LOG_INFO("loading from '%s'", file_path.c_str()); diff --git a/ggml b/ggml index 6eb26b3c7..55c79c624 160000 --- a/ggml +++ b/ggml @@ -1 +1 @@ -Subproject commit 6eb26b3c74ed06f600e61f48d62dc39f9c1166c0 +Subproject commit 55c79c6249dbc5e3ac8cd82556861608a6fd425e From e6fb4e82f8b6248e0e40a19473d8aa966d2460f4 Mon Sep 17 00:00:00 2001 From: Phylliida Date: Tue, 9 Dec 2025 18:48:15 -0800 Subject: [PATCH 08/25] switch to pad op circular only --- ggml_extend.hpp | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/ggml_extend.hpp b/ggml_extend.hpp index 9699b12cd..15746487e 100644 --- a/ggml_extend.hpp +++ b/ggml_extend.hpp @@ -609,7 +609,7 @@ __STATIC_INLINE__ struct ggml_tensor* sd_pad(struct ggml_context* ctx, int p2, int p3) { if (sd_is_circular_padding_enabled()) { - return ggml_pad_circular(ctx, a, 0, p0, 0, p1, 0, p2, 0, p3); + return ggml_pad_ext_circular(ctx, a, 0, p0, 0, p1, 0, p2, 0, p3); } else { return ggml_pad(ctx, a, p0, p1, p2, p3); @@ -627,7 +627,7 @@ __STATIC_INLINE__ struct ggml_tensor* sd_pad_ext(struct ggml_context* ctx, int lp3, int rp3) { if (sd_is_circular_padding_enabled()) { - return ggml_pad_circular(ctx, a, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3); + return ggml_pad_ext_circular(ctx, a, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3); } return ggml_pad_ext(ctx, a, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3); } @@ -1035,22 +1035,21 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_nn_conv_2d(struct ggml_context* ctx, const bool use_circular = sd_is_circular_padding_enabled(); LOG_DEBUG("use circular conv %d", use_circular ? 1 : 0); const bool is_depthwise = (w->ne[2] == 1 && x->ne[2] == w->ne[3]); + + if (use_circular && (p0 != 0 || p1 != 0)) { + x = ggml_pad_ext_circular(ctx, x, p0, p0, p1, p1, 0, 0, 0, 0); + p0 = 0; + p1 = 0; + } + if (direct) { - if (use_circular) { - if (is_depthwise) { - x = ggml_conv_2d_dw_direct_circular(ctx, w, x, s0, s1, p0, p1, d0, d1); - } else { - x = ggml_conv_2d_direct_circular(ctx, w, x, s0, s1, p0, p1, d0, d1); - } + if (is_depthwise) { + x = ggml_conv_2d_dw_direct(ctx, w, x, s0, s1, p0, p1, d0, d1); } else { x = ggml_conv_2d_direct(ctx, w, x, s0, s1, p0, p1, d0, d1); } } else { - if (use_circular) { - x = ggml_conv_2d_circular(ctx, w, x, s0, s1, p0, p1, d0, d1); - } else { - x = ggml_conv_2d(ctx, w, x, s0, s1, p0, p1, d0, d1); - } + x = ggml_conv_2d(ctx, w, x, s0, s1, p0, p1, d0, d1); } if (scale != 1.f) { x = ggml_scale(ctx, x, 1.f / scale); From 00c92ef9150efefb65c3cd2ee17020490b2108c3 Mon Sep 17 00:00:00 2001 From: Phylliida Date: Fri, 12 Dec 2025 13:05:29 -0800 Subject: [PATCH 09/25] Set ggml to most recent --- ggml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml b/ggml index 55c79c624..d80bac55f 160000 --- a/ggml +++ b/ggml @@ -1 +1 @@ -Subproject commit 55c79c6249dbc5e3ac8cd82556861608a6fd425e +Subproject commit d80bac55f6d0c57e57143f80cbb6e3155dec1cc7 From 144c2786aaa84b856f471bd486f1979a707a3cb3 Mon Sep 17 00:00:00 2001 From: Phylliida Date: Fri, 12 Dec 2025 13:07:39 -0800 Subject: [PATCH 10/25] Revert ggml temp --- ggml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml b/ggml index d80bac55f..2d3876d55 160000 --- a/ggml +++ b/ggml @@ -1 +1 @@ -Subproject commit d80bac55f6d0c57e57143f80cbb6e3155dec1cc7 +Subproject commit 2d3876d554551d35c06dccc5852be50d5fd2a275 From 247d67fd56808046aede01f1e7070b952814f2fc Mon Sep 17 00:00:00 2001 From: Phylliida Date: Fri, 12 Dec 2025 13:14:43 -0800 Subject: [PATCH 11/25] Update ggml to most recent --- ggml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml b/ggml index 2d3876d55..d80bac55f 160000 --- a/ggml +++ b/ggml @@ -1 +1 @@ -Subproject commit 2d3876d554551d35c06dccc5852be50d5fd2a275 +Subproject commit d80bac55f6d0c57e57143f80cbb6e3155dec1cc7 From 686a208fa2a178afe2839d75998c277d39795a2f Mon Sep 17 00:00:00 2001 From: Phylliida Date: Fri, 12 Dec 2025 13:18:11 -0800 Subject: [PATCH 12/25] Revert unneded flux change --- flux.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flux.hpp b/flux.hpp index 7e6e52372..1df2874ae 100644 --- a/flux.hpp +++ b/flux.hpp @@ -457,7 +457,7 @@ namespace Flux { ModulationOut get_distil_mod(GGMLRunnerContext* ctx, struct ggml_tensor* vec) { int64_t offset = 3 * idx; - return {ctx, vec, offset}; + return ModulationOut(ctx, vec, offset); } struct ggml_tensor* forward(GGMLRunnerContext* ctx, From 15076b089ba064268ae5149c9047729d5e017643 Mon Sep 17 00:00:00 2001 From: Phylliida Date: Fri, 12 Dec 2025 13:51:57 -0800 Subject: [PATCH 13/25] move circular flag to the GGMLRunnerContext --- clip.hpp | 2 +- ggml_extend.hpp | 40 ++++++++++++++++++++++++++++++---------- lora.hpp | 3 +++ stable-diffusion.cpp | 12 ++++++++---- 4 files changed, 42 insertions(+), 15 deletions(-) diff --git a/clip.hpp b/clip.hpp index 24c94f1bb..cda5a3015 100644 --- a/clip.hpp +++ b/clip.hpp @@ -664,7 +664,7 @@ class CLIPVisionEmbeddings : public GGMLBlock { // concat(patch_embedding, class_embedding) + position_embedding struct ggml_tensor* patch_embedding; int64_t N = pixel_values->ne[3]; - patch_embedding = ggml_ext_conv_2d(ctx->ggml_ctx, pixel_values, patch_embed_weight, nullptr, patch_size, patch_size); // [N, embed_dim, image_size // pacht_size, image_size // pacht_size] + patch_embedding = ggml_ext_conv_2d(ctx->ggml_ctx, pixel_values, patch_embed_weight, nullptr, patch_size, patch_size, 0, 0, 1, 1, false, ctx->circular_pad_enabled); // [N, embed_dim, image_size // pacht_size, image_size // pacht_size] patch_embedding = ggml_reshape_3d(ctx->ggml_ctx, patch_embedding, num_patches, embed_dim, N); // [N, embed_dim, num_patches] patch_embedding = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, patch_embedding, 1, 0, 2, 3)); // [N, num_patches, embed_dim] patch_embedding = ggml_reshape_4d(ctx->ggml_ctx, patch_embedding, 1, embed_dim, num_patches, N); // [N, num_patches, embed_dim, 1] diff --git a/ggml_extend.hpp b/ggml_extend.hpp index 87a34e64d..6d71a0890 100644 --- a/ggml_extend.hpp +++ b/ggml_extend.hpp @@ -1016,20 +1016,30 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_conv_2d(struct ggml_context* ctx, struct ggml_tensor* x, struct ggml_tensor* w, struct ggml_tensor* b, - int s0 = 1, - int s1 = 1, - int p0 = 0, - int p1 = 0, - int d0 = 1, - int d1 = 1, - bool direct = false, - float scale = 1.f) { + int s0 = 1, + int s1 = 1, + int p0 = 0, + int p1 = 0, + int d0 = 1, + int d1 = 1, + bool direct = false, + bool circular = false, + float scale = 1.f) { if (scale != 1.f) { x = ggml_scale(ctx, x, scale); } if (w->ne[2] != x->ne[2] && ggml_n_dims(w) == 2) { w = ggml_reshape_4d(ctx, w, 1, 1, w->ne[0], w->ne[1]); } + + // use circular padding (on a torus, x and y wrap around) for seamless textures + // see https://github.com/leejet/stable-diffusion.cpp/pull/914 + if (circular && (p0 != 0 || p1 != 0)) { + x = ggml_pad_ext_circular(ctx, x, p0, p0, p1, p1, 0, 0, 0, 0); + p0 = 0; + p1 = 0; + } + if (direct) { if (is_depthwise) { x = ggml_conv_2d_dw_direct(ctx, w, x, s0, s1, p0, p1, d0, d1); @@ -1543,7 +1553,8 @@ struct WeightAdapter { int d0 = 1; int d1 = 1; bool direct = false; - float scale = 1.f; + bool circular = false; + float scale = 1.f; } conv2d; }; virtual ggml_tensor* patch_weight(ggml_context* ctx, ggml_tensor* weight, const std::string& weight_name) = 0; @@ -1561,6 +1572,7 @@ struct GGMLRunnerContext { ggml_context* ggml_ctx = nullptr; bool flash_attn_enabled = false; bool conv2d_direct_enabled = false; + bool circular_pad_enabled = false; std::shared_ptr weight_adapter = nullptr; }; @@ -1597,6 +1609,7 @@ struct GGMLRunner { bool flash_attn_enabled = false; bool conv2d_direct_enabled = false; + bool circular_pad_enabled = false; void alloc_params_ctx() { struct ggml_init_params params; @@ -1874,6 +1887,7 @@ struct GGMLRunner { runner_ctx.backend = runtime_backend; runner_ctx.flash_attn_enabled = flash_attn_enabled; runner_ctx.conv2d_direct_enabled = conv2d_direct_enabled; + runner_ctx.circular_pad_enabled = circular_pad_enabled; runner_ctx.weight_adapter = weight_adapter; return runner_ctx; } @@ -2018,6 +2032,10 @@ struct GGMLRunner { conv2d_direct_enabled = enabled; } + void set_circular_pad_enabled(bool enabled) { + circular_pad_enabled = enabled; + } + void set_weight_adapter(const std::shared_ptr& adapter) { weight_adapter = adapter; } @@ -2289,7 +2307,8 @@ class Conv2d : public UnaryBlock { forward_params.conv2d.d0 = dilation.second; forward_params.conv2d.d1 = dilation.first; forward_params.conv2d.direct = ctx->conv2d_direct_enabled; - forward_params.conv2d.scale = scale; + forward_params.conv2d.circular = ctx->circular_pad_enabled; + forward_params.conv2d.scale = scale; return ctx->weight_adapter->forward_with_lora(ctx->ggml_ctx, x, w, b, prefix, forward_params); } return ggml_ext_conv_2d(ctx->ggml_ctx, @@ -2303,6 +2322,7 @@ class Conv2d : public UnaryBlock { dilation.second, dilation.first, ctx->conv2d_direct_enabled, + ctx->circular_pad_enabled, scale); } }; diff --git a/lora.hpp b/lora.hpp index b847f044c..321e63bca 100644 --- a/lora.hpp +++ b/lora.hpp @@ -599,6 +599,7 @@ struct LoraModel : public GGMLRunner { forward_params.conv2d.d0, forward_params.conv2d.d1, forward_params.conv2d.direct, + forward_params.conv2d.circular, forward_params.conv2d.scale); if (lora_mid) { lx = ggml_ext_conv_2d(ctx, @@ -612,6 +613,7 @@ struct LoraModel : public GGMLRunner { 1, 1, forward_params.conv2d.direct, + forward_params.conv2d.circular, forward_params.conv2d.scale); } lx = ggml_ext_conv_2d(ctx, @@ -625,6 +627,7 @@ struct LoraModel : public GGMLRunner { 1, 1, forward_params.conv2d.direct, + forward_params.conv2d.circular, forward_params.conv2d.scale); } diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 9c4c69349..ec0dab2b1 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -212,10 +212,6 @@ class StableDiffusionGGML { use_tiny_autoencoder = taesd_path.size() > 0; offload_params_to_cpu = sd_ctx_params->offload_params_to_cpu; circular_pad = sd_ctx_params->circular_pad; - sd_set_circular_padding_enabled(circular_pad); - if (circular_pad) { - LOG_INFO("Using circular padding for convolutions"); - } rng = get_rng(sd_ctx_params->rng_type); if (sd_ctx_params->sampler_rng_type != RNG_TYPE_COUNT && sd_ctx_params->sampler_rng_type != sd_ctx_params->rng_type) { @@ -393,6 +389,10 @@ class StableDiffusionGGML { vae_decode_only = false; } + if (circular_pad) { + LOG_INFO("Using circular padding for convolutions"); + } + bool clip_on_cpu = sd_ctx_params->keep_clip_on_cpu; { @@ -540,6 +540,7 @@ class StableDiffusionGGML { LOG_INFO("Using Conv2d direct in the diffusion model"); std::dynamic_pointer_cast(diffusion_model)->unet.set_conv2d_direct_enabled(true); } + std::dynamic_pointer_cast(diffusion_model)->unet.set_circular_pad_enabled(circular_pad); } if (sd_ctx_params->diffusion_flash_attn) { @@ -602,6 +603,7 @@ class StableDiffusionGGML { vae_conv_2d_scale); first_stage_model->set_conv2d_scale(vae_conv_2d_scale); } + first_stage_model->set_circular_pad_enabled(circular_pad); first_stage_model->alloc_params_buffer(); first_stage_model->get_param_tensors(tensors, "first_stage_model"); } @@ -616,6 +618,7 @@ class StableDiffusionGGML { LOG_INFO("Using Conv2d direct in the tae model"); tae_first_stage->set_conv2d_direct_enabled(true); } + tae_first_stage->set_circular_pad_enabled(circular_pad); } // first_stage_model->get_param_tensors(tensors, "first_stage_model."); @@ -635,6 +638,7 @@ class StableDiffusionGGML { LOG_INFO("Using Conv2d direct in the control net"); control_net->set_conv2d_direct_enabled(true); } + control_net->set_circular_pad_enabled(circular_pad); } if (strstr(SAFE_STR(sd_ctx_params->photo_maker_path), "v2")) { From bf28347524530b13256473e30a3f95044b9642fa Mon Sep 17 00:00:00 2001 From: Phylliida Date: Fri, 12 Dec 2025 13:53:58 -0800 Subject: [PATCH 14/25] Pass through circular param in all places where conv is called --- lora.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/lora.hpp b/lora.hpp index 321e63bca..e6af66798 100644 --- a/lora.hpp +++ b/lora.hpp @@ -782,6 +782,7 @@ struct MultiLoraAdapter : public WeightAdapter { forward_params.conv2d.d0, forward_params.conv2d.d1, forward_params.conv2d.direct, + forward_params.conv2d.circular, forward_params.conv2d.scale); } for (auto& lora_model : lora_models) { From 5f2de586beba9a33f3323ff4e54945839c9219f3 Mon Sep 17 00:00:00 2001 From: Phylliida Date: Fri, 12 Dec 2025 14:19:40 -0800 Subject: [PATCH 15/25] fix of constant and minor cleanup --- common.hpp | 2 +- ggml_extend.hpp | 16 ++++++++-------- qwen_image.hpp | 3 ++- rope.hpp | 3 ++- wan.hpp | 2 +- 5 files changed, 14 insertions(+), 12 deletions(-) diff --git a/common.hpp b/common.hpp index 0ea197990..33d499fb1 100644 --- a/common.hpp +++ b/common.hpp @@ -28,7 +28,7 @@ class DownSampleBlock : public GGMLBlock { if (vae_downsample) { auto conv = std::dynamic_pointer_cast(blocks["conv"]); - x = sd_pad(ctx, x, 1, 1, 0, 0); + x = ggml_pad(ctx->ggml_ctx, x, 1, 1, 0, 0); x = conv->forward(ctx, x); } else { auto conv = std::dynamic_pointer_cast(blocks["op"]); diff --git a/ggml_extend.hpp b/ggml_extend.hpp index 6d71a0890..b76a25f04 100644 --- a/ggml_extend.hpp +++ b/ggml_extend.hpp @@ -21,6 +21,10 @@ #include #include +#ifndef GGML_KQ_MASK_PAD +#define GGML_KQ_MASK_PAD 1 +#endif + #include "ggml-alloc.h" #include "ggml-backend.h" #include "ggml-cpu.h" @@ -1041,11 +1045,7 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_conv_2d(struct ggml_context* ctx, } if (direct) { - if (is_depthwise) { - x = ggml_conv_2d_dw_direct(ctx, w, x, s0, s1, p0, p1, d0, d1); - } else { - x = ggml_conv_2d_direct(ctx, w, x, s0, s1, p0, p1, d0, d1); - } + x = ggml_conv_2d_direct(ctx, w, x, s0, s1, p0, p1, d0, d1); } else { x = ggml_conv_2d(ctx, w, x, s0, s1, p0, p1, d0, d1); } @@ -1269,7 +1269,7 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_attention_ext(struct ggml_context auto build_kqv = [&](ggml_tensor* q_in, ggml_tensor* k_in, ggml_tensor* v_in, ggml_tensor* mask_in) -> ggml_tensor* { if (kv_pad != 0) { - k_in = sd_pad(ctx, k_in, 0, kv_pad, 0, 0); + k_in = ggml_pad(ctx, k_in, 0, kv_pad, 0, 0); } if (kv_scale != 1.0f) { k_in = ggml_scale(ctx, k_in, kv_scale); @@ -1279,7 +1279,7 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_attention_ext(struct ggml_context v_in = ggml_ext_cont(ctx, ggml_permute(ctx, v_in, 0, 2, 1, 3)); v_in = ggml_reshape_3d(ctx, v_in, d_head, L_k, n_kv_head * N); if (kv_pad != 0) { - v_in = sd_pad(ctx, v_in, 0, kv_pad, 0, 0); + v_in = ggml_pad(ctx, v_in, 0, kv_pad, 0, 0); } if (kv_scale != 1.0f) { v_in = ggml_scale(ctx, v_in, kv_scale); @@ -1302,7 +1302,7 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_attention_ext(struct ggml_context mask_pad = GGML_PAD(L_q, GGML_KQ_MASK_PAD) - mask_in->ne[1]; } if (mask_pad > 0) { - mask_in = sd_pad(ctx, mask_in, 0, mask_pad, 0, 0); + mask_in = ggml_pad(ctx, mask_in, 0, mask_pad, 0, 0); } mask_in = ggml_cast(ctx, mask_in, GGML_TYPE_F16); } diff --git a/qwen_image.hpp b/qwen_image.hpp index 4dd13f8ac..6853787cd 100644 --- a/qwen_image.hpp +++ b/qwen_image.hpp @@ -361,7 +361,7 @@ namespace Qwen { int pad_h = (params.patch_size - H % params.patch_size) % params.patch_size; int pad_w = (params.patch_size - W % params.patch_size) % params.patch_size; - x = sd_pad(ctx, x, pad_w, pad_h, 0, 0); // [N, C, H + pad_h, W + pad_w] + x = ggml_pad(ctx, x, pad_w, pad_h, 0, 0); // [N, C, H + pad_h, W + pad_w] return x; } @@ -565,6 +565,7 @@ namespace Qwen { ref_latents, increase_ref_index, qwen_image_params.theta, + circular_pad_enabled, qwen_image_params.axes_dim); int pos_len = pe_vec.size() / qwen_image_params.axes_dim_sum / 2; // LOG_DEBUG("pos_len %d", pos_len); diff --git a/rope.hpp b/rope.hpp index f9892929e..f84fe4f43 100644 --- a/rope.hpp +++ b/rope.hpp @@ -316,11 +316,12 @@ namespace Rope { const std::vector& ref_latents, bool increase_ref_index, int theta, + bool circular, const std::vector& axes_dim) { std::vector> ids = gen_qwen_image_ids(h, w, patch_size, bs, context_len, ref_latents, increase_ref_index); std::vector> wrap_dims; // This logic simply stores the (pad and patch_adjusted) sizes of images so we can make sure rope correctly tiles - if (sd_is_circular_padding_enabled() && bs > 0 && axes_dim.size() >= 3) { + if (circular && bs > 0 && axes_dim.size() >= 3) { int pad_h = (patch_size - (h % patch_size)) % patch_size; int pad_w = (patch_size - (w % patch_size)) % patch_size; int h_len = (h + pad_h) / patch_size; diff --git a/wan.hpp b/wan.hpp index c09c55cfd..75333bfe1 100644 --- a/wan.hpp +++ b/wan.hpp @@ -1835,7 +1835,7 @@ namespace WAN { int pad_t = (std::get<0>(params.patch_size) - T % std::get<0>(params.patch_size)) % std::get<0>(params.patch_size); int pad_h = (std::get<1>(params.patch_size) - H % std::get<1>(params.patch_size)) % std::get<1>(params.patch_size); int pad_w = (std::get<2>(params.patch_size) - W % std::get<2>(params.patch_size)) % std::get<2>(params.patch_size); - x = sd_pad(ctx, x, pad_w, pad_h, pad_t, 0); // [N*C, T + pad_t, H + pad_h, W + pad_w] + x = ggml_pad(ctx, x, pad_w, pad_h, pad_t, 0); // [N*C, T + pad_t, H + pad_h, W + pad_w] return x; } From d7d8da10998e12abff4ee508588d72a554189156 Mon Sep 17 00:00:00 2001 From: Phylliida Date: Fri, 12 Dec 2025 14:23:03 -0800 Subject: [PATCH 16/25] Added back --circular option --- examples/cli/main.cpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp index 49b202fda..79c456271 100644 --- a/examples/cli/main.cpp +++ b/examples/cli/main.cpp @@ -518,6 +518,7 @@ struct SDContextParams { bool diffusion_flash_attn = false; bool diffusion_conv_direct = false; bool vae_conv_direct = false; + bool circular_pad = false; bool chroma_use_dit_mask = true; bool chroma_use_t5_mask = false; @@ -671,6 +672,10 @@ struct SDContextParams { "--vae-conv-direct", "use ggml_conv2d_direct in the vae model", true, &vae_conv_direct}, + {"", + "--circular", + "enable circular padding for convolutions", + true, &circular_pad}, {"", "--chroma-disable-dit-mask", "disable dit mask for chroma", @@ -934,6 +939,7 @@ struct SDContextParams { << " diffusion_flash_attn: " << (diffusion_flash_attn ? "true" : "false") << ",\n" << " diffusion_conv_direct: " << (diffusion_conv_direct ? "true" : "false") << ",\n" << " vae_conv_direct: " << (vae_conv_direct ? "true" : "false") << ",\n" + << " circular_pad: " << (circular_pad ? "true" : "false") << ",\n" << " chroma_use_dit_mask: " << (chroma_use_dit_mask ? "true" : "false") << ",\n" << " chroma_use_t5_mask: " << (chroma_use_t5_mask ? "true" : "false") << ",\n" << " chroma_t5_mask_pad: " << chroma_t5_mask_pad << ",\n" @@ -995,6 +1001,7 @@ struct SDContextParams { taesd_preview, diffusion_conv_direct, vae_conv_direct, + circular_pad, force_sdxl_vae_conv_scale, chroma_use_dit_mask, chroma_use_t5_mask, From 822f9a522aedd1a5923f8df5cb201c234116b74b Mon Sep 17 00:00:00 2001 From: Phylliida Date: Fri, 12 Dec 2025 15:40:45 -0800 Subject: [PATCH 17/25] Conv2d circular in vae and various models --- common.hpp | 8 +++++++- diffusion_model.hpp | 25 +++++++++++++++++++++++++ flux.hpp | 30 +++++++++++++++++------------- qwen_image.hpp | 16 ++++++++++------ stable-diffusion.cpp | 21 +++++++++++++++------ wan.hpp | 30 +++++++++++++++++++++++------- z_image.hpp | 16 ++++++++++------ 7 files changed, 107 insertions(+), 39 deletions(-) diff --git a/common.hpp b/common.hpp index 33d499fb1..a95e76a4f 100644 --- a/common.hpp +++ b/common.hpp @@ -28,7 +28,13 @@ class DownSampleBlock : public GGMLBlock { if (vae_downsample) { auto conv = std::dynamic_pointer_cast(blocks["conv"]); - x = ggml_pad(ctx->ggml_ctx, x, 1, 1, 0, 0); + // For VAE downsampling we manually pad by 1 before the stride-2 conv. + // Honor the global circular padding flag here to avoid seams in seamless mode. + if (ctx->circular_pad_enabled) { + x = ggml_pad_circular(ctx->ggml_ctx, x, 1, 1, 0, 0); + } else { + x = ggml_pad(ctx->ggml_ctx, x, 1, 1, 0, 0); + } x = conv->forward(ctx, x); } else { auto conv = std::dynamic_pointer_cast(blocks["op"]); diff --git a/diffusion_model.hpp b/diffusion_model.hpp index 8c741fdc4..b6491291a 100644 --- a/diffusion_model.hpp +++ b/diffusion_model.hpp @@ -39,6 +39,7 @@ struct DiffusionModel { virtual void set_weight_adapter(const std::shared_ptr& adapter){}; virtual int64_t get_adm_in_channels() = 0; virtual void set_flash_attn_enabled(bool enabled) = 0; + virtual void set_circular_pad_enabled(bool enabled) = 0; }; struct UNetModel : public DiffusionModel { @@ -87,6 +88,10 @@ struct UNetModel : public DiffusionModel { unet.set_flash_attention_enabled(enabled); } + void set_circular_pad_enabled(bool enabled) override { + unet.set_circular_pad_enabled(enabled); + } + bool compute(int n_threads, DiffusionParams diffusion_params, struct ggml_tensor** output = nullptr, @@ -148,6 +153,10 @@ struct MMDiTModel : public DiffusionModel { mmdit.set_flash_attention_enabled(enabled); } + void set_circular_pad_enabled(bool enabled) override { + mmdit.set_circular_pad_enabled(enabled); + } + bool compute(int n_threads, DiffusionParams diffusion_params, struct ggml_tensor** output = nullptr, @@ -210,6 +219,10 @@ struct FluxModel : public DiffusionModel { flux.set_flash_attention_enabled(enabled); } + void set_circular_pad_enabled(bool enabled) override { + flux.set_circular_pad_enabled(enabled); + } + bool compute(int n_threads, DiffusionParams diffusion_params, struct ggml_tensor** output = nullptr, @@ -277,6 +290,10 @@ struct WanModel : public DiffusionModel { wan.set_flash_attention_enabled(enabled); } + void set_circular_pad_enabled(bool enabled) override { + wan.set_circular_pad_enabled(enabled); + } + bool compute(int n_threads, DiffusionParams diffusion_params, struct ggml_tensor** output = nullptr, @@ -343,6 +360,10 @@ struct QwenImageModel : public DiffusionModel { qwen_image.set_flash_attention_enabled(enabled); } + void set_circular_pad_enabled(bool enabled) override { + qwen_image.set_circular_pad_enabled(enabled); + } + bool compute(int n_threads, DiffusionParams diffusion_params, struct ggml_tensor** output = nullptr, @@ -406,6 +427,10 @@ struct ZImageModel : public DiffusionModel { z_image.set_flash_attention_enabled(enabled); } + void set_circular_pad_enabled(bool enabled) override { + z_image.set_circular_pad_enabled(enabled); + } + bool compute(int n_threads, DiffusionParams diffusion_params, struct ggml_tensor** output = nullptr, diff --git a/flux.hpp b/flux.hpp index 1df2874ae..602ab9bdb 100644 --- a/flux.hpp +++ b/flux.hpp @@ -858,14 +858,18 @@ namespace Flux { } } - struct ggml_tensor* pad_to_patch_size(struct ggml_context* ctx, + struct ggml_tensor* pad_to_patch_size(GGMLRunnerContext* ctx, struct ggml_tensor* x) { int64_t W = x->ne[0]; int64_t H = x->ne[1]; int pad_h = (params.patch_size - H % params.patch_size) % params.patch_size; int pad_w = (params.patch_size - W % params.patch_size) % params.patch_size; - x = ggml_pad(ctx, x, pad_w, pad_h, 0, 0); // [N, C, H + pad_h, W + pad_w] + if (ctx->circular_pad_enabled) { + x = ggml_pad_circular(ctx->ggml_ctx, x, pad_w, pad_h, 0, 0); + } else { + x = ggml_pad(ctx->ggml_ctx, x, pad_w, pad_h, 0, 0); // [N, C, H + pad_h, W + pad_w] + } return x; } @@ -891,11 +895,11 @@ namespace Flux { return x; } - struct ggml_tensor* process_img(struct ggml_context* ctx, + struct ggml_tensor* process_img(GGMLRunnerContext* ctx, struct ggml_tensor* x) { // img = rearrange(x, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=patch_size, pw=patch_size) x = pad_to_patch_size(ctx, x); - x = patchify(ctx, x); + x = patchify(ctx->ggml_ctx, x); return x; } @@ -1065,7 +1069,7 @@ namespace Flux { int pad_h = (patch_size - H % patch_size) % patch_size; int pad_w = (patch_size - W % patch_size) % patch_size; - auto img = pad_to_patch_size(ctx->ggml_ctx, x); + auto img = pad_to_patch_size(ctx, x); auto orig_img = img; auto img_in_patch = std::dynamic_pointer_cast(blocks["img_in_patch"]); @@ -1128,7 +1132,7 @@ namespace Flux { int pad_h = (patch_size - H % patch_size) % patch_size; int pad_w = (patch_size - W % patch_size) % patch_size; - auto img = process_img(ctx->ggml_ctx, x); + auto img = process_img(ctx, x); uint64_t img_tokens = img->ne[1]; if (params.version == VERSION_FLUX_FILL) { @@ -1136,8 +1140,8 @@ namespace Flux { ggml_tensor* masked = ggml_view_4d(ctx->ggml_ctx, c_concat, c_concat->ne[0], c_concat->ne[1], C, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], 0); ggml_tensor* mask = ggml_view_4d(ctx->ggml_ctx, c_concat, c_concat->ne[0], c_concat->ne[1], 8 * 8, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], c_concat->nb[2] * C); - masked = process_img(ctx->ggml_ctx, masked); - mask = process_img(ctx->ggml_ctx, mask); + masked = process_img(ctx, masked); + mask = process_img(ctx, mask); img = ggml_concat(ctx->ggml_ctx, img, ggml_concat(ctx->ggml_ctx, masked, mask, 0), 0); } else if (params.version == VERSION_FLEX_2) { @@ -1146,21 +1150,21 @@ namespace Flux { ggml_tensor* mask = ggml_view_4d(ctx->ggml_ctx, c_concat, c_concat->ne[0], c_concat->ne[1], 1, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], c_concat->nb[2] * C); ggml_tensor* control = ggml_view_4d(ctx->ggml_ctx, c_concat, c_concat->ne[0], c_concat->ne[1], C, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], c_concat->nb[2] * (C + 1)); - masked = process_img(ctx->ggml_ctx, masked); - mask = process_img(ctx->ggml_ctx, mask); - control = process_img(ctx->ggml_ctx, control); + masked = process_img(ctx, masked); + mask = process_img(ctx, mask); + control = process_img(ctx, control); img = ggml_concat(ctx->ggml_ctx, img, ggml_concat(ctx->ggml_ctx, ggml_concat(ctx->ggml_ctx, masked, mask, 0), control, 0), 0); } else if (params.version == VERSION_FLUX_CONTROLS) { GGML_ASSERT(c_concat != nullptr); - auto control = process_img(ctx->ggml_ctx, c_concat); + auto control = process_img(ctx, c_concat); img = ggml_concat(ctx->ggml_ctx, img, control, 0); } if (ref_latents.size() > 0) { for (ggml_tensor* ref : ref_latents) { - ref = process_img(ctx->ggml_ctx, ref); + ref = process_img(ctx, ref); img = ggml_concat(ctx->ggml_ctx, img, ref, 1); } } diff --git a/qwen_image.hpp b/qwen_image.hpp index 6853787cd..d35ff18dc 100644 --- a/qwen_image.hpp +++ b/qwen_image.hpp @@ -354,14 +354,18 @@ namespace Qwen { blocks["proj_out"] = std::shared_ptr(new Linear(inner_dim, params.patch_size * params.patch_size * params.out_channels)); } - struct ggml_tensor* pad_to_patch_size(struct ggml_context* ctx, + struct ggml_tensor* pad_to_patch_size(GGMLRunnerContext* ctx, struct ggml_tensor* x) { int64_t W = x->ne[0]; int64_t H = x->ne[1]; int pad_h = (params.patch_size - H % params.patch_size) % params.patch_size; int pad_w = (params.patch_size - W % params.patch_size) % params.patch_size; - x = ggml_pad(ctx, x, pad_w, pad_h, 0, 0); // [N, C, H + pad_h, W + pad_w] + if (ctx->circular_pad_enabled) { + x = ggml_pad_circular(ctx->ggml_ctx, x, pad_w, pad_h, 0, 0); + } else { + x = ggml_pad(ctx->ggml_ctx, x, pad_w, pad_h, 0, 0); // [N, C, H + pad_h, W + pad_w] + } return x; } @@ -387,10 +391,10 @@ namespace Qwen { return x; } - struct ggml_tensor* process_img(struct ggml_context* ctx, + struct ggml_tensor* process_img(GGMLRunnerContext* ctx, struct ggml_tensor* x) { x = pad_to_patch_size(ctx, x); - x = patchify(ctx, x); + x = patchify(ctx->ggml_ctx, x); return x; } @@ -466,12 +470,12 @@ namespace Qwen { int64_t C = x->ne[2]; int64_t N = x->ne[3]; - auto img = process_img(ctx->ggml_ctx, x); + auto img = process_img(ctx, x); uint64_t img_tokens = img->ne[1]; if (ref_latents.size() > 0) { for (ggml_tensor* ref : ref_latents) { - ref = process_img(ctx->ggml_ctx, ref); + ref = process_img(ctx, ref); img = ggml_concat(ctx->ggml_ctx, img, ref, 1); } } diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index ec0dab2b1..172ee2e13 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -408,6 +408,7 @@ class StableDiffusionGGML { diffusion_model = std::make_shared(backend, offload_params_to_cpu, tensor_storage_map); + diffusion_model->set_circular_pad_enabled(circular_pad); } else if (sd_version_is_flux(version)) { bool is_chroma = false; for (auto pair : tensor_storage_map) { @@ -448,6 +449,7 @@ class StableDiffusionGGML { tensor_storage_map, version, sd_ctx_params->chroma_use_dit_mask); + diffusion_model->set_circular_pad_enabled(circular_pad); } else if (sd_version_is_flux2(version)) { bool is_chroma = false; cond_stage_model = std::make_shared(clip_backend, @@ -455,10 +457,11 @@ class StableDiffusionGGML { tensor_storage_map, version); diffusion_model = std::make_shared(backend, - offload_params_to_cpu, - tensor_storage_map, - version, - sd_ctx_params->chroma_use_dit_mask); + offload_params_to_cpu, + tensor_storage_map, + version, + sd_ctx_params->chroma_use_dit_mask); + diffusion_model->set_circular_pad_enabled(circular_pad); } else if (sd_version_is_wan(version)) { cond_stage_model = std::make_shared(clip_backend, offload_params_to_cpu, @@ -471,12 +474,14 @@ class StableDiffusionGGML { tensor_storage_map, "model.diffusion_model", version); + diffusion_model->set_circular_pad_enabled(circular_pad); if (strlen(SAFE_STR(sd_ctx_params->high_noise_diffusion_model_path)) > 0) { high_noise_diffusion_model = std::make_shared(backend, offload_params_to_cpu, tensor_storage_map, "model.high_noise_diffusion_model", version); + high_noise_diffusion_model->set_circular_pad_enabled(circular_pad); } if (diffusion_model->get_desc() == "Wan2.1-I2V-14B" || diffusion_model->get_desc() == "Wan2.1-FLF2V-14B" || @@ -503,6 +508,7 @@ class StableDiffusionGGML { tensor_storage_map, "model.diffusion_model", version); + diffusion_model->set_circular_pad_enabled(circular_pad); } else if (sd_version_is_z_image(version)) { cond_stage_model = std::make_shared(clip_backend, offload_params_to_cpu, @@ -513,6 +519,7 @@ class StableDiffusionGGML { tensor_storage_map, "model.diffusion_model", version); + diffusion_model->set_circular_pad_enabled(circular_pad); } else { // SD1.x SD2.x SDXL std::map embbeding_map; for (int i = 0; i < sd_ctx_params->embedding_count; i++) { @@ -538,8 +545,9 @@ class StableDiffusionGGML { version); if (sd_ctx_params->diffusion_conv_direct) { LOG_INFO("Using Conv2d direct in the diffusion model"); - std::dynamic_pointer_cast(diffusion_model)->unet.set_conv2d_direct_enabled(true); - } + std::dynamic_pointer_cast(diffusion_model)->unet.set_conv2d_direct_enabled(true); + } + diffusion_model->set_circular_pad_enabled(circular_pad); std::dynamic_pointer_cast(diffusion_model)->unet.set_circular_pad_enabled(circular_pad); } @@ -577,6 +585,7 @@ class StableDiffusionGGML { "first_stage_model", vae_decode_only, version); + first_stage_model->set_circular_pad_enabled(circular_pad); first_stage_model->alloc_params_buffer(); first_stage_model->get_param_tensors(tensors, "first_stage_model"); } else if (version == VERSION_CHROMA_RADIANCE) { diff --git a/wan.hpp b/wan.hpp index 75333bfe1..4cab4032b 100644 --- a/wan.hpp +++ b/wan.hpp @@ -75,7 +75,11 @@ namespace WAN { lp2 -= (int)cache_x->ne[2]; } - x = ggml_pad_ext(ctx->ggml_ctx, x, lp0, rp0, lp1, rp1, lp2, rp2, 0, 0); + if (ctx->circular_pad_enabled) { + x = ggml_pad_ext_circular(ctx->ggml_ctx, x, lp0, rp0, lp1, rp1, 0, 0, 0, 0); + } else { + x = ggml_pad_ext(ctx->ggml_ctx, x, lp0, rp0, lp1, rp1, 0, 0, 0, 0); + } return ggml_ext_conv_3d(ctx->ggml_ctx, x, w, b, in_channels, std::get<2>(stride), std::get<1>(stride), std::get<0>(stride), 0, 0, 0, @@ -206,9 +210,17 @@ namespace WAN { } else if (mode == "upsample3d") { x = ggml_upscale(ctx->ggml_ctx, x, 2, GGML_SCALE_MODE_NEAREST); } else if (mode == "downsample2d") { - x = ggml_pad(ctx->ggml_ctx, x, 1, 1, 0, 0); + if (ctx->circular_pad_enabled) { + x = ggml_pad_circular(ctx->ggml_ctx, x, 1, 1, 0, 0); + } else { + x = ggml_pad(ctx->ggml_ctx, x, 1, 1, 0, 0); + } } else if (mode == "downsample3d") { - x = ggml_pad(ctx->ggml_ctx, x, 1, 1, 0, 0); + if (ctx->circular_pad_enabled) { + x = ggml_pad_circular(ctx->ggml_ctx, x, 1, 1, 0, 0); + } else { + x = ggml_pad(ctx->ggml_ctx, x, 1, 1, 0, 0); + } } x = resample_1->forward(ctx, x); x = ggml_ext_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, x, 0, 1, 3, 2)); // (c, t, h, w) @@ -1826,7 +1838,7 @@ namespace WAN { } } - struct ggml_tensor* pad_to_patch_size(struct ggml_context* ctx, + struct ggml_tensor* pad_to_patch_size(GGMLRunnerContext* ctx, struct ggml_tensor* x) { int64_t W = x->ne[0]; int64_t H = x->ne[1]; @@ -1835,7 +1847,11 @@ namespace WAN { int pad_t = (std::get<0>(params.patch_size) - T % std::get<0>(params.patch_size)) % std::get<0>(params.patch_size); int pad_h = (std::get<1>(params.patch_size) - H % std::get<1>(params.patch_size)) % std::get<1>(params.patch_size); int pad_w = (std::get<2>(params.patch_size) - W % std::get<2>(params.patch_size)) % std::get<2>(params.patch_size); - x = ggml_pad(ctx, x, pad_w, pad_h, pad_t, 0); // [N*C, T + pad_t, H + pad_h, W + pad_w] + if (ctx->circular_pad_enabled) { + x = ggml_pad_circular(ctx->ggml_ctx, x, pad_w, pad_h, pad_t, 0); + } else { + x = ggml_pad(ctx->ggml_ctx, x, pad_w, pad_h, pad_t, 0); + } return x; } @@ -1986,14 +2002,14 @@ namespace WAN { int64_t T = x->ne[2]; int64_t C = x->ne[3]; - x = pad_to_patch_size(ctx->ggml_ctx, x); + x = pad_to_patch_size(ctx, x); int64_t t_len = ((T + (std::get<0>(params.patch_size) / 2)) / std::get<0>(params.patch_size)); int64_t h_len = ((H + (std::get<1>(params.patch_size) / 2)) / std::get<1>(params.patch_size)); int64_t w_len = ((W + (std::get<2>(params.patch_size) / 2)) / std::get<2>(params.patch_size)); if (time_dim_concat != nullptr) { - time_dim_concat = pad_to_patch_size(ctx->ggml_ctx, time_dim_concat); + time_dim_concat = pad_to_patch_size(ctx, time_dim_concat); x = ggml_concat(ctx->ggml_ctx, x, time_dim_concat, 2); // [N*C, (T+pad_t) + (T2+pad_t2), H + pad_h, W + pad_w] t_len = ((x->ne[2] + (std::get<0>(params.patch_size) / 2)) / std::get<0>(params.patch_size)); } diff --git a/z_image.hpp b/z_image.hpp index bc554f177..3268e3057 100644 --- a/z_image.hpp +++ b/z_image.hpp @@ -324,14 +324,18 @@ namespace ZImage { blocks["final_layer"] = std::make_shared(z_image_params.hidden_size, z_image_params.patch_size, z_image_params.out_channels); } - struct ggml_tensor* pad_to_patch_size(struct ggml_context* ctx, + struct ggml_tensor* pad_to_patch_size(GGMLRunnerContext* ctx, struct ggml_tensor* x) { int64_t W = x->ne[0]; int64_t H = x->ne[1]; int pad_h = (z_image_params.patch_size - H % z_image_params.patch_size) % z_image_params.patch_size; int pad_w = (z_image_params.patch_size - W % z_image_params.patch_size) % z_image_params.patch_size; - x = ggml_pad(ctx, x, pad_w, pad_h, 0, 0); // [N, C, H + pad_h, W + pad_w] + if (ctx->circular_pad_enabled) { + x = ggml_pad_circular(ctx->ggml_ctx, x, pad_w, pad_h, 0, 0); // [N, C, H + pad_h, W + pad_w] + } else { + x = ggml_pad(ctx->ggml_ctx, x, pad_w, pad_h, 0, 0); // [N, C, H + pad_h, W + pad_w] + } return x; } @@ -357,10 +361,10 @@ namespace ZImage { return x; } - struct ggml_tensor* process_img(struct ggml_context* ctx, + struct ggml_tensor* process_img(GGMLRunnerContext* ctx, struct ggml_tensor* x) { x = pad_to_patch_size(ctx, x); - x = patchify(ctx, x); + x = patchify(ctx->ggml_ctx, x); return x; } @@ -473,12 +477,12 @@ namespace ZImage { int64_t C = x->ne[2]; int64_t N = x->ne[3]; - auto img = process_img(ctx->ggml_ctx, x); + auto img = process_img(ctx, x); uint64_t n_img_token = img->ne[1]; if (ref_latents.size() > 0) { for (ggml_tensor* ref : ref_latents) { - ref = process_img(ctx->ggml_ctx, ref); + ref = process_img(ctx, ref); img = ggml_concat(ctx->ggml_ctx, img, ref, 1); } } From 8e829edb29731b2ca6008c9ce1bacf750ce03887 Mon Sep 17 00:00:00 2001 From: Phylliida Date: Fri, 12 Dec 2025 15:48:57 -0800 Subject: [PATCH 18/25] Fix temporal padding for qwen image and other vaes --- wan.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/wan.hpp b/wan.hpp index 4cab4032b..18b7e3d29 100644 --- a/wan.hpp +++ b/wan.hpp @@ -76,9 +76,9 @@ namespace WAN { } if (ctx->circular_pad_enabled) { - x = ggml_pad_ext_circular(ctx->ggml_ctx, x, lp0, rp0, lp1, rp1, 0, 0, 0, 0); + x = ggml_pad_ext_circular(ctx->ggml_ctx, x, lp0, rp0, lp1, rp1, lp2, rp2, 0, 0); } else { - x = ggml_pad_ext(ctx->ggml_ctx, x, lp0, rp0, lp1, rp1, 0, 0, 0, 0); + x = ggml_pad_ext(ctx->ggml_ctx, x, lp0, rp0, lp1, rp1, lp2, rp2, 0, 0); } return ggml_ext_conv_3d(ctx->ggml_ctx, x, w, b, in_channels, std::get<2>(stride), std::get<1>(stride), std::get<0>(stride), From 4054e3cc325b424071551b9b59b01eb347f1c209 Mon Sep 17 00:00:00 2001 From: Phylliida Date: Sat, 13 Dec 2025 00:17:19 -0800 Subject: [PATCH 19/25] Z Image circular tiling --- rope.hpp | 22 +++++++++++++++++++++- z_image.hpp | 1 + 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/rope.hpp b/rope.hpp index f84fe4f43..982e98469 100644 --- a/rope.hpp +++ b/rope.hpp @@ -495,9 +495,29 @@ namespace Rope { const std::vector& ref_latents, bool increase_ref_index, int theta, + bool circular, const std::vector& axes_dim) { std::vector> ids = gen_z_image_ids(h, w, patch_size, bs, context_len, seq_multi_of, ref_latents, increase_ref_index); - return embed_nd(ids, bs, theta, axes_dim); + std::vector> wrap_dims; + if (circular && bs > 0 && axes_dim.size() >= 3) { + int pad_h = (patch_size - (h % patch_size)) % patch_size; + int pad_w = (patch_size - (w % patch_size)) % patch_size; + int h_len = (h + pad_h) / patch_size; + int w_len = (w + pad_w) / patch_size; + if (h_len > 0 && w_len > 0) { + size_t pos_len = ids.size() / bs; + wrap_dims.assign(axes_dim.size(), std::vector(pos_len, 0)); + size_t cursor = context_len + bound_mod(context_len, seq_multi_of); // skip text (and its padding) + size_t img_tokens = static_cast(h_len) * static_cast(w_len); + for (size_t token_i = 0; token_i < img_tokens; ++token_i) { + wrap_dims[1][cursor + token_i] = h_len; + wrap_dims[2][cursor + token_i] = w_len; + } + } + } + + const std::vector>* wraps_ptr = wrap_dims.empty() ? nullptr : &wrap_dims; + return embed_nd(ids, bs, theta, axes_dim, wraps_ptr); } __STATIC_INLINE__ struct ggml_tensor* apply_rope(struct ggml_context* ctx, diff --git a/z_image.hpp b/z_image.hpp index 3268e3057..0955d4e9a 100644 --- a/z_image.hpp +++ b/z_image.hpp @@ -556,6 +556,7 @@ namespace ZImage { ref_latents, increase_ref_index, z_image_params.theta, + circular_pad_enabled, z_image_params.axes_dim); int pos_len = pe_vec.size() / z_image_params.axes_dim_sum / 2; // LOG_DEBUG("pos_len %d", pos_len); From 4b87268db6fd02e9d82c6865207d04e3be873e64 Mon Sep 17 00:00:00 2001 From: Phylliida Date: Sat, 13 Dec 2025 01:07:00 -0800 Subject: [PATCH 20/25] x and y axis seamless only --- diffusion_model.hpp | 9 +++++++++ examples/cli/main.cpp | 20 +++++++++++++++++--- ggml_extend.hpp | 11 +++++++++++ qwen_image.hpp | 3 ++- rope.hpp | 34 ++++++++++++++++++++++++---------- stable-diffusion.cpp | 12 ++++++++++++ stable-diffusion.h | 2 ++ z_image.hpp | 3 ++- 8 files changed, 79 insertions(+), 15 deletions(-) diff --git a/diffusion_model.hpp b/diffusion_model.hpp index b6491291a..968c978cd 100644 --- a/diffusion_model.hpp +++ b/diffusion_model.hpp @@ -40,6 +40,7 @@ struct DiffusionModel { virtual int64_t get_adm_in_channels() = 0; virtual void set_flash_attn_enabled(bool enabled) = 0; virtual void set_circular_pad_enabled(bool enabled) = 0; + virtual void set_rope_circular_axes(bool circular_x, bool circular_y) {}; }; struct UNetModel : public DiffusionModel { @@ -364,6 +365,10 @@ struct QwenImageModel : public DiffusionModel { qwen_image.set_circular_pad_enabled(enabled); } + void set_rope_circular_axes(bool circular_x, bool circular_y) override { + qwen_image.set_circular_rope_enabled(circular_x, circular_y); + } + bool compute(int n_threads, DiffusionParams diffusion_params, struct ggml_tensor** output = nullptr, @@ -431,6 +436,10 @@ struct ZImageModel : public DiffusionModel { z_image.set_circular_pad_enabled(enabled); } + void set_rope_circular_axes(bool circular_x, bool circular_y) override { + z_image.set_circular_rope_enabled(circular_x, circular_y); + } + bool compute(int n_threads, DiffusionParams diffusion_params, struct ggml_tensor** output = nullptr, diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp index 79c456271..2aa3446ed 100644 --- a/examples/cli/main.cpp +++ b/examples/cli/main.cpp @@ -519,6 +519,8 @@ struct SDContextParams { bool diffusion_conv_direct = false; bool vae_conv_direct = false; bool circular_pad = false; + bool circular_pad_x = false; + bool circular_pad_y = false; bool chroma_use_dit_mask = true; bool chroma_use_t5_mask = false; @@ -673,9 +675,17 @@ struct SDContextParams { "use ggml_conv2d_direct in the vae model", true, &vae_conv_direct}, {"", - "--circular", - "enable circular padding for convolutions", - true, &circular_pad}, + "--circular", + "enable circular padding for convolutions", + true, &circular_pad}, + {"", + "--circularx", + "enable circular RoPE wrapping on x-axis (width) only", + true, &circular_pad_x}, + {"", + "--circulary", + "enable circular RoPE wrapping on y-axis (height) only", + true, &circular_pad_y}, {"", "--chroma-disable-dit-mask", "disable dit mask for chroma", @@ -940,6 +950,8 @@ struct SDContextParams { << " diffusion_conv_direct: " << (diffusion_conv_direct ? "true" : "false") << ",\n" << " vae_conv_direct: " << (vae_conv_direct ? "true" : "false") << ",\n" << " circular_pad: " << (circular_pad ? "true" : "false") << ",\n" + << " circular_pad_x: " << (circular_pad_x ? "true" : "false") << ",\n" + << " circular_pad_y: " << (circular_pad_y ? "true" : "false") << ",\n" << " chroma_use_dit_mask: " << (chroma_use_dit_mask ? "true" : "false") << ",\n" << " chroma_use_t5_mask: " << (chroma_use_t5_mask ? "true" : "false") << ",\n" << " chroma_t5_mask_pad: " << chroma_t5_mask_pad << ",\n" @@ -1002,6 +1014,8 @@ struct SDContextParams { diffusion_conv_direct, vae_conv_direct, circular_pad, + circular_pad || circular_pad_x, + circular_pad || circular_pad_y, force_sdxl_vae_conv_scale, chroma_use_dit_mask, chroma_use_t5_mask, diff --git a/ggml_extend.hpp b/ggml_extend.hpp index b76a25f04..a4224d3c6 100644 --- a/ggml_extend.hpp +++ b/ggml_extend.hpp @@ -1573,6 +1573,8 @@ struct GGMLRunnerContext { bool flash_attn_enabled = false; bool conv2d_direct_enabled = false; bool circular_pad_enabled = false; + bool rope_circular_x_enabled = false; + bool rope_circular_y_enabled = false; std::shared_ptr weight_adapter = nullptr; }; @@ -1610,6 +1612,8 @@ struct GGMLRunner { bool flash_attn_enabled = false; bool conv2d_direct_enabled = false; bool circular_pad_enabled = false; + bool rope_circular_x_enabled = false; + bool rope_circular_y_enabled = false; void alloc_params_ctx() { struct ggml_init_params params; @@ -1888,6 +1892,8 @@ struct GGMLRunner { runner_ctx.flash_attn_enabled = flash_attn_enabled; runner_ctx.conv2d_direct_enabled = conv2d_direct_enabled; runner_ctx.circular_pad_enabled = circular_pad_enabled; + runner_ctx.rope_circular_x_enabled = rope_circular_x_enabled; + runner_ctx.rope_circular_y_enabled = rope_circular_y_enabled; runner_ctx.weight_adapter = weight_adapter; return runner_ctx; } @@ -2036,6 +2042,11 @@ struct GGMLRunner { circular_pad_enabled = enabled; } + void set_circular_rope_enabled(bool circular_x, bool circular_y) { + rope_circular_x_enabled = circular_x; + rope_circular_y_enabled = circular_y; + } + void set_weight_adapter(const std::shared_ptr& adapter) { weight_adapter = adapter; } diff --git a/qwen_image.hpp b/qwen_image.hpp index d35ff18dc..d24c3d83e 100644 --- a/qwen_image.hpp +++ b/qwen_image.hpp @@ -569,7 +569,8 @@ namespace Qwen { ref_latents, increase_ref_index, qwen_image_params.theta, - circular_pad_enabled, + rope_circular_y_enabled, + rope_circular_x_enabled, qwen_image_params.axes_dim); int pos_len = pe_vec.size() / qwen_image_params.axes_dim_sum / 2; // LOG_DEBUG("pos_len %d", pos_len); diff --git a/rope.hpp b/rope.hpp index 982e98469..55695f55c 100644 --- a/rope.hpp +++ b/rope.hpp @@ -316,12 +316,13 @@ namespace Rope { const std::vector& ref_latents, bool increase_ref_index, int theta, - bool circular, + bool circular_h, + bool circular_w, const std::vector& axes_dim) { std::vector> ids = gen_qwen_image_ids(h, w, patch_size, bs, context_len, ref_latents, increase_ref_index); std::vector> wrap_dims; // This logic simply stores the (pad and patch_adjusted) sizes of images so we can make sure rope correctly tiles - if (circular && bs > 0 && axes_dim.size() >= 3) { + if ((circular_h || circular_w) && bs > 0 && axes_dim.size() >= 3) { int pad_h = (patch_size - (h % patch_size)) % patch_size; int pad_w = (patch_size - (w % patch_size)) % patch_size; int h_len = (h + pad_h) / patch_size; @@ -333,8 +334,12 @@ namespace Rope { size_t cursor = context_len; // ignore text tokens const size_t img_tokens = static_cast(h_len) * static_cast(w_len); for (size_t token_i = 0; token_i < img_tokens; ++token_i) { - wrap_dims[1][cursor + token_i] = h_len; - wrap_dims[2][cursor + token_i] = w_len; + if (circular_h) { + wrap_dims[1][cursor + token_i] = h_len; + } + if (circular_w) { + wrap_dims[2][cursor + token_i] = w_len; + } } cursor += img_tokens; // For each reference image, store wrap sizes as well @@ -350,8 +355,12 @@ namespace Rope { int ref_w_len = (ref_w + ref_pad_w) / patch_size; size_t ref_n_tokens = static_cast(ref_h_len) * static_cast(ref_w_len); for (size_t token_i = 0; token_i < ref_n_tokens; ++token_i) { - wrap_dims[1][cursor + token_i] = ref_h_len; - wrap_dims[2][cursor + token_i] = ref_w_len; + if (circular_h) { + wrap_dims[1][cursor + token_i] = ref_h_len; + } + if (circular_w) { + wrap_dims[2][cursor + token_i] = ref_w_len; + } } cursor += ref_n_tokens; } @@ -495,11 +504,12 @@ namespace Rope { const std::vector& ref_latents, bool increase_ref_index, int theta, - bool circular, + bool circular_h, + bool circular_w, const std::vector& axes_dim) { std::vector> ids = gen_z_image_ids(h, w, patch_size, bs, context_len, seq_multi_of, ref_latents, increase_ref_index); std::vector> wrap_dims; - if (circular && bs > 0 && axes_dim.size() >= 3) { + if ((circular_h || circular_w) && bs > 0 && axes_dim.size() >= 3) { int pad_h = (patch_size - (h % patch_size)) % patch_size; int pad_w = (patch_size - (w % patch_size)) % patch_size; int h_len = (h + pad_h) / patch_size; @@ -510,8 +520,12 @@ namespace Rope { size_t cursor = context_len + bound_mod(context_len, seq_multi_of); // skip text (and its padding) size_t img_tokens = static_cast(h_len) * static_cast(w_len); for (size_t token_i = 0; token_i < img_tokens; ++token_i) { - wrap_dims[1][cursor + token_i] = h_len; - wrap_dims[2][cursor + token_i] = w_len; + if (circular_h) { + wrap_dims[1][cursor + token_i] = h_len; + } + if (circular_w) { + wrap_dims[2][cursor + token_i] = w_len; + } } } } diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 172ee2e13..ffa322a64 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -128,6 +128,8 @@ class StableDiffusionGGML { sd_tiling_params_t vae_tiling_params = {false, 0, 0, 0.5f, 0, 0}; bool offload_params_to_cpu = false; bool circular_pad = false; + bool circular_pad_x = false; + bool circular_pad_y = false; bool stacked_id = false; bool is_using_v_parameterization = false; @@ -212,6 +214,8 @@ class StableDiffusionGGML { use_tiny_autoencoder = taesd_path.size() > 0; offload_params_to_cpu = sd_ctx_params->offload_params_to_cpu; circular_pad = sd_ctx_params->circular_pad; + circular_pad_x = sd_ctx_params->circular_pad_x || circular_pad; + circular_pad_y = sd_ctx_params->circular_pad_y || circular_pad; rng = get_rng(sd_ctx_params->rng_type); if (sd_ctx_params->sampler_rng_type != RNG_TYPE_COUNT && sd_ctx_params->sampler_rng_type != sd_ctx_params->rng_type) { @@ -509,6 +513,7 @@ class StableDiffusionGGML { "model.diffusion_model", version); diffusion_model->set_circular_pad_enabled(circular_pad); + diffusion_model->set_rope_circular_axes(circular_pad_x, circular_pad_y); } else if (sd_version_is_z_image(version)) { cond_stage_model = std::make_shared(clip_backend, offload_params_to_cpu, @@ -520,6 +525,7 @@ class StableDiffusionGGML { "model.diffusion_model", version); diffusion_model->set_circular_pad_enabled(circular_pad); + diffusion_model->set_rope_circular_axes(circular_pad_x, circular_pad_y); } else { // SD1.x SD2.x SDXL std::map embbeding_map; for (int i = 0; i < sd_ctx_params->embedding_count; i++) { @@ -2531,6 +2537,8 @@ void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) { sd_ctx_params->keep_vae_on_cpu = false; sd_ctx_params->diffusion_flash_attn = false; sd_ctx_params->circular_pad = false; + sd_ctx_params->circular_pad_x = false; + sd_ctx_params->circular_pad_y = false; sd_ctx_params->chroma_use_dit_mask = true; sd_ctx_params->chroma_use_t5_mask = false; sd_ctx_params->chroma_t5_mask_pad = 1; @@ -2572,6 +2580,8 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) { "keep_vae_on_cpu: %s\n" "diffusion_flash_attn: %s\n" "circular_pad: %s\n" + "circular_pad_x: %s\n" + "circular_pad_y: %s\n" "chroma_use_dit_mask: %s\n" "chroma_use_t5_mask: %s\n" "chroma_t5_mask_pad: %d\n", @@ -2603,6 +2613,8 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) { BOOL_STR(sd_ctx_params->keep_vae_on_cpu), BOOL_STR(sd_ctx_params->diffusion_flash_attn), BOOL_STR(sd_ctx_params->circular_pad), + BOOL_STR(sd_ctx_params->circular_pad_x), + BOOL_STR(sd_ctx_params->circular_pad_y), BOOL_STR(sd_ctx_params->chroma_use_dit_mask), BOOL_STR(sd_ctx_params->chroma_use_t5_mask), sd_ctx_params->chroma_t5_mask_pad); diff --git a/stable-diffusion.h b/stable-diffusion.h index 87b5b0485..3eb1324f5 100644 --- a/stable-diffusion.h +++ b/stable-diffusion.h @@ -190,6 +190,8 @@ typedef struct { bool diffusion_conv_direct; bool vae_conv_direct; bool circular_pad; + bool circular_pad_x; + bool circular_pad_y; bool force_sdxl_vae_conv_scale; bool chroma_use_dit_mask; bool chroma_use_t5_mask; diff --git a/z_image.hpp b/z_image.hpp index 0955d4e9a..c87f1b9d9 100644 --- a/z_image.hpp +++ b/z_image.hpp @@ -556,7 +556,8 @@ namespace ZImage { ref_latents, increase_ref_index, z_image_params.theta, - circular_pad_enabled, + rope_circular_y_enabled, + rope_circular_x_enabled, z_image_params.axes_dim); int pos_len = pe_vec.size() / z_image_params.axes_dim_sum / 2; // LOG_DEBUG("pos_len %d", pos_len); From 935f98037525fa0a3aca3f3effd5083365c9bd44 Mon Sep 17 00:00:00 2001 From: Phylliida Date: Sat, 13 Dec 2025 01:25:00 -0800 Subject: [PATCH 21/25] First attempt at chroma seamless x and y --- diffusion_model.hpp | 4 ++++ flux.hpp | 2 ++ rope.hpp | 45 +++++++++++++++++++++++++++++++++++++++++++- stable-diffusion.cpp | 2 ++ 4 files changed, 52 insertions(+), 1 deletion(-) diff --git a/diffusion_model.hpp b/diffusion_model.hpp index 968c978cd..959d1f99a 100644 --- a/diffusion_model.hpp +++ b/diffusion_model.hpp @@ -224,6 +224,10 @@ struct FluxModel : public DiffusionModel { flux.set_circular_pad_enabled(enabled); } + void set_rope_circular_axes(bool circular_x, bool circular_y) override { + flux.set_circular_rope_enabled(circular_x, circular_y); + } + bool compute(int n_threads, DiffusionParams diffusion_params, struct ggml_tensor** output = nullptr, diff --git a/flux.hpp b/flux.hpp index 602ab9bdb..7d47350ca 100644 --- a/flux.hpp +++ b/flux.hpp @@ -1445,6 +1445,8 @@ namespace Flux { increase_ref_index, flux_params.ref_index_scale, flux_params.theta, + rope_circular_y_enabled, + rope_circular_x_enabled, flux_params.axes_dim); int pos_len = pe_vec.size() / flux_params.axes_dim_sum / 2; // LOG_DEBUG("pos_len %d", pos_len); diff --git a/rope.hpp b/rope.hpp index 55695f55c..682cb641a 100644 --- a/rope.hpp +++ b/rope.hpp @@ -266,6 +266,8 @@ namespace Rope { bool increase_ref_index, float ref_index_scale, int theta, + bool circular_h, + bool circular_w, const std::vector& axes_dim) { std::vector> ids = gen_flux_ids(h, w, @@ -277,7 +279,48 @@ namespace Rope { ref_latents, increase_ref_index, ref_index_scale); - return embed_nd(ids, bs, theta, axes_dim); + std::vector> wrap_dims; + if ((circular_h || circular_w) && bs > 0 && axes_dim.size() >= 3) { + int h_len = (h + (patch_size / 2)) / patch_size; + int w_len = (w + (patch_size / 2)) / patch_size; + if (h_len > 0 && w_len > 0) { + size_t pos_len = ids.size() / bs; + wrap_dims.assign(axes_dim.size(), std::vector(pos_len, 0)); + size_t cursor = context_len; // text first + const size_t img_tokens = static_cast(h_len) * static_cast(w_len); + for (size_t token_i = 0; token_i < img_tokens; ++token_i) { + if (circular_h) { + wrap_dims[1][cursor + token_i] = h_len; + } + if (circular_w) { + wrap_dims[2][cursor + token_i] = w_len; + } + } + cursor += img_tokens; + // reference latents + for (ggml_tensor* ref : ref_latents) { + if (ref == nullptr) { + continue; + } + int ref_h = static_cast(ref->ne[1]); + int ref_w = static_cast(ref->ne[0]); + int ref_h_l = (ref_h + (patch_size / 2)) / patch_size; + int ref_w_l = (ref_w + (patch_size / 2)) / patch_size; + size_t ref_tokens = static_cast(ref_h_l) * static_cast(ref_w_l); + for (size_t token_i = 0; token_i < ref_tokens; ++token_i) { + if (circular_h) { + wrap_dims[1][cursor + token_i] = ref_h_l; + } + if (circular_w) { + wrap_dims[2][cursor + token_i] = ref_w_l; + } + } + cursor += ref_tokens; + } + } + } + const std::vector>* wraps_ptr = wrap_dims.empty() ? nullptr : &wrap_dims; + return embed_nd(ids, bs, theta, axes_dim, wraps_ptr); } __STATIC_INLINE__ std::vector> gen_qwen_image_ids(int h, diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index ffa322a64..eabb51012 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -454,6 +454,7 @@ class StableDiffusionGGML { version, sd_ctx_params->chroma_use_dit_mask); diffusion_model->set_circular_pad_enabled(circular_pad); + diffusion_model->set_rope_circular_axes(circular_pad_x, circular_pad_y); } else if (sd_version_is_flux2(version)) { bool is_chroma = false; cond_stage_model = std::make_shared(clip_backend, @@ -466,6 +467,7 @@ class StableDiffusionGGML { version, sd_ctx_params->chroma_use_dit_mask); diffusion_model->set_circular_pad_enabled(circular_pad); + diffusion_model->set_rope_circular_axes(circular_pad_x, circular_pad_y); } else if (sd_version_is_wan(version)) { cond_stage_model = std::make_shared(clip_backend, offload_params_to_cpu, From 820fb6bba598a446febf0117d5fdab8bd421ae6f Mon Sep 17 00:00:00 2001 From: Phylliida Date: Sat, 13 Dec 2025 02:02:00 -0800 Subject: [PATCH 22/25] refactor into pure x and y, almost there --- clip.hpp | 2 +- common.hpp | 8 +--- diffusion_model.hpp | 37 ++++++++++++++--- flux.hpp | 6 +-- ggml_extend.hpp | 95 +++++++++++++++++++++++++++++++++++++++----- lora.hpp | 12 ++++-- qwen_image.hpp | 6 +-- stable-diffusion.cpp | 47 +++++++++++----------- wan.hpp | 25 ++---------- z_image.hpp | 6 +-- 10 files changed, 158 insertions(+), 86 deletions(-) diff --git a/clip.hpp b/clip.hpp index cda5a3015..4b51727c4 100644 --- a/clip.hpp +++ b/clip.hpp @@ -664,7 +664,7 @@ class CLIPVisionEmbeddings : public GGMLBlock { // concat(patch_embedding, class_embedding) + position_embedding struct ggml_tensor* patch_embedding; int64_t N = pixel_values->ne[3]; - patch_embedding = ggml_ext_conv_2d(ctx->ggml_ctx, pixel_values, patch_embed_weight, nullptr, patch_size, patch_size, 0, 0, 1, 1, false, ctx->circular_pad_enabled); // [N, embed_dim, image_size // pacht_size, image_size // pacht_size] + patch_embedding = ggml_ext_conv_2d(ctx->ggml_ctx, pixel_values, patch_embed_weight, nullptr, patch_size, patch_size, 0, 0, 1, 1, false, ctx->circular_pad_x_enabled, ctx->circular_pad_y_enabled); // [N, embed_dim, image_size // pacht_size, image_size // pacht_size] patch_embedding = ggml_reshape_3d(ctx->ggml_ctx, patch_embedding, num_patches, embed_dim, N); // [N, embed_dim, num_patches] patch_embedding = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, patch_embedding, 1, 0, 2, 3)); // [N, num_patches, embed_dim] patch_embedding = ggml_reshape_4d(ctx->ggml_ctx, patch_embedding, 1, embed_dim, num_patches, N); // [N, num_patches, embed_dim, 1] diff --git a/common.hpp b/common.hpp index a95e76a4f..8d66422b1 100644 --- a/common.hpp +++ b/common.hpp @@ -29,12 +29,8 @@ class DownSampleBlock : public GGMLBlock { auto conv = std::dynamic_pointer_cast(blocks["conv"]); // For VAE downsampling we manually pad by 1 before the stride-2 conv. - // Honor the global circular padding flag here to avoid seams in seamless mode. - if (ctx->circular_pad_enabled) { - x = ggml_pad_circular(ctx->ggml_ctx, x, 1, 1, 0, 0); - } else { - x = ggml_pad(ctx->ggml_ctx, x, 1, 1, 0, 0); - } + // Honor the global circular padding flags here to avoid seams in seamless mode. + x = sd_pad(ctx->ggml_ctx, x, 1, 1, 0, 0, ctx->circular_pad_x_enabled, ctx->circular_pad_y_enabled); x = conv->forward(ctx, x); } else { auto conv = std::dynamic_pointer_cast(blocks["op"]); diff --git a/diffusion_model.hpp b/diffusion_model.hpp index 959d1f99a..c73a50106 100644 --- a/diffusion_model.hpp +++ b/diffusion_model.hpp @@ -40,6 +40,7 @@ struct DiffusionModel { virtual int64_t get_adm_in_channels() = 0; virtual void set_flash_attn_enabled(bool enabled) = 0; virtual void set_circular_pad_enabled(bool enabled) = 0; + virtual void set_circular_pad_axes(bool circular_x, bool circular_y) = 0; virtual void set_rope_circular_axes(bool circular_x, bool circular_y) {}; }; @@ -90,7 +91,11 @@ struct UNetModel : public DiffusionModel { } void set_circular_pad_enabled(bool enabled) override { - unet.set_circular_pad_enabled(enabled); + unet.set_circular_pad_axes(enabled, enabled); + } + + void set_circular_pad_axes(bool circular_x, bool circular_y) override { + unet.set_circular_pad_axes(circular_x, circular_y); } bool compute(int n_threads, @@ -155,7 +160,11 @@ struct MMDiTModel : public DiffusionModel { } void set_circular_pad_enabled(bool enabled) override { - mmdit.set_circular_pad_enabled(enabled); + mmdit.set_circular_pad_axes(enabled, enabled); + } + + void set_circular_pad_axes(bool circular_x, bool circular_y) override { + mmdit.set_circular_pad_axes(circular_x, circular_y); } bool compute(int n_threads, @@ -221,7 +230,11 @@ struct FluxModel : public DiffusionModel { } void set_circular_pad_enabled(bool enabled) override { - flux.set_circular_pad_enabled(enabled); + flux.set_circular_pad_axes(enabled, enabled); + } + + void set_circular_pad_axes(bool circular_x, bool circular_y) override { + flux.set_circular_pad_axes(circular_x, circular_y); } void set_rope_circular_axes(bool circular_x, bool circular_y) override { @@ -296,7 +309,11 @@ struct WanModel : public DiffusionModel { } void set_circular_pad_enabled(bool enabled) override { - wan.set_circular_pad_enabled(enabled); + wan.set_circular_pad_axes(enabled, enabled); + } + + void set_circular_pad_axes(bool circular_x, bool circular_y) override { + wan.set_circular_pad_axes(circular_x, circular_y); } bool compute(int n_threads, @@ -366,7 +383,11 @@ struct QwenImageModel : public DiffusionModel { } void set_circular_pad_enabled(bool enabled) override { - qwen_image.set_circular_pad_enabled(enabled); + qwen_image.set_circular_pad_axes(enabled, enabled); + } + + void set_circular_pad_axes(bool circular_x, bool circular_y) override { + qwen_image.set_circular_pad_axes(circular_x, circular_y); } void set_rope_circular_axes(bool circular_x, bool circular_y) override { @@ -437,7 +458,11 @@ struct ZImageModel : public DiffusionModel { } void set_circular_pad_enabled(bool enabled) override { - z_image.set_circular_pad_enabled(enabled); + z_image.set_circular_pad_axes(enabled, enabled); + } + + void set_circular_pad_axes(bool circular_x, bool circular_y) override { + z_image.set_circular_pad_axes(circular_x, circular_y); } void set_rope_circular_axes(bool circular_x, bool circular_y) override { diff --git a/flux.hpp b/flux.hpp index 7d47350ca..65e91e106 100644 --- a/flux.hpp +++ b/flux.hpp @@ -865,11 +865,7 @@ namespace Flux { int pad_h = (params.patch_size - H % params.patch_size) % params.patch_size; int pad_w = (params.patch_size - W % params.patch_size) % params.patch_size; - if (ctx->circular_pad_enabled) { - x = ggml_pad_circular(ctx->ggml_ctx, x, pad_w, pad_h, 0, 0); - } else { - x = ggml_pad(ctx->ggml_ctx, x, pad_w, pad_h, 0, 0); // [N, C, H + pad_h, W + pad_w] - } + x = sd_pad(ctx->ggml_ctx, x, pad_w, pad_h, 0, 0, ctx->circular_pad_x_enabled, ctx->circular_pad_y_enabled); return x; } diff --git a/ggml_extend.hpp b/ggml_extend.hpp index a4224d3c6..3c3b7aa28 100644 --- a/ggml_extend.hpp +++ b/ggml_extend.hpp @@ -1012,6 +1012,69 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_linear(struct ggml_context* ctx, return x; } +__STATIC_INLINE__ struct ggml_tensor* sd_pad(struct ggml_context* ctx, + struct ggml_tensor* x, + int pad_w, + int pad_h, + int pad_t = 0, + int pad_d = 0, + bool circular_x = false, + bool circular_y = false) { + if ((circular_x && circular_y) || (!circular_x && !circular_y)) { + return circular_x && circular_y ? ggml_pad_circular(ctx, x, pad_w, pad_h, pad_t, pad_d) + : ggml_pad(ctx, x, pad_w, pad_h, pad_t, pad_d); + } + + int rem_w = pad_w; + int rem_h = pad_h; + + if (circular_x && pad_w != 0) { + x = ggml_pad_circular(ctx, x, pad_w, 0, 0, 0); + rem_w = 0; + } + if (circular_y && pad_h != 0) { + x = ggml_pad_circular(ctx, x, 0, pad_h, 0, 0); + rem_h = 0; + } + + if (rem_w != 0 || rem_h != 0 || pad_t != 0 || pad_d != 0) { + x = ggml_pad(ctx, x, rem_w, rem_h, pad_t, pad_d); + } + return x; +} + +__STATIC_INLINE__ struct ggml_tensor* sd_pad_ext(struct ggml_context* ctx, + struct ggml_tensor* x, + int lp0, + int rp0, + int lp1, + int rp1, + int lp2, + int rp2, + int lp3, + int rp3, + bool circular_x = false, + bool circular_y = false) { + if ((circular_x && circular_y) || (!circular_x && !circular_y)) { + return circular_x && circular_y ? ggml_pad_ext_circular(ctx, x, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3) + : ggml_pad_ext(ctx, x, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3); + } + + if (circular_x && (lp0 != 0 || rp0 != 0)) { + x = ggml_pad_ext_circular(ctx, x, lp0, rp0, 0, 0, 0, 0, 0, 0); + lp0 = rp0 = 0; + } + if (circular_y && (lp1 != 0 || rp1 != 0)) { + x = ggml_pad_ext_circular(ctx, x, 0, 0, lp1, rp1, 0, 0, 0, 0); + lp1 = rp1 = 0; + } + + if (lp0 != 0 || rp0 != 0 || lp1 != 0 || rp1 != 0 || lp2 != 0 || rp2 != 0 || lp3 != 0 || rp3 != 0) { + x = ggml_pad_ext(ctx, x, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3); + } + return x; +} + // w: [OC,IC, KH, KW] // x: [N, IC, IH, IW] // b: [OC,] @@ -1027,7 +1090,8 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_conv_2d(struct ggml_context* ctx, int d0 = 1, int d1 = 1, bool direct = false, - bool circular = false, + bool circular_x = false, + bool circular_y = false, float scale = 1.f) { if (scale != 1.f) { x = ggml_scale(ctx, x, scale); @@ -1038,8 +1102,8 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_conv_2d(struct ggml_context* ctx, // use circular padding (on a torus, x and y wrap around) for seamless textures // see https://github.com/leejet/stable-diffusion.cpp/pull/914 - if (circular && (p0 != 0 || p1 != 0)) { - x = ggml_pad_ext_circular(ctx, x, p0, p0, p1, p1, 0, 0, 0, 0); + if ((p0 != 0 || p1 != 0) && (circular_x || circular_y)) { + x = sd_pad(ctx, x, p0, p1, 0, 0, circular_x, circular_y); p0 = 0; p1 = 0; } @@ -1553,7 +1617,8 @@ struct WeightAdapter { int d0 = 1; int d1 = 1; bool direct = false; - bool circular = false; + bool circular_x = false; + bool circular_y = false; float scale = 1.f; } conv2d; }; @@ -1572,7 +1637,8 @@ struct GGMLRunnerContext { ggml_context* ggml_ctx = nullptr; bool flash_attn_enabled = false; bool conv2d_direct_enabled = false; - bool circular_pad_enabled = false; + bool circular_pad_x_enabled = false; + bool circular_pad_y_enabled = false; bool rope_circular_x_enabled = false; bool rope_circular_y_enabled = false; std::shared_ptr weight_adapter = nullptr; @@ -1611,7 +1677,8 @@ struct GGMLRunner { bool flash_attn_enabled = false; bool conv2d_direct_enabled = false; - bool circular_pad_enabled = false; + bool circular_pad_x_enabled = false; + bool circular_pad_y_enabled = false; bool rope_circular_x_enabled = false; bool rope_circular_y_enabled = false; @@ -1891,7 +1958,8 @@ struct GGMLRunner { runner_ctx.backend = runtime_backend; runner_ctx.flash_attn_enabled = flash_attn_enabled; runner_ctx.conv2d_direct_enabled = conv2d_direct_enabled; - runner_ctx.circular_pad_enabled = circular_pad_enabled; + runner_ctx.circular_pad_x_enabled = circular_pad_x_enabled; + runner_ctx.circular_pad_y_enabled = circular_pad_y_enabled; runner_ctx.rope_circular_x_enabled = rope_circular_x_enabled; runner_ctx.rope_circular_y_enabled = rope_circular_y_enabled; runner_ctx.weight_adapter = weight_adapter; @@ -2039,7 +2107,12 @@ struct GGMLRunner { } void set_circular_pad_enabled(bool enabled) { - circular_pad_enabled = enabled; + set_circular_pad_axes(enabled, enabled); + } + + void set_circular_pad_axes(bool circular_x, bool circular_y) { + circular_pad_x_enabled = circular_x; + circular_pad_y_enabled = circular_y; } void set_circular_rope_enabled(bool circular_x, bool circular_y) { @@ -2318,7 +2391,8 @@ class Conv2d : public UnaryBlock { forward_params.conv2d.d0 = dilation.second; forward_params.conv2d.d1 = dilation.first; forward_params.conv2d.direct = ctx->conv2d_direct_enabled; - forward_params.conv2d.circular = ctx->circular_pad_enabled; + forward_params.conv2d.circular_x = ctx->circular_pad_x_enabled; + forward_params.conv2d.circular_y = ctx->circular_pad_y_enabled; forward_params.conv2d.scale = scale; return ctx->weight_adapter->forward_with_lora(ctx->ggml_ctx, x, w, b, prefix, forward_params); } @@ -2333,7 +2407,8 @@ class Conv2d : public UnaryBlock { dilation.second, dilation.first, ctx->conv2d_direct_enabled, - ctx->circular_pad_enabled, + ctx->circular_pad_x_enabled, + ctx->circular_pad_y_enabled, scale); } }; diff --git a/lora.hpp b/lora.hpp index e6af66798..7d83ec5cd 100644 --- a/lora.hpp +++ b/lora.hpp @@ -599,7 +599,8 @@ struct LoraModel : public GGMLRunner { forward_params.conv2d.d0, forward_params.conv2d.d1, forward_params.conv2d.direct, - forward_params.conv2d.circular, + forward_params.conv2d.circular_x, + forward_params.conv2d.circular_y, forward_params.conv2d.scale); if (lora_mid) { lx = ggml_ext_conv_2d(ctx, @@ -613,7 +614,8 @@ struct LoraModel : public GGMLRunner { 1, 1, forward_params.conv2d.direct, - forward_params.conv2d.circular, + forward_params.conv2d.circular_x, + forward_params.conv2d.circular_y, forward_params.conv2d.scale); } lx = ggml_ext_conv_2d(ctx, @@ -627,7 +629,8 @@ struct LoraModel : public GGMLRunner { 1, 1, forward_params.conv2d.direct, - forward_params.conv2d.circular, + forward_params.conv2d.circular_x, + forward_params.conv2d.circular_y, forward_params.conv2d.scale); } @@ -782,7 +785,8 @@ struct MultiLoraAdapter : public WeightAdapter { forward_params.conv2d.d0, forward_params.conv2d.d1, forward_params.conv2d.direct, - forward_params.conv2d.circular, + forward_params.conv2d.circular_x, + forward_params.conv2d.circular_y, forward_params.conv2d.scale); } for (auto& lora_model : lora_models) { diff --git a/qwen_image.hpp b/qwen_image.hpp index d24c3d83e..169e1e325 100644 --- a/qwen_image.hpp +++ b/qwen_image.hpp @@ -361,11 +361,7 @@ namespace Qwen { int pad_h = (params.patch_size - H % params.patch_size) % params.patch_size; int pad_w = (params.patch_size - W % params.patch_size) % params.patch_size; - if (ctx->circular_pad_enabled) { - x = ggml_pad_circular(ctx->ggml_ctx, x, pad_w, pad_h, 0, 0); - } else { - x = ggml_pad(ctx->ggml_ctx, x, pad_w, pad_h, 0, 0); // [N, C, H + pad_h, W + pad_w] - } + x = sd_pad(ctx->ggml_ctx, x, pad_w, pad_h, 0, 0, ctx->circular_pad_x_enabled, ctx->circular_pad_y_enabled); return x; } diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index eabb51012..6a51b6b48 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -216,6 +216,7 @@ class StableDiffusionGGML { circular_pad = sd_ctx_params->circular_pad; circular_pad_x = sd_ctx_params->circular_pad_x || circular_pad; circular_pad_y = sd_ctx_params->circular_pad_y || circular_pad; + bool circular_pad_any = circular_pad || circular_pad_x || circular_pad_y; rng = get_rng(sd_ctx_params->rng_type); if (sd_ctx_params->sampler_rng_type != RNG_TYPE_COUNT && sd_ctx_params->sampler_rng_type != sd_ctx_params->rng_type) { @@ -393,7 +394,7 @@ class StableDiffusionGGML { vae_decode_only = false; } - if (circular_pad) { + if (circular_pad_any) { LOG_INFO("Using circular padding for convolutions"); } @@ -412,7 +413,7 @@ class StableDiffusionGGML { diffusion_model = std::make_shared(backend, offload_params_to_cpu, tensor_storage_map); - diffusion_model->set_circular_pad_enabled(circular_pad); + diffusion_model->set_circular_pad_axes(circular_pad_x, circular_pad_y); } else if (sd_version_is_flux(version)) { bool is_chroma = false; for (auto pair : tensor_storage_map) { @@ -453,7 +454,7 @@ class StableDiffusionGGML { tensor_storage_map, version, sd_ctx_params->chroma_use_dit_mask); - diffusion_model->set_circular_pad_enabled(circular_pad); + diffusion_model->set_circular_pad_axes(circular_pad_x, circular_pad_y); diffusion_model->set_rope_circular_axes(circular_pad_x, circular_pad_y); } else if (sd_version_is_flux2(version)) { bool is_chroma = false; @@ -466,7 +467,7 @@ class StableDiffusionGGML { tensor_storage_map, version, sd_ctx_params->chroma_use_dit_mask); - diffusion_model->set_circular_pad_enabled(circular_pad); + diffusion_model->set_circular_pad_axes(circular_pad_x, circular_pad_y); diffusion_model->set_rope_circular_axes(circular_pad_x, circular_pad_y); } else if (sd_version_is_wan(version)) { cond_stage_model = std::make_shared(clip_backend, @@ -480,14 +481,14 @@ class StableDiffusionGGML { tensor_storage_map, "model.diffusion_model", version); - diffusion_model->set_circular_pad_enabled(circular_pad); + diffusion_model->set_circular_pad_axes(circular_pad_x, circular_pad_y); if (strlen(SAFE_STR(sd_ctx_params->high_noise_diffusion_model_path)) > 0) { high_noise_diffusion_model = std::make_shared(backend, offload_params_to_cpu, tensor_storage_map, "model.high_noise_diffusion_model", version); - high_noise_diffusion_model->set_circular_pad_enabled(circular_pad); + high_noise_diffusion_model->set_circular_pad_axes(circular_pad_x, circular_pad_y); } if (diffusion_model->get_desc() == "Wan2.1-I2V-14B" || diffusion_model->get_desc() == "Wan2.1-FLF2V-14B" || @@ -514,7 +515,7 @@ class StableDiffusionGGML { tensor_storage_map, "model.diffusion_model", version); - diffusion_model->set_circular_pad_enabled(circular_pad); + diffusion_model->set_circular_pad_axes(circular_pad_x, circular_pad_y); diffusion_model->set_rope_circular_axes(circular_pad_x, circular_pad_y); } else if (sd_version_is_z_image(version)) { cond_stage_model = std::make_shared(clip_backend, @@ -526,7 +527,7 @@ class StableDiffusionGGML { tensor_storage_map, "model.diffusion_model", version); - diffusion_model->set_circular_pad_enabled(circular_pad); + diffusion_model->set_circular_pad_axes(circular_pad_x, circular_pad_y); diffusion_model->set_rope_circular_axes(circular_pad_x, circular_pad_y); } else { // SD1.x SD2.x SDXL std::map embbeding_map; @@ -547,17 +548,17 @@ class StableDiffusionGGML { embbeding_map, version); } - diffusion_model = std::make_shared(backend, - offload_params_to_cpu, - tensor_storage_map, - version); - if (sd_ctx_params->diffusion_conv_direct) { - LOG_INFO("Using Conv2d direct in the diffusion model"); - std::dynamic_pointer_cast(diffusion_model)->unet.set_conv2d_direct_enabled(true); - } - diffusion_model->set_circular_pad_enabled(circular_pad); - std::dynamic_pointer_cast(diffusion_model)->unet.set_circular_pad_enabled(circular_pad); - } + diffusion_model = std::make_shared(backend, + offload_params_to_cpu, + tensor_storage_map, + version); + if (sd_ctx_params->diffusion_conv_direct) { + LOG_INFO("Using Conv2d direct in the diffusion model"); + std::dynamic_pointer_cast(diffusion_model)->unet.set_conv2d_direct_enabled(true); + } + diffusion_model->set_circular_pad_axes(circular_pad_x, circular_pad_y); + std::dynamic_pointer_cast(diffusion_model)->unet.set_circular_pad_axes(circular_pad_x, circular_pad_y); + } if (sd_ctx_params->diffusion_flash_attn) { LOG_INFO("Using flash attention in the diffusion model"); @@ -593,7 +594,7 @@ class StableDiffusionGGML { "first_stage_model", vae_decode_only, version); - first_stage_model->set_circular_pad_enabled(circular_pad); + first_stage_model->set_circular_pad_axes(circular_pad_x, circular_pad_y); first_stage_model->alloc_params_buffer(); first_stage_model->get_param_tensors(tensors, "first_stage_model"); } else if (version == VERSION_CHROMA_RADIANCE) { @@ -620,7 +621,7 @@ class StableDiffusionGGML { vae_conv_2d_scale); first_stage_model->set_conv2d_scale(vae_conv_2d_scale); } - first_stage_model->set_circular_pad_enabled(circular_pad); + first_stage_model->set_circular_pad_axes(circular_pad_x, circular_pad_y); first_stage_model->alloc_params_buffer(); first_stage_model->get_param_tensors(tensors, "first_stage_model"); } @@ -635,7 +636,7 @@ class StableDiffusionGGML { LOG_INFO("Using Conv2d direct in the tae model"); tae_first_stage->set_conv2d_direct_enabled(true); } - tae_first_stage->set_circular_pad_enabled(circular_pad); + tae_first_stage->set_circular_pad_axes(circular_pad_x, circular_pad_y); } // first_stage_model->get_param_tensors(tensors, "first_stage_model."); @@ -655,7 +656,7 @@ class StableDiffusionGGML { LOG_INFO("Using Conv2d direct in the control net"); control_net->set_conv2d_direct_enabled(true); } - control_net->set_circular_pad_enabled(circular_pad); + control_net->set_circular_pad_axes(circular_pad_x, circular_pad_y); } if (strstr(SAFE_STR(sd_ctx_params->photo_maker_path), "v2")) { diff --git a/wan.hpp b/wan.hpp index 18b7e3d29..90091c70e 100644 --- a/wan.hpp +++ b/wan.hpp @@ -75,11 +75,7 @@ namespace WAN { lp2 -= (int)cache_x->ne[2]; } - if (ctx->circular_pad_enabled) { - x = ggml_pad_ext_circular(ctx->ggml_ctx, x, lp0, rp0, lp1, rp1, lp2, rp2, 0, 0); - } else { - x = ggml_pad_ext(ctx->ggml_ctx, x, lp0, rp0, lp1, rp1, lp2, rp2, 0, 0); - } + x = sd_pad_ext(ctx->ggml_ctx, x, lp0, rp0, lp1, rp1, lp2, rp2, 0, 0, ctx->circular_pad_x_enabled, ctx->circular_pad_y_enabled); return ggml_ext_conv_3d(ctx->ggml_ctx, x, w, b, in_channels, std::get<2>(stride), std::get<1>(stride), std::get<0>(stride), 0, 0, 0, @@ -210,17 +206,9 @@ namespace WAN { } else if (mode == "upsample3d") { x = ggml_upscale(ctx->ggml_ctx, x, 2, GGML_SCALE_MODE_NEAREST); } else if (mode == "downsample2d") { - if (ctx->circular_pad_enabled) { - x = ggml_pad_circular(ctx->ggml_ctx, x, 1, 1, 0, 0); - } else { - x = ggml_pad(ctx->ggml_ctx, x, 1, 1, 0, 0); - } + x = sd_pad(ctx->ggml_ctx, x, 1, 1, 0, 0, ctx->circular_pad_x_enabled, ctx->circular_pad_y_enabled); } else if (mode == "downsample3d") { - if (ctx->circular_pad_enabled) { - x = ggml_pad_circular(ctx->ggml_ctx, x, 1, 1, 0, 0); - } else { - x = ggml_pad(ctx->ggml_ctx, x, 1, 1, 0, 0); - } + x = sd_pad(ctx->ggml_ctx, x, 1, 1, 0, 0, ctx->circular_pad_x_enabled, ctx->circular_pad_y_enabled); } x = resample_1->forward(ctx, x); x = ggml_ext_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, x, 0, 1, 3, 2)); // (c, t, h, w) @@ -1847,12 +1835,7 @@ namespace WAN { int pad_t = (std::get<0>(params.patch_size) - T % std::get<0>(params.patch_size)) % std::get<0>(params.patch_size); int pad_h = (std::get<1>(params.patch_size) - H % std::get<1>(params.patch_size)) % std::get<1>(params.patch_size); int pad_w = (std::get<2>(params.patch_size) - W % std::get<2>(params.patch_size)) % std::get<2>(params.patch_size); - if (ctx->circular_pad_enabled) { - x = ggml_pad_circular(ctx->ggml_ctx, x, pad_w, pad_h, pad_t, 0); - } else { - x = ggml_pad(ctx->ggml_ctx, x, pad_w, pad_h, pad_t, 0); - } - + sd_pad(ctx->ggml_ctx, x, pad_w, pad_h, pad_t, 0, ctx->circular_pad_x_enabled, ctx->circular_pad_y_enabled); return x; } diff --git a/z_image.hpp b/z_image.hpp index c87f1b9d9..cb64d7b0e 100644 --- a/z_image.hpp +++ b/z_image.hpp @@ -331,11 +331,7 @@ namespace ZImage { int pad_h = (z_image_params.patch_size - H % z_image_params.patch_size) % z_image_params.patch_size; int pad_w = (z_image_params.patch_size - W % z_image_params.patch_size) % z_image_params.patch_size; - if (ctx->circular_pad_enabled) { - x = ggml_pad_circular(ctx->ggml_ctx, x, pad_w, pad_h, 0, 0); // [N, C, H + pad_h, W + pad_w] - } else { - x = ggml_pad(ctx->ggml_ctx, x, pad_w, pad_h, 0, 0); // [N, C, H + pad_h, W + pad_w] - } + x = sd_pad(ctx->ggml_ctx, x, pad_w, pad_h, 0, 0, ctx->circular_pad_x_enabled, ctx->circular_pad_y_enabled); return x; } From 32e1b7556b3d7919dcc981b3c6f1a031345b1b46 Mon Sep 17 00:00:00 2001 From: Phylliida Date: Sat, 13 Dec 2025 02:16:50 -0800 Subject: [PATCH 23/25] Fix crash on chroma --- ggml_extend.hpp | 43 ++++++++++++------------------------------- 1 file changed, 12 insertions(+), 31 deletions(-) diff --git a/ggml_extend.hpp b/ggml_extend.hpp index 3c3b7aa28..8c0e8518c 100644 --- a/ggml_extend.hpp +++ b/ggml_extend.hpp @@ -1012,37 +1012,6 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_linear(struct ggml_context* ctx, return x; } -__STATIC_INLINE__ struct ggml_tensor* sd_pad(struct ggml_context* ctx, - struct ggml_tensor* x, - int pad_w, - int pad_h, - int pad_t = 0, - int pad_d = 0, - bool circular_x = false, - bool circular_y = false) { - if ((circular_x && circular_y) || (!circular_x && !circular_y)) { - return circular_x && circular_y ? ggml_pad_circular(ctx, x, pad_w, pad_h, pad_t, pad_d) - : ggml_pad(ctx, x, pad_w, pad_h, pad_t, pad_d); - } - - int rem_w = pad_w; - int rem_h = pad_h; - - if (circular_x && pad_w != 0) { - x = ggml_pad_circular(ctx, x, pad_w, 0, 0, 0); - rem_w = 0; - } - if (circular_y && pad_h != 0) { - x = ggml_pad_circular(ctx, x, 0, pad_h, 0, 0); - rem_h = 0; - } - - if (rem_w != 0 || rem_h != 0 || pad_t != 0 || pad_d != 0) { - x = ggml_pad(ctx, x, rem_w, rem_h, pad_t, pad_d); - } - return x; -} - __STATIC_INLINE__ struct ggml_tensor* sd_pad_ext(struct ggml_context* ctx, struct ggml_tensor* x, int lp0, @@ -1075,6 +1044,18 @@ __STATIC_INLINE__ struct ggml_tensor* sd_pad_ext(struct ggml_context* ctx, return x; } +__STATIC_INLINE__ struct ggml_tensor* sd_pad(struct ggml_context* ctx, + struct ggml_tensor* x, + int pad_w, + int pad_h, + int pad_t = 0, + int pad_d = 0, + bool circular_x = false, + bool circular_y = false) { + + return sd_pad_ext(ctx, x, pad_w, pad_w, pad_h, pad_h, pad_t, pad_t, pad_d, pad_d, circular_x, circular_y); +} + // w: [OC,IC, KH, KW] // x: [N, IC, IH, IW] // b: [OC,] From dc6e8870b222d9b15fdd0fbf3592ff5d0a5ec328 Mon Sep 17 00:00:00 2001 From: Phylliida Date: Sat, 13 Dec 2025 02:38:59 -0800 Subject: [PATCH 24/25] Refactor into cleaner variable choices --- clip.hpp | 2 +- common.hpp | 2 +- diffusion_model.hpp | 65 +++++++++++++++++-------------------------- examples/cli/main.cpp | 24 ++++++++-------- flux.hpp | 6 ++-- ggml_extend.hpp | 41 ++++++++++----------------- qwen_image.hpp | 6 ++-- stable-diffusion.cpp | 64 ++++++++++++++++++++---------------------- stable-diffusion.h | 6 ++-- wan.hpp | 8 +++--- z_image.hpp | 6 ++-- 11 files changed, 101 insertions(+), 129 deletions(-) diff --git a/clip.hpp b/clip.hpp index 4b51727c4..c5d7a19c6 100644 --- a/clip.hpp +++ b/clip.hpp @@ -664,7 +664,7 @@ class CLIPVisionEmbeddings : public GGMLBlock { // concat(patch_embedding, class_embedding) + position_embedding struct ggml_tensor* patch_embedding; int64_t N = pixel_values->ne[3]; - patch_embedding = ggml_ext_conv_2d(ctx->ggml_ctx, pixel_values, patch_embed_weight, nullptr, patch_size, patch_size, 0, 0, 1, 1, false, ctx->circular_pad_x_enabled, ctx->circular_pad_y_enabled); // [N, embed_dim, image_size // pacht_size, image_size // pacht_size] + patch_embedding = ggml_ext_conv_2d(ctx->ggml_ctx, pixel_values, patch_embed_weight, nullptr, patch_size, patch_size, 0, 0, 1, 1, false, ctx->circular_x_enabled, ctx->circular_y_enabled); // [N, embed_dim, image_size // pacht_size, image_size // pacht_size] patch_embedding = ggml_reshape_3d(ctx->ggml_ctx, patch_embedding, num_patches, embed_dim, N); // [N, embed_dim, num_patches] patch_embedding = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, patch_embedding, 1, 0, 2, 3)); // [N, num_patches, embed_dim] patch_embedding = ggml_reshape_4d(ctx->ggml_ctx, patch_embedding, 1, embed_dim, num_patches, N); // [N, num_patches, embed_dim, 1] diff --git a/common.hpp b/common.hpp index 8d66422b1..3741e975a 100644 --- a/common.hpp +++ b/common.hpp @@ -30,7 +30,7 @@ class DownSampleBlock : public GGMLBlock { // For VAE downsampling we manually pad by 1 before the stride-2 conv. // Honor the global circular padding flags here to avoid seams in seamless mode. - x = sd_pad(ctx->ggml_ctx, x, 1, 1, 0, 0, ctx->circular_pad_x_enabled, ctx->circular_pad_y_enabled); + x = sd_pad(ctx->ggml_ctx, x, 1, 1, 0, 0, ctx->circular_x_enabled, ctx->circular_y_enabled); x = conv->forward(ctx, x); } else { auto conv = std::dynamic_pointer_cast(blocks["op"]); diff --git a/diffusion_model.hpp b/diffusion_model.hpp index c73a50106..0b32babf8 100644 --- a/diffusion_model.hpp +++ b/diffusion_model.hpp @@ -39,9 +39,8 @@ struct DiffusionModel { virtual void set_weight_adapter(const std::shared_ptr& adapter){}; virtual int64_t get_adm_in_channels() = 0; virtual void set_flash_attn_enabled(bool enabled) = 0; - virtual void set_circular_pad_enabled(bool enabled) = 0; - virtual void set_circular_pad_axes(bool circular_x, bool circular_y) = 0; - virtual void set_rope_circular_axes(bool circular_x, bool circular_y) {}; + virtual void set_circular_enabled(bool enabled) = 0; + virtual void set_circular_axes(bool circular_x, bool circular_y) = 0; }; struct UNetModel : public DiffusionModel { @@ -90,12 +89,12 @@ struct UNetModel : public DiffusionModel { unet.set_flash_attention_enabled(enabled); } - void set_circular_pad_enabled(bool enabled) override { - unet.set_circular_pad_axes(enabled, enabled); + void set_circular_enabled(bool enabled) override { + unet.set_circular_axes(enabled, enabled); } - void set_circular_pad_axes(bool circular_x, bool circular_y) override { - unet.set_circular_pad_axes(circular_x, circular_y); + void set_circular_axes(bool circular_x, bool circular_y) override { + unet.set_circular_axes(circular_x, circular_y); } bool compute(int n_threads, @@ -159,12 +158,12 @@ struct MMDiTModel : public DiffusionModel { mmdit.set_flash_attention_enabled(enabled); } - void set_circular_pad_enabled(bool enabled) override { - mmdit.set_circular_pad_axes(enabled, enabled); + void set_circular_enabled(bool enabled) override { + mmdit.set_circular_axes(enabled, enabled); } - void set_circular_pad_axes(bool circular_x, bool circular_y) override { - mmdit.set_circular_pad_axes(circular_x, circular_y); + void set_circular_axes(bool circular_x, bool circular_y) override { + mmdit.set_circular_axes(circular_x, circular_y); } bool compute(int n_threads, @@ -229,16 +228,12 @@ struct FluxModel : public DiffusionModel { flux.set_flash_attention_enabled(enabled); } - void set_circular_pad_enabled(bool enabled) override { - flux.set_circular_pad_axes(enabled, enabled); + void set_circular_enabled(bool enabled) override { + flux.set_circular_axes(enabled, enabled); } - void set_circular_pad_axes(bool circular_x, bool circular_y) override { - flux.set_circular_pad_axes(circular_x, circular_y); - } - - void set_rope_circular_axes(bool circular_x, bool circular_y) override { - flux.set_circular_rope_enabled(circular_x, circular_y); + void set_circular_axes(bool circular_x, bool circular_y) override { + flux.set_circular_axes(circular_x, circular_y); } bool compute(int n_threads, @@ -308,12 +303,12 @@ struct WanModel : public DiffusionModel { wan.set_flash_attention_enabled(enabled); } - void set_circular_pad_enabled(bool enabled) override { - wan.set_circular_pad_axes(enabled, enabled); + void set_circular_enabled(bool enabled) override { + wan.set_circular_axes(enabled, enabled); } - void set_circular_pad_axes(bool circular_x, bool circular_y) override { - wan.set_circular_pad_axes(circular_x, circular_y); + void set_circular_axes(bool circular_x, bool circular_y) override { + wan.set_circular_axes(circular_x, circular_y); } bool compute(int n_threads, @@ -382,16 +377,12 @@ struct QwenImageModel : public DiffusionModel { qwen_image.set_flash_attention_enabled(enabled); } - void set_circular_pad_enabled(bool enabled) override { - qwen_image.set_circular_pad_axes(enabled, enabled); - } - - void set_circular_pad_axes(bool circular_x, bool circular_y) override { - qwen_image.set_circular_pad_axes(circular_x, circular_y); + void set_circular_enabled(bool enabled) override { + qwen_image.set_circular_axes(enabled, enabled); } - void set_rope_circular_axes(bool circular_x, bool circular_y) override { - qwen_image.set_circular_rope_enabled(circular_x, circular_y); + void set_circular_axes(bool circular_x, bool circular_y) override { + qwen_image.set_circular_axes(circular_x, circular_y); } bool compute(int n_threads, @@ -457,16 +448,12 @@ struct ZImageModel : public DiffusionModel { z_image.set_flash_attention_enabled(enabled); } - void set_circular_pad_enabled(bool enabled) override { - z_image.set_circular_pad_axes(enabled, enabled); - } - - void set_circular_pad_axes(bool circular_x, bool circular_y) override { - z_image.set_circular_pad_axes(circular_x, circular_y); + void set_circular_enabled(bool enabled) override { + z_image.set_circular_axes(enabled, enabled); } - void set_rope_circular_axes(bool circular_x, bool circular_y) override { - z_image.set_circular_rope_enabled(circular_x, circular_y); + void set_circular_axes(bool circular_x, bool circular_y) override { + z_image.set_circular_axes(circular_x, circular_y); } bool compute(int n_threads, diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp index 2aa3446ed..e472ca2e6 100644 --- a/examples/cli/main.cpp +++ b/examples/cli/main.cpp @@ -518,9 +518,9 @@ struct SDContextParams { bool diffusion_flash_attn = false; bool diffusion_conv_direct = false; bool vae_conv_direct = false; - bool circular_pad = false; - bool circular_pad_x = false; - bool circular_pad_y = false; + bool circular = false; + bool circular_x = false; + bool circular_y = false; bool chroma_use_dit_mask = true; bool chroma_use_t5_mask = false; @@ -677,15 +677,15 @@ struct SDContextParams { {"", "--circular", "enable circular padding for convolutions", - true, &circular_pad}, + true, &circular}, {"", "--circularx", "enable circular RoPE wrapping on x-axis (width) only", - true, &circular_pad_x}, + true, &circular_x}, {"", "--circulary", "enable circular RoPE wrapping on y-axis (height) only", - true, &circular_pad_y}, + true, &circular_y}, {"", "--chroma-disable-dit-mask", "disable dit mask for chroma", @@ -949,9 +949,9 @@ struct SDContextParams { << " diffusion_flash_attn: " << (diffusion_flash_attn ? "true" : "false") << ",\n" << " diffusion_conv_direct: " << (diffusion_conv_direct ? "true" : "false") << ",\n" << " vae_conv_direct: " << (vae_conv_direct ? "true" : "false") << ",\n" - << " circular_pad: " << (circular_pad ? "true" : "false") << ",\n" - << " circular_pad_x: " << (circular_pad_x ? "true" : "false") << ",\n" - << " circular_pad_y: " << (circular_pad_y ? "true" : "false") << ",\n" + << " circular: " << (circular ? "true" : "false") << ",\n" + << " circular_x: " << (circular_x ? "true" : "false") << ",\n" + << " circular_y: " << (circular_y ? "true" : "false") << ",\n" << " chroma_use_dit_mask: " << (chroma_use_dit_mask ? "true" : "false") << ",\n" << " chroma_use_t5_mask: " << (chroma_use_t5_mask ? "true" : "false") << ",\n" << " chroma_t5_mask_pad: " << chroma_t5_mask_pad << ",\n" @@ -1013,9 +1013,9 @@ struct SDContextParams { taesd_preview, diffusion_conv_direct, vae_conv_direct, - circular_pad, - circular_pad || circular_pad_x, - circular_pad || circular_pad_y, + circular, + circular || circular_x, + circular || circular_y, force_sdxl_vae_conv_scale, chroma_use_dit_mask, chroma_use_t5_mask, diff --git a/flux.hpp b/flux.hpp index 65e91e106..2038fe152 100644 --- a/flux.hpp +++ b/flux.hpp @@ -865,7 +865,7 @@ namespace Flux { int pad_h = (params.patch_size - H % params.patch_size) % params.patch_size; int pad_w = (params.patch_size - W % params.patch_size) % params.patch_size; - x = sd_pad(ctx->ggml_ctx, x, pad_w, pad_h, 0, 0, ctx->circular_pad_x_enabled, ctx->circular_pad_y_enabled); + x = sd_pad(ctx->ggml_ctx, x, pad_w, pad_h, 0, 0, ctx->circular_x_enabled, ctx->circular_y_enabled); return x; } @@ -1441,8 +1441,8 @@ namespace Flux { increase_ref_index, flux_params.ref_index_scale, flux_params.theta, - rope_circular_y_enabled, - rope_circular_x_enabled, + circular_y_enabled, + circular_x_enabled, flux_params.axes_dim); int pos_len = pe_vec.size() / flux_params.axes_dim_sum / 2; // LOG_DEBUG("pos_len %d", pos_len); diff --git a/ggml_extend.hpp b/ggml_extend.hpp index 8c0e8518c..0d520fb10 100644 --- a/ggml_extend.hpp +++ b/ggml_extend.hpp @@ -1618,10 +1618,8 @@ struct GGMLRunnerContext { ggml_context* ggml_ctx = nullptr; bool flash_attn_enabled = false; bool conv2d_direct_enabled = false; - bool circular_pad_x_enabled = false; - bool circular_pad_y_enabled = false; - bool rope_circular_x_enabled = false; - bool rope_circular_y_enabled = false; + bool circular_x_enabled = false; + bool circular_y_enabled = false; std::shared_ptr weight_adapter = nullptr; }; @@ -1658,10 +1656,8 @@ struct GGMLRunner { bool flash_attn_enabled = false; bool conv2d_direct_enabled = false; - bool circular_pad_x_enabled = false; - bool circular_pad_y_enabled = false; - bool rope_circular_x_enabled = false; - bool rope_circular_y_enabled = false; + bool circular_x_enabled = false; + bool circular_y_enabled = false; void alloc_params_ctx() { struct ggml_init_params params; @@ -1939,10 +1935,8 @@ struct GGMLRunner { runner_ctx.backend = runtime_backend; runner_ctx.flash_attn_enabled = flash_attn_enabled; runner_ctx.conv2d_direct_enabled = conv2d_direct_enabled; - runner_ctx.circular_pad_x_enabled = circular_pad_x_enabled; - runner_ctx.circular_pad_y_enabled = circular_pad_y_enabled; - runner_ctx.rope_circular_x_enabled = rope_circular_x_enabled; - runner_ctx.rope_circular_y_enabled = rope_circular_y_enabled; + runner_ctx.circular_x_enabled = circular_x_enabled; + runner_ctx.circular_y_enabled = circular_y_enabled; runner_ctx.weight_adapter = weight_adapter; return runner_ctx; } @@ -2087,18 +2081,13 @@ struct GGMLRunner { conv2d_direct_enabled = enabled; } - void set_circular_pad_enabled(bool enabled) { - set_circular_pad_axes(enabled, enabled); + void set_circular_enabled(bool enabled) { + set_circular_axes(enabled, enabled); } - void set_circular_pad_axes(bool circular_x, bool circular_y) { - circular_pad_x_enabled = circular_x; - circular_pad_y_enabled = circular_y; - } - - void set_circular_rope_enabled(bool circular_x, bool circular_y) { - rope_circular_x_enabled = circular_x; - rope_circular_y_enabled = circular_y; + void set_circular_axes(bool circular_x, bool circular_y) { + circular_x_enabled = circular_x; + circular_y_enabled = circular_y; } void set_weight_adapter(const std::shared_ptr& adapter) { @@ -2372,8 +2361,8 @@ class Conv2d : public UnaryBlock { forward_params.conv2d.d0 = dilation.second; forward_params.conv2d.d1 = dilation.first; forward_params.conv2d.direct = ctx->conv2d_direct_enabled; - forward_params.conv2d.circular_x = ctx->circular_pad_x_enabled; - forward_params.conv2d.circular_y = ctx->circular_pad_y_enabled; + forward_params.conv2d.circular_x = ctx->circular_x_enabled; + forward_params.conv2d.circular_y = ctx->circular_y_enabled; forward_params.conv2d.scale = scale; return ctx->weight_adapter->forward_with_lora(ctx->ggml_ctx, x, w, b, prefix, forward_params); } @@ -2388,8 +2377,8 @@ class Conv2d : public UnaryBlock { dilation.second, dilation.first, ctx->conv2d_direct_enabled, - ctx->circular_pad_x_enabled, - ctx->circular_pad_y_enabled, + ctx->circular_x_enabled, + ctx->circular_y_enabled, scale); } }; diff --git a/qwen_image.hpp b/qwen_image.hpp index 169e1e325..847f61171 100644 --- a/qwen_image.hpp +++ b/qwen_image.hpp @@ -361,7 +361,7 @@ namespace Qwen { int pad_h = (params.patch_size - H % params.patch_size) % params.patch_size; int pad_w = (params.patch_size - W % params.patch_size) % params.patch_size; - x = sd_pad(ctx->ggml_ctx, x, pad_w, pad_h, 0, 0, ctx->circular_pad_x_enabled, ctx->circular_pad_y_enabled); + x = sd_pad(ctx->ggml_ctx, x, pad_w, pad_h, 0, 0, ctx->circular_x_enabled, ctx->circular_y_enabled); return x; } @@ -565,8 +565,8 @@ namespace Qwen { ref_latents, increase_ref_index, qwen_image_params.theta, - rope_circular_y_enabled, - rope_circular_x_enabled, + circular_y_enabled, + circular_x_enabled, qwen_image_params.axes_dim); int pos_len = pe_vec.size() / qwen_image_params.axes_dim_sum / 2; // LOG_DEBUG("pos_len %d", pos_len); diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 6a51b6b48..d94134602 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -127,9 +127,9 @@ class StableDiffusionGGML { bool use_tiny_autoencoder = false; sd_tiling_params_t vae_tiling_params = {false, 0, 0, 0.5f, 0, 0}; bool offload_params_to_cpu = false; - bool circular_pad = false; - bool circular_pad_x = false; - bool circular_pad_y = false; + bool circular = false; + bool circular_x = false; + bool circular_y = false; bool stacked_id = false; bool is_using_v_parameterization = false; @@ -213,10 +213,10 @@ class StableDiffusionGGML { taesd_path = SAFE_STR(sd_ctx_params->taesd_path); use_tiny_autoencoder = taesd_path.size() > 0; offload_params_to_cpu = sd_ctx_params->offload_params_to_cpu; - circular_pad = sd_ctx_params->circular_pad; - circular_pad_x = sd_ctx_params->circular_pad_x || circular_pad; - circular_pad_y = sd_ctx_params->circular_pad_y || circular_pad; - bool circular_pad_any = circular_pad || circular_pad_x || circular_pad_y; + circular = sd_ctx_params->circular; + circular_x = sd_ctx_params->circular_x || circular; + circular_y = sd_ctx_params->circular_y || circular; + bool circular_any = circular || circular_x || circular_y; rng = get_rng(sd_ctx_params->rng_type); if (sd_ctx_params->sampler_rng_type != RNG_TYPE_COUNT && sd_ctx_params->sampler_rng_type != sd_ctx_params->rng_type) { @@ -394,7 +394,7 @@ class StableDiffusionGGML { vae_decode_only = false; } - if (circular_pad_any) { + if (circular_any) { LOG_INFO("Using circular padding for convolutions"); } @@ -413,7 +413,7 @@ class StableDiffusionGGML { diffusion_model = std::make_shared(backend, offload_params_to_cpu, tensor_storage_map); - diffusion_model->set_circular_pad_axes(circular_pad_x, circular_pad_y); + diffusion_model->set_circular_axes(circular_x, circular_y); } else if (sd_version_is_flux(version)) { bool is_chroma = false; for (auto pair : tensor_storage_map) { @@ -454,8 +454,7 @@ class StableDiffusionGGML { tensor_storage_map, version, sd_ctx_params->chroma_use_dit_mask); - diffusion_model->set_circular_pad_axes(circular_pad_x, circular_pad_y); - diffusion_model->set_rope_circular_axes(circular_pad_x, circular_pad_y); + diffusion_model->set_circular_axes(circular_x, circular_y); } else if (sd_version_is_flux2(version)) { bool is_chroma = false; cond_stage_model = std::make_shared(clip_backend, @@ -467,8 +466,7 @@ class StableDiffusionGGML { tensor_storage_map, version, sd_ctx_params->chroma_use_dit_mask); - diffusion_model->set_circular_pad_axes(circular_pad_x, circular_pad_y); - diffusion_model->set_rope_circular_axes(circular_pad_x, circular_pad_y); + diffusion_model->set_circular_axes(circular_x, circular_y); } else if (sd_version_is_wan(version)) { cond_stage_model = std::make_shared(clip_backend, offload_params_to_cpu, @@ -481,14 +479,14 @@ class StableDiffusionGGML { tensor_storage_map, "model.diffusion_model", version); - diffusion_model->set_circular_pad_axes(circular_pad_x, circular_pad_y); + diffusion_model->set_circular_axes(circular_x, circular_y); if (strlen(SAFE_STR(sd_ctx_params->high_noise_diffusion_model_path)) > 0) { high_noise_diffusion_model = std::make_shared(backend, offload_params_to_cpu, tensor_storage_map, "model.high_noise_diffusion_model", version); - high_noise_diffusion_model->set_circular_pad_axes(circular_pad_x, circular_pad_y); + high_noise_diffusion_model->set_circular_axes(circular_x, circular_y); } if (diffusion_model->get_desc() == "Wan2.1-I2V-14B" || diffusion_model->get_desc() == "Wan2.1-FLF2V-14B" || @@ -515,8 +513,7 @@ class StableDiffusionGGML { tensor_storage_map, "model.diffusion_model", version); - diffusion_model->set_circular_pad_axes(circular_pad_x, circular_pad_y); - diffusion_model->set_rope_circular_axes(circular_pad_x, circular_pad_y); + diffusion_model->set_circular_axes(circular_x, circular_y); } else if (sd_version_is_z_image(version)) { cond_stage_model = std::make_shared(clip_backend, offload_params_to_cpu, @@ -527,8 +524,7 @@ class StableDiffusionGGML { tensor_storage_map, "model.diffusion_model", version); - diffusion_model->set_circular_pad_axes(circular_pad_x, circular_pad_y); - diffusion_model->set_rope_circular_axes(circular_pad_x, circular_pad_y); + diffusion_model->set_circular_axes(circular_x, circular_y); } else { // SD1.x SD2.x SDXL std::map embbeding_map; for (int i = 0; i < sd_ctx_params->embedding_count; i++) { @@ -556,8 +552,8 @@ class StableDiffusionGGML { LOG_INFO("Using Conv2d direct in the diffusion model"); std::dynamic_pointer_cast(diffusion_model)->unet.set_conv2d_direct_enabled(true); } - diffusion_model->set_circular_pad_axes(circular_pad_x, circular_pad_y); - std::dynamic_pointer_cast(diffusion_model)->unet.set_circular_pad_axes(circular_pad_x, circular_pad_y); + diffusion_model->set_circular_axes(circular_x, circular_y); + std::dynamic_pointer_cast(diffusion_model)->unet.set_circular_axes(circular_x, circular_y); } if (sd_ctx_params->diffusion_flash_attn) { @@ -594,7 +590,7 @@ class StableDiffusionGGML { "first_stage_model", vae_decode_only, version); - first_stage_model->set_circular_pad_axes(circular_pad_x, circular_pad_y); + first_stage_model->set_circular_axes(circular_x, circular_y); first_stage_model->alloc_params_buffer(); first_stage_model->get_param_tensors(tensors, "first_stage_model"); } else if (version == VERSION_CHROMA_RADIANCE) { @@ -621,7 +617,7 @@ class StableDiffusionGGML { vae_conv_2d_scale); first_stage_model->set_conv2d_scale(vae_conv_2d_scale); } - first_stage_model->set_circular_pad_axes(circular_pad_x, circular_pad_y); + first_stage_model->set_circular_axes(circular_x, circular_y); first_stage_model->alloc_params_buffer(); first_stage_model->get_param_tensors(tensors, "first_stage_model"); } @@ -636,7 +632,7 @@ class StableDiffusionGGML { LOG_INFO("Using Conv2d direct in the tae model"); tae_first_stage->set_conv2d_direct_enabled(true); } - tae_first_stage->set_circular_pad_axes(circular_pad_x, circular_pad_y); + tae_first_stage->set_circular_axes(circular_x, circular_y); } // first_stage_model->get_param_tensors(tensors, "first_stage_model."); @@ -656,7 +652,7 @@ class StableDiffusionGGML { LOG_INFO("Using Conv2d direct in the control net"); control_net->set_conv2d_direct_enabled(true); } - control_net->set_circular_pad_axes(circular_pad_x, circular_pad_y); + control_net->set_circular_axes(circular_x, circular_y); } if (strstr(SAFE_STR(sd_ctx_params->photo_maker_path), "v2")) { @@ -2539,9 +2535,9 @@ void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) { sd_ctx_params->keep_control_net_on_cpu = false; sd_ctx_params->keep_vae_on_cpu = false; sd_ctx_params->diffusion_flash_attn = false; - sd_ctx_params->circular_pad = false; - sd_ctx_params->circular_pad_x = false; - sd_ctx_params->circular_pad_y = false; + sd_ctx_params->circular = false; + sd_ctx_params->circular_x = false; + sd_ctx_params->circular_y = false; sd_ctx_params->chroma_use_dit_mask = true; sd_ctx_params->chroma_use_t5_mask = false; sd_ctx_params->chroma_t5_mask_pad = 1; @@ -2582,9 +2578,9 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) { "keep_control_net_on_cpu: %s\n" "keep_vae_on_cpu: %s\n" "diffusion_flash_attn: %s\n" - "circular_pad: %s\n" - "circular_pad_x: %s\n" - "circular_pad_y: %s\n" + "circular: %s\n" + "circular_x: %s\n" + "circular_y: %s\n" "chroma_use_dit_mask: %s\n" "chroma_use_t5_mask: %s\n" "chroma_t5_mask_pad: %d\n", @@ -2615,9 +2611,9 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) { BOOL_STR(sd_ctx_params->keep_control_net_on_cpu), BOOL_STR(sd_ctx_params->keep_vae_on_cpu), BOOL_STR(sd_ctx_params->diffusion_flash_attn), - BOOL_STR(sd_ctx_params->circular_pad), - BOOL_STR(sd_ctx_params->circular_pad_x), - BOOL_STR(sd_ctx_params->circular_pad_y), + BOOL_STR(sd_ctx_params->circular), + BOOL_STR(sd_ctx_params->circular_x), + BOOL_STR(sd_ctx_params->circular_y), BOOL_STR(sd_ctx_params->chroma_use_dit_mask), BOOL_STR(sd_ctx_params->chroma_use_t5_mask), sd_ctx_params->chroma_t5_mask_pad); diff --git a/stable-diffusion.h b/stable-diffusion.h index 3eb1324f5..4ef3799b0 100644 --- a/stable-diffusion.h +++ b/stable-diffusion.h @@ -189,9 +189,9 @@ typedef struct { bool tae_preview_only; bool diffusion_conv_direct; bool vae_conv_direct; - bool circular_pad; - bool circular_pad_x; - bool circular_pad_y; + bool circular; + bool circular_x; + bool circular_y; bool force_sdxl_vae_conv_scale; bool chroma_use_dit_mask; bool chroma_use_t5_mask; diff --git a/wan.hpp b/wan.hpp index 90091c70e..8e5984622 100644 --- a/wan.hpp +++ b/wan.hpp @@ -75,7 +75,7 @@ namespace WAN { lp2 -= (int)cache_x->ne[2]; } - x = sd_pad_ext(ctx->ggml_ctx, x, lp0, rp0, lp1, rp1, lp2, rp2, 0, 0, ctx->circular_pad_x_enabled, ctx->circular_pad_y_enabled); + x = sd_pad_ext(ctx->ggml_ctx, x, lp0, rp0, lp1, rp1, lp2, rp2, 0, 0, ctx->circular_x_enabled, ctx->circular_y_enabled); return ggml_ext_conv_3d(ctx->ggml_ctx, x, w, b, in_channels, std::get<2>(stride), std::get<1>(stride), std::get<0>(stride), 0, 0, 0, @@ -206,9 +206,9 @@ namespace WAN { } else if (mode == "upsample3d") { x = ggml_upscale(ctx->ggml_ctx, x, 2, GGML_SCALE_MODE_NEAREST); } else if (mode == "downsample2d") { - x = sd_pad(ctx->ggml_ctx, x, 1, 1, 0, 0, ctx->circular_pad_x_enabled, ctx->circular_pad_y_enabled); + x = sd_pad(ctx->ggml_ctx, x, 1, 1, 0, 0, ctx->circular_x_enabled, ctx->circular_y_enabled); } else if (mode == "downsample3d") { - x = sd_pad(ctx->ggml_ctx, x, 1, 1, 0, 0, ctx->circular_pad_x_enabled, ctx->circular_pad_y_enabled); + x = sd_pad(ctx->ggml_ctx, x, 1, 1, 0, 0, ctx->circular_x_enabled, ctx->circular_y_enabled); } x = resample_1->forward(ctx, x); x = ggml_ext_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, x, 0, 1, 3, 2)); // (c, t, h, w) @@ -1835,7 +1835,7 @@ namespace WAN { int pad_t = (std::get<0>(params.patch_size) - T % std::get<0>(params.patch_size)) % std::get<0>(params.patch_size); int pad_h = (std::get<1>(params.patch_size) - H % std::get<1>(params.patch_size)) % std::get<1>(params.patch_size); int pad_w = (std::get<2>(params.patch_size) - W % std::get<2>(params.patch_size)) % std::get<2>(params.patch_size); - sd_pad(ctx->ggml_ctx, x, pad_w, pad_h, pad_t, 0, ctx->circular_pad_x_enabled, ctx->circular_pad_y_enabled); + sd_pad(ctx->ggml_ctx, x, pad_w, pad_h, pad_t, 0, ctx->circular_x_enabled, ctx->circular_y_enabled); return x; } diff --git a/z_image.hpp b/z_image.hpp index cb64d7b0e..5a53fe675 100644 --- a/z_image.hpp +++ b/z_image.hpp @@ -331,7 +331,7 @@ namespace ZImage { int pad_h = (z_image_params.patch_size - H % z_image_params.patch_size) % z_image_params.patch_size; int pad_w = (z_image_params.patch_size - W % z_image_params.patch_size) % z_image_params.patch_size; - x = sd_pad(ctx->ggml_ctx, x, pad_w, pad_h, 0, 0, ctx->circular_pad_x_enabled, ctx->circular_pad_y_enabled); + x = sd_pad(ctx->ggml_ctx, x, pad_w, pad_h, 0, 0, ctx->circular_x_enabled, ctx->circular_y_enabled); return x; } @@ -552,8 +552,8 @@ namespace ZImage { ref_latents, increase_ref_index, z_image_params.theta, - rope_circular_y_enabled, - rope_circular_x_enabled, + circular_y_enabled, + circular_x_enabled, z_image_params.axes_dim); int pos_len = pe_vec.size() / z_image_params.axes_dim_sum / 2; // LOG_DEBUG("pos_len %d", pos_len); From 665190f79bd3fe565e3b45e7e858abefb6aacb33 Mon Sep 17 00:00:00 2001 From: Phylliida Date: Sat, 13 Dec 2025 02:43:16 -0800 Subject: [PATCH 25/25] Removed redundant set_circular_enabled --- diffusion_model.hpp | 25 ------------------------- ggml_extend.hpp | 4 ---- 2 files changed, 29 deletions(-) diff --git a/diffusion_model.hpp b/diffusion_model.hpp index 0b32babf8..0724cc938 100644 --- a/diffusion_model.hpp +++ b/diffusion_model.hpp @@ -39,7 +39,6 @@ struct DiffusionModel { virtual void set_weight_adapter(const std::shared_ptr& adapter){}; virtual int64_t get_adm_in_channels() = 0; virtual void set_flash_attn_enabled(bool enabled) = 0; - virtual void set_circular_enabled(bool enabled) = 0; virtual void set_circular_axes(bool circular_x, bool circular_y) = 0; }; @@ -89,10 +88,6 @@ struct UNetModel : public DiffusionModel { unet.set_flash_attention_enabled(enabled); } - void set_circular_enabled(bool enabled) override { - unet.set_circular_axes(enabled, enabled); - } - void set_circular_axes(bool circular_x, bool circular_y) override { unet.set_circular_axes(circular_x, circular_y); } @@ -158,10 +153,6 @@ struct MMDiTModel : public DiffusionModel { mmdit.set_flash_attention_enabled(enabled); } - void set_circular_enabled(bool enabled) override { - mmdit.set_circular_axes(enabled, enabled); - } - void set_circular_axes(bool circular_x, bool circular_y) override { mmdit.set_circular_axes(circular_x, circular_y); } @@ -228,10 +219,6 @@ struct FluxModel : public DiffusionModel { flux.set_flash_attention_enabled(enabled); } - void set_circular_enabled(bool enabled) override { - flux.set_circular_axes(enabled, enabled); - } - void set_circular_axes(bool circular_x, bool circular_y) override { flux.set_circular_axes(circular_x, circular_y); } @@ -303,10 +290,6 @@ struct WanModel : public DiffusionModel { wan.set_flash_attention_enabled(enabled); } - void set_circular_enabled(bool enabled) override { - wan.set_circular_axes(enabled, enabled); - } - void set_circular_axes(bool circular_x, bool circular_y) override { wan.set_circular_axes(circular_x, circular_y); } @@ -377,10 +360,6 @@ struct QwenImageModel : public DiffusionModel { qwen_image.set_flash_attention_enabled(enabled); } - void set_circular_enabled(bool enabled) override { - qwen_image.set_circular_axes(enabled, enabled); - } - void set_circular_axes(bool circular_x, bool circular_y) override { qwen_image.set_circular_axes(circular_x, circular_y); } @@ -448,10 +427,6 @@ struct ZImageModel : public DiffusionModel { z_image.set_flash_attention_enabled(enabled); } - void set_circular_enabled(bool enabled) override { - z_image.set_circular_axes(enabled, enabled); - } - void set_circular_axes(bool circular_x, bool circular_y) override { z_image.set_circular_axes(circular_x, circular_y); } diff --git a/ggml_extend.hpp b/ggml_extend.hpp index 0d520fb10..663012d5b 100644 --- a/ggml_extend.hpp +++ b/ggml_extend.hpp @@ -2081,10 +2081,6 @@ struct GGMLRunner { conv2d_direct_enabled = enabled; } - void set_circular_enabled(bool enabled) { - set_circular_axes(enabled, enabled); - } - void set_circular_axes(bool circular_x, bool circular_y) { circular_x_enabled = circular_x; circular_y_enabled = circular_y;