From 476df8e53ece2e4b8a4131110ed0487defe774d7 Mon Sep 17 00:00:00 2001
From: bepis <phylliida.dev@gmail.com>
Date: Thu, 23 Oct 2025 17:59:25 -0700
Subject: [PATCH 01/25] global bool

---
 common.hpp            |  2 +-
 examples/cli/main.cpp |  5 ++++
 flux.hpp              | 12 ++++-----
 ggml                  |  2 +-
 ggml_extend.hpp       | 57 +++++++++++++++++++++++++++++++++++++++----
 mmdit.hpp             |  4 +--
 qwen_image.hpp        |  4 +--
 stable-diffusion.cpp  |  9 +++++++
 stable-diffusion.h    |  1 +
 wan.hpp               | 12 ++++-----
 10 files changed, 85 insertions(+), 23 deletions(-)
diff --git a/common.hpp b/common.hpp
index d32167145..4a891bc8b 100644
--- a/common.hpp
+++ b/common.hpp
@@ -28,7 +28,7 @@ class DownSampleBlock : public GGMLBlock {
         if (vae_downsample) {
             auto conv = std::dynamic_pointer_cast<Conv2d>(blocks["conv"]);
 
-            x = ggml_pad(ctx, x, 1, 1, 0, 0);
+            x = sd_pad(ctx, x, 1, 1, 0, 0);
             x = conv->forward(ctx, x);
         } else {
             auto conv = std::dynamic_pointer_cast<Conv2d>(blocks["op"]);
diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp
index ff36cea25..cd6310736 100644
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@@ -113,6 +113,7 @@ struct SDParams {
     bool diffusion_flash_attn  = false;
     bool diffusion_conv_direct = false;
     bool vae_conv_direct       = false;
+    bool circular_pad          = false;
     bool canny_preprocess      = false;
     bool color                 = false;
     int upscale_repeats        = 1;
@@ -183,6 +184,7 @@ void print_params(SDParams params) {
     printf("    diffusion flash attention:         %s\n", params.diffusion_flash_attn ? "true" : "false");
     printf("    diffusion Conv2d direct:           %s\n", params.diffusion_conv_direct ? "true" : "false");
     printf("    vae_conv_direct:                   %s\n", params.vae_conv_direct ? "true" : "false");
+    printf("    circular padding:                  %s\n", params.circular_pad ? "true" : "false");
     printf("    control_strength:                  %.2f\n", params.control_strength);
     printf("    prompt:                            %s\n", params.prompt.c_str());
     printf("    negative_prompt:                   %s\n", params.negative_prompt.c_str());
@@ -304,6 +306,7 @@ void print_usage(int argc, const char* argv[]) {
     printf("                                     This might crash if it is not supported by the backend.\n");
     printf("  --vae-conv-direct                  use Conv2d direct in the vae model (should improve the performance)\n");
     printf("                                     This might crash if it is not supported by the backend.\n");
+    printf("  --circular                         use circular padding for convolutions and pad ops\n");
     printf("  --control-net-cpu                  keep controlnet in cpu (for low vram)\n");
     printf("  --canny                            apply canny preprocessor (edge detection)\n");
     printf("  --color                            colors the logging tags according to level\n");
@@ -573,6 +576,7 @@ void parse_args(int argc, const char** argv, SDParams& params) {
         {"", "--diffusion-fa", "", true, &params.diffusion_flash_attn},
         {"", "--diffusion-conv-direct", "", true, &params.diffusion_conv_direct},
         {"", "--vae-conv-direct", "", true, &params.vae_conv_direct},
+        {"", "--circular", "", true, &params.circular_pad},
         {"", "--canny", "", true, &params.canny_preprocess},
         {"-v", "--verbose", "", true, &params.verbose},
         {"", "--color", "", true, &params.color},
@@ -1386,6 +1390,7 @@ int main(int argc, const char* argv[]) {
         params.diffusion_flash_attn,
         params.diffusion_conv_direct,
         params.vae_conv_direct,
+        params.circular_pad,
         params.force_sdxl_vae_conv_scale,
         params.chroma_use_dit_mask,
         params.chroma_use_t5_mask,
diff --git a/flux.hpp b/flux.hpp
index 2ed410419..c03b3ce2a 100644
--- a/flux.hpp
+++ b/flux.hpp
@@ -696,7 +696,7 @@ namespace Flux {
                 vec = approx->forward(ctx, vec);                           // [344, N, hidden_size]
 
                 if (y != NULL) {
-                    txt_img_mask = ggml_pad(ctx, y, img->ne[1], 0, 0, 0);
+                    txt_img_mask = sd_pad(ctx, y, img->ne[1], 0, 0, 0);
                 }
             } else {
                 auto time_in   = std::dynamic_pointer_cast<MLPEmbedder>(blocks["time_in"]);
@@ -759,7 +759,7 @@ namespace Flux {
             int64_t patch_size = 2;
             int pad_h          = (patch_size - H % patch_size) % patch_size;
             int pad_w          = (patch_size - W % patch_size) % patch_size;
-            x                  = ggml_pad(ctx, x, pad_w, pad_h, 0, 0);  // [N, C, H + pad_h, W + pad_w]
+            x                  = sd_pad(ctx, x, pad_w, pad_h, 0, 0);  // [N, C, H + pad_h, W + pad_w]
 
             // img = rearrange(x, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=patch_size, pw=patch_size)
             auto img = patchify(ctx, x, patch_size);  // [N, h*w, C * patch_size * patch_size]
@@ -815,9 +815,9 @@ namespace Flux {
                 ggml_tensor* mask    = ggml_view_4d(ctx, c_concat, c_concat->ne[0], c_concat->ne[1], 1, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], c_concat->nb[2] * C);
                 ggml_tensor* control = ggml_view_4d(ctx, c_concat, c_concat->ne[0], c_concat->ne[1], C, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], c_concat->nb[2] * (C + 1));
 
-                masked  = ggml_pad(ctx, masked, pad_w, pad_h, 0, 0);
-                mask    = ggml_pad(ctx, mask, pad_w, pad_h, 0, 0);
-                control = ggml_pad(ctx, control, pad_w, pad_h, 0, 0);
+                masked  = sd_pad(ctx, masked, pad_w, pad_h, 0, 0);
+                mask    = sd_pad(ctx, mask, pad_w, pad_h, 0, 0);
+                control = sd_pad(ctx, control, pad_w, pad_h, 0, 0);
 
                 masked  = patchify(ctx, masked, patch_size);
                 mask    = patchify(ctx, mask, patch_size);
@@ -827,7 +827,7 @@ namespace Flux {
             } else if (params.version == VERSION_FLUX_CONTROLS) {
                 GGML_ASSERT(c_concat != NULL);
 
-                ggml_tensor* control = ggml_pad(ctx, c_concat, pad_w, pad_h, 0, 0);
+                ggml_tensor* control = sd_pad(ctx, c_concat, pad_w, pad_h, 0, 0);
                 control              = patchify(ctx, control, patch_size);
                 img                  = ggml_concat(ctx, img, control, 0);
             }
diff --git a/ggml b/ggml
index 7bffd79a4..25d358c62 160000
--- a/ggml
+++ b/ggml
@@ -1 +1 @@
-Subproject commit 7bffd79a4bec72e9a3bfbedb582a218b84401c13
+Subproject commit 25d358c627186901b6506ee70faed598613eff05
diff --git a/ggml_extend.hpp b/ggml_extend.hpp
index d8df0d8f6..7a253404b 100644
--- a/ggml_extend.hpp
+++ b/ggml_extend.hpp
@@ -60,6 +60,39 @@
 #define SD_UNUSED(x) (void)(x)
 #endif
 
+inline bool& sd_global_circular_padding_enabled() {
+    static bool enabled = false;
+    return enabled;
+}
+
+__STATIC_INLINE__ struct ggml_tensor* sd_pad(struct ggml_context* ctx,
+                                             struct ggml_tensor* a,
+                                             int p0,
+                                             int p1,
+                                             int p2,
+                                             int p3) {
+    if (sd_global_circular_padding_enabled()) {
+        return ggml_pad_circular(ctx, a, 0, p0, 0, p1, 0, p2, 0, p3);
+    }
+    return ggml_pad(ctx, a, p0, p1, p2, p3);
+}
+
+__STATIC_INLINE__ struct ggml_tensor* sd_pad_ext(struct ggml_context* ctx,
+                                                 struct ggml_tensor* a,
+                                                 int lp0,
+                                                 int rp0,
+                                                 int lp1,
+                                                 int rp1,
+                                                 int lp2,
+                                                 int rp2,
+                                                 int lp3,
+                                                 int rp3) {
+    if (sd_global_circular_padding_enabled()) {
+        return ggml_pad_circular(ctx, a, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3);
+    }
+    return ggml_pad_ext(ctx, a, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3);
+}
+
 __STATIC_INLINE__ void ggml_log_callback_default(ggml_log_level level, const char* text, void*) {
     switch (level) {
         case GGML_LOG_LEVEL_DEBUG:
@@ -986,10 +1019,24 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_nn_conv_2d(struct ggml_context* ctx,
     if (scale != 1.f) {
         x = ggml_scale(ctx, x, scale);
     }
+    const bool use_circular = sd_global_circular_padding_enabled() && (p0 != 0 || p1 != 0);
+    const bool is_depthwise = (w->ne[2] == 1 && x->ne[2] == w->ne[3]);
     if (direct) {
-        x = ggml_conv_2d_direct(ctx, w, x, s0, s1, p0, p1, d0, d1);
+        if (use_circular) {
+            if (is_depthwise) {
+                x = ggml_conv_2d_dw_direct_circular(ctx, w, x, s0, s1, p0, p1, d0, d1);
+            } else {
+                x = ggml_conv_2d_direct_circular(ctx, w, x, s0, s1, p0, p1, d0, d1);
+            }
+        } else {
+            x = ggml_conv_2d_direct(ctx, w, x, s0, s1, p0, p1, d0, d1);
+        }
     } else {
-        x = ggml_conv_2d(ctx, w, x, s0, s1, p0, p1, d0, d1);
+        if (use_circular) {
+            x = ggml_conv_2d_circular(ctx, w, x, s0, s1, p0, p1, d0, d1);
+        } else {
+            x = ggml_conv_2d(ctx, w, x, s0, s1, p0, p1, d0, d1);
+        }
     }
     if (scale != 1.f) {
         x = ggml_scale(ctx, x, 1.f / scale);
@@ -1190,7 +1237,7 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_nn_attention_ext(struct ggml_context*
 
     auto build_kqv = [&](ggml_tensor* q_in, ggml_tensor* k_in, ggml_tensor* v_in, ggml_tensor* mask_in) -> ggml_tensor* {
         if (kv_pad != 0) {
-            k_in = ggml_pad(ctx, k_in, 0, kv_pad, 0, 0);
+            k_in = sd_pad(ctx, k_in, 0, kv_pad, 0, 0);
         }
         if (kv_scale != 1.0f) {
             k_in = ggml_scale(ctx, k_in, kv_scale);
@@ -1200,7 +1247,7 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_nn_attention_ext(struct ggml_context*
         v_in = ggml_nn_cont(ctx, ggml_permute(ctx, v_in, 0, 2, 1, 3));
         v_in = ggml_reshape_3d(ctx, v_in, d_head, L_k, n_kv_head * N);
         if (kv_pad != 0) {
-            v_in = ggml_pad(ctx, v_in, 0, kv_pad, 0, 0);
+            v_in = sd_pad(ctx, v_in, 0, kv_pad, 0, 0);
         }
         if (kv_scale != 1.0f) {
             v_in = ggml_scale(ctx, v_in, kv_scale);
@@ -1223,7 +1270,7 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_nn_attention_ext(struct ggml_context*
                 mask_pad = GGML_PAD(L_q, GGML_KQ_MASK_PAD) - mask_in->ne[1];
             }
             if (mask_pad > 0) {
-                mask_in = ggml_pad(ctx, mask_in, 0, mask_pad, 0, 0);
+                mask_in = sd_pad(ctx, mask_in, 0, mask_pad, 0, 0);
             }
             mask_in = ggml_cast(ctx, mask_in, GGML_TYPE_F16);
         }
diff --git a/mmdit.hpp b/mmdit.hpp
index d9d19340c..1b3f2276f 100644
--- a/mmdit.hpp
+++ b/mmdit.hpp
@@ -80,7 +80,7 @@ struct PatchEmbed : public GGMLBlock {
             int64_t H = x->ne[1];
             int pad_h = (patch_size - H % patch_size) % patch_size;
             int pad_w = (patch_size - W % patch_size) % patch_size;
-            x         = ggml_pad(ctx, x, pad_w, pad_h, 0, 0);  // TODO: reflect pad mode
+            x         = sd_pad(ctx, x, pad_w, pad_h, 0, 0);  // TODO: reflect pad mode
         }
         x = proj->forward(ctx, x);
 
@@ -997,4 +997,4 @@ struct MMDiTRunner : public GGMLRunner {
     }
 };
 
-#endif
\ No newline at end of file
+#endif
diff --git a/qwen_image.hpp b/qwen_image.hpp
index ce4e62dce..cc336ff28 100644
--- a/qwen_image.hpp
+++ b/qwen_image.hpp
@@ -363,7 +363,7 @@ namespace Qwen {
 
             int pad_h = (params.patch_size - H % params.patch_size) % params.patch_size;
             int pad_w = (params.patch_size - W % params.patch_size) % params.patch_size;
-            x         = ggml_pad(ctx, x, pad_w, pad_h, 0, 0);  // [N, C, H + pad_h, W + pad_w]
+            x         = sd_pad(ctx, x, pad_w, pad_h, 0, 0);  // [N, C, H + pad_h, W + pad_w]
             return x;
         }
 
@@ -691,4 +691,4 @@ namespace Qwen {
 
 }  // namespace name
 
-#endif  // __QWEN_IMAGE_HPP__
\ No newline at end of file
+#endif  // __QWEN_IMAGE_HPP__
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index 87b6a3779..e0c19f3a1 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -114,6 +114,7 @@ class StableDiffusionGGML {
     bool use_tiny_autoencoder            = false;
     sd_tiling_params_t vae_tiling_params = {false, 0, 0, 0.5f, 0, 0};
     bool offload_params_to_cpu           = false;
+    bool circular_pad                    = false;
     bool stacked_id                      = false;
 
     bool is_using_v_parameterization     = false;
@@ -187,6 +188,11 @@ class StableDiffusionGGML {
         taesd_path              = SAFE_STR(sd_ctx_params->taesd_path);
         use_tiny_autoencoder    = taesd_path.size() > 0;
         offload_params_to_cpu   = sd_ctx_params->offload_params_to_cpu;
+        circular_pad            = sd_ctx_params->circular_pad;
+        sd_global_circular_padding_enabled() = circular_pad;
+        if (circular_pad) {
+            LOG_INFO("Using circular padding for convolutions");
+        }
 
         if (sd_ctx_params->rng_type == STD_DEFAULT_RNG) {
             rng = std::make_shared<STDDefaultRNG>();
@@ -1820,6 +1826,7 @@ void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) {
     sd_ctx_params->keep_control_net_on_cpu = false;
     sd_ctx_params->keep_vae_on_cpu         = false;
     sd_ctx_params->diffusion_flash_attn    = false;
+    sd_ctx_params->circular_pad            = false;
     sd_ctx_params->chroma_use_dit_mask     = true;
     sd_ctx_params->chroma_use_t5_mask      = false;
     sd_ctx_params->chroma_t5_mask_pad      = 1;
@@ -1860,6 +1867,7 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
              "keep_control_net_on_cpu: %s\n"
              "keep_vae_on_cpu: %s\n"
              "diffusion_flash_attn: %s\n"
+             "circular_pad: %s\n"
              "chroma_use_dit_mask: %s\n"
              "chroma_use_t5_mask: %s\n"
              "chroma_t5_mask_pad: %d\n",
@@ -1889,6 +1897,7 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
              BOOL_STR(sd_ctx_params->keep_control_net_on_cpu),
              BOOL_STR(sd_ctx_params->keep_vae_on_cpu),
              BOOL_STR(sd_ctx_params->diffusion_flash_attn),
+             BOOL_STR(sd_ctx_params->circular_pad),
              BOOL_STR(sd_ctx_params->chroma_use_dit_mask),
              BOOL_STR(sd_ctx_params->chroma_use_t5_mask),
              sd_ctx_params->chroma_t5_mask_pad);
diff --git a/stable-diffusion.h b/stable-diffusion.h
index a891a58f1..1512c7192 100644
--- a/stable-diffusion.h
+++ b/stable-diffusion.h
@@ -164,6 +164,7 @@ typedef struct {
     bool diffusion_flash_attn;
     bool diffusion_conv_direct;
     bool vae_conv_direct;
+    bool circular_pad;
     bool force_sdxl_vae_conv_scale;
     bool chroma_use_dit_mask;
     bool chroma_use_t5_mask;
diff --git a/wan.hpp b/wan.hpp
index 31fa90b3a..8d2e29641 100644
--- a/wan.hpp
+++ b/wan.hpp
@@ -73,7 +73,7 @@ namespace WAN {
                 lp2 -= (int)cache_x->ne[2];
             }
 
-            x = ggml_pad_ext(ctx, x, lp0, rp0, lp1, rp1, lp2, rp2, 0, 0);
+            x = sd_pad_ext(ctx, x, lp0, rp0, lp1, rp1, lp2, rp2, 0, 0);
             return ggml_nn_conv_3d(ctx, x, w, b, in_channels,
                                    std::get<2>(stride), std::get<1>(stride), std::get<0>(stride),
                                    0, 0, 0,
@@ -172,7 +172,7 @@ namespace WAN {
                                                   2);
                         }
                         if (chunk_idx == 1 && cache_x->ne[2] < 2) {  // Rep
-                            cache_x = ggml_pad_ext(ctx, cache_x, 0, 0, 0, 0, (int)cache_x->ne[2], 0, 0, 0);
+                            cache_x = sd_pad_ext(ctx, cache_x, 0, 0, 0, 0, (int)cache_x->ne[2], 0, 0, 0);
                             // aka cache_x = torch.cat([torch.zeros_like(cache_x).to(cache_x.device),cache_x],dim=2)
                         }
                         if (chunk_idx == 1) {
@@ -198,9 +198,9 @@ namespace WAN {
                 } else if (mode == "upsample3d") {
                     x = ggml_upscale(ctx, x, 2, GGML_SCALE_MODE_NEAREST);
                 } else if (mode == "downsample2d") {
-                    x = ggml_pad(ctx, x, 1, 1, 0, 0);
+                    x = sd_pad(ctx, x, 1, 1, 0, 0);
                 } else if (mode == "downsample3d") {
-                    x = ggml_pad(ctx, x, 1, 1, 0, 0);
+                    x = sd_pad(ctx, x, 1, 1, 0, 0);
                 }
                 x = resample_1->forward(ctx, x);
                 x = ggml_nn_cont(ctx, ggml_torch_permute(ctx, x, 0, 1, 3, 2));  // (c, t, h, w)
@@ -260,7 +260,7 @@ namespace WAN {
 
             int64_t pad_t = (factor_t - T % factor_t) % factor_t;
 
-            x = ggml_pad_ext(ctx, x, 0, 0, 0, 0, pad_t, 0, 0, 0);
+            x = sd_pad_ext(ctx, x, 0, 0, 0, 0, pad_t, 0, 0, 0);
             T = x->ne[2];
 
             x = ggml_reshape_4d(ctx, x, W * H, factor_t, T / factor_t, C);                                                  // [C, T/factor_t, factor_t, H*W]
@@ -1838,7 +1838,7 @@ namespace WAN {
             int pad_t = (std::get<0>(params.patch_size) - T % std::get<0>(params.patch_size)) % std::get<0>(params.patch_size);
             int pad_h = (std::get<1>(params.patch_size) - H % std::get<1>(params.patch_size)) % std::get<1>(params.patch_size);
             int pad_w = (std::get<2>(params.patch_size) - W % std::get<2>(params.patch_size)) % std::get<2>(params.patch_size);
-            x         = ggml_pad(ctx, x, pad_w, pad_h, pad_t, 0);  // [N*C, T + pad_t, H + pad_h, W + pad_w]
+            x         = sd_pad(ctx, x, pad_w, pad_h, pad_t, 0);  // [N*C, T + pad_t, H + pad_h, W + pad_w]
 
             return x;
         }

From 6d85b94039d8c6bfa13fa2f815bb4a32824a6de6 Mon Sep 17 00:00:00 2001
From: bepis <phylliida.dev@gmail.com>
Date: Thu, 23 Oct 2025 18:01:28 -0700
Subject: [PATCH 02/25] reworked circular to global flag

---
 ggml_extend.hpp      | 19 ++++++++++++++-----
 stable-diffusion.cpp |  2 +-
 2 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/ggml_extend.hpp b/ggml_extend.hpp
index 7a253404b..7c43b3e19 100644
--- a/ggml_extend.hpp
+++ b/ggml_extend.hpp
@@ -19,6 +19,7 @@
 #include <string>
 #include <unordered_map>
 #include <vector>
+#include <atomic>
 
 #include "ggml-alloc.h"
 #include "ggml-backend.h"
@@ -60,18 +61,26 @@
 #define SD_UNUSED(x) (void)(x)
 #endif
 
-inline bool& sd_global_circular_padding_enabled() {
-    static bool enabled = false;
+inline std::atomic<bool>& sd_circular_padding_flag() {
+    static std::atomic<bool> enabled{false};
     return enabled;
 }
 
+inline void sd_set_circular_padding_enabled(bool enabled) {
+    sd_circular_padding_flag().store(enabled, std::memory_order_relaxed);
+}
+
+inline bool sd_is_circular_padding_enabled() {
+    return sd_circular_padding_flag().load(std::memory_order_relaxed);
+}
+
 __STATIC_INLINE__ struct ggml_tensor* sd_pad(struct ggml_context* ctx,
                                              struct ggml_tensor* a,
                                              int p0,
                                              int p1,
                                              int p2,
                                              int p3) {
-    if (sd_global_circular_padding_enabled()) {
+    if (sd_is_circular_padding_enabled()) {
         return ggml_pad_circular(ctx, a, 0, p0, 0, p1, 0, p2, 0, p3);
     }
     return ggml_pad(ctx, a, p0, p1, p2, p3);
@@ -87,7 +96,7 @@ __STATIC_INLINE__ struct ggml_tensor* sd_pad_ext(struct ggml_context* ctx,
                                                  int rp2,
                                                  int lp3,
                                                  int rp3) {
-    if (sd_global_circular_padding_enabled()) {
+    if (sd_is_circular_padding_enabled()) {
         return ggml_pad_circular(ctx, a, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3);
     }
     return ggml_pad_ext(ctx, a, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3);
@@ -1019,7 +1028,7 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_nn_conv_2d(struct ggml_context* ctx,
     if (scale != 1.f) {
         x = ggml_scale(ctx, x, scale);
     }
-    const bool use_circular = sd_global_circular_padding_enabled() && (p0 != 0 || p1 != 0);
+    const bool use_circular = sd_is_circular_padding_enabled() && (p0 != 0 || p1 != 0);
     const bool is_depthwise = (w->ne[2] == 1 && x->ne[2] == w->ne[3]);
     if (direct) {
         if (use_circular) {
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index e0c19f3a1..adc007be9 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -189,7 +189,7 @@ class StableDiffusionGGML {
         use_tiny_autoencoder    = taesd_path.size() > 0;
         offload_params_to_cpu   = sd_ctx_params->offload_params_to_cpu;
         circular_pad            = sd_ctx_params->circular_pad;
-        sd_global_circular_padding_enabled() = circular_pad;
+        sd_set_circular_padding_enabled(circular_pad);
         if (circular_pad) {
             LOG_INFO("Using circular padding for convolutions");
         }

From 009271189fb7e77f566012518655d5981f0b152e Mon Sep 17 00:00:00 2001
From: bepis <phylliida.dev@gmail.com>
Date: Thu, 23 Oct 2025 18:22:26 -0700
Subject: [PATCH 03/25] cleaner implementation of tiling support in sd cpp

---
 rope.hpp | 42 +++++++++++++++++++++++++++---------------
 1 file changed, 27 insertions(+), 15 deletions(-)

diff --git a/rope.hpp b/rope.hpp
index 295c9a217..4b68e2ac1 100644
--- a/rope.hpp
+++ b/rope.hpp
@@ -50,24 +50,36 @@ namespace Rope {
             omega[i] = 1.0 / std::pow(theta, scale[i]);
         }
 
-        int pos_size = pos.size();
-        std::vector<std::vector<float>> out(pos_size, std::vector<float>(half_dim));
-        for (int i = 0; i < pos_size; ++i) {
+        for (size_t i = 0; i < pos.size(); ++i) {
+            float position = pos[i];
             for (int j = 0; j < half_dim; ++j) {
-                out[i][j] = pos[i] * omega[j];
-            }
-        }
-
-        std::vector<std::vector<float>> result(pos_size, std::vector<float>(half_dim * 4));
-        for (int i = 0; i < pos_size; ++i) {
-            for (int j = 0; j < half_dim; ++j) {
-                result[i][4 * j]     = std::cos(out[i][j]);
-                result[i][4 * j + 1] = -std::sin(out[i][j]);
-                result[i][4 * j + 2] = std::sin(out[i][j]);
-                result[i][4 * j + 3] = std::cos(out[i][j]);
+                float omega_val       = omega[j];
+                float original_angle  = position * omega_val;
+                float angle           = original_angle;
+                if (sd_is_circular_padding_enabled()) {
+                    constexpr float TWO_PI = 6.28318530717958647692f;
+                    float wrap_f            = static_cast<float>(wrap);
+                    float cycles            = omega_val * wrap_f / TWO_PI;
+                    float rounded           = std::round(cycles);  // closest periodic harmonic
+                    float periodic_omega    = TWO_PI * rounded / wrap_f;
+                    float periodic_angle    = position * periodic_omega;
+                    float rel_pos           = std::fmod(position, wrap_f);
+                    if (rel_pos < 0.0f) {
+                        rel_pos += wrap_f;
+                    }
+                    float t       = wrap_f > 0.0f ? rel_pos / wrap_f : 0.0f;
+                    float window  = 0.5f - 0.5f * std::cos(TWO_PI * t);  // 0 at edges, 1 in the middle
+                    window        = std::clamp(window, 0.0f, 1.0f);
+                    angle         = periodic_angle + window * (original_angle - periodic_angle);
+                }
+                float sin_val = std::sin(angle);
+                float cos_val = std::cos(angle);
+                result[i][4 * j]     = cos_val;
+                result[i][4 * j + 1] = -sin_val;
+                result[i][4 * j + 2] = sin_val;
+                result[i][4 * j + 3] = cos_val;
             }
         }
-
         return result;
     }
 

From ee0e82a40bdade750b2454013058cf4aff04fc73 Mon Sep 17 00:00:00 2001
From: bepis <phylliida.dev@gmail.com>
Date: Fri, 24 Oct 2025 13:52:58 -0700
Subject: [PATCH 04/25] cleaned rope

---
 ggml            |  2 +-
 ggml_extend.hpp | 84 +++++++++++++++++++++++++------------------------
 rope.hpp        | 63 +++++++++++++++++++++++++++++++++----
 3 files changed, 101 insertions(+), 48 deletions(-)

diff --git a/ggml b/ggml
index 25d358c62..6eb26b3c7 160000
--- a/ggml
+++ b/ggml
@@ -1 +1 @@
-Subproject commit 25d358c627186901b6506ee70faed598613eff05
+Subproject commit 6eb26b3c74ed06f600e61f48d62dc39f9c1166c0
diff --git a/ggml_extend.hpp b/ggml_extend.hpp
index 7c43b3e19..638beec0d 100644
--- a/ggml_extend.hpp
+++ b/ggml_extend.hpp
@@ -61,47 +61,6 @@
 #define SD_UNUSED(x) (void)(x)
 #endif
 
-inline std::atomic<bool>& sd_circular_padding_flag() {
-    static std::atomic<bool> enabled{false};
-    return enabled;
-}
-
-inline void sd_set_circular_padding_enabled(bool enabled) {
-    sd_circular_padding_flag().store(enabled, std::memory_order_relaxed);
-}
-
-inline bool sd_is_circular_padding_enabled() {
-    return sd_circular_padding_flag().load(std::memory_order_relaxed);
-}
-
-__STATIC_INLINE__ struct ggml_tensor* sd_pad(struct ggml_context* ctx,
-                                             struct ggml_tensor* a,
-                                             int p0,
-                                             int p1,
-                                             int p2,
-                                             int p3) {
-    if (sd_is_circular_padding_enabled()) {
-        return ggml_pad_circular(ctx, a, 0, p0, 0, p1, 0, p2, 0, p3);
-    }
-    return ggml_pad(ctx, a, p0, p1, p2, p3);
-}
-
-__STATIC_INLINE__ struct ggml_tensor* sd_pad_ext(struct ggml_context* ctx,
-                                                 struct ggml_tensor* a,
-                                                 int lp0,
-                                                 int rp0,
-                                                 int lp1,
-                                                 int rp1,
-                                                 int lp2,
-                                                 int rp2,
-                                                 int lp3,
-                                                 int rp3) {
-    if (sd_is_circular_padding_enabled()) {
-        return ggml_pad_circular(ctx, a, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3);
-    }
-    return ggml_pad_ext(ctx, a, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3);
-}
-
 __STATIC_INLINE__ void ggml_log_callback_default(ggml_log_level level, const char* text, void*) {
     switch (level) {
         case GGML_LOG_LEVEL_DEBUG:
@@ -628,6 +587,49 @@ __STATIC_INLINE__ void ggml_tensor_clamp(struct ggml_tensor* src, float min, flo
     }
 }
 
+
+
+inline std::atomic<bool>& sd_circular_padding_flag() {
+    static std::atomic<bool> enabled{false};
+    return enabled;
+}
+
+inline void sd_set_circular_padding_enabled(bool enabled) {
+    sd_circular_padding_flag().store(enabled, std::memory_order_relaxed);
+}
+
+inline bool sd_is_circular_padding_enabled() {
+    return sd_circular_padding_flag().load(std::memory_order_relaxed);
+}
+
+__STATIC_INLINE__ struct ggml_tensor* sd_pad(struct ggml_context* ctx,
+                                             struct ggml_tensor* a,
+                                             int p0,
+                                             int p1,
+                                             int p2,
+                                             int p3) {
+    if (sd_is_circular_padding_enabled()) {
+        return ggml_pad_circular(ctx, a, 0, p0, 0, p1, 0, p2, 0, p3);
+    }
+    return ggml_pad(ctx, a, p0, p1, p2, p3);
+}
+
+__STATIC_INLINE__ struct ggml_tensor* sd_pad_ext(struct ggml_context* ctx,
+                                                 struct ggml_tensor* a,
+                                                 int lp0,
+                                                 int rp0,
+                                                 int lp1,
+                                                 int rp1,
+                                                 int lp2,
+                                                 int rp2,
+                                                 int lp3,
+                                                 int rp3) {
+    if (sd_is_circular_padding_enabled()) {
+        return ggml_pad_circular(ctx, a, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3);
+    }
+    return ggml_pad_ext(ctx, a, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3);
+}
+
 __STATIC_INLINE__ struct ggml_tensor* ggml_tensor_concat(struct ggml_context* ctx,
                                                          struct ggml_tensor* a,
                                                          struct ggml_tensor* b,
diff --git a/rope.hpp b/rope.hpp
index 4b68e2ac1..8280de528 100644
--- a/rope.hpp
+++ b/rope.hpp
@@ -1,6 +1,8 @@
 #ifndef __ROPE_HPP__
 #define __ROPE_HPP__
 
+#include <algorithm>
+#include <cmath>
 #include <vector>
 #include "ggml_extend.hpp"
 
@@ -39,15 +41,20 @@ namespace Rope {
         return flat_vec;
     }
 
-    __STATIC_INLINE__ std::vector<std::vector<float>> rope(const std::vector<float>& pos, int dim, int theta) {
+    __STATIC_INLINE__ std::vector<std::vector<float>> rope(const std::vector<float>& pos,
+                                                           int dim,
+                                                           int theta,
+                                                           const std::vector<int>* wraps = nullptr) {
         assert(dim % 2 == 0);
         int half_dim = dim / 2;
 
+        std::vector<std::vector<float>> result(pos.size(), std::vector<float>(half_dim * 4));
+
         std::vector<float> scale = linspace(0.f, (dim * 1.f - 2) / dim, half_dim);
 
         std::vector<float> omega(half_dim);
         for (int i = 0; i < half_dim; ++i) {
-            omega[i] = 1.0 / std::pow(theta, scale[i]);
+            omega[i] = 1.0f / std::pow(theta, scale[i]);
         }
 
         for (size_t i = 0; i < pos.size(); ++i) {
@@ -56,7 +63,13 @@ namespace Rope {
                 float omega_val       = omega[j];
                 float original_angle  = position * omega_val;
                 float angle           = original_angle;
-                if (sd_is_circular_padding_enabled()) {
+                int wrap              = 0;
+                if (wraps != nullptr && !wraps->empty()) {
+                    size_t wrap_size = wraps->size();
+                    size_t wrap_idx  = wrap_size > 0 ? (i % wrap_size) : 0;
+                    wrap             = (*wraps)[wrap_idx];
+                }
+                if (wrap > 0) {
                     constexpr float TWO_PI = 6.28318530717958647692f;
                     float wrap_f            = static_cast<float>(wrap);
                     float cycles            = omega_val * wrap_f / TWO_PI;
@@ -80,6 +93,7 @@ namespace Rope {
                 result[i][4 * j + 3] = cos_val;
             }
         }
+
         return result;
     }
 
@@ -134,7 +148,8 @@ namespace Rope {
     __STATIC_INLINE__ std::vector<float> embed_nd(const std::vector<std::vector<float>>& ids,
                                                   int bs,
                                                   int theta,
-                                                  const std::vector<int>& axes_dim) {
+                                                  const std::vector<int>& axes_dim,
+                                                  const std::vector<std::vector<int>>* axes_wraps = nullptr) {
         std::vector<std::vector<float>> trans_ids = transpose(ids);
         size_t pos_len                            = ids.size() / bs;
         int num_axes                              = axes_dim.size();
@@ -149,7 +164,12 @@ namespace Rope {
         std::vector<std::vector<float>> emb(bs * pos_len, std::vector<float>(emb_dim * 2 * 2, 0.0));
         int offset = 0;
         for (int i = 0; i < num_axes; ++i) {
-            std::vector<std::vector<float>> rope_emb = rope(trans_ids[i], axes_dim[i], theta);  // [bs*pos_len, axes_dim[i]/2 * 2 * 2]
+            const std::vector<int>* axis_wrap = nullptr;
+            if (axes_wraps != nullptr && i < (int)axes_wraps->size()) {
+                axis_wrap = &(*axes_wraps)[i];
+            }
+            std::vector<std::vector<float>> rope_emb =
+                rope(trans_ids[i], axes_dim[i], theta, axis_wrap);  // [bs*pos_len, axes_dim[i]/2 * 2 * 2]
             for (int b = 0; b < bs; ++b) {
                 for (int j = 0; j < pos_len; ++j) {
                     for (int k = 0; k < rope_emb[0].size(); ++k) {
@@ -264,7 +284,38 @@ namespace Rope {
                                                            int theta,
                                                            const std::vector<int>& axes_dim) {
         std::vector<std::vector<float>> ids = gen_qwen_image_ids(h, w, patch_size, bs, context_len, ref_latents, increase_ref_index);
-        return embed_nd(ids, bs, theta, axes_dim);
+        std::vector<std::vector<int>> axes_wraps;
+        if (sd_is_circular_padding_enabled() && bs > 0 && axes_dim.size() >= 3) {
+            int pad_h = (patch_size - (h % patch_size)) % patch_size;
+            int pad_w = (patch_size - (w % patch_size)) % patch_size;
+            int h_len = (h + pad_h) / patch_size;
+            int w_len = (w + pad_w) / patch_size;
+            if (h_len > 0 && w_len > 0) {
+                const size_t total_tokens     = ids.size();
+                // Track per-token wrap lengths for the row/column axes so only spatial tokens become periodic.
+                axes_wraps.assign(axes_dim.size(), std::vector<int>(total_tokens / bs, 0));
+                size_t cursor = 0;
+                for (ggml_tensor* ref : ref_latents) {
+                    if (ref == nullptr) {
+                        continue;
+                    }
+                    int ref_h      = static_cast<int>(ref->ne[1]);
+                    int ref_w      = static_cast<int>(ref->ne[0]);
+                    int ref_pad_h  = (patch_size - (ref_h % patch_size)) % patch_size;
+                    int ref_pad_w  = (patch_size - (ref_w % patch_size)) % patch_size;
+                    int ref_h_len  = (ref_h + ref_pad_h) / patch_size;
+                    int ref_w_len  = (ref_w + ref_pad_w) / patch_size;
+                    size_t ref_n_tokens  = static_cast<size_t>(ref_h_len) * static_cast<size_t>(ref_w_len);
+                    for (size_t token_i = 0; token_i < ref_n_tokens; ++token_i) {
+                        axes_wraps[1][cursor + token_i] = ref_h_len;
+                        axes_wraps[2][cursor + token_i] = ref_w_len;
+                    }
+                    cursor += ref_n_tokens;
+                }
+            }
+        }
+        const std::vector<std::vector<int>>* wraps_ptr = axes_wraps.empty() ? nullptr : &axes_wraps;
+        return embed_nd(ids, bs, theta, axes_dim, wraps_ptr);
     }
 
     __STATIC_INLINE__ std::vector<std::vector<float>> gen_vid_ids(int t,

From cbb261dfdb6dedaa4539ef082749037c05f0cb34 Mon Sep 17 00:00:00 2001
From: bepis <phylliida.dev@gmail.com>
Date: Fri, 24 Oct 2025 14:46:28 -0700
Subject: [PATCH 05/25] working simplified but still need wraps

---
 ggml_extend.hpp |  7 +++++--
 rope.hpp        | 31 +++++++++++++++++--------------
 2 files changed, 22 insertions(+), 16 deletions(-)

diff --git a/ggml_extend.hpp b/ggml_extend.hpp
index 638beec0d..9699b12cd 100644
--- a/ggml_extend.hpp
+++ b/ggml_extend.hpp
@@ -611,7 +611,9 @@ __STATIC_INLINE__ struct ggml_tensor* sd_pad(struct ggml_context* ctx,
     if (sd_is_circular_padding_enabled()) {
         return ggml_pad_circular(ctx, a, 0, p0, 0, p1, 0, p2, 0, p3);
     }
-    return ggml_pad(ctx, a, p0, p1, p2, p3);
+    else {
+        return ggml_pad(ctx, a, p0, p1, p2, p3);
+    }
 }
 
 __STATIC_INLINE__ struct ggml_tensor* sd_pad_ext(struct ggml_context* ctx,
@@ -1030,7 +1032,8 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_nn_conv_2d(struct ggml_context* ctx,
     if (scale != 1.f) {
         x = ggml_scale(ctx, x, scale);
     }
-    const bool use_circular = sd_is_circular_padding_enabled() && (p0 != 0 || p1 != 0);
+    const bool use_circular = sd_is_circular_padding_enabled();
+    LOG_DEBUG("use circular conv %d", use_circular ? 1 : 0);
     const bool is_depthwise = (w->ne[2] == 1 && x->ne[2] == w->ne[3]);
     if (direct) {
         if (use_circular) {
diff --git a/rope.hpp b/rope.hpp
index 8280de528..34d119ef9 100644
--- a/rope.hpp
+++ b/rope.hpp
@@ -63,9 +63,10 @@ namespace Rope {
                 float omega_val       = omega[j];
                 float original_angle  = position * omega_val;
                 float angle           = original_angle;
-                int wrap              = 0;
+                float wrap = 0;
                 if (wraps != nullptr && !wraps->empty()) {
                     size_t wrap_size = wraps->size();
+                    // mod batch size since we only store this for one item in the batch
                     size_t wrap_idx  = wrap_size > 0 ? (i % wrap_size) : 0;
                     wrap             = (*wraps)[wrap_idx];
                 }
@@ -73,17 +74,11 @@ namespace Rope {
                     constexpr float TWO_PI = 6.28318530717958647692f;
                     float wrap_f            = static_cast<float>(wrap);
                     float cycles            = omega_val * wrap_f / TWO_PI;
-                    float rounded           = std::round(cycles);  // closest periodic harmonic
-                    float periodic_omega    = TWO_PI * rounded / wrap_f;
-                    float periodic_angle    = position * periodic_omega;
-                    float rel_pos           = std::fmod(position, wrap_f);
-                    if (rel_pos < 0.0f) {
-                        rel_pos += wrap_f;
-                    }
-                    float t       = wrap_f > 0.0f ? rel_pos / wrap_f : 0.0f;
-                    float window  = 0.5f - 0.5f * std::cos(TWO_PI * t);  // 0 at edges, 1 in the middle
-                    window        = std::clamp(window, 0.0f, 1.0f);
-                    angle         = periodic_angle + window * (original_angle - periodic_angle);
+                    // closest periodic harmonic, necessary to ensure things neatly tile
+                    // without this round, things don't tile at the boundaries and you end up
+                    // with the model knowing what is "center"
+                    float rounded           = std::round(cycles);
+                    angle         = position * TWO_PI * rounded / wrap_f;
                 }
                 float sin_val = std::sin(angle);
                 float cos_val = std::cos(angle);
@@ -282,7 +277,9 @@ namespace Rope {
                                                            const std::vector<ggml_tensor*>& ref_latents,
                                                            bool increase_ref_index,
                                                            int theta,
-                                                           const std::vector<int>& axes_dim) {
+                                                           const std::vector<int>& axes_dim,
+                                                           bool circular = false) {
+        circular = true;
         std::vector<std::vector<float>> ids = gen_qwen_image_ids(h, w, patch_size, bs, context_len, ref_latents, increase_ref_index);
         std::vector<std::vector<int>> axes_wraps;
         if (sd_is_circular_padding_enabled() && bs > 0 && axes_dim.size() >= 3) {
@@ -294,7 +291,13 @@ namespace Rope {
                 const size_t total_tokens     = ids.size();
                 // Track per-token wrap lengths for the row/column axes so only spatial tokens become periodic.
                 axes_wraps.assign(axes_dim.size(), std::vector<int>(total_tokens / bs, 0));
-                size_t cursor = 0;
+                size_t cursor = context_len; // ignore text tokens
+                const size_t img_tokens       = static_cast<size_t>(h_len) * static_cast<size_t>(w_len);
+                for (size_t token_i = 0; token_i < img_tokens; ++token_i) {
+                    axes_wraps[1][cursor + token_i] = h_len;
+                    axes_wraps[2][cursor + token_i] = w_len;
+                }
+                cursor += img_tokens;
                 for (ggml_tensor* ref : ref_latents) {
                     if (ref == nullptr) {
                         continue;

From 8d7f6793bcb478647e03ce626bafdf459042e1c9 Mon Sep 17 00:00:00 2001
From: bepis <phylliida.dev@gmail.com>
Date: Fri, 24 Oct 2025 15:24:59 -0700
Subject: [PATCH 06/25] Further clean of rope

---
 rope.hpp | 44 ++++++++++++++++++++++----------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/rope.hpp b/rope.hpp
index 34d119ef9..82084403e 100644
--- a/rope.hpp
+++ b/rope.hpp
@@ -44,7 +44,7 @@ namespace Rope {
     __STATIC_INLINE__ std::vector<std::vector<float>> rope(const std::vector<float>& pos,
                                                            int dim,
                                                            int theta,
-                                                           const std::vector<int>* wraps = nullptr) {
+                                                           const std::vector<int>* wrap_dims = nullptr) {
         assert(dim % 2 == 0);
         int half_dim = dim / 2;
 
@@ -63,16 +63,16 @@ namespace Rope {
                 float omega_val       = omega[j];
                 float original_angle  = position * omega_val;
                 float angle           = original_angle;
-                float wrap = 0;
-                if (wraps != nullptr && !wraps->empty()) {
-                    size_t wrap_size = wraps->size();
+                int wrap_dim = 0;
+                if (wrap_dims != nullptr && !wrap_dims->empty()) {
+                    size_t wrap_size = wrap_dims->size();
                     // mod batch size since we only store this for one item in the batch
                     size_t wrap_idx  = wrap_size > 0 ? (i % wrap_size) : 0;
-                    wrap             = (*wraps)[wrap_idx];
+                    wrap_dim             = (*wrap_dims)[wrap_idx];
                 }
-                if (wrap > 0) {
+                if (wrap_dim > 0) {
                     constexpr float TWO_PI = 6.28318530717958647692f;
-                    float wrap_f            = static_cast<float>(wrap);
+                    float wrap_f            = static_cast<float>(wrap_dim);
                     float cycles            = omega_val * wrap_f / TWO_PI;
                     // closest periodic harmonic, necessary to ensure things neatly tile
                     // without this round, things don't tile at the boundaries and you end up
@@ -144,7 +144,7 @@ namespace Rope {
                                                   int bs,
                                                   int theta,
                                                   const std::vector<int>& axes_dim,
-                                                  const std::vector<std::vector<int>>* axes_wraps = nullptr) {
+                                                  const std::vector<std::vector<int>>* wrap_dims = nullptr) {
         std::vector<std::vector<float>> trans_ids = transpose(ids);
         size_t pos_len                            = ids.size() / bs;
         int num_axes                              = axes_dim.size();
@@ -159,12 +159,12 @@ namespace Rope {
         std::vector<std::vector<float>> emb(bs * pos_len, std::vector<float>(emb_dim * 2 * 2, 0.0));
         int offset = 0;
         for (int i = 0; i < num_axes; ++i) {
-            const std::vector<int>* axis_wrap = nullptr;
-            if (axes_wraps != nullptr && i < (int)axes_wraps->size()) {
-                axis_wrap = &(*axes_wraps)[i];
+            const std::vector<int>* axis_wrap_dims = nullptr;
+            if (wrap_dims != nullptr && i < (int)wrap_dims->size()) {
+                axis_wrap_dims = &(*wrap_dims)[i];
             }
             std::vector<std::vector<float>> rope_emb =
-                rope(trans_ids[i], axes_dim[i], theta, axis_wrap);  // [bs*pos_len, axes_dim[i]/2 * 2 * 2]
+                rope(trans_ids[i], axes_dim[i], theta, axis_wrap_dims);  // [bs*pos_len, axes_dim[i]/2 * 2 * 2]
             for (int b = 0; b < bs; ++b) {
                 for (int j = 0; j < pos_len; ++j) {
                     for (int k = 0; k < rope_emb[0].size(); ++k) {
@@ -277,11 +277,10 @@ namespace Rope {
                                                            const std::vector<ggml_tensor*>& ref_latents,
                                                            bool increase_ref_index,
                                                            int theta,
-                                                           const std::vector<int>& axes_dim,
-                                                           bool circular = false) {
-        circular = true;
+                                                           const std::vector<int>& axes_dim) {
         std::vector<std::vector<float>> ids = gen_qwen_image_ids(h, w, patch_size, bs, context_len, ref_latents, increase_ref_index);
-        std::vector<std::vector<int>> axes_wraps;
+        std::vector<std::vector<int>> wrap_dims;
+        // This logic simply stores the (pad and patch_adjusted) sizes of images so we can make sure rope correctly tiles
         if (sd_is_circular_padding_enabled() && bs > 0 && axes_dim.size() >= 3) {
             int pad_h = (patch_size - (h % patch_size)) % patch_size;
             int pad_w = (patch_size - (w % patch_size)) % patch_size;
@@ -290,14 +289,15 @@ namespace Rope {
             if (h_len > 0 && w_len > 0) {
                 const size_t total_tokens     = ids.size();
                 // Track per-token wrap lengths for the row/column axes so only spatial tokens become periodic.
-                axes_wraps.assign(axes_dim.size(), std::vector<int>(total_tokens / bs, 0));
+                wrap_dims.assign(axes_dim.size(), std::vector<int>(total_tokens / bs, 0));
                 size_t cursor = context_len; // ignore text tokens
                 const size_t img_tokens       = static_cast<size_t>(h_len) * static_cast<size_t>(w_len);
                 for (size_t token_i = 0; token_i < img_tokens; ++token_i) {
-                    axes_wraps[1][cursor + token_i] = h_len;
-                    axes_wraps[2][cursor + token_i] = w_len;
+                    wrap_dims[1][cursor + token_i] = h_len;
+                    wrap_dims[2][cursor + token_i] = w_len;
                 }
                 cursor += img_tokens;
+                // For each reference image, store wrap sizes as well
                 for (ggml_tensor* ref : ref_latents) {
                     if (ref == nullptr) {
                         continue;
@@ -310,14 +310,14 @@ namespace Rope {
                     int ref_w_len  = (ref_w + ref_pad_w) / patch_size;
                     size_t ref_n_tokens  = static_cast<size_t>(ref_h_len) * static_cast<size_t>(ref_w_len);
                     for (size_t token_i = 0; token_i < ref_n_tokens; ++token_i) {
-                        axes_wraps[1][cursor + token_i] = ref_h_len;
-                        axes_wraps[2][cursor + token_i] = ref_w_len;
+                        wrap_dims[1][cursor + token_i] = ref_h_len;
+                        wrap_dims[2][cursor + token_i] = ref_w_len;
                     }
                     cursor += ref_n_tokens;
                 }
             }
         }
-        const std::vector<std::vector<int>>* wraps_ptr = axes_wraps.empty() ? nullptr : &axes_wraps;
+        const std::vector<std::vector<int>>* wraps_ptr = wrap_dims.empty() ? nullptr : &wrap_dims;
         return embed_nd(ids, bs, theta, axes_dim, wraps_ptr);
     }
 

From 4f2db1bef6f97c0026dfba158a44eadecc707adf Mon Sep 17 00:00:00 2001
From: bepis <phylliida.dev@gmail.com>
Date: Fri, 24 Oct 2025 16:03:08 -0700
Subject: [PATCH 07/25] resolve flux conflict

---
 flux.hpp | 79 ++++++++++++++++++++++++++++----------------------------
 ggml     |  2 +-
 2 files changed, 41 insertions(+), 40 deletions(-)

diff --git a/flux.hpp b/flux.hpp
index c03b3ce2a..355184be2 100644
--- a/flux.hpp
+++ b/flux.hpp
@@ -1,6 +1,7 @@
 #ifndef __FLUX_HPP__
 #define __FLUX_HPP__
 
+#include <memory>
 #include <vector>
 
 #include "ggml_extend.hpp"
@@ -18,7 +19,7 @@ namespace Flux {
             blocks["out_layer"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_dim, hidden_dim, true));
         }
 
-        struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+        struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) override {
             // x: [..., in_dim]
             // return: [..., hidden_dim]
             auto in_layer  = std::dynamic_pointer_cast<Linear>(blocks["in_layer"]);
@@ -36,7 +37,7 @@ namespace Flux {
         int64_t hidden_size;
         float eps;
 
-        void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") {
+        void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") override {
             ggml_type wtype = GGML_TYPE_F32;
             params["scale"] = ggml_new_tensor_1d(ctx, wtype, hidden_size);
         }
@@ -47,7 +48,7 @@ namespace Flux {
             : hidden_size(hidden_size),
               eps(eps) {}
 
-        struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+        struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) override {
             struct ggml_tensor* w = params["scale"];
             x                     = ggml_rms_norm(ctx, x, eps);
             x                     = ggml_mul(ctx, x, w);
@@ -136,11 +137,11 @@ namespace Flux {
     };
 
     struct ModulationOut {
-        ggml_tensor* shift = NULL;
-        ggml_tensor* scale = NULL;
-        ggml_tensor* gate  = NULL;
+        ggml_tensor* shift = nullptr;
+        ggml_tensor* scale = nullptr;
+        ggml_tensor* gate  = nullptr;
 
-        ModulationOut(ggml_tensor* shift = NULL, ggml_tensor* scale = NULL, ggml_tensor* gate = NULL)
+        ModulationOut(ggml_tensor* shift = nullptr, ggml_tensor* scale = nullptr, ggml_tensor* gate = nullptr)
             : shift(shift), scale(scale), gate(gate) {}
 
         ModulationOut(struct ggml_context* ctx, ggml_tensor* vec, int64_t offset) {
@@ -259,7 +260,7 @@ namespace Flux {
                                                                     struct ggml_tensor* txt,
                                                                     struct ggml_tensor* vec,
                                                                     struct ggml_tensor* pe,
-                                                                    struct ggml_tensor* mask = NULL) {
+                                                                    struct ggml_tensor* mask = nullptr) {
             // img: [N, n_img_token, hidden_size]
             // txt: [N, n_txt_token, hidden_size]
             // pe: [n_img_token + n_txt_token, d_head/2, 2, 2]
@@ -398,7 +399,7 @@ namespace Flux {
 
         ModulationOut get_distil_mod(struct ggml_context* ctx, struct ggml_tensor* vec) {
             int64_t offset = 3 * idx;
-            return ModulationOut(ctx, vec, offset);
+            return {ctx, vec, offset};
         }
 
         struct ggml_tensor* forward(struct ggml_context* ctx,
@@ -406,7 +407,7 @@ namespace Flux {
                                     struct ggml_tensor* x,
                                     struct ggml_tensor* vec,
                                     struct ggml_tensor* pe,
-                                    struct ggml_tensor* mask = NULL) {
+                                    struct ggml_tensor* mask = nullptr) {
             // x: [N, n_token, hidden_size]
             // pe: [n_token, d_head/2, 2, 2]
             // return: [N, n_token, hidden_size]
@@ -485,7 +486,7 @@ namespace Flux {
             auto shift     = ggml_view_2d(ctx, vec, vec->ne[0], vec->ne[1], vec->nb[1], stride * (offset + 0));  // [N, dim]
             auto scale     = ggml_view_2d(ctx, vec, vec->ne[0], vec->ne[1], vec->nb[1], stride * (offset + 1));  // [N, dim]
             // No gate
-            return ModulationOut(shift, scale, NULL);
+            return {shift, scale, nullptr};
         }
 
         struct ggml_tensor* forward(struct ggml_context* ctx,
@@ -664,7 +665,7 @@ namespace Flux {
                                          struct ggml_tensor* y,
                                          struct ggml_tensor* guidance,
                                          struct ggml_tensor* pe,
-                                         struct ggml_tensor* mod_index_arange = NULL,
+                                         struct ggml_tensor* mod_index_arange = nullptr,
                                          std::vector<int> skip_layers         = {}) {
             auto img_in      = std::dynamic_pointer_cast<Linear>(blocks["img_in"]);
             auto txt_in      = std::dynamic_pointer_cast<Linear>(blocks["txt_in"]);
@@ -672,7 +673,7 @@ namespace Flux {
 
             img = img_in->forward(ctx, img);
             struct ggml_tensor* vec;
-            struct ggml_tensor* txt_img_mask = NULL;
+            struct ggml_tensor* txt_img_mask = nullptr;
             if (params.is_chroma) {
                 int64_t mod_index_length = 344;
                 auto approx              = std::dynamic_pointer_cast<ChromaApproximator>(blocks["distilled_guidance_layer"]);
@@ -681,7 +682,7 @@ namespace Flux {
 
                 // auto mod_index_arange  = ggml_arange(ctx, 0, (float)mod_index_length, 1);
                 // ggml_arange tot working on a lot of backends, precomputing it on CPU instead
-                GGML_ASSERT(arange != NULL);
+                GGML_ASSERT(arange != nullptr);
                 auto modulation_index = ggml_nn_timestep_embedding(ctx, mod_index_arange, 32, 10000, 1000.f);  // [1, 344, 32]
 
                 // Batch broadcast (will it ever be useful)
@@ -695,15 +696,15 @@ namespace Flux {
                 vec = ggml_cont(ctx, ggml_permute(ctx, vec, 0, 2, 1, 3));  // [344, N, 64]
                 vec = approx->forward(ctx, vec);                           // [344, N, hidden_size]
 
-                if (y != NULL) {
-                    txt_img_mask = sd_pad(ctx, y, img->ne[1], 0, 0, 0);
+                if (y != nullptr) {
+                    txt_img_mask = ggml_pad(ctx, y, img->ne[1], 0, 0, 0);
                 }
             } else {
                 auto time_in   = std::dynamic_pointer_cast<MLPEmbedder>(blocks["time_in"]);
                 auto vector_in = std::dynamic_pointer_cast<MLPEmbedder>(blocks["vector_in"]);
                 vec            = time_in->forward(ctx, ggml_nn_timestep_embedding(ctx, timesteps, 256, 10000, 1000.f));
                 if (params.guidance_embed) {
-                    GGML_ASSERT(guidance != NULL);
+                    GGML_ASSERT(guidance != nullptr);
                     auto guidance_in = std::dynamic_pointer_cast<MLPEmbedder>(blocks["guidance_in"]);
                     // bf16 and fp16 result is different
                     auto g_in = ggml_nn_timestep_embedding(ctx, guidance, 256, 10000, 1000.f);
@@ -759,7 +760,7 @@ namespace Flux {
             int64_t patch_size = 2;
             int pad_h          = (patch_size - H % patch_size) % patch_size;
             int pad_w          = (patch_size - W % patch_size) % patch_size;
-            x                  = sd_pad(ctx, x, pad_w, pad_h, 0, 0);  // [N, C, H + pad_h, W + pad_w]
+            x                  = ggml_pad(ctx, x, pad_w, pad_h, 0, 0);  // [N, C, H + pad_h, W + pad_w]
 
             // img = rearrange(x, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=patch_size, pw=patch_size)
             auto img = patchify(ctx, x, patch_size);  // [N, h*w, C * patch_size * patch_size]
@@ -775,14 +776,14 @@ namespace Flux {
                                     struct ggml_tensor* y,
                                     struct ggml_tensor* guidance,
                                     struct ggml_tensor* pe,
-                                    struct ggml_tensor* mod_index_arange  = NULL,
+                                    struct ggml_tensor* mod_index_arange  = nullptr,
                                     std::vector<ggml_tensor*> ref_latents = {},
                                     std::vector<int> skip_layers          = {}) {
             // Forward pass of DiT.
             // x: (N, C, H, W) tensor of spatial inputs (images or latent representations of images)
             // timestep: (N,) tensor of diffusion timesteps
             // context: (N, L, D)
-            // c_concat: NULL, or for (N,C+M, H, W) for Fill
+            // c_concat: nullptr, or for (N,C+M, H, W) for Fill
             // y: (N, adm_in_channels) tensor of class labels
             // guidance: (N,)
             // pe: (L, d_head/2, 2, 2)
@@ -801,7 +802,7 @@ namespace Flux {
             uint64_t img_tokens = img->ne[1];
 
             if (params.version == VERSION_FLUX_FILL) {
-                GGML_ASSERT(c_concat != NULL);
+                GGML_ASSERT(c_concat != nullptr);
                 ggml_tensor* masked = ggml_view_4d(ctx, c_concat, c_concat->ne[0], c_concat->ne[1], C, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], 0);
                 ggml_tensor* mask   = ggml_view_4d(ctx, c_concat, c_concat->ne[0], c_concat->ne[1], 8 * 8, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], c_concat->nb[2] * C);
 
@@ -810,14 +811,14 @@ namespace Flux {
 
                 img = ggml_concat(ctx, img, ggml_concat(ctx, masked, mask, 0), 0);
             } else if (params.version == VERSION_FLEX_2) {
-                GGML_ASSERT(c_concat != NULL);
+                GGML_ASSERT(c_concat != nullptr);
                 ggml_tensor* masked  = ggml_view_4d(ctx, c_concat, c_concat->ne[0], c_concat->ne[1], C, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], 0);
                 ggml_tensor* mask    = ggml_view_4d(ctx, c_concat, c_concat->ne[0], c_concat->ne[1], 1, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], c_concat->nb[2] * C);
                 ggml_tensor* control = ggml_view_4d(ctx, c_concat, c_concat->ne[0], c_concat->ne[1], C, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], c_concat->nb[2] * (C + 1));
 
-                masked  = sd_pad(ctx, masked, pad_w, pad_h, 0, 0);
-                mask    = sd_pad(ctx, mask, pad_w, pad_h, 0, 0);
-                control = sd_pad(ctx, control, pad_w, pad_h, 0, 0);
+                masked  = ggml_pad(ctx, masked, pad_w, pad_h, 0, 0);
+                mask    = ggml_pad(ctx, mask, pad_w, pad_h, 0, 0);
+                control = ggml_pad(ctx, control, pad_w, pad_h, 0, 0);
 
                 masked  = patchify(ctx, masked, patch_size);
                 mask    = patchify(ctx, mask, patch_size);
@@ -825,9 +826,9 @@ namespace Flux {
 
                 img = ggml_concat(ctx, img, ggml_concat(ctx, ggml_concat(ctx, masked, mask, 0), control, 0), 0);
             } else if (params.version == VERSION_FLUX_CONTROLS) {
-                GGML_ASSERT(c_concat != NULL);
+                GGML_ASSERT(c_concat != nullptr);
 
-                ggml_tensor* control = sd_pad(ctx, c_concat, pad_w, pad_h, 0, 0);
+                ggml_tensor* control = ggml_pad(ctx, c_concat, pad_w, pad_h, 0, 0);
                 control              = patchify(ctx, control, patch_size);
                 img                  = ggml_concat(ctx, img, control, 0);
             }
@@ -924,7 +925,7 @@ namespace Flux {
             flux.init(params_ctx, tensor_types, prefix);
         }
 
-        std::string get_desc() {
+        std::string get_desc() override {
             return "flux";
         }
 
@@ -944,18 +945,18 @@ namespace Flux {
             GGML_ASSERT(x->ne[3] == 1);
             struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, FLUX_GRAPH_SIZE, false);
 
-            struct ggml_tensor* mod_index_arange = NULL;
+            struct ggml_tensor* mod_index_arange = nullptr;
 
             x       = to_backend(x);
             context = to_backend(context);
-            if (c_concat != NULL) {
+            if (c_concat != nullptr) {
                 c_concat = to_backend(c_concat);
             }
             if (flux_params.is_chroma) {
                 guidance = ggml_set_f32(guidance, 0);
 
                 if (!use_mask) {
-                    y = NULL;
+                    y = nullptr;
                 }
 
                 // ggml_arange is not working on some backends, precompute it
@@ -987,7 +988,7 @@ namespace Flux {
             auto pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, flux_params.axes_dim_sum / 2, pos_len);
             // pe->data = pe_vec.data();
             // print_ggml_tensor(pe);
-            // pe->data = NULL;
+            // pe->data = nullptr;
             set_backend_tensor_data(pe, pe_vec.data());
 
             struct ggml_tensor* out = flux.forward(compute_ctx,
@@ -1017,8 +1018,8 @@ namespace Flux {
                      struct ggml_tensor* guidance,
                      std::vector<ggml_tensor*> ref_latents = {},
                      bool increase_ref_index               = false,
-                     struct ggml_tensor** output           = NULL,
-                     struct ggml_context* output_ctx       = NULL,
+                     struct ggml_tensor** output           = nullptr,
+                     struct ggml_context* output_ctx       = nullptr,
                      std::vector<int> skip_layers          = std::vector<int>()) {
             // x: [N, in_channels, h, w]
             // timesteps: [N, ]
@@ -1035,11 +1036,11 @@ namespace Flux {
         void test() {
             struct ggml_init_params params;
             params.mem_size   = static_cast<size_t>(20 * 1024 * 1024);  // 20 MB
-            params.mem_buffer = NULL;
+            params.mem_buffer = nullptr;
             params.no_alloc   = false;
 
             struct ggml_context* work_ctx = ggml_init(params);
-            GGML_ASSERT(work_ctx != NULL);
+            GGML_ASSERT(work_ctx != nullptr);
 
             {
                 // cpu f16:
@@ -1063,10 +1064,10 @@ namespace Flux {
                 ggml_set_f32(y, 0.01f);
                 // print_ggml_tensor(y);
 
-                struct ggml_tensor* out = NULL;
+                struct ggml_tensor* out = nullptr;
 
                 int t0 = ggml_time_ms();
-                compute(8, x, timesteps, context, NULL, y, guidance, {}, false, &out, work_ctx);
+                compute(8, x, timesteps, context, nullptr, y, guidance, {}, false, &out, work_ctx);
                 int t1 = ggml_time_ms();
 
                 print_ggml_tensor(out);
@@ -1078,7 +1079,7 @@ namespace Flux {
             // ggml_backend_t backend    = ggml_backend_cuda_init(0);
             ggml_backend_t backend           = ggml_backend_cpu_init();
             ggml_type model_data_type        = GGML_TYPE_Q8_0;
-            std::shared_ptr<FluxRunner> flux = std::shared_ptr<FluxRunner>(new FluxRunner(backend, false));
+            std::shared_ptr<FluxRunner> flux = std::make_shared<FluxRunner>(backend, false);
             {
                 LOG_INFO("loading from '%s'", file_path.c_str());
 
diff --git a/ggml b/ggml
index 6eb26b3c7..55c79c624 160000
--- a/ggml
+++ b/ggml
@@ -1 +1 @@
-Subproject commit 6eb26b3c74ed06f600e61f48d62dc39f9c1166c0
+Subproject commit 55c79c6249dbc5e3ac8cd82556861608a6fd425e

From e6fb4e82f8b6248e0e40a19473d8aa966d2460f4 Mon Sep 17 00:00:00 2001
From: Phylliida <phylliidadev@gmail.com>
Date: Tue, 9 Dec 2025 18:48:15 -0800
Subject: [PATCH 08/25] switch to pad op circular only

---
 ggml_extend.hpp | 25 ++++++++++++-------------
 1 file changed, 12 insertions(+), 13 deletions(-)

diff --git a/ggml_extend.hpp b/ggml_extend.hpp
index 9699b12cd..15746487e 100644
--- a/ggml_extend.hpp
+++ b/ggml_extend.hpp
@@ -609,7 +609,7 @@ __STATIC_INLINE__ struct ggml_tensor* sd_pad(struct ggml_context* ctx,
                                              int p2,
                                              int p3) {
     if (sd_is_circular_padding_enabled()) {
-        return ggml_pad_circular(ctx, a, 0, p0, 0, p1, 0, p2, 0, p3);
+        return ggml_pad_ext_circular(ctx, a, 0, p0, 0, p1, 0, p2, 0, p3);
     }
     else {
         return ggml_pad(ctx, a, p0, p1, p2, p3);
@@ -627,7 +627,7 @@ __STATIC_INLINE__ struct ggml_tensor* sd_pad_ext(struct ggml_context* ctx,
                                                  int lp3,
                                                  int rp3) {
     if (sd_is_circular_padding_enabled()) {
-        return ggml_pad_circular(ctx, a, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3);
+        return ggml_pad_ext_circular(ctx, a, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3);
     }
     return ggml_pad_ext(ctx, a, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3);
 }
@@ -1035,22 +1035,21 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_nn_conv_2d(struct ggml_context* ctx,
     const bool use_circular = sd_is_circular_padding_enabled();
     LOG_DEBUG("use circular conv %d", use_circular ? 1 : 0);
     const bool is_depthwise = (w->ne[2] == 1 && x->ne[2] == w->ne[3]);
+
+    if (use_circular && (p0 != 0 || p1 != 0)) {
+        x  = ggml_pad_ext_circular(ctx, x, p0, p0, p1, p1, 0, 0, 0, 0);
+        p0 = 0;
+        p1 = 0;
+    }
+
     if (direct) {
-        if (use_circular) {
-            if (is_depthwise) {
-                x = ggml_conv_2d_dw_direct_circular(ctx, w, x, s0, s1, p0, p1, d0, d1);
-            } else {
-                x = ggml_conv_2d_direct_circular(ctx, w, x, s0, s1, p0, p1, d0, d1);
-            }
+        if (is_depthwise) {
+            x = ggml_conv_2d_dw_direct(ctx, w, x, s0, s1, p0, p1, d0, d1);
         } else {
             x = ggml_conv_2d_direct(ctx, w, x, s0, s1, p0, p1, d0, d1);
         }
     } else {
-        if (use_circular) {
-            x = ggml_conv_2d_circular(ctx, w, x, s0, s1, p0, p1, d0, d1);
-        } else {
-            x = ggml_conv_2d(ctx, w, x, s0, s1, p0, p1, d0, d1);
-        }
+        x = ggml_conv_2d(ctx, w, x, s0, s1, p0, p1, d0, d1);
     }
     if (scale != 1.f) {
         x = ggml_scale(ctx, x, 1.f / scale);

From 00c92ef9150efefb65c3cd2ee17020490b2108c3 Mon Sep 17 00:00:00 2001
From: Phylliida <phylliidadev@gmail.com>
Date: Fri, 12 Dec 2025 13:05:29 -0800
Subject: [PATCH 09/25] Set ggml to most recent

---
 ggml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml b/ggml
index 55c79c624..d80bac55f 160000
--- a/ggml
+++ b/ggml
@@ -1 +1 @@
-Subproject commit 55c79c6249dbc5e3ac8cd82556861608a6fd425e
+Subproject commit d80bac55f6d0c57e57143f80cbb6e3155dec1cc7

From 144c2786aaa84b856f471bd486f1979a707a3cb3 Mon Sep 17 00:00:00 2001
From: Phylliida <phylliidadev@gmail.com>
Date: Fri, 12 Dec 2025 13:07:39 -0800
Subject: [PATCH 10/25] Revert ggml temp

---
 ggml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml b/ggml
index d80bac55f..2d3876d55 160000
--- a/ggml
+++ b/ggml
@@ -1 +1 @@
-Subproject commit d80bac55f6d0c57e57143f80cbb6e3155dec1cc7
+Subproject commit 2d3876d554551d35c06dccc5852be50d5fd2a275

From 247d67fd56808046aede01f1e7070b952814f2fc Mon Sep 17 00:00:00 2001
From: Phylliida <phylliidadev@gmail.com>
Date: Fri, 12 Dec 2025 13:14:43 -0800
Subject: [PATCH 11/25] Update ggml to most recent

---
 ggml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml b/ggml
index 2d3876d55..d80bac55f 160000
--- a/ggml
+++ b/ggml
@@ -1 +1 @@
-Subproject commit 2d3876d554551d35c06dccc5852be50d5fd2a275
+Subproject commit d80bac55f6d0c57e57143f80cbb6e3155dec1cc7

From 686a208fa2a178afe2839d75998c277d39795a2f Mon Sep 17 00:00:00 2001
From: Phylliida <phylliidadev@gmail.com>
Date: Fri, 12 Dec 2025 13:18:11 -0800
Subject: [PATCH 12/25] Revert unneded flux change

---
 flux.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flux.hpp b/flux.hpp
index 7e6e52372..1df2874ae 100644
--- a/flux.hpp
+++ b/flux.hpp
@@ -457,7 +457,7 @@ namespace Flux {
 
         ModulationOut get_distil_mod(GGMLRunnerContext* ctx, struct ggml_tensor* vec) {
             int64_t offset = 3 * idx;
-            return {ctx, vec, offset};
+            return ModulationOut(ctx, vec, offset);
         }
 
         struct ggml_tensor* forward(GGMLRunnerContext* ctx,

From 15076b089ba064268ae5149c9047729d5e017643 Mon Sep 17 00:00:00 2001
From: Phylliida <phylliidadev@gmail.com>
Date: Fri, 12 Dec 2025 13:51:57 -0800
Subject: [PATCH 13/25] move circular flag to the GGMLRunnerContext

---
 clip.hpp             |  2 +-
 ggml_extend.hpp      | 40 ++++++++++++++++++++++++++++++----------
 lora.hpp             |  3 +++
 stable-diffusion.cpp | 12 ++++++++----
 4 files changed, 42 insertions(+), 15 deletions(-)

diff --git a/clip.hpp b/clip.hpp
index 24c94f1bb..cda5a3015 100644
--- a/clip.hpp
+++ b/clip.hpp
@@ -664,7 +664,7 @@ class CLIPVisionEmbeddings : public GGMLBlock {
         // concat(patch_embedding, class_embedding) + position_embedding
         struct ggml_tensor* patch_embedding;
         int64_t N       = pixel_values->ne[3];
-        patch_embedding = ggml_ext_conv_2d(ctx->ggml_ctx, pixel_values, patch_embed_weight, nullptr, patch_size, patch_size);  // [N, embed_dim, image_size // pacht_size, image_size // pacht_size]
+        patch_embedding = ggml_ext_conv_2d(ctx->ggml_ctx, pixel_values, patch_embed_weight, nullptr, patch_size, patch_size, 0, 0, 1, 1, false, ctx->circular_pad_enabled);  // [N, embed_dim, image_size // pacht_size, image_size // pacht_size]
         patch_embedding = ggml_reshape_3d(ctx->ggml_ctx, patch_embedding, num_patches, embed_dim, N);                          // [N, embed_dim, num_patches]
         patch_embedding = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, patch_embedding, 1, 0, 2, 3));                  // [N, num_patches, embed_dim]
         patch_embedding = ggml_reshape_4d(ctx->ggml_ctx, patch_embedding, 1, embed_dim, num_patches, N);                       // [N, num_patches, embed_dim, 1]
diff --git a/ggml_extend.hpp b/ggml_extend.hpp
index 87a34e64d..6d71a0890 100644
--- a/ggml_extend.hpp
+++ b/ggml_extend.hpp
@@ -1016,20 +1016,30 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_conv_2d(struct ggml_context* ctx,
                                                        struct ggml_tensor* x,
                                                        struct ggml_tensor* w,
                                                        struct ggml_tensor* b,
-                                                       int s0      = 1,
-                                                       int s1      = 1,
-                                                       int p0      = 0,
-                                                       int p1      = 0,
-                                                       int d0      = 1,
-                                                       int d1      = 1,
-                                                       bool direct = false,
-                                                       float scale = 1.f) {
+                                                       int s0        = 1,
+                                                       int s1        = 1,
+                                                       int p0        = 0,
+                                                       int p1        = 0,
+                                                       int d0        = 1,
+                                                       int d1        = 1,
+                                                       bool direct   = false,
+                                                       bool circular = false,
+                                                       float scale   = 1.f) {
     if (scale != 1.f) {
         x = ggml_scale(ctx, x, scale);
     }
     if (w->ne[2] != x->ne[2] && ggml_n_dims(w) == 2) {
         w = ggml_reshape_4d(ctx, w, 1, 1, w->ne[0], w->ne[1]);
     }
+
+    // use circular padding (on a torus, x and y wrap around) for seamless textures
+    // see https://github.com/leejet/stable-diffusion.cpp/pull/914
+    if (circular && (p0 != 0 || p1 != 0)) {
+        x  = ggml_pad_ext_circular(ctx, x, p0, p0, p1, p1, 0, 0, 0, 0);
+        p0 = 0;
+        p1 = 0;
+    }
+
     if (direct) {
         if (is_depthwise) {
             x = ggml_conv_2d_dw_direct(ctx, w, x, s0, s1, p0, p1, d0, d1);
@@ -1543,7 +1553,8 @@ struct WeightAdapter {
             int d0      = 1;
             int d1      = 1;
             bool direct = false;
-            float scale = 1.f;
+            bool circular = false;
+            float scale   = 1.f;
         } conv2d;
     };
     virtual ggml_tensor* patch_weight(ggml_context* ctx, ggml_tensor* weight, const std::string& weight_name) = 0;
@@ -1561,6 +1572,7 @@ struct GGMLRunnerContext {
     ggml_context* ggml_ctx                        = nullptr;
     bool flash_attn_enabled                       = false;
     bool conv2d_direct_enabled                    = false;
+    bool circular_pad_enabled                     = false;
     std::shared_ptr<WeightAdapter> weight_adapter = nullptr;
 };
 
@@ -1597,6 +1609,7 @@ struct GGMLRunner {
 
     bool flash_attn_enabled    = false;
     bool conv2d_direct_enabled = false;
+    bool circular_pad_enabled  = false;
 
     void alloc_params_ctx() {
         struct ggml_init_params params;
@@ -1874,6 +1887,7 @@ struct GGMLRunner {
         runner_ctx.backend               = runtime_backend;
         runner_ctx.flash_attn_enabled    = flash_attn_enabled;
         runner_ctx.conv2d_direct_enabled = conv2d_direct_enabled;
+        runner_ctx.circular_pad_enabled  = circular_pad_enabled;
         runner_ctx.weight_adapter        = weight_adapter;
         return runner_ctx;
     }
@@ -2018,6 +2032,10 @@ struct GGMLRunner {
         conv2d_direct_enabled = enabled;
     }
 
+    void set_circular_pad_enabled(bool enabled) {
+        circular_pad_enabled = enabled;
+    }
+
     void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) {
         weight_adapter = adapter;
     }
@@ -2289,7 +2307,8 @@ class Conv2d : public UnaryBlock {
             forward_params.conv2d.d0     = dilation.second;
             forward_params.conv2d.d1     = dilation.first;
             forward_params.conv2d.direct = ctx->conv2d_direct_enabled;
-            forward_params.conv2d.scale  = scale;
+            forward_params.conv2d.circular = ctx->circular_pad_enabled;
+            forward_params.conv2d.scale    = scale;
             return ctx->weight_adapter->forward_with_lora(ctx->ggml_ctx, x, w, b, prefix, forward_params);
         }
         return ggml_ext_conv_2d(ctx->ggml_ctx,
@@ -2303,6 +2322,7 @@ class Conv2d : public UnaryBlock {
                                 dilation.second,
                                 dilation.first,
                                 ctx->conv2d_direct_enabled,
+                                ctx->circular_pad_enabled,
                                 scale);
     }
 };
diff --git a/lora.hpp b/lora.hpp
index b847f044c..321e63bca 100644
--- a/lora.hpp
+++ b/lora.hpp
@@ -599,6 +599,7 @@ struct LoraModel : public GGMLRunner {
                                       forward_params.conv2d.d0,
                                       forward_params.conv2d.d1,
                                       forward_params.conv2d.direct,
+                                      forward_params.conv2d.circular,
                                       forward_params.conv2d.scale);
                 if (lora_mid) {
                     lx = ggml_ext_conv_2d(ctx,
@@ -612,6 +613,7 @@ struct LoraModel : public GGMLRunner {
                                           1,
                                           1,
                                           forward_params.conv2d.direct,
+                                          forward_params.conv2d.circular,
                                           forward_params.conv2d.scale);
                 }
                 lx = ggml_ext_conv_2d(ctx,
@@ -625,6 +627,7 @@ struct LoraModel : public GGMLRunner {
                                       1,
                                       1,
                                       forward_params.conv2d.direct,
+                                      forward_params.conv2d.circular,
                                       forward_params.conv2d.scale);
             }
 
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index 9c4c69349..ec0dab2b1 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -212,10 +212,6 @@ class StableDiffusionGGML {
         use_tiny_autoencoder    = taesd_path.size() > 0;
         offload_params_to_cpu   = sd_ctx_params->offload_params_to_cpu;
         circular_pad            = sd_ctx_params->circular_pad;
-        sd_set_circular_padding_enabled(circular_pad);
-        if (circular_pad) {
-            LOG_INFO("Using circular padding for convolutions");
-        }
 
         rng = get_rng(sd_ctx_params->rng_type);
         if (sd_ctx_params->sampler_rng_type != RNG_TYPE_COUNT && sd_ctx_params->sampler_rng_type != sd_ctx_params->rng_type) {
@@ -393,6 +389,10 @@ class StableDiffusionGGML {
             vae_decode_only = false;
         }
 
+        if (circular_pad) {
+            LOG_INFO("Using circular padding for convolutions");
+        }
+
         bool clip_on_cpu = sd_ctx_params->keep_clip_on_cpu;
 
         {
@@ -540,6 +540,7 @@ class StableDiffusionGGML {
                     LOG_INFO("Using Conv2d direct in the diffusion model");
                     std::dynamic_pointer_cast<UNetModel>(diffusion_model)->unet.set_conv2d_direct_enabled(true);
                 }
+                std::dynamic_pointer_cast<UNetModel>(diffusion_model)->unet.set_circular_pad_enabled(circular_pad);
             }
 
             if (sd_ctx_params->diffusion_flash_attn) {
@@ -602,6 +603,7 @@ class StableDiffusionGGML {
                         vae_conv_2d_scale);
                     first_stage_model->set_conv2d_scale(vae_conv_2d_scale);
                 }
+                first_stage_model->set_circular_pad_enabled(circular_pad);
                 first_stage_model->alloc_params_buffer();
                 first_stage_model->get_param_tensors(tensors, "first_stage_model");
             }
@@ -616,6 +618,7 @@ class StableDiffusionGGML {
                     LOG_INFO("Using Conv2d direct in the tae model");
                     tae_first_stage->set_conv2d_direct_enabled(true);
                 }
+                tae_first_stage->set_circular_pad_enabled(circular_pad);
             }
             // first_stage_model->get_param_tensors(tensors, "first_stage_model.");
 
@@ -635,6 +638,7 @@ class StableDiffusionGGML {
                     LOG_INFO("Using Conv2d direct in the control net");
                     control_net->set_conv2d_direct_enabled(true);
                 }
+                control_net->set_circular_pad_enabled(circular_pad);
             }
 
             if (strstr(SAFE_STR(sd_ctx_params->photo_maker_path), "v2")) {

From bf28347524530b13256473e30a3f95044b9642fa Mon Sep 17 00:00:00 2001
From: Phylliida <phylliidadev@gmail.com>
Date: Fri, 12 Dec 2025 13:53:58 -0800
Subject: [PATCH 14/25] Pass through circular param in all places where conv is
 called

---
 lora.hpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lora.hpp b/lora.hpp
index 321e63bca..e6af66798 100644
--- a/lora.hpp
+++ b/lora.hpp
@@ -782,6 +782,7 @@ struct MultiLoraAdapter : public WeightAdapter {
                                    forward_params.conv2d.d0,
                                    forward_params.conv2d.d1,
                                    forward_params.conv2d.direct,
+                                   forward_params.conv2d.circular,
                                    forward_params.conv2d.scale);
         }
         for (auto& lora_model : lora_models) {

From 5f2de586beba9a33f3323ff4e54945839c9219f3 Mon Sep 17 00:00:00 2001
From: Phylliida <phylliidadev@gmail.com>
Date: Fri, 12 Dec 2025 14:19:40 -0800
Subject: [PATCH 15/25] fix of constant and minor cleanup

---
 common.hpp      |  2 +-
 ggml_extend.hpp | 16 ++++++++--------
 qwen_image.hpp  |  3 ++-
 rope.hpp        |  3 ++-
 wan.hpp         |  2 +-
 5 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/common.hpp b/common.hpp
index 0ea197990..33d499fb1 100644
--- a/common.hpp
+++ b/common.hpp
@@ -28,7 +28,7 @@ class DownSampleBlock : public GGMLBlock {
         if (vae_downsample) {
             auto conv = std::dynamic_pointer_cast<Conv2d>(blocks["conv"]);
 
-            x = sd_pad(ctx, x, 1, 1, 0, 0);
+            x = ggml_pad(ctx->ggml_ctx, x, 1, 1, 0, 0);
             x = conv->forward(ctx, x);
         } else {
             auto conv = std::dynamic_pointer_cast<Conv2d>(blocks["op"]);
diff --git a/ggml_extend.hpp b/ggml_extend.hpp
index 6d71a0890..b76a25f04 100644
--- a/ggml_extend.hpp
+++ b/ggml_extend.hpp
@@ -21,6 +21,10 @@
 #include <vector>
 #include <atomic>
 
+#ifndef GGML_KQ_MASK_PAD
+#define GGML_KQ_MASK_PAD 1
+#endif
+
 #include "ggml-alloc.h"
 #include "ggml-backend.h"
 #include "ggml-cpu.h"
@@ -1041,11 +1045,7 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_conv_2d(struct ggml_context* ctx,
     }
 
     if (direct) {
-        if (is_depthwise) {
-            x = ggml_conv_2d_dw_direct(ctx, w, x, s0, s1, p0, p1, d0, d1);
-        } else {
-            x = ggml_conv_2d_direct(ctx, w, x, s0, s1, p0, p1, d0, d1);
-        }
+        x = ggml_conv_2d_direct(ctx, w, x, s0, s1, p0, p1, d0, d1);
     } else {
         x = ggml_conv_2d(ctx, w, x, s0, s1, p0, p1, d0, d1);
     }
@@ -1269,7 +1269,7 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_attention_ext(struct ggml_context
 
     auto build_kqv = [&](ggml_tensor* q_in, ggml_tensor* k_in, ggml_tensor* v_in, ggml_tensor* mask_in) -> ggml_tensor* {
         if (kv_pad != 0) {
-            k_in = sd_pad(ctx, k_in, 0, kv_pad, 0, 0);
+            k_in = ggml_pad(ctx, k_in, 0, kv_pad, 0, 0);
         }
         if (kv_scale != 1.0f) {
             k_in = ggml_scale(ctx, k_in, kv_scale);
@@ -1279,7 +1279,7 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_attention_ext(struct ggml_context
         v_in = ggml_ext_cont(ctx, ggml_permute(ctx, v_in, 0, 2, 1, 3));
         v_in = ggml_reshape_3d(ctx, v_in, d_head, L_k, n_kv_head * N);
         if (kv_pad != 0) {
-            v_in = sd_pad(ctx, v_in, 0, kv_pad, 0, 0);
+            v_in = ggml_pad(ctx, v_in, 0, kv_pad, 0, 0);
         }
         if (kv_scale != 1.0f) {
             v_in = ggml_scale(ctx, v_in, kv_scale);
@@ -1302,7 +1302,7 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_attention_ext(struct ggml_context
                 mask_pad = GGML_PAD(L_q, GGML_KQ_MASK_PAD) - mask_in->ne[1];
             }
             if (mask_pad > 0) {
-                mask_in = sd_pad(ctx, mask_in, 0, mask_pad, 0, 0);
+                mask_in = ggml_pad(ctx, mask_in, 0, mask_pad, 0, 0);
             }
             mask_in = ggml_cast(ctx, mask_in, GGML_TYPE_F16);
         }
diff --git a/qwen_image.hpp b/qwen_image.hpp
index 4dd13f8ac..6853787cd 100644
--- a/qwen_image.hpp
+++ b/qwen_image.hpp
@@ -361,7 +361,7 @@ namespace Qwen {
 
             int pad_h = (params.patch_size - H % params.patch_size) % params.patch_size;
             int pad_w = (params.patch_size - W % params.patch_size) % params.patch_size;
-            x         = sd_pad(ctx, x, pad_w, pad_h, 0, 0);  // [N, C, H + pad_h, W + pad_w]
+            x         = ggml_pad(ctx, x, pad_w, pad_h, 0, 0);  // [N, C, H + pad_h, W + pad_w]
             return x;
         }
 
@@ -565,6 +565,7 @@ namespace Qwen {
                                                   ref_latents,
                                                   increase_ref_index,
                                                   qwen_image_params.theta,
+                                                  circular_pad_enabled,
                                                   qwen_image_params.axes_dim);
             int pos_len = pe_vec.size() / qwen_image_params.axes_dim_sum / 2;
             // LOG_DEBUG("pos_len %d", pos_len);
diff --git a/rope.hpp b/rope.hpp
index f9892929e..f84fe4f43 100644
--- a/rope.hpp
+++ b/rope.hpp
@@ -316,11 +316,12 @@ namespace Rope {
                                                            const std::vector<ggml_tensor*>& ref_latents,
                                                            bool increase_ref_index,
                                                            int theta,
+                                                           bool circular,
                                                            const std::vector<int>& axes_dim) {
         std::vector<std::vector<float>> ids = gen_qwen_image_ids(h, w, patch_size, bs, context_len, ref_latents, increase_ref_index);
         std::vector<std::vector<int>> wrap_dims;
         // This logic simply stores the (pad and patch_adjusted) sizes of images so we can make sure rope correctly tiles
-        if (sd_is_circular_padding_enabled() && bs > 0 && axes_dim.size() >= 3) {
+        if (circular && bs > 0 && axes_dim.size() >= 3) {
             int pad_h = (patch_size - (h % patch_size)) % patch_size;
             int pad_w = (patch_size - (w % patch_size)) % patch_size;
             int h_len = (h + pad_h) / patch_size;
diff --git a/wan.hpp b/wan.hpp
index c09c55cfd..75333bfe1 100644
--- a/wan.hpp
+++ b/wan.hpp
@@ -1835,7 +1835,7 @@ namespace WAN {
             int pad_t = (std::get<0>(params.patch_size) - T % std::get<0>(params.patch_size)) % std::get<0>(params.patch_size);
             int pad_h = (std::get<1>(params.patch_size) - H % std::get<1>(params.patch_size)) % std::get<1>(params.patch_size);
             int pad_w = (std::get<2>(params.patch_size) - W % std::get<2>(params.patch_size)) % std::get<2>(params.patch_size);
-            x         = sd_pad(ctx, x, pad_w, pad_h, pad_t, 0);  // [N*C, T + pad_t, H + pad_h, W + pad_w]
+            x         = ggml_pad(ctx, x, pad_w, pad_h, pad_t, 0);  // [N*C, T + pad_t, H + pad_h, W + pad_w]
 
             return x;
         }

From d7d8da10998e12abff4ee508588d72a554189156 Mon Sep 17 00:00:00 2001
From: Phylliida <phylliidadev@gmail.com>
Date: Fri, 12 Dec 2025 14:23:03 -0800
Subject: [PATCH 16/25] Added back --circular option

---
 examples/cli/main.cpp | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp
index 49b202fda..79c456271 100644
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@@ -518,6 +518,7 @@ struct SDContextParams {
     bool diffusion_flash_attn   = false;
     bool diffusion_conv_direct  = false;
     bool vae_conv_direct        = false;
+    bool circular_pad           = false;
 
     bool chroma_use_dit_mask = true;
     bool chroma_use_t5_mask  = false;
@@ -671,6 +672,10 @@ struct SDContextParams {
              "--vae-conv-direct",
              "use ggml_conv2d_direct in the vae model",
              true, &vae_conv_direct},
+            {"",
+             "--circular",
+             "enable circular padding for convolutions",
+             true, &circular_pad},
             {"",
              "--chroma-disable-dit-mask",
              "disable dit mask for chroma",
@@ -934,6 +939,7 @@ struct SDContextParams {
             << "  diffusion_flash_attn: " << (diffusion_flash_attn ? "true" : "false") << ",\n"
             << "  diffusion_conv_direct: " << (diffusion_conv_direct ? "true" : "false") << ",\n"
             << "  vae_conv_direct: " << (vae_conv_direct ? "true" : "false") << ",\n"
+            << "  circular_pad: " << (circular_pad ? "true" : "false") << ",\n"
             << "  chroma_use_dit_mask: " << (chroma_use_dit_mask ? "true" : "false") << ",\n"
             << "  chroma_use_t5_mask: " << (chroma_use_t5_mask ? "true" : "false") << ",\n"
             << "  chroma_t5_mask_pad: " << chroma_t5_mask_pad << ",\n"
@@ -995,6 +1001,7 @@ struct SDContextParams {
             taesd_preview,
             diffusion_conv_direct,
             vae_conv_direct,
+            circular_pad,
             force_sdxl_vae_conv_scale,
             chroma_use_dit_mask,
             chroma_use_t5_mask,

From 822f9a522aedd1a5923f8df5cb201c234116b74b Mon Sep 17 00:00:00 2001
From: Phylliida <phylliidadev@gmail.com>
Date: Fri, 12 Dec 2025 15:40:45 -0800
Subject: [PATCH 17/25] Conv2d circular in vae and various models

---
 common.hpp           |  8 +++++++-
 diffusion_model.hpp  | 25 +++++++++++++++++++++++++
 flux.hpp             | 30 +++++++++++++++++-------------
 qwen_image.hpp       | 16 ++++++++++------
 stable-diffusion.cpp | 21 +++++++++++++++------
 wan.hpp              | 30 +++++++++++++++++++++++-------
 z_image.hpp          | 16 ++++++++++------
 7 files changed, 107 insertions(+), 39 deletions(-)

diff --git a/common.hpp b/common.hpp
index 33d499fb1..a95e76a4f 100644
--- a/common.hpp
+++ b/common.hpp
@@ -28,7 +28,13 @@ class DownSampleBlock : public GGMLBlock {
         if (vae_downsample) {
             auto conv = std::dynamic_pointer_cast<Conv2d>(blocks["conv"]);
 
-            x = ggml_pad(ctx->ggml_ctx, x, 1, 1, 0, 0);
+            // For VAE downsampling we manually pad by 1 before the stride-2 conv.
+            // Honor the global circular padding flag here to avoid seams in seamless mode.
+            if (ctx->circular_pad_enabled) {
+                x = ggml_pad_circular(ctx->ggml_ctx, x, 1, 1, 0, 0);
+            } else {
+                x = ggml_pad(ctx->ggml_ctx, x, 1, 1, 0, 0);
+            }
             x = conv->forward(ctx, x);
         } else {
             auto conv = std::dynamic_pointer_cast<Conv2d>(blocks["op"]);
diff --git a/diffusion_model.hpp b/diffusion_model.hpp
index 8c741fdc4..b6491291a 100644
--- a/diffusion_model.hpp
+++ b/diffusion_model.hpp
@@ -39,6 +39,7 @@ struct DiffusionModel {
     virtual void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter){};
     virtual int64_t get_adm_in_channels()             = 0;
     virtual void set_flash_attn_enabled(bool enabled) = 0;
+    virtual void set_circular_pad_enabled(bool enabled) = 0;
 };
 
 struct UNetModel : public DiffusionModel {
@@ -87,6 +88,10 @@ struct UNetModel : public DiffusionModel {
         unet.set_flash_attention_enabled(enabled);
     }
 
+    void set_circular_pad_enabled(bool enabled) override {
+        unet.set_circular_pad_enabled(enabled);
+    }
+
     bool compute(int n_threads,
                  DiffusionParams diffusion_params,
                  struct ggml_tensor** output     = nullptr,
@@ -148,6 +153,10 @@ struct MMDiTModel : public DiffusionModel {
         mmdit.set_flash_attention_enabled(enabled);
     }
 
+    void set_circular_pad_enabled(bool enabled) override {
+        mmdit.set_circular_pad_enabled(enabled);
+    }
+
     bool compute(int n_threads,
                  DiffusionParams diffusion_params,
                  struct ggml_tensor** output     = nullptr,
@@ -210,6 +219,10 @@ struct FluxModel : public DiffusionModel {
         flux.set_flash_attention_enabled(enabled);
     }
 
+    void set_circular_pad_enabled(bool enabled) override {
+        flux.set_circular_pad_enabled(enabled);
+    }
+
     bool compute(int n_threads,
                  DiffusionParams diffusion_params,
                  struct ggml_tensor** output     = nullptr,
@@ -277,6 +290,10 @@ struct WanModel : public DiffusionModel {
         wan.set_flash_attention_enabled(enabled);
     }
 
+    void set_circular_pad_enabled(bool enabled) override {
+        wan.set_circular_pad_enabled(enabled);
+    }
+
     bool compute(int n_threads,
                  DiffusionParams diffusion_params,
                  struct ggml_tensor** output     = nullptr,
@@ -343,6 +360,10 @@ struct QwenImageModel : public DiffusionModel {
         qwen_image.set_flash_attention_enabled(enabled);
     }
 
+    void set_circular_pad_enabled(bool enabled) override {
+        qwen_image.set_circular_pad_enabled(enabled);
+    }
+
     bool compute(int n_threads,
                  DiffusionParams diffusion_params,
                  struct ggml_tensor** output     = nullptr,
@@ -406,6 +427,10 @@ struct ZImageModel : public DiffusionModel {
         z_image.set_flash_attention_enabled(enabled);
     }
 
+    void set_circular_pad_enabled(bool enabled) override {
+        z_image.set_circular_pad_enabled(enabled);
+    }
+
     bool compute(int n_threads,
                  DiffusionParams diffusion_params,
                  struct ggml_tensor** output     = nullptr,
diff --git a/flux.hpp b/flux.hpp
index 1df2874ae..602ab9bdb 100644
--- a/flux.hpp
+++ b/flux.hpp
@@ -858,14 +858,18 @@ namespace Flux {
             }
         }
 
-        struct ggml_tensor* pad_to_patch_size(struct ggml_context* ctx,
+        struct ggml_tensor* pad_to_patch_size(GGMLRunnerContext* ctx,
                                               struct ggml_tensor* x) {
             int64_t W = x->ne[0];
             int64_t H = x->ne[1];
 
             int pad_h = (params.patch_size - H % params.patch_size) % params.patch_size;
             int pad_w = (params.patch_size - W % params.patch_size) % params.patch_size;
-            x         = ggml_pad(ctx, x, pad_w, pad_h, 0, 0);  // [N, C, H + pad_h, W + pad_w]
+            if (ctx->circular_pad_enabled) {
+                x = ggml_pad_circular(ctx->ggml_ctx, x, pad_w, pad_h, 0, 0);
+            } else {
+                x = ggml_pad(ctx->ggml_ctx, x, pad_w, pad_h, 0, 0);  // [N, C, H + pad_h, W + pad_w]
+            }
             return x;
         }
 
@@ -891,11 +895,11 @@ namespace Flux {
             return x;
         }
 
-        struct ggml_tensor* process_img(struct ggml_context* ctx,
+        struct ggml_tensor* process_img(GGMLRunnerContext* ctx,
                                         struct ggml_tensor* x) {
             // img = rearrange(x, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=patch_size, pw=patch_size)
             x = pad_to_patch_size(ctx, x);
-            x = patchify(ctx, x);
+            x = patchify(ctx->ggml_ctx, x);
             return x;
         }
 
@@ -1065,7 +1069,7 @@ namespace Flux {
             int pad_h          = (patch_size - H % patch_size) % patch_size;
             int pad_w          = (patch_size - W % patch_size) % patch_size;
 
-            auto img      = pad_to_patch_size(ctx->ggml_ctx, x);
+            auto img      = pad_to_patch_size(ctx, x);
             auto orig_img = img;
 
             auto img_in_patch = std::dynamic_pointer_cast<Conv2d>(blocks["img_in_patch"]);
@@ -1128,7 +1132,7 @@ namespace Flux {
             int pad_h          = (patch_size - H % patch_size) % patch_size;
             int pad_w          = (patch_size - W % patch_size) % patch_size;
 
-            auto img            = process_img(ctx->ggml_ctx, x);
+            auto img            = process_img(ctx, x);
             uint64_t img_tokens = img->ne[1];
 
             if (params.version == VERSION_FLUX_FILL) {
@@ -1136,8 +1140,8 @@ namespace Flux {
                 ggml_tensor* masked = ggml_view_4d(ctx->ggml_ctx, c_concat, c_concat->ne[0], c_concat->ne[1], C, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], 0);
                 ggml_tensor* mask   = ggml_view_4d(ctx->ggml_ctx, c_concat, c_concat->ne[0], c_concat->ne[1], 8 * 8, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], c_concat->nb[2] * C);
 
-                masked = process_img(ctx->ggml_ctx, masked);
-                mask   = process_img(ctx->ggml_ctx, mask);
+                masked = process_img(ctx, masked);
+                mask   = process_img(ctx, mask);
 
                 img = ggml_concat(ctx->ggml_ctx, img, ggml_concat(ctx->ggml_ctx, masked, mask, 0), 0);
             } else if (params.version == VERSION_FLEX_2) {
@@ -1146,21 +1150,21 @@ namespace Flux {
                 ggml_tensor* mask    = ggml_view_4d(ctx->ggml_ctx, c_concat, c_concat->ne[0], c_concat->ne[1], 1, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], c_concat->nb[2] * C);
                 ggml_tensor* control = ggml_view_4d(ctx->ggml_ctx, c_concat, c_concat->ne[0], c_concat->ne[1], C, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], c_concat->nb[2] * (C + 1));
 
-                masked  = process_img(ctx->ggml_ctx, masked);
-                mask    = process_img(ctx->ggml_ctx, mask);
-                control = process_img(ctx->ggml_ctx, control);
+                masked  = process_img(ctx, masked);
+                mask    = process_img(ctx, mask);
+                control = process_img(ctx, control);
 
                 img = ggml_concat(ctx->ggml_ctx, img, ggml_concat(ctx->ggml_ctx, ggml_concat(ctx->ggml_ctx, masked, mask, 0), control, 0), 0);
             } else if (params.version == VERSION_FLUX_CONTROLS) {
                 GGML_ASSERT(c_concat != nullptr);
 
-                auto control = process_img(ctx->ggml_ctx, c_concat);
+                auto control = process_img(ctx, c_concat);
                 img          = ggml_concat(ctx->ggml_ctx, img, control, 0);
             }
 
             if (ref_latents.size() > 0) {
                 for (ggml_tensor* ref : ref_latents) {
-                    ref = process_img(ctx->ggml_ctx, ref);
+                    ref = process_img(ctx, ref);
                     img = ggml_concat(ctx->ggml_ctx, img, ref, 1);
                 }
             }
diff --git a/qwen_image.hpp b/qwen_image.hpp
index 6853787cd..d35ff18dc 100644
--- a/qwen_image.hpp
+++ b/qwen_image.hpp
@@ -354,14 +354,18 @@ namespace Qwen {
             blocks["proj_out"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, params.patch_size * params.patch_size * params.out_channels));
         }
 
-        struct ggml_tensor* pad_to_patch_size(struct ggml_context* ctx,
+        struct ggml_tensor* pad_to_patch_size(GGMLRunnerContext* ctx,
                                               struct ggml_tensor* x) {
             int64_t W = x->ne[0];
             int64_t H = x->ne[1];
 
             int pad_h = (params.patch_size - H % params.patch_size) % params.patch_size;
             int pad_w = (params.patch_size - W % params.patch_size) % params.patch_size;
-            x         = ggml_pad(ctx, x, pad_w, pad_h, 0, 0);  // [N, C, H + pad_h, W + pad_w]
+            if (ctx->circular_pad_enabled) {
+                x = ggml_pad_circular(ctx->ggml_ctx, x, pad_w, pad_h, 0, 0);
+            } else {
+                x = ggml_pad(ctx->ggml_ctx, x, pad_w, pad_h, 0, 0);  // [N, C, H + pad_h, W + pad_w]
+            }
             return x;
         }
 
@@ -387,10 +391,10 @@ namespace Qwen {
             return x;
         }
 
-        struct ggml_tensor* process_img(struct ggml_context* ctx,
+        struct ggml_tensor* process_img(GGMLRunnerContext* ctx,
                                         struct ggml_tensor* x) {
             x = pad_to_patch_size(ctx, x);
-            x = patchify(ctx, x);
+            x = patchify(ctx->ggml_ctx, x);
             return x;
         }
 
@@ -466,12 +470,12 @@ namespace Qwen {
             int64_t C = x->ne[2];
             int64_t N = x->ne[3];
 
-            auto img            = process_img(ctx->ggml_ctx, x);
+            auto img            = process_img(ctx, x);
             uint64_t img_tokens = img->ne[1];
 
             if (ref_latents.size() > 0) {
                 for (ggml_tensor* ref : ref_latents) {
-                    ref = process_img(ctx->ggml_ctx, ref);
+                    ref = process_img(ctx, ref);
                     img = ggml_concat(ctx->ggml_ctx, img, ref, 1);
                 }
             }
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index ec0dab2b1..172ee2e13 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -408,6 +408,7 @@ class StableDiffusionGGML {
                 diffusion_model  = std::make_shared<MMDiTModel>(backend,
                                                                offload_params_to_cpu,
                                                                tensor_storage_map);
+                diffusion_model->set_circular_pad_enabled(circular_pad);
             } else if (sd_version_is_flux(version)) {
                 bool is_chroma = false;
                 for (auto pair : tensor_storage_map) {
@@ -448,6 +449,7 @@ class StableDiffusionGGML {
                                                               tensor_storage_map,
                                                               version,
                                                               sd_ctx_params->chroma_use_dit_mask);
+                diffusion_model->set_circular_pad_enabled(circular_pad);
             } else if (sd_version_is_flux2(version)) {
                 bool is_chroma   = false;
                 cond_stage_model = std::make_shared<LLMEmbedder>(clip_backend,
@@ -455,10 +457,11 @@ class StableDiffusionGGML {
                                                                  tensor_storage_map,
                                                                  version);
                 diffusion_model  = std::make_shared<FluxModel>(backend,
-                                                              offload_params_to_cpu,
-                                                              tensor_storage_map,
-                                                              version,
-                                                              sd_ctx_params->chroma_use_dit_mask);
+                                                               offload_params_to_cpu,
+                                                               tensor_storage_map,
+                                                               version,
+                                                               sd_ctx_params->chroma_use_dit_mask);
+                diffusion_model->set_circular_pad_enabled(circular_pad);
             } else if (sd_version_is_wan(version)) {
                 cond_stage_model = std::make_shared<T5CLIPEmbedder>(clip_backend,
                                                                     offload_params_to_cpu,
@@ -471,12 +474,14 @@ class StableDiffusionGGML {
                                                              tensor_storage_map,
                                                              "model.diffusion_model",
                                                              version);
+                diffusion_model->set_circular_pad_enabled(circular_pad);
                 if (strlen(SAFE_STR(sd_ctx_params->high_noise_diffusion_model_path)) > 0) {
                     high_noise_diffusion_model = std::make_shared<WanModel>(backend,
                                                                             offload_params_to_cpu,
                                                                             tensor_storage_map,
                                                                             "model.high_noise_diffusion_model",
                                                                             version);
+                    high_noise_diffusion_model->set_circular_pad_enabled(circular_pad);
                 }
                 if (diffusion_model->get_desc() == "Wan2.1-I2V-14B" ||
                     diffusion_model->get_desc() == "Wan2.1-FLF2V-14B" ||
@@ -503,6 +508,7 @@ class StableDiffusionGGML {
                                                                    tensor_storage_map,
                                                                    "model.diffusion_model",
                                                                    version);
+                diffusion_model->set_circular_pad_enabled(circular_pad);
             } else if (sd_version_is_z_image(version)) {
                 cond_stage_model = std::make_shared<LLMEmbedder>(clip_backend,
                                                                  offload_params_to_cpu,
@@ -513,6 +519,7 @@ class StableDiffusionGGML {
                                                                 tensor_storage_map,
                                                                 "model.diffusion_model",
                                                                 version);
+                diffusion_model->set_circular_pad_enabled(circular_pad);
             } else {  // SD1.x SD2.x SDXL
                 std::map<std::string, std::string> embbeding_map;
                 for (int i = 0; i < sd_ctx_params->embedding_count; i++) {
@@ -538,8 +545,9 @@ class StableDiffusionGGML {
                                                               version);
                 if (sd_ctx_params->diffusion_conv_direct) {
                     LOG_INFO("Using Conv2d direct in the diffusion model");
-                    std::dynamic_pointer_cast<UNetModel>(diffusion_model)->unet.set_conv2d_direct_enabled(true);
-                }
+                std::dynamic_pointer_cast<UNetModel>(diffusion_model)->unet.set_conv2d_direct_enabled(true);
+            }
+            diffusion_model->set_circular_pad_enabled(circular_pad);
                 std::dynamic_pointer_cast<UNetModel>(diffusion_model)->unet.set_circular_pad_enabled(circular_pad);
             }
 
@@ -577,6 +585,7 @@ class StableDiffusionGGML {
                                                                         "first_stage_model",
                                                                         vae_decode_only,
                                                                         version);
+                first_stage_model->set_circular_pad_enabled(circular_pad);
                 first_stage_model->alloc_params_buffer();
                 first_stage_model->get_param_tensors(tensors, "first_stage_model");
             } else if (version == VERSION_CHROMA_RADIANCE) {
diff --git a/wan.hpp b/wan.hpp
index 75333bfe1..4cab4032b 100644
--- a/wan.hpp
+++ b/wan.hpp
@@ -75,7 +75,11 @@ namespace WAN {
                 lp2 -= (int)cache_x->ne[2];
             }
 
-            x = ggml_pad_ext(ctx->ggml_ctx, x, lp0, rp0, lp1, rp1, lp2, rp2, 0, 0);
+            if (ctx->circular_pad_enabled) {
+                x = ggml_pad_ext_circular(ctx->ggml_ctx, x, lp0, rp0, lp1, rp1, 0, 0, 0, 0);
+            } else {
+                x = ggml_pad_ext(ctx->ggml_ctx, x, lp0, rp0, lp1, rp1, 0, 0, 0, 0);
+            }
             return ggml_ext_conv_3d(ctx->ggml_ctx, x, w, b, in_channels,
                                     std::get<2>(stride), std::get<1>(stride), std::get<0>(stride),
                                     0, 0, 0,
@@ -206,9 +210,17 @@ namespace WAN {
                 } else if (mode == "upsample3d") {
                     x = ggml_upscale(ctx->ggml_ctx, x, 2, GGML_SCALE_MODE_NEAREST);
                 } else if (mode == "downsample2d") {
-                    x = ggml_pad(ctx->ggml_ctx, x, 1, 1, 0, 0);
+                    if (ctx->circular_pad_enabled) {
+                        x = ggml_pad_circular(ctx->ggml_ctx, x, 1, 1, 0, 0);
+                    } else {
+                        x = ggml_pad(ctx->ggml_ctx, x, 1, 1, 0, 0);
+                    }
                 } else if (mode == "downsample3d") {
-                    x = ggml_pad(ctx->ggml_ctx, x, 1, 1, 0, 0);
+                    if (ctx->circular_pad_enabled) {
+                        x = ggml_pad_circular(ctx->ggml_ctx, x, 1, 1, 0, 0);
+                    } else {
+                        x = ggml_pad(ctx->ggml_ctx, x, 1, 1, 0, 0);
+                    }
                 }
                 x = resample_1->forward(ctx, x);
                 x = ggml_ext_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, x, 0, 1, 3, 2));  // (c, t, h, w)
@@ -1826,7 +1838,7 @@ namespace WAN {
             }
         }
 
-        struct ggml_tensor* pad_to_patch_size(struct ggml_context* ctx,
+        struct ggml_tensor* pad_to_patch_size(GGMLRunnerContext* ctx,
                                               struct ggml_tensor* x) {
             int64_t W = x->ne[0];
             int64_t H = x->ne[1];
@@ -1835,7 +1847,11 @@ namespace WAN {
             int pad_t = (std::get<0>(params.patch_size) - T % std::get<0>(params.patch_size)) % std::get<0>(params.patch_size);
             int pad_h = (std::get<1>(params.patch_size) - H % std::get<1>(params.patch_size)) % std::get<1>(params.patch_size);
             int pad_w = (std::get<2>(params.patch_size) - W % std::get<2>(params.patch_size)) % std::get<2>(params.patch_size);
-            x         = ggml_pad(ctx, x, pad_w, pad_h, pad_t, 0);  // [N*C, T + pad_t, H + pad_h, W + pad_w]
+            if (ctx->circular_pad_enabled) {
+                x = ggml_pad_circular(ctx->ggml_ctx, x, pad_w, pad_h, pad_t, 0);
+            } else  {
+                x = ggml_pad(ctx->ggml_ctx, x, pad_w, pad_h, pad_t, 0);
+            }
 
             return x;
         }
@@ -1986,14 +2002,14 @@ namespace WAN {
             int64_t T = x->ne[2];
             int64_t C = x->ne[3];
 
-            x = pad_to_patch_size(ctx->ggml_ctx, x);
+            x = pad_to_patch_size(ctx, x);
 
             int64_t t_len = ((T + (std::get<0>(params.patch_size) / 2)) / std::get<0>(params.patch_size));
             int64_t h_len = ((H + (std::get<1>(params.patch_size) / 2)) / std::get<1>(params.patch_size));
             int64_t w_len = ((W + (std::get<2>(params.patch_size) / 2)) / std::get<2>(params.patch_size));
 
             if (time_dim_concat != nullptr) {
-                time_dim_concat = pad_to_patch_size(ctx->ggml_ctx, time_dim_concat);
+                time_dim_concat = pad_to_patch_size(ctx, time_dim_concat);
                 x               = ggml_concat(ctx->ggml_ctx, x, time_dim_concat, 2);  // [N*C, (T+pad_t) + (T2+pad_t2), H + pad_h, W + pad_w]
                 t_len           = ((x->ne[2] + (std::get<0>(params.patch_size) / 2)) / std::get<0>(params.patch_size));
             }
diff --git a/z_image.hpp b/z_image.hpp
index bc554f177..3268e3057 100644
--- a/z_image.hpp
+++ b/z_image.hpp
@@ -324,14 +324,18 @@ namespace ZImage {
             blocks["final_layer"] = std::make_shared<FinalLayer>(z_image_params.hidden_size, z_image_params.patch_size, z_image_params.out_channels);
         }
 
-        struct ggml_tensor* pad_to_patch_size(struct ggml_context* ctx,
+        struct ggml_tensor* pad_to_patch_size(GGMLRunnerContext* ctx,
                                               struct ggml_tensor* x) {
             int64_t W = x->ne[0];
             int64_t H = x->ne[1];
 
             int pad_h = (z_image_params.patch_size - H % z_image_params.patch_size) % z_image_params.patch_size;
             int pad_w = (z_image_params.patch_size - W % z_image_params.patch_size) % z_image_params.patch_size;
-            x         = ggml_pad(ctx, x, pad_w, pad_h, 0, 0);  // [N, C, H + pad_h, W + pad_w]
+            if (ctx->circular_pad_enabled) {
+                x = ggml_pad_circular(ctx->ggml_ctx, x, pad_w, pad_h, 0, 0);  // [N, C, H + pad_h, W + pad_w]
+            } else {
+                x = ggml_pad(ctx->ggml_ctx, x, pad_w, pad_h, 0, 0);  // [N, C, H + pad_h, W + pad_w]
+            }
             return x;
         }
 
@@ -357,10 +361,10 @@ namespace ZImage {
             return x;
         }
 
-        struct ggml_tensor* process_img(struct ggml_context* ctx,
+        struct ggml_tensor* process_img(GGMLRunnerContext* ctx,
                                         struct ggml_tensor* x) {
             x = pad_to_patch_size(ctx, x);
-            x = patchify(ctx, x);
+            x = patchify(ctx->ggml_ctx, x);
             return x;
         }
 
@@ -473,12 +477,12 @@ namespace ZImage {
             int64_t C = x->ne[2];
             int64_t N = x->ne[3];
 
-            auto img             = process_img(ctx->ggml_ctx, x);
+            auto img             = process_img(ctx, x);
             uint64_t n_img_token = img->ne[1];
 
             if (ref_latents.size() > 0) {
                 for (ggml_tensor* ref : ref_latents) {
-                    ref = process_img(ctx->ggml_ctx, ref);
+                    ref = process_img(ctx, ref);
                     img = ggml_concat(ctx->ggml_ctx, img, ref, 1);
                 }
             }

From 8e829edb29731b2ca6008c9ce1bacf750ce03887 Mon Sep 17 00:00:00 2001
From: Phylliida <phylliidadev@gmail.com>
Date: Fri, 12 Dec 2025 15:48:57 -0800
Subject: [PATCH 18/25] Fix temporal padding for qwen image and other vaes

---
 wan.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/wan.hpp b/wan.hpp
index 4cab4032b..18b7e3d29 100644
--- a/wan.hpp
+++ b/wan.hpp
@@ -76,9 +76,9 @@ namespace WAN {
             }
 
             if (ctx->circular_pad_enabled) {
-                x = ggml_pad_ext_circular(ctx->ggml_ctx, x, lp0, rp0, lp1, rp1, 0, 0, 0, 0);
+                x = ggml_pad_ext_circular(ctx->ggml_ctx, x, lp0, rp0, lp1, rp1, lp2, rp2, 0, 0);
             } else {
-                x = ggml_pad_ext(ctx->ggml_ctx, x, lp0, rp0, lp1, rp1, 0, 0, 0, 0);
+                x = ggml_pad_ext(ctx->ggml_ctx, x, lp0, rp0, lp1, rp1, lp2, rp2, 0, 0);
             }
             return ggml_ext_conv_3d(ctx->ggml_ctx, x, w, b, in_channels,
                                     std::get<2>(stride), std::get<1>(stride), std::get<0>(stride),

From 4054e3cc325b424071551b9b59b01eb347f1c209 Mon Sep 17 00:00:00 2001
From: Phylliida <phylliidadev@gmail.com>
Date: Sat, 13 Dec 2025 00:17:19 -0800
Subject: [PATCH 19/25] Z Image circular tiling

---
 rope.hpp    | 22 +++++++++++++++++++++-
 z_image.hpp |  1 +
 2 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/rope.hpp b/rope.hpp
index f84fe4f43..982e98469 100644
--- a/rope.hpp
+++ b/rope.hpp
@@ -495,9 +495,29 @@ namespace Rope {
                                                         const std::vector<ggml_tensor*>& ref_latents,
                                                         bool increase_ref_index,
                                                         int theta,
+                                                        bool circular,
                                                         const std::vector<int>& axes_dim) {
         std::vector<std::vector<float>> ids = gen_z_image_ids(h, w, patch_size, bs, context_len, seq_multi_of, ref_latents, increase_ref_index);
-        return embed_nd(ids, bs, theta, axes_dim);
+        std::vector<std::vector<int>> wrap_dims;
+        if (circular && bs > 0 && axes_dim.size() >= 3) {
+            int pad_h = (patch_size - (h % patch_size)) % patch_size;
+            int pad_w = (patch_size - (w % patch_size)) % patch_size;
+            int h_len = (h + pad_h) / patch_size;
+            int w_len = (w + pad_w) / patch_size;
+            if (h_len > 0 && w_len > 0) {
+                size_t pos_len = ids.size() / bs;
+                wrap_dims.assign(axes_dim.size(), std::vector<int>(pos_len, 0));
+                size_t cursor = context_len + bound_mod(context_len, seq_multi_of);  // skip text (and its padding)
+                size_t img_tokens = static_cast<size_t>(h_len) * static_cast<size_t>(w_len);
+                for (size_t token_i = 0; token_i < img_tokens; ++token_i) {
+                    wrap_dims[1][cursor + token_i] = h_len;
+                    wrap_dims[2][cursor + token_i] = w_len;
+                }
+            }
+        }
+
+        const std::vector<std::vector<int>>* wraps_ptr = wrap_dims.empty() ? nullptr : &wrap_dims;
+        return embed_nd(ids, bs, theta, axes_dim, wraps_ptr);
     }
 
     __STATIC_INLINE__ struct ggml_tensor* apply_rope(struct ggml_context* ctx,
diff --git a/z_image.hpp b/z_image.hpp
index 3268e3057..0955d4e9a 100644
--- a/z_image.hpp
+++ b/z_image.hpp
@@ -556,6 +556,7 @@ namespace ZImage {
                                                ref_latents,
                                                increase_ref_index,
                                                z_image_params.theta,
+                                               circular_pad_enabled,
                                                z_image_params.axes_dim);
             int pos_len = pe_vec.size() / z_image_params.axes_dim_sum / 2;
             // LOG_DEBUG("pos_len %d", pos_len);

From 4b87268db6fd02e9d82c6865207d04e3be873e64 Mon Sep 17 00:00:00 2001
From: Phylliida <phylliidadev@gmail.com>
Date: Sat, 13 Dec 2025 01:07:00 -0800
Subject: [PATCH 20/25] x and y axis seamless only

---
 diffusion_model.hpp   |  9 +++++++++
 examples/cli/main.cpp | 20 +++++++++++++++++---
 ggml_extend.hpp       | 11 +++++++++++
 qwen_image.hpp        |  3 ++-
 rope.hpp              | 34 ++++++++++++++++++++++++----------
 stable-diffusion.cpp  | 12 ++++++++++++
 stable-diffusion.h    |  2 ++
 z_image.hpp           |  3 ++-
 8 files changed, 79 insertions(+), 15 deletions(-)

diff --git a/diffusion_model.hpp b/diffusion_model.hpp
index b6491291a..968c978cd 100644
--- a/diffusion_model.hpp
+++ b/diffusion_model.hpp
@@ -40,6 +40,7 @@ struct DiffusionModel {
     virtual int64_t get_adm_in_channels()             = 0;
     virtual void set_flash_attn_enabled(bool enabled) = 0;
     virtual void set_circular_pad_enabled(bool enabled) = 0;
+    virtual void set_rope_circular_axes(bool circular_x, bool circular_y) {};
 };
 
 struct UNetModel : public DiffusionModel {
@@ -364,6 +365,10 @@ struct QwenImageModel : public DiffusionModel {
         qwen_image.set_circular_pad_enabled(enabled);
     }
 
+    void set_rope_circular_axes(bool circular_x, bool circular_y) override {
+        qwen_image.set_circular_rope_enabled(circular_x, circular_y);
+    }
+
     bool compute(int n_threads,
                  DiffusionParams diffusion_params,
                  struct ggml_tensor** output     = nullptr,
@@ -431,6 +436,10 @@ struct ZImageModel : public DiffusionModel {
         z_image.set_circular_pad_enabled(enabled);
     }
 
+    void set_rope_circular_axes(bool circular_x, bool circular_y) override {
+        z_image.set_circular_rope_enabled(circular_x, circular_y);
+    }
+
     bool compute(int n_threads,
                  DiffusionParams diffusion_params,
                  struct ggml_tensor** output     = nullptr,
diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp
index 79c456271..2aa3446ed 100644
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@@ -519,6 +519,8 @@ struct SDContextParams {
     bool diffusion_conv_direct  = false;
     bool vae_conv_direct        = false;
     bool circular_pad           = false;
+    bool circular_pad_x         = false;
+    bool circular_pad_y         = false;
 
     bool chroma_use_dit_mask = true;
     bool chroma_use_t5_mask  = false;
@@ -673,9 +675,17 @@ struct SDContextParams {
              "use ggml_conv2d_direct in the vae model",
              true, &vae_conv_direct},
             {"",
-             "--circular",
-             "enable circular padding for convolutions",
-             true, &circular_pad},
+            "--circular",
+            "enable circular padding for convolutions",
+            true, &circular_pad},
+            {"",
+             "--circularx",
+             "enable circular RoPE wrapping on x-axis (width) only",
+             true, &circular_pad_x},
+            {"",
+             "--circulary",
+             "enable circular RoPE wrapping on y-axis (height) only",
+             true, &circular_pad_y},
             {"",
              "--chroma-disable-dit-mask",
              "disable dit mask for chroma",
@@ -940,6 +950,8 @@ struct SDContextParams {
             << "  diffusion_conv_direct: " << (diffusion_conv_direct ? "true" : "false") << ",\n"
             << "  vae_conv_direct: " << (vae_conv_direct ? "true" : "false") << ",\n"
             << "  circular_pad: " << (circular_pad ? "true" : "false") << ",\n"
+            << "  circular_pad_x: " << (circular_pad_x ? "true" : "false") << ",\n"
+            << "  circular_pad_y: " << (circular_pad_y ? "true" : "false") << ",\n"
             << "  chroma_use_dit_mask: " << (chroma_use_dit_mask ? "true" : "false") << ",\n"
             << "  chroma_use_t5_mask: " << (chroma_use_t5_mask ? "true" : "false") << ",\n"
             << "  chroma_t5_mask_pad: " << chroma_t5_mask_pad << ",\n"
@@ -1002,6 +1014,8 @@ struct SDContextParams {
             diffusion_conv_direct,
             vae_conv_direct,
             circular_pad,
+            circular_pad || circular_pad_x,
+            circular_pad || circular_pad_y,
             force_sdxl_vae_conv_scale,
             chroma_use_dit_mask,
             chroma_use_t5_mask,
diff --git a/ggml_extend.hpp b/ggml_extend.hpp
index b76a25f04..a4224d3c6 100644
--- a/ggml_extend.hpp
+++ b/ggml_extend.hpp
@@ -1573,6 +1573,8 @@ struct GGMLRunnerContext {
     bool flash_attn_enabled                       = false;
     bool conv2d_direct_enabled                    = false;
     bool circular_pad_enabled                     = false;
+    bool rope_circular_x_enabled                  = false;
+    bool rope_circular_y_enabled                  = false;
     std::shared_ptr<WeightAdapter> weight_adapter = nullptr;
 };
 
@@ -1610,6 +1612,8 @@ struct GGMLRunner {
     bool flash_attn_enabled    = false;
     bool conv2d_direct_enabled = false;
     bool circular_pad_enabled  = false;
+    bool rope_circular_x_enabled = false;
+    bool rope_circular_y_enabled = false;
 
     void alloc_params_ctx() {
         struct ggml_init_params params;
@@ -1888,6 +1892,8 @@ struct GGMLRunner {
         runner_ctx.flash_attn_enabled    = flash_attn_enabled;
         runner_ctx.conv2d_direct_enabled = conv2d_direct_enabled;
         runner_ctx.circular_pad_enabled  = circular_pad_enabled;
+        runner_ctx.rope_circular_x_enabled = rope_circular_x_enabled;
+        runner_ctx.rope_circular_y_enabled = rope_circular_y_enabled;
         runner_ctx.weight_adapter        = weight_adapter;
         return runner_ctx;
     }
@@ -2036,6 +2042,11 @@ struct GGMLRunner {
         circular_pad_enabled = enabled;
     }
 
+    void set_circular_rope_enabled(bool circular_x, bool circular_y) {
+        rope_circular_x_enabled = circular_x;
+        rope_circular_y_enabled = circular_y;
+    }
+
     void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) {
         weight_adapter = adapter;
     }
diff --git a/qwen_image.hpp b/qwen_image.hpp
index d35ff18dc..d24c3d83e 100644
--- a/qwen_image.hpp
+++ b/qwen_image.hpp
@@ -569,7 +569,8 @@ namespace Qwen {
                                                   ref_latents,
                                                   increase_ref_index,
                                                   qwen_image_params.theta,
-                                                  circular_pad_enabled,
+                                                  rope_circular_y_enabled,
+                                                  rope_circular_x_enabled,
                                                   qwen_image_params.axes_dim);
             int pos_len = pe_vec.size() / qwen_image_params.axes_dim_sum / 2;
             // LOG_DEBUG("pos_len %d", pos_len);
diff --git a/rope.hpp b/rope.hpp
index 982e98469..55695f55c 100644
--- a/rope.hpp
+++ b/rope.hpp
@@ -316,12 +316,13 @@ namespace Rope {
                                                            const std::vector<ggml_tensor*>& ref_latents,
                                                            bool increase_ref_index,
                                                            int theta,
-                                                           bool circular,
+                                                           bool circular_h,
+                                                           bool circular_w,
                                                            const std::vector<int>& axes_dim) {
         std::vector<std::vector<float>> ids = gen_qwen_image_ids(h, w, patch_size, bs, context_len, ref_latents, increase_ref_index);
         std::vector<std::vector<int>> wrap_dims;
         // This logic simply stores the (pad and patch_adjusted) sizes of images so we can make sure rope correctly tiles
-        if (circular && bs > 0 && axes_dim.size() >= 3) {
+        if ((circular_h || circular_w) && bs > 0 && axes_dim.size() >= 3) {
             int pad_h = (patch_size - (h % patch_size)) % patch_size;
             int pad_w = (patch_size - (w % patch_size)) % patch_size;
             int h_len = (h + pad_h) / patch_size;
@@ -333,8 +334,12 @@ namespace Rope {
                 size_t cursor = context_len; // ignore text tokens
                 const size_t img_tokens       = static_cast<size_t>(h_len) * static_cast<size_t>(w_len);
                 for (size_t token_i = 0; token_i < img_tokens; ++token_i) {
-                    wrap_dims[1][cursor + token_i] = h_len;
-                    wrap_dims[2][cursor + token_i] = w_len;
+                    if (circular_h) {
+                        wrap_dims[1][cursor + token_i] = h_len;
+                    }
+                    if (circular_w) {
+                        wrap_dims[2][cursor + token_i] = w_len;
+                    }
                 }
                 cursor += img_tokens;
                 // For each reference image, store wrap sizes as well
@@ -350,8 +355,12 @@ namespace Rope {
                     int ref_w_len  = (ref_w + ref_pad_w) / patch_size;
                     size_t ref_n_tokens  = static_cast<size_t>(ref_h_len) * static_cast<size_t>(ref_w_len);
                     for (size_t token_i = 0; token_i < ref_n_tokens; ++token_i) {
-                        wrap_dims[1][cursor + token_i] = ref_h_len;
-                        wrap_dims[2][cursor + token_i] = ref_w_len;
+                        if (circular_h) {
+                            wrap_dims[1][cursor + token_i] = ref_h_len;
+                        }
+                        if (circular_w) {
+                            wrap_dims[2][cursor + token_i] = ref_w_len;
+                        }
                     }
                     cursor += ref_n_tokens;
                 }
@@ -495,11 +504,12 @@ namespace Rope {
                                                         const std::vector<ggml_tensor*>& ref_latents,
                                                         bool increase_ref_index,
                                                         int theta,
-                                                        bool circular,
+                                                        bool circular_h,
+                                                        bool circular_w,
                                                         const std::vector<int>& axes_dim) {
         std::vector<std::vector<float>> ids = gen_z_image_ids(h, w, patch_size, bs, context_len, seq_multi_of, ref_latents, increase_ref_index);
         std::vector<std::vector<int>> wrap_dims;
-        if (circular && bs > 0 && axes_dim.size() >= 3) {
+        if ((circular_h || circular_w) && bs > 0 && axes_dim.size() >= 3) {
             int pad_h = (patch_size - (h % patch_size)) % patch_size;
             int pad_w = (patch_size - (w % patch_size)) % patch_size;
             int h_len = (h + pad_h) / patch_size;
@@ -510,8 +520,12 @@ namespace Rope {
                 size_t cursor = context_len + bound_mod(context_len, seq_multi_of);  // skip text (and its padding)
                 size_t img_tokens = static_cast<size_t>(h_len) * static_cast<size_t>(w_len);
                 for (size_t token_i = 0; token_i < img_tokens; ++token_i) {
-                    wrap_dims[1][cursor + token_i] = h_len;
-                    wrap_dims[2][cursor + token_i] = w_len;
+                    if (circular_h) {
+                        wrap_dims[1][cursor + token_i] = h_len;
+                    }
+                    if (circular_w) {
+                        wrap_dims[2][cursor + token_i] = w_len;
+                    }
                 }
             }
         }
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index 172ee2e13..ffa322a64 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -128,6 +128,8 @@ class StableDiffusionGGML {
     sd_tiling_params_t vae_tiling_params = {false, 0, 0, 0.5f, 0, 0};
     bool offload_params_to_cpu           = false;
     bool circular_pad                    = false;
+    bool circular_pad_x                  = false;
+    bool circular_pad_y                  = false;
     bool stacked_id                      = false;
 
     bool is_using_v_parameterization     = false;
@@ -212,6 +214,8 @@ class StableDiffusionGGML {
         use_tiny_autoencoder    = taesd_path.size() > 0;
         offload_params_to_cpu   = sd_ctx_params->offload_params_to_cpu;
         circular_pad            = sd_ctx_params->circular_pad;
+        circular_pad_x          = sd_ctx_params->circular_pad_x || circular_pad;
+        circular_pad_y          = sd_ctx_params->circular_pad_y || circular_pad;
 
         rng = get_rng(sd_ctx_params->rng_type);
         if (sd_ctx_params->sampler_rng_type != RNG_TYPE_COUNT && sd_ctx_params->sampler_rng_type != sd_ctx_params->rng_type) {
@@ -509,6 +513,7 @@ class StableDiffusionGGML {
                                                                    "model.diffusion_model",
                                                                    version);
                 diffusion_model->set_circular_pad_enabled(circular_pad);
+                diffusion_model->set_rope_circular_axes(circular_pad_x, circular_pad_y);
             } else if (sd_version_is_z_image(version)) {
                 cond_stage_model = std::make_shared<LLMEmbedder>(clip_backend,
                                                                  offload_params_to_cpu,
@@ -520,6 +525,7 @@ class StableDiffusionGGML {
                                                                 "model.diffusion_model",
                                                                 version);
                 diffusion_model->set_circular_pad_enabled(circular_pad);
+                diffusion_model->set_rope_circular_axes(circular_pad_x, circular_pad_y);
             } else {  // SD1.x SD2.x SDXL
                 std::map<std::string, std::string> embbeding_map;
                 for (int i = 0; i < sd_ctx_params->embedding_count; i++) {
@@ -2531,6 +2537,8 @@ void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) {
     sd_ctx_params->keep_vae_on_cpu         = false;
     sd_ctx_params->diffusion_flash_attn    = false;
     sd_ctx_params->circular_pad            = false;
+    sd_ctx_params->circular_pad_x          = false;
+    sd_ctx_params->circular_pad_y          = false;
     sd_ctx_params->chroma_use_dit_mask     = true;
     sd_ctx_params->chroma_use_t5_mask      = false;
     sd_ctx_params->chroma_t5_mask_pad      = 1;
@@ -2572,6 +2580,8 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
              "keep_vae_on_cpu: %s\n"
              "diffusion_flash_attn: %s\n"
              "circular_pad: %s\n"
+             "circular_pad_x: %s\n"
+             "circular_pad_y: %s\n"
              "chroma_use_dit_mask: %s\n"
              "chroma_use_t5_mask: %s\n"
              "chroma_t5_mask_pad: %d\n",
@@ -2603,6 +2613,8 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
              BOOL_STR(sd_ctx_params->keep_vae_on_cpu),
              BOOL_STR(sd_ctx_params->diffusion_flash_attn),
              BOOL_STR(sd_ctx_params->circular_pad),
+             BOOL_STR(sd_ctx_params->circular_pad_x),
+             BOOL_STR(sd_ctx_params->circular_pad_y),
              BOOL_STR(sd_ctx_params->chroma_use_dit_mask),
              BOOL_STR(sd_ctx_params->chroma_use_t5_mask),
              sd_ctx_params->chroma_t5_mask_pad);
diff --git a/stable-diffusion.h b/stable-diffusion.h
index 87b5b0485..3eb1324f5 100644
--- a/stable-diffusion.h
+++ b/stable-diffusion.h
@@ -190,6 +190,8 @@ typedef struct {
     bool diffusion_conv_direct;
     bool vae_conv_direct;
     bool circular_pad;
+    bool circular_pad_x;
+    bool circular_pad_y;
     bool force_sdxl_vae_conv_scale;
     bool chroma_use_dit_mask;
     bool chroma_use_t5_mask;
diff --git a/z_image.hpp b/z_image.hpp
index 0955d4e9a..c87f1b9d9 100644
--- a/z_image.hpp
+++ b/z_image.hpp
@@ -556,7 +556,8 @@ namespace ZImage {
                                                ref_latents,
                                                increase_ref_index,
                                                z_image_params.theta,
-                                               circular_pad_enabled,
+                                               rope_circular_y_enabled,
+                                               rope_circular_x_enabled,
                                                z_image_params.axes_dim);
             int pos_len = pe_vec.size() / z_image_params.axes_dim_sum / 2;
             // LOG_DEBUG("pos_len %d", pos_len);

From 935f98037525fa0a3aca3f3effd5083365c9bd44 Mon Sep 17 00:00:00 2001
From: Phylliida <phylliidadev@gmail.com>
Date: Sat, 13 Dec 2025 01:25:00 -0800
Subject: [PATCH 21/25] First attempt at chroma seamless x and y

---
 diffusion_model.hpp  |  4 ++++
 flux.hpp             |  2 ++
 rope.hpp             | 45 +++++++++++++++++++++++++++++++++++++++++++-
 stable-diffusion.cpp |  2 ++
 4 files changed, 52 insertions(+), 1 deletion(-)

diff --git a/diffusion_model.hpp b/diffusion_model.hpp
index 968c978cd..959d1f99a 100644
--- a/diffusion_model.hpp
+++ b/diffusion_model.hpp
@@ -224,6 +224,10 @@ struct FluxModel : public DiffusionModel {
         flux.set_circular_pad_enabled(enabled);
     }
 
+    void set_rope_circular_axes(bool circular_x, bool circular_y) override {
+        flux.set_circular_rope_enabled(circular_x, circular_y);
+    }
+
     bool compute(int n_threads,
                  DiffusionParams diffusion_params,
                  struct ggml_tensor** output     = nullptr,
diff --git a/flux.hpp b/flux.hpp
index 602ab9bdb..7d47350ca 100644
--- a/flux.hpp
+++ b/flux.hpp
@@ -1445,6 +1445,8 @@ namespace Flux {
                                             increase_ref_index,
                                             flux_params.ref_index_scale,
                                             flux_params.theta,
+                                            rope_circular_y_enabled,
+                                            rope_circular_x_enabled,
                                             flux_params.axes_dim);
             int pos_len = pe_vec.size() / flux_params.axes_dim_sum / 2;
             // LOG_DEBUG("pos_len %d", pos_len);
diff --git a/rope.hpp b/rope.hpp
index 55695f55c..682cb641a 100644
--- a/rope.hpp
+++ b/rope.hpp
@@ -266,6 +266,8 @@ namespace Rope {
                                                      bool increase_ref_index,
                                                      float ref_index_scale,
                                                      int theta,
+                                                     bool circular_h,
+                                                     bool circular_w,
                                                      const std::vector<int>& axes_dim) {
         std::vector<std::vector<float>> ids = gen_flux_ids(h,
                                                            w,
@@ -277,7 +279,48 @@ namespace Rope {
                                                            ref_latents,
                                                            increase_ref_index,
                                                            ref_index_scale);
-        return embed_nd(ids, bs, theta, axes_dim);
+        std::vector<std::vector<int>> wrap_dims;
+        if ((circular_h || circular_w) && bs > 0 && axes_dim.size() >= 3) {
+            int h_len = (h + (patch_size / 2)) / patch_size;
+            int w_len = (w + (patch_size / 2)) / patch_size;
+            if (h_len > 0 && w_len > 0) {
+                size_t pos_len = ids.size() / bs;
+                wrap_dims.assign(axes_dim.size(), std::vector<int>(pos_len, 0));
+                size_t cursor = context_len;  // text first
+                const size_t img_tokens = static_cast<size_t>(h_len) * static_cast<size_t>(w_len);
+                for (size_t token_i = 0; token_i < img_tokens; ++token_i) {
+                    if (circular_h) {
+                        wrap_dims[1][cursor + token_i] = h_len;
+                    }
+                    if (circular_w) {
+                        wrap_dims[2][cursor + token_i] = w_len;
+                    }
+                }
+                cursor += img_tokens;
+                // reference latents
+                for (ggml_tensor* ref : ref_latents) {
+                    if (ref == nullptr) {
+                        continue;
+                    }
+                    int ref_h   = static_cast<int>(ref->ne[1]);
+                    int ref_w   = static_cast<int>(ref->ne[0]);
+                    int ref_h_l = (ref_h + (patch_size / 2)) / patch_size;
+                    int ref_w_l = (ref_w + (patch_size / 2)) / patch_size;
+                    size_t ref_tokens = static_cast<size_t>(ref_h_l) * static_cast<size_t>(ref_w_l);
+                    for (size_t token_i = 0; token_i < ref_tokens; ++token_i) {
+                        if (circular_h) {
+                            wrap_dims[1][cursor + token_i] = ref_h_l;
+                        }
+                        if (circular_w) {
+                            wrap_dims[2][cursor + token_i] = ref_w_l;
+                        }
+                    }
+                    cursor += ref_tokens;
+                }
+            }
+        }
+        const std::vector<std::vector<int>>* wraps_ptr = wrap_dims.empty() ? nullptr : &wrap_dims;
+        return embed_nd(ids, bs, theta, axes_dim, wraps_ptr);
     }
 
     __STATIC_INLINE__ std::vector<std::vector<float>> gen_qwen_image_ids(int h,
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index ffa322a64..eabb51012 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -454,6 +454,7 @@ class StableDiffusionGGML {
                                                               version,
                                                               sd_ctx_params->chroma_use_dit_mask);
                 diffusion_model->set_circular_pad_enabled(circular_pad);
+                diffusion_model->set_rope_circular_axes(circular_pad_x, circular_pad_y);
             } else if (sd_version_is_flux2(version)) {
                 bool is_chroma   = false;
                 cond_stage_model = std::make_shared<LLMEmbedder>(clip_backend,
@@ -466,6 +467,7 @@ class StableDiffusionGGML {
                                                                version,
                                                                sd_ctx_params->chroma_use_dit_mask);
                 diffusion_model->set_circular_pad_enabled(circular_pad);
+                diffusion_model->set_rope_circular_axes(circular_pad_x, circular_pad_y);
             } else if (sd_version_is_wan(version)) {
                 cond_stage_model = std::make_shared<T5CLIPEmbedder>(clip_backend,
                                                                     offload_params_to_cpu,

From 820fb6bba598a446febf0117d5fdab8bd421ae6f Mon Sep 17 00:00:00 2001
From: Phylliida <phylliidadev@gmail.com>
Date: Sat, 13 Dec 2025 02:02:00 -0800
Subject: [PATCH 22/25] refactor into pure x and y, almost there

---
 clip.hpp             |  2 +-
 common.hpp           |  8 +---
 diffusion_model.hpp  | 37 ++++++++++++++---
 flux.hpp             |  6 +--
 ggml_extend.hpp      | 95 +++++++++++++++++++++++++++++++++++++++-----
 lora.hpp             | 12 ++++--
 qwen_image.hpp       |  6 +--
 stable-diffusion.cpp | 47 +++++++++++-----------
 wan.hpp              | 25 ++----------
 z_image.hpp          |  6 +--
 10 files changed, 158 insertions(+), 86 deletions(-)

diff --git a/clip.hpp b/clip.hpp
index cda5a3015..4b51727c4 100644
--- a/clip.hpp
+++ b/clip.hpp
@@ -664,7 +664,7 @@ class CLIPVisionEmbeddings : public GGMLBlock {
         // concat(patch_embedding, class_embedding) + position_embedding
         struct ggml_tensor* patch_embedding;
         int64_t N       = pixel_values->ne[3];
-        patch_embedding = ggml_ext_conv_2d(ctx->ggml_ctx, pixel_values, patch_embed_weight, nullptr, patch_size, patch_size, 0, 0, 1, 1, false, ctx->circular_pad_enabled);  // [N, embed_dim, image_size // pacht_size, image_size // pacht_size]
+        patch_embedding    = ggml_ext_conv_2d(ctx->ggml_ctx, pixel_values, patch_embed_weight, nullptr, patch_size, patch_size, 0, 0, 1, 1, false, ctx->circular_pad_x_enabled, ctx->circular_pad_y_enabled);  // [N, embed_dim, image_size // pacht_size, image_size // pacht_size]
         patch_embedding = ggml_reshape_3d(ctx->ggml_ctx, patch_embedding, num_patches, embed_dim, N);                          // [N, embed_dim, num_patches]
         patch_embedding = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, patch_embedding, 1, 0, 2, 3));                  // [N, num_patches, embed_dim]
         patch_embedding = ggml_reshape_4d(ctx->ggml_ctx, patch_embedding, 1, embed_dim, num_patches, N);                       // [N, num_patches, embed_dim, 1]
diff --git a/common.hpp b/common.hpp
index a95e76a4f..8d66422b1 100644
--- a/common.hpp
+++ b/common.hpp
@@ -29,12 +29,8 @@ class DownSampleBlock : public GGMLBlock {
             auto conv = std::dynamic_pointer_cast<Conv2d>(blocks["conv"]);
 
             // For VAE downsampling we manually pad by 1 before the stride-2 conv.
-            // Honor the global circular padding flag here to avoid seams in seamless mode.
-            if (ctx->circular_pad_enabled) {
-                x = ggml_pad_circular(ctx->ggml_ctx, x, 1, 1, 0, 0);
-            } else {
-                x = ggml_pad(ctx->ggml_ctx, x, 1, 1, 0, 0);
-            }
+            // Honor the global circular padding flags here to avoid seams in seamless mode.
+            x = sd_pad(ctx->ggml_ctx, x, 1, 1, 0, 0, ctx->circular_pad_x_enabled, ctx->circular_pad_y_enabled);
             x = conv->forward(ctx, x);
         } else {
             auto conv = std::dynamic_pointer_cast<Conv2d>(blocks["op"]);
diff --git a/diffusion_model.hpp b/diffusion_model.hpp
index 959d1f99a..c73a50106 100644
--- a/diffusion_model.hpp
+++ b/diffusion_model.hpp
@@ -40,6 +40,7 @@ struct DiffusionModel {
     virtual int64_t get_adm_in_channels()             = 0;
     virtual void set_flash_attn_enabled(bool enabled) = 0;
     virtual void set_circular_pad_enabled(bool enabled) = 0;
+    virtual void set_circular_pad_axes(bool circular_x, bool circular_y) = 0;
     virtual void set_rope_circular_axes(bool circular_x, bool circular_y) {};
 };
 
@@ -90,7 +91,11 @@ struct UNetModel : public DiffusionModel {
     }
 
     void set_circular_pad_enabled(bool enabled) override {
-        unet.set_circular_pad_enabled(enabled);
+        unet.set_circular_pad_axes(enabled, enabled);
+    }
+
+    void set_circular_pad_axes(bool circular_x, bool circular_y) override {
+        unet.set_circular_pad_axes(circular_x, circular_y);
     }
 
     bool compute(int n_threads,
@@ -155,7 +160,11 @@ struct MMDiTModel : public DiffusionModel {
     }
 
     void set_circular_pad_enabled(bool enabled) override {
-        mmdit.set_circular_pad_enabled(enabled);
+        mmdit.set_circular_pad_axes(enabled, enabled);
+    }
+
+    void set_circular_pad_axes(bool circular_x, bool circular_y) override {
+        mmdit.set_circular_pad_axes(circular_x, circular_y);
     }
 
     bool compute(int n_threads,
@@ -221,7 +230,11 @@ struct FluxModel : public DiffusionModel {
     }
 
     void set_circular_pad_enabled(bool enabled) override {
-        flux.set_circular_pad_enabled(enabled);
+        flux.set_circular_pad_axes(enabled, enabled);
+    }
+
+    void set_circular_pad_axes(bool circular_x, bool circular_y) override {
+        flux.set_circular_pad_axes(circular_x, circular_y);
     }
 
     void set_rope_circular_axes(bool circular_x, bool circular_y) override {
@@ -296,7 +309,11 @@ struct WanModel : public DiffusionModel {
     }
 
     void set_circular_pad_enabled(bool enabled) override {
-        wan.set_circular_pad_enabled(enabled);
+        wan.set_circular_pad_axes(enabled, enabled);
+    }
+
+    void set_circular_pad_axes(bool circular_x, bool circular_y) override {
+        wan.set_circular_pad_axes(circular_x, circular_y);
     }
 
     bool compute(int n_threads,
@@ -366,7 +383,11 @@ struct QwenImageModel : public DiffusionModel {
     }
 
     void set_circular_pad_enabled(bool enabled) override {
-        qwen_image.set_circular_pad_enabled(enabled);
+        qwen_image.set_circular_pad_axes(enabled, enabled);
+    }
+
+    void set_circular_pad_axes(bool circular_x, bool circular_y) override {
+        qwen_image.set_circular_pad_axes(circular_x, circular_y);
     }
 
     void set_rope_circular_axes(bool circular_x, bool circular_y) override {
@@ -437,7 +458,11 @@ struct ZImageModel : public DiffusionModel {
     }
 
     void set_circular_pad_enabled(bool enabled) override {
-        z_image.set_circular_pad_enabled(enabled);
+        z_image.set_circular_pad_axes(enabled, enabled);
+    }
+
+    void set_circular_pad_axes(bool circular_x, bool circular_y) override {
+        z_image.set_circular_pad_axes(circular_x, circular_y);
     }
 
     void set_rope_circular_axes(bool circular_x, bool circular_y) override {
diff --git a/flux.hpp b/flux.hpp
index 7d47350ca..65e91e106 100644
--- a/flux.hpp
+++ b/flux.hpp
@@ -865,11 +865,7 @@ namespace Flux {
 
             int pad_h = (params.patch_size - H % params.patch_size) % params.patch_size;
             int pad_w = (params.patch_size - W % params.patch_size) % params.patch_size;
-            if (ctx->circular_pad_enabled) {
-                x = ggml_pad_circular(ctx->ggml_ctx, x, pad_w, pad_h, 0, 0);
-            } else {
-                x = ggml_pad(ctx->ggml_ctx, x, pad_w, pad_h, 0, 0);  // [N, C, H + pad_h, W + pad_w]
-            }
+            x = sd_pad(ctx->ggml_ctx, x, pad_w, pad_h, 0, 0, ctx->circular_pad_x_enabled, ctx->circular_pad_y_enabled);
             return x;
         }
 
diff --git a/ggml_extend.hpp b/ggml_extend.hpp
index a4224d3c6..3c3b7aa28 100644
--- a/ggml_extend.hpp
+++ b/ggml_extend.hpp
@@ -1012,6 +1012,69 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_linear(struct ggml_context* ctx,
     return x;
 }
 
+__STATIC_INLINE__ struct ggml_tensor* sd_pad(struct ggml_context* ctx,
+                                             struct ggml_tensor* x,
+                                             int pad_w,
+                                             int pad_h,
+                                             int pad_t       = 0,
+                                             int pad_d       = 0,
+                                             bool circular_x = false,
+                                             bool circular_y = false) {
+    if ((circular_x && circular_y) || (!circular_x && !circular_y)) {
+        return circular_x && circular_y ? ggml_pad_circular(ctx, x, pad_w, pad_h, pad_t, pad_d)
+                                        : ggml_pad(ctx, x, pad_w, pad_h, pad_t, pad_d);
+    }
+
+    int rem_w = pad_w;
+    int rem_h = pad_h;
+
+    if (circular_x && pad_w != 0) {
+        x     = ggml_pad_circular(ctx, x, pad_w, 0, 0, 0);
+        rem_w = 0;
+    }
+    if (circular_y && pad_h != 0) {
+        x     = ggml_pad_circular(ctx, x, 0, pad_h, 0, 0);
+        rem_h = 0;
+    }
+
+    if (rem_w != 0 || rem_h != 0 || pad_t != 0 || pad_d != 0) {
+        x = ggml_pad(ctx, x, rem_w, rem_h, pad_t, pad_d);
+    }
+    return x;
+}
+
+__STATIC_INLINE__ struct ggml_tensor* sd_pad_ext(struct ggml_context* ctx,
+                                                 struct ggml_tensor* x,
+                                                 int lp0,
+                                                 int rp0,
+                                                 int lp1,
+                                                 int rp1,
+                                                 int lp2,
+                                                 int rp2,
+                                                 int lp3,
+                                                 int rp3,
+                                                 bool circular_x = false,
+                                                 bool circular_y = false) {
+    if ((circular_x && circular_y) || (!circular_x && !circular_y)) {
+        return circular_x && circular_y ? ggml_pad_ext_circular(ctx, x, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3)
+                                        : ggml_pad_ext(ctx, x, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3);
+    }
+
+    if (circular_x && (lp0 != 0 || rp0 != 0)) {
+        x   = ggml_pad_ext_circular(ctx, x, lp0, rp0, 0, 0, 0, 0, 0, 0);
+        lp0 = rp0 = 0;
+    }
+    if (circular_y && (lp1 != 0 || rp1 != 0)) {
+        x   = ggml_pad_ext_circular(ctx, x, 0, 0, lp1, rp1, 0, 0, 0, 0);
+        lp1 = rp1 = 0;
+    }
+
+    if (lp0 != 0 || rp0 != 0 || lp1 != 0 || rp1 != 0 || lp2 != 0 || rp2 != 0 || lp3 != 0 || rp3 != 0) {
+        x = ggml_pad_ext(ctx, x, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3);
+    }
+    return x;
+}
+
 // w: [OC，IC, KH, KW]
 // x: [N, IC, IH, IW]
 // b: [OC,]
@@ -1027,7 +1090,8 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_conv_2d(struct ggml_context* ctx,
                                                        int d0        = 1,
                                                        int d1        = 1,
                                                        bool direct   = false,
-                                                       bool circular = false,
+                                                       bool circular_x = false,
+                                                       bool circular_y = false,
                                                        float scale   = 1.f) {
     if (scale != 1.f) {
         x = ggml_scale(ctx, x, scale);
@@ -1038,8 +1102,8 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_conv_2d(struct ggml_context* ctx,
 
     // use circular padding (on a torus, x and y wrap around) for seamless textures
     // see https://github.com/leejet/stable-diffusion.cpp/pull/914
-    if (circular && (p0 != 0 || p1 != 0)) {
-        x  = ggml_pad_ext_circular(ctx, x, p0, p0, p1, p1, 0, 0, 0, 0);
+    if ((p0 != 0 || p1 != 0) && (circular_x || circular_y)) {
+        x  = sd_pad(ctx, x, p0, p1, 0, 0, circular_x, circular_y);
         p0 = 0;
         p1 = 0;
     }
@@ -1553,7 +1617,8 @@ struct WeightAdapter {
             int d0      = 1;
             int d1      = 1;
             bool direct = false;
-            bool circular = false;
+            bool circular_x = false;
+            bool circular_y = false;
             float scale   = 1.f;
         } conv2d;
     };
@@ -1572,7 +1637,8 @@ struct GGMLRunnerContext {
     ggml_context* ggml_ctx                        = nullptr;
     bool flash_attn_enabled                       = false;
     bool conv2d_direct_enabled                    = false;
-    bool circular_pad_enabled                     = false;
+    bool circular_pad_x_enabled                   = false;
+    bool circular_pad_y_enabled                   = false;
     bool rope_circular_x_enabled                  = false;
     bool rope_circular_y_enabled                  = false;
     std::shared_ptr<WeightAdapter> weight_adapter = nullptr;
@@ -1611,7 +1677,8 @@ struct GGMLRunner {
 
     bool flash_attn_enabled    = false;
     bool conv2d_direct_enabled = false;
-    bool circular_pad_enabled  = false;
+    bool circular_pad_x_enabled  = false;
+    bool circular_pad_y_enabled  = false;
     bool rope_circular_x_enabled = false;
     bool rope_circular_y_enabled = false;
 
@@ -1891,7 +1958,8 @@ struct GGMLRunner {
         runner_ctx.backend               = runtime_backend;
         runner_ctx.flash_attn_enabled    = flash_attn_enabled;
         runner_ctx.conv2d_direct_enabled = conv2d_direct_enabled;
-        runner_ctx.circular_pad_enabled  = circular_pad_enabled;
+        runner_ctx.circular_pad_x_enabled = circular_pad_x_enabled;
+        runner_ctx.circular_pad_y_enabled = circular_pad_y_enabled;
         runner_ctx.rope_circular_x_enabled = rope_circular_x_enabled;
         runner_ctx.rope_circular_y_enabled = rope_circular_y_enabled;
         runner_ctx.weight_adapter        = weight_adapter;
@@ -2039,7 +2107,12 @@ struct GGMLRunner {
     }
 
     void set_circular_pad_enabled(bool enabled) {
-        circular_pad_enabled = enabled;
+        set_circular_pad_axes(enabled, enabled);
+    }
+
+    void set_circular_pad_axes(bool circular_x, bool circular_y) {
+        circular_pad_x_enabled = circular_x;
+        circular_pad_y_enabled = circular_y;
     }
 
     void set_circular_rope_enabled(bool circular_x, bool circular_y) {
@@ -2318,7 +2391,8 @@ class Conv2d : public UnaryBlock {
             forward_params.conv2d.d0     = dilation.second;
             forward_params.conv2d.d1     = dilation.first;
             forward_params.conv2d.direct = ctx->conv2d_direct_enabled;
-            forward_params.conv2d.circular = ctx->circular_pad_enabled;
+            forward_params.conv2d.circular_x = ctx->circular_pad_x_enabled;
+            forward_params.conv2d.circular_y = ctx->circular_pad_y_enabled;
             forward_params.conv2d.scale    = scale;
             return ctx->weight_adapter->forward_with_lora(ctx->ggml_ctx, x, w, b, prefix, forward_params);
         }
@@ -2333,7 +2407,8 @@ class Conv2d : public UnaryBlock {
                                 dilation.second,
                                 dilation.first,
                                 ctx->conv2d_direct_enabled,
-                                ctx->circular_pad_enabled,
+                                ctx->circular_pad_x_enabled,
+                                ctx->circular_pad_y_enabled,
                                 scale);
     }
 };
diff --git a/lora.hpp b/lora.hpp
index e6af66798..7d83ec5cd 100644
--- a/lora.hpp
+++ b/lora.hpp
@@ -599,7 +599,8 @@ struct LoraModel : public GGMLRunner {
                                       forward_params.conv2d.d0,
                                       forward_params.conv2d.d1,
                                       forward_params.conv2d.direct,
-                                      forward_params.conv2d.circular,
+                                      forward_params.conv2d.circular_x,
+                                      forward_params.conv2d.circular_y,
                                       forward_params.conv2d.scale);
                 if (lora_mid) {
                     lx = ggml_ext_conv_2d(ctx,
@@ -613,7 +614,8 @@ struct LoraModel : public GGMLRunner {
                                           1,
                                           1,
                                           forward_params.conv2d.direct,
-                                          forward_params.conv2d.circular,
+                                          forward_params.conv2d.circular_x,
+                                          forward_params.conv2d.circular_y,
                                           forward_params.conv2d.scale);
                 }
                 lx = ggml_ext_conv_2d(ctx,
@@ -627,7 +629,8 @@ struct LoraModel : public GGMLRunner {
                                       1,
                                       1,
                                       forward_params.conv2d.direct,
-                                      forward_params.conv2d.circular,
+                                      forward_params.conv2d.circular_x,
+                                      forward_params.conv2d.circular_y,
                                       forward_params.conv2d.scale);
             }
 
@@ -782,7 +785,8 @@ struct MultiLoraAdapter : public WeightAdapter {
                                    forward_params.conv2d.d0,
                                    forward_params.conv2d.d1,
                                    forward_params.conv2d.direct,
-                                   forward_params.conv2d.circular,
+                                   forward_params.conv2d.circular_x,
+                                   forward_params.conv2d.circular_y,
                                    forward_params.conv2d.scale);
         }
         for (auto& lora_model : lora_models) {
diff --git a/qwen_image.hpp b/qwen_image.hpp
index d24c3d83e..169e1e325 100644
--- a/qwen_image.hpp
+++ b/qwen_image.hpp
@@ -361,11 +361,7 @@ namespace Qwen {
 
             int pad_h = (params.patch_size - H % params.patch_size) % params.patch_size;
             int pad_w = (params.patch_size - W % params.patch_size) % params.patch_size;
-            if (ctx->circular_pad_enabled) {
-                x = ggml_pad_circular(ctx->ggml_ctx, x, pad_w, pad_h, 0, 0);
-            } else {
-                x = ggml_pad(ctx->ggml_ctx, x, pad_w, pad_h, 0, 0);  // [N, C, H + pad_h, W + pad_w]
-            }
+            x = sd_pad(ctx->ggml_ctx, x, pad_w, pad_h, 0, 0, ctx->circular_pad_x_enabled, ctx->circular_pad_y_enabled);
             return x;
         }
 
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index eabb51012..6a51b6b48 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -216,6 +216,7 @@ class StableDiffusionGGML {
         circular_pad            = sd_ctx_params->circular_pad;
         circular_pad_x          = sd_ctx_params->circular_pad_x || circular_pad;
         circular_pad_y          = sd_ctx_params->circular_pad_y || circular_pad;
+        bool circular_pad_any   = circular_pad || circular_pad_x || circular_pad_y;
 
         rng = get_rng(sd_ctx_params->rng_type);
         if (sd_ctx_params->sampler_rng_type != RNG_TYPE_COUNT && sd_ctx_params->sampler_rng_type != sd_ctx_params->rng_type) {
@@ -393,7 +394,7 @@ class StableDiffusionGGML {
             vae_decode_only = false;
         }
 
-        if (circular_pad) {
+        if (circular_pad_any) {
             LOG_INFO("Using circular padding for convolutions");
         }
 
@@ -412,7 +413,7 @@ class StableDiffusionGGML {
                 diffusion_model  = std::make_shared<MMDiTModel>(backend,
                                                                offload_params_to_cpu,
                                                                tensor_storage_map);
-                diffusion_model->set_circular_pad_enabled(circular_pad);
+                diffusion_model->set_circular_pad_axes(circular_pad_x, circular_pad_y);
             } else if (sd_version_is_flux(version)) {
                 bool is_chroma = false;
                 for (auto pair : tensor_storage_map) {
@@ -453,7 +454,7 @@ class StableDiffusionGGML {
                                                               tensor_storage_map,
                                                               version,
                                                               sd_ctx_params->chroma_use_dit_mask);
-                diffusion_model->set_circular_pad_enabled(circular_pad);
+                diffusion_model->set_circular_pad_axes(circular_pad_x, circular_pad_y);
                 diffusion_model->set_rope_circular_axes(circular_pad_x, circular_pad_y);
             } else if (sd_version_is_flux2(version)) {
                 bool is_chroma   = false;
@@ -466,7 +467,7 @@ class StableDiffusionGGML {
                                                                tensor_storage_map,
                                                                version,
                                                                sd_ctx_params->chroma_use_dit_mask);
-                diffusion_model->set_circular_pad_enabled(circular_pad);
+                diffusion_model->set_circular_pad_axes(circular_pad_x, circular_pad_y);
                 diffusion_model->set_rope_circular_axes(circular_pad_x, circular_pad_y);
             } else if (sd_version_is_wan(version)) {
                 cond_stage_model = std::make_shared<T5CLIPEmbedder>(clip_backend,
@@ -480,14 +481,14 @@ class StableDiffusionGGML {
                                                              tensor_storage_map,
                                                              "model.diffusion_model",
                                                              version);
-                diffusion_model->set_circular_pad_enabled(circular_pad);
+                diffusion_model->set_circular_pad_axes(circular_pad_x, circular_pad_y);
                 if (strlen(SAFE_STR(sd_ctx_params->high_noise_diffusion_model_path)) > 0) {
                     high_noise_diffusion_model = std::make_shared<WanModel>(backend,
                                                                             offload_params_to_cpu,
                                                                             tensor_storage_map,
                                                                             "model.high_noise_diffusion_model",
                                                                             version);
-                    high_noise_diffusion_model->set_circular_pad_enabled(circular_pad);
+                    high_noise_diffusion_model->set_circular_pad_axes(circular_pad_x, circular_pad_y);
                 }
                 if (diffusion_model->get_desc() == "Wan2.1-I2V-14B" ||
                     diffusion_model->get_desc() == "Wan2.1-FLF2V-14B" ||
@@ -514,7 +515,7 @@ class StableDiffusionGGML {
                                                                    tensor_storage_map,
                                                                    "model.diffusion_model",
                                                                    version);
-                diffusion_model->set_circular_pad_enabled(circular_pad);
+                diffusion_model->set_circular_pad_axes(circular_pad_x, circular_pad_y);
                 diffusion_model->set_rope_circular_axes(circular_pad_x, circular_pad_y);
             } else if (sd_version_is_z_image(version)) {
                 cond_stage_model = std::make_shared<LLMEmbedder>(clip_backend,
@@ -526,7 +527,7 @@ class StableDiffusionGGML {
                                                                 tensor_storage_map,
                                                                 "model.diffusion_model",
                                                                 version);
-                diffusion_model->set_circular_pad_enabled(circular_pad);
+                diffusion_model->set_circular_pad_axes(circular_pad_x, circular_pad_y);
                 diffusion_model->set_rope_circular_axes(circular_pad_x, circular_pad_y);
             } else {  // SD1.x SD2.x SDXL
                 std::map<std::string, std::string> embbeding_map;
@@ -547,17 +548,17 @@ class StableDiffusionGGML {
                                                                                            embbeding_map,
                                                                                            version);
                 }
-                diffusion_model = std::make_shared<UNetModel>(backend,
-                                                              offload_params_to_cpu,
-                                                              tensor_storage_map,
-                                                              version);
-                if (sd_ctx_params->diffusion_conv_direct) {
-                    LOG_INFO("Using Conv2d direct in the diffusion model");
-                std::dynamic_pointer_cast<UNetModel>(diffusion_model)->unet.set_conv2d_direct_enabled(true);
-            }
-            diffusion_model->set_circular_pad_enabled(circular_pad);
-                std::dynamic_pointer_cast<UNetModel>(diffusion_model)->unet.set_circular_pad_enabled(circular_pad);
-            }
+        diffusion_model = std::make_shared<UNetModel>(backend,
+                                                      offload_params_to_cpu,
+                                                      tensor_storage_map,
+                                                      version);
+        if (sd_ctx_params->diffusion_conv_direct) {
+                LOG_INFO("Using Conv2d direct in the diffusion model");
+            std::dynamic_pointer_cast<UNetModel>(diffusion_model)->unet.set_conv2d_direct_enabled(true);
+        }
+        diffusion_model->set_circular_pad_axes(circular_pad_x, circular_pad_y);
+            std::dynamic_pointer_cast<UNetModel>(diffusion_model)->unet.set_circular_pad_axes(circular_pad_x, circular_pad_y);
+        }
 
             if (sd_ctx_params->diffusion_flash_attn) {
                 LOG_INFO("Using flash attention in the diffusion model");
@@ -593,7 +594,7 @@ class StableDiffusionGGML {
                                                                         "first_stage_model",
                                                                         vae_decode_only,
                                                                         version);
-                first_stage_model->set_circular_pad_enabled(circular_pad);
+                first_stage_model->set_circular_pad_axes(circular_pad_x, circular_pad_y);
                 first_stage_model->alloc_params_buffer();
                 first_stage_model->get_param_tensors(tensors, "first_stage_model");
             } else if (version == VERSION_CHROMA_RADIANCE) {
@@ -620,7 +621,7 @@ class StableDiffusionGGML {
                         vae_conv_2d_scale);
                     first_stage_model->set_conv2d_scale(vae_conv_2d_scale);
                 }
-                first_stage_model->set_circular_pad_enabled(circular_pad);
+                first_stage_model->set_circular_pad_axes(circular_pad_x, circular_pad_y);
                 first_stage_model->alloc_params_buffer();
                 first_stage_model->get_param_tensors(tensors, "first_stage_model");
             }
@@ -635,7 +636,7 @@ class StableDiffusionGGML {
                     LOG_INFO("Using Conv2d direct in the tae model");
                     tae_first_stage->set_conv2d_direct_enabled(true);
                 }
-                tae_first_stage->set_circular_pad_enabled(circular_pad);
+                tae_first_stage->set_circular_pad_axes(circular_pad_x, circular_pad_y);
             }
             // first_stage_model->get_param_tensors(tensors, "first_stage_model.");
 
@@ -655,7 +656,7 @@ class StableDiffusionGGML {
                     LOG_INFO("Using Conv2d direct in the control net");
                     control_net->set_conv2d_direct_enabled(true);
                 }
-                control_net->set_circular_pad_enabled(circular_pad);
+                control_net->set_circular_pad_axes(circular_pad_x, circular_pad_y);
             }
 
             if (strstr(SAFE_STR(sd_ctx_params->photo_maker_path), "v2")) {
diff --git a/wan.hpp b/wan.hpp
index 18b7e3d29..90091c70e 100644
--- a/wan.hpp
+++ b/wan.hpp
@@ -75,11 +75,7 @@ namespace WAN {
                 lp2 -= (int)cache_x->ne[2];
             }
 
-            if (ctx->circular_pad_enabled) {
-                x = ggml_pad_ext_circular(ctx->ggml_ctx, x, lp0, rp0, lp1, rp1, lp2, rp2, 0, 0);
-            } else {
-                x = ggml_pad_ext(ctx->ggml_ctx, x, lp0, rp0, lp1, rp1, lp2, rp2, 0, 0);
-            }
+            x = sd_pad_ext(ctx->ggml_ctx, x, lp0, rp0, lp1, rp1, lp2, rp2, 0, 0, ctx->circular_pad_x_enabled, ctx->circular_pad_y_enabled);
             return ggml_ext_conv_3d(ctx->ggml_ctx, x, w, b, in_channels,
                                     std::get<2>(stride), std::get<1>(stride), std::get<0>(stride),
                                     0, 0, 0,
@@ -210,17 +206,9 @@ namespace WAN {
                 } else if (mode == "upsample3d") {
                     x = ggml_upscale(ctx->ggml_ctx, x, 2, GGML_SCALE_MODE_NEAREST);
                 } else if (mode == "downsample2d") {
-                    if (ctx->circular_pad_enabled) {
-                        x = ggml_pad_circular(ctx->ggml_ctx, x, 1, 1, 0, 0);
-                    } else {
-                        x = ggml_pad(ctx->ggml_ctx, x, 1, 1, 0, 0);
-                    }
+                    x = sd_pad(ctx->ggml_ctx, x, 1, 1, 0, 0, ctx->circular_pad_x_enabled, ctx->circular_pad_y_enabled);
                 } else if (mode == "downsample3d") {
-                    if (ctx->circular_pad_enabled) {
-                        x = ggml_pad_circular(ctx->ggml_ctx, x, 1, 1, 0, 0);
-                    } else {
-                        x = ggml_pad(ctx->ggml_ctx, x, 1, 1, 0, 0);
-                    }
+                    x = sd_pad(ctx->ggml_ctx, x, 1, 1, 0, 0, ctx->circular_pad_x_enabled, ctx->circular_pad_y_enabled);
                 }
                 x = resample_1->forward(ctx, x);
                 x = ggml_ext_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, x, 0, 1, 3, 2));  // (c, t, h, w)
@@ -1847,12 +1835,7 @@ namespace WAN {
             int pad_t = (std::get<0>(params.patch_size) - T % std::get<0>(params.patch_size)) % std::get<0>(params.patch_size);
             int pad_h = (std::get<1>(params.patch_size) - H % std::get<1>(params.patch_size)) % std::get<1>(params.patch_size);
             int pad_w = (std::get<2>(params.patch_size) - W % std::get<2>(params.patch_size)) % std::get<2>(params.patch_size);
-            if (ctx->circular_pad_enabled) {
-                x = ggml_pad_circular(ctx->ggml_ctx, x, pad_w, pad_h, pad_t, 0);
-            } else  {
-                x = ggml_pad(ctx->ggml_ctx, x, pad_w, pad_h, pad_t, 0);
-            }
-
+            sd_pad(ctx->ggml_ctx, x, pad_w, pad_h, pad_t, 0, ctx->circular_pad_x_enabled, ctx->circular_pad_y_enabled);
             return x;
         }
 
diff --git a/z_image.hpp b/z_image.hpp
index c87f1b9d9..cb64d7b0e 100644
--- a/z_image.hpp
+++ b/z_image.hpp
@@ -331,11 +331,7 @@ namespace ZImage {
 
             int pad_h = (z_image_params.patch_size - H % z_image_params.patch_size) % z_image_params.patch_size;
             int pad_w = (z_image_params.patch_size - W % z_image_params.patch_size) % z_image_params.patch_size;
-            if (ctx->circular_pad_enabled) {
-                x = ggml_pad_circular(ctx->ggml_ctx, x, pad_w, pad_h, 0, 0);  // [N, C, H + pad_h, W + pad_w]
-            } else {
-                x = ggml_pad(ctx->ggml_ctx, x, pad_w, pad_h, 0, 0);  // [N, C, H + pad_h, W + pad_w]
-            }
+            x = sd_pad(ctx->ggml_ctx, x, pad_w, pad_h, 0, 0, ctx->circular_pad_x_enabled, ctx->circular_pad_y_enabled);
             return x;
         }
 

From 32e1b7556b3d7919dcc981b3c6f1a031345b1b46 Mon Sep 17 00:00:00 2001
From: Phylliida <phylliidadev@gmail.com>
Date: Sat, 13 Dec 2025 02:16:50 -0800
Subject: [PATCH 23/25] Fix crash on chroma

---
 ggml_extend.hpp | 43 ++++++++++++-------------------------------
 1 file changed, 12 insertions(+), 31 deletions(-)

diff --git a/ggml_extend.hpp b/ggml_extend.hpp
index 3c3b7aa28..8c0e8518c 100644
--- a/ggml_extend.hpp
+++ b/ggml_extend.hpp
@@ -1012,37 +1012,6 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_linear(struct ggml_context* ctx,
     return x;
 }
 
-__STATIC_INLINE__ struct ggml_tensor* sd_pad(struct ggml_context* ctx,
-                                             struct ggml_tensor* x,
-                                             int pad_w,
-                                             int pad_h,
-                                             int pad_t       = 0,
-                                             int pad_d       = 0,
-                                             bool circular_x = false,
-                                             bool circular_y = false) {
-    if ((circular_x && circular_y) || (!circular_x && !circular_y)) {
-        return circular_x && circular_y ? ggml_pad_circular(ctx, x, pad_w, pad_h, pad_t, pad_d)
-                                        : ggml_pad(ctx, x, pad_w, pad_h, pad_t, pad_d);
-    }
-
-    int rem_w = pad_w;
-    int rem_h = pad_h;
-
-    if (circular_x && pad_w != 0) {
-        x     = ggml_pad_circular(ctx, x, pad_w, 0, 0, 0);
-        rem_w = 0;
-    }
-    if (circular_y && pad_h != 0) {
-        x     = ggml_pad_circular(ctx, x, 0, pad_h, 0, 0);
-        rem_h = 0;
-    }
-
-    if (rem_w != 0 || rem_h != 0 || pad_t != 0 || pad_d != 0) {
-        x = ggml_pad(ctx, x, rem_w, rem_h, pad_t, pad_d);
-    }
-    return x;
-}
-
 __STATIC_INLINE__ struct ggml_tensor* sd_pad_ext(struct ggml_context* ctx,
                                                  struct ggml_tensor* x,
                                                  int lp0,
@@ -1075,6 +1044,18 @@ __STATIC_INLINE__ struct ggml_tensor* sd_pad_ext(struct ggml_context* ctx,
     return x;
 }
 
+__STATIC_INLINE__ struct ggml_tensor* sd_pad(struct ggml_context* ctx,
+                                             struct ggml_tensor* x,
+                                             int pad_w,
+                                             int pad_h,
+                                             int pad_t       = 0,
+                                             int pad_d       = 0,
+                                             bool circular_x = false,
+                                             bool circular_y = false) {
+
+    return sd_pad_ext(ctx, x, pad_w, pad_w, pad_h, pad_h, pad_t, pad_t, pad_d, pad_d, circular_x, circular_y);
+}
+
 // w: [OC，IC, KH, KW]
 // x: [N, IC, IH, IW]
 // b: [OC,]

From dc6e8870b222d9b15fdd0fbf3592ff5d0a5ec328 Mon Sep 17 00:00:00 2001
From: Phylliida <phylliidadev@gmail.com>
Date: Sat, 13 Dec 2025 02:38:59 -0800
Subject: [PATCH 24/25] Refactor into cleaner variable choices

---
 clip.hpp              |  2 +-
 common.hpp            |  2 +-
 diffusion_model.hpp   | 65 +++++++++++++++++--------------------------
 examples/cli/main.cpp | 24 ++++++++--------
 flux.hpp              |  6 ++--
 ggml_extend.hpp       | 41 ++++++++++-----------------
 qwen_image.hpp        |  6 ++--
 stable-diffusion.cpp  | 64 ++++++++++++++++++++----------------------
 stable-diffusion.h    |  6 ++--
 wan.hpp               |  8 +++---
 z_image.hpp           |  6 ++--
 11 files changed, 101 insertions(+), 129 deletions(-)

diff --git a/clip.hpp b/clip.hpp
index 4b51727c4..c5d7a19c6 100644
--- a/clip.hpp
+++ b/clip.hpp
@@ -664,7 +664,7 @@ class CLIPVisionEmbeddings : public GGMLBlock {
         // concat(patch_embedding, class_embedding) + position_embedding
         struct ggml_tensor* patch_embedding;
         int64_t N       = pixel_values->ne[3];
-        patch_embedding    = ggml_ext_conv_2d(ctx->ggml_ctx, pixel_values, patch_embed_weight, nullptr, patch_size, patch_size, 0, 0, 1, 1, false, ctx->circular_pad_x_enabled, ctx->circular_pad_y_enabled);  // [N, embed_dim, image_size // pacht_size, image_size // pacht_size]
+        patch_embedding    = ggml_ext_conv_2d(ctx->ggml_ctx, pixel_values, patch_embed_weight, nullptr, patch_size, patch_size, 0, 0, 1, 1, false, ctx->circular_x_enabled, ctx->circular_y_enabled);  // [N, embed_dim, image_size // pacht_size, image_size // pacht_size]
         patch_embedding = ggml_reshape_3d(ctx->ggml_ctx, patch_embedding, num_patches, embed_dim, N);                          // [N, embed_dim, num_patches]
         patch_embedding = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, patch_embedding, 1, 0, 2, 3));                  // [N, num_patches, embed_dim]
         patch_embedding = ggml_reshape_4d(ctx->ggml_ctx, patch_embedding, 1, embed_dim, num_patches, N);                       // [N, num_patches, embed_dim, 1]
diff --git a/common.hpp b/common.hpp
index 8d66422b1..3741e975a 100644
--- a/common.hpp
+++ b/common.hpp
@@ -30,7 +30,7 @@ class DownSampleBlock : public GGMLBlock {
 
             // For VAE downsampling we manually pad by 1 before the stride-2 conv.
             // Honor the global circular padding flags here to avoid seams in seamless mode.
-            x = sd_pad(ctx->ggml_ctx, x, 1, 1, 0, 0, ctx->circular_pad_x_enabled, ctx->circular_pad_y_enabled);
+            x = sd_pad(ctx->ggml_ctx, x, 1, 1, 0, 0, ctx->circular_x_enabled, ctx->circular_y_enabled);
             x = conv->forward(ctx, x);
         } else {
             auto conv = std::dynamic_pointer_cast<Conv2d>(blocks["op"]);
diff --git a/diffusion_model.hpp b/diffusion_model.hpp
index c73a50106..0b32babf8 100644
--- a/diffusion_model.hpp
+++ b/diffusion_model.hpp
@@ -39,9 +39,8 @@ struct DiffusionModel {
     virtual void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter){};
     virtual int64_t get_adm_in_channels()             = 0;
     virtual void set_flash_attn_enabled(bool enabled) = 0;
-    virtual void set_circular_pad_enabled(bool enabled) = 0;
-    virtual void set_circular_pad_axes(bool circular_x, bool circular_y) = 0;
-    virtual void set_rope_circular_axes(bool circular_x, bool circular_y) {};
+    virtual void set_circular_enabled(bool enabled) = 0;
+    virtual void set_circular_axes(bool circular_x, bool circular_y) = 0;
 };
 
 struct UNetModel : public DiffusionModel {
@@ -90,12 +89,12 @@ struct UNetModel : public DiffusionModel {
         unet.set_flash_attention_enabled(enabled);
     }
 
-    void set_circular_pad_enabled(bool enabled) override {
-        unet.set_circular_pad_axes(enabled, enabled);
+    void set_circular_enabled(bool enabled) override {
+        unet.set_circular_axes(enabled, enabled);
     }
 
-    void set_circular_pad_axes(bool circular_x, bool circular_y) override {
-        unet.set_circular_pad_axes(circular_x, circular_y);
+    void set_circular_axes(bool circular_x, bool circular_y) override {
+        unet.set_circular_axes(circular_x, circular_y);
     }
 
     bool compute(int n_threads,
@@ -159,12 +158,12 @@ struct MMDiTModel : public DiffusionModel {
         mmdit.set_flash_attention_enabled(enabled);
     }
 
-    void set_circular_pad_enabled(bool enabled) override {
-        mmdit.set_circular_pad_axes(enabled, enabled);
+    void set_circular_enabled(bool enabled) override {
+        mmdit.set_circular_axes(enabled, enabled);
     }
 
-    void set_circular_pad_axes(bool circular_x, bool circular_y) override {
-        mmdit.set_circular_pad_axes(circular_x, circular_y);
+    void set_circular_axes(bool circular_x, bool circular_y) override {
+        mmdit.set_circular_axes(circular_x, circular_y);
     }
 
     bool compute(int n_threads,
@@ -229,16 +228,12 @@ struct FluxModel : public DiffusionModel {
         flux.set_flash_attention_enabled(enabled);
     }
 
-    void set_circular_pad_enabled(bool enabled) override {
-        flux.set_circular_pad_axes(enabled, enabled);
+    void set_circular_enabled(bool enabled) override {
+        flux.set_circular_axes(enabled, enabled);
     }
 
-    void set_circular_pad_axes(bool circular_x, bool circular_y) override {
-        flux.set_circular_pad_axes(circular_x, circular_y);
-    }
-
-    void set_rope_circular_axes(bool circular_x, bool circular_y) override {
-        flux.set_circular_rope_enabled(circular_x, circular_y);
+    void set_circular_axes(bool circular_x, bool circular_y) override {
+        flux.set_circular_axes(circular_x, circular_y);
     }
 
     bool compute(int n_threads,
@@ -308,12 +303,12 @@ struct WanModel : public DiffusionModel {
         wan.set_flash_attention_enabled(enabled);
     }
 
-    void set_circular_pad_enabled(bool enabled) override {
-        wan.set_circular_pad_axes(enabled, enabled);
+    void set_circular_enabled(bool enabled) override {
+        wan.set_circular_axes(enabled, enabled);
     }
 
-    void set_circular_pad_axes(bool circular_x, bool circular_y) override {
-        wan.set_circular_pad_axes(circular_x, circular_y);
+    void set_circular_axes(bool circular_x, bool circular_y) override {
+        wan.set_circular_axes(circular_x, circular_y);
     }
 
     bool compute(int n_threads,
@@ -382,16 +377,12 @@ struct QwenImageModel : public DiffusionModel {
         qwen_image.set_flash_attention_enabled(enabled);
     }
 
-    void set_circular_pad_enabled(bool enabled) override {
-        qwen_image.set_circular_pad_axes(enabled, enabled);
-    }
-
-    void set_circular_pad_axes(bool circular_x, bool circular_y) override {
-        qwen_image.set_circular_pad_axes(circular_x, circular_y);
+    void set_circular_enabled(bool enabled) override {
+        qwen_image.set_circular_axes(enabled, enabled);
     }
 
-    void set_rope_circular_axes(bool circular_x, bool circular_y) override {
-        qwen_image.set_circular_rope_enabled(circular_x, circular_y);
+    void set_circular_axes(bool circular_x, bool circular_y) override {
+        qwen_image.set_circular_axes(circular_x, circular_y);
     }
 
     bool compute(int n_threads,
@@ -457,16 +448,12 @@ struct ZImageModel : public DiffusionModel {
         z_image.set_flash_attention_enabled(enabled);
     }
 
-    void set_circular_pad_enabled(bool enabled) override {
-        z_image.set_circular_pad_axes(enabled, enabled);
-    }
-
-    void set_circular_pad_axes(bool circular_x, bool circular_y) override {
-        z_image.set_circular_pad_axes(circular_x, circular_y);
+    void set_circular_enabled(bool enabled) override {
+        z_image.set_circular_axes(enabled, enabled);
     }
 
-    void set_rope_circular_axes(bool circular_x, bool circular_y) override {
-        z_image.set_circular_rope_enabled(circular_x, circular_y);
+    void set_circular_axes(bool circular_x, bool circular_y) override {
+        z_image.set_circular_axes(circular_x, circular_y);
     }
 
     bool compute(int n_threads,
diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp
index 2aa3446ed..e472ca2e6 100644
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@@ -518,9 +518,9 @@ struct SDContextParams {
     bool diffusion_flash_attn   = false;
     bool diffusion_conv_direct  = false;
     bool vae_conv_direct        = false;
-    bool circular_pad           = false;
-    bool circular_pad_x         = false;
-    bool circular_pad_y         = false;
+    bool circular              = false;
+    bool circular_x            = false;
+    bool circular_y            = false;
 
     bool chroma_use_dit_mask = true;
     bool chroma_use_t5_mask  = false;
@@ -677,15 +677,15 @@ struct SDContextParams {
             {"",
             "--circular",
             "enable circular padding for convolutions",
-            true, &circular_pad},
+            true, &circular},
             {"",
              "--circularx",
              "enable circular RoPE wrapping on x-axis (width) only",
-             true, &circular_pad_x},
+             true, &circular_x},
             {"",
              "--circulary",
              "enable circular RoPE wrapping on y-axis (height) only",
-             true, &circular_pad_y},
+             true, &circular_y},
             {"",
              "--chroma-disable-dit-mask",
              "disable dit mask for chroma",
@@ -949,9 +949,9 @@ struct SDContextParams {
             << "  diffusion_flash_attn: " << (diffusion_flash_attn ? "true" : "false") << ",\n"
             << "  diffusion_conv_direct: " << (diffusion_conv_direct ? "true" : "false") << ",\n"
             << "  vae_conv_direct: " << (vae_conv_direct ? "true" : "false") << ",\n"
-            << "  circular_pad: " << (circular_pad ? "true" : "false") << ",\n"
-            << "  circular_pad_x: " << (circular_pad_x ? "true" : "false") << ",\n"
-            << "  circular_pad_y: " << (circular_pad_y ? "true" : "false") << ",\n"
+            << "  circular: " << (circular ? "true" : "false") << ",\n"
+            << "  circular_x: " << (circular_x ? "true" : "false") << ",\n"
+            << "  circular_y: " << (circular_y ? "true" : "false") << ",\n"
             << "  chroma_use_dit_mask: " << (chroma_use_dit_mask ? "true" : "false") << ",\n"
             << "  chroma_use_t5_mask: " << (chroma_use_t5_mask ? "true" : "false") << ",\n"
             << "  chroma_t5_mask_pad: " << chroma_t5_mask_pad << ",\n"
@@ -1013,9 +1013,9 @@ struct SDContextParams {
             taesd_preview,
             diffusion_conv_direct,
             vae_conv_direct,
-            circular_pad,
-            circular_pad || circular_pad_x,
-            circular_pad || circular_pad_y,
+            circular,
+            circular || circular_x,
+            circular || circular_y,
             force_sdxl_vae_conv_scale,
             chroma_use_dit_mask,
             chroma_use_t5_mask,
diff --git a/flux.hpp b/flux.hpp
index 65e91e106..2038fe152 100644
--- a/flux.hpp
+++ b/flux.hpp
@@ -865,7 +865,7 @@ namespace Flux {
 
             int pad_h = (params.patch_size - H % params.patch_size) % params.patch_size;
             int pad_w = (params.patch_size - W % params.patch_size) % params.patch_size;
-            x = sd_pad(ctx->ggml_ctx, x, pad_w, pad_h, 0, 0, ctx->circular_pad_x_enabled, ctx->circular_pad_y_enabled);
+            x = sd_pad(ctx->ggml_ctx, x, pad_w, pad_h, 0, 0, ctx->circular_x_enabled, ctx->circular_y_enabled);
             return x;
         }
 
@@ -1441,8 +1441,8 @@ namespace Flux {
                                             increase_ref_index,
                                             flux_params.ref_index_scale,
                                             flux_params.theta,
-                                            rope_circular_y_enabled,
-                                            rope_circular_x_enabled,
+                                            circular_y_enabled,
+                                            circular_x_enabled,
                                             flux_params.axes_dim);
             int pos_len = pe_vec.size() / flux_params.axes_dim_sum / 2;
             // LOG_DEBUG("pos_len %d", pos_len);
diff --git a/ggml_extend.hpp b/ggml_extend.hpp
index 8c0e8518c..0d520fb10 100644
--- a/ggml_extend.hpp
+++ b/ggml_extend.hpp
@@ -1618,10 +1618,8 @@ struct GGMLRunnerContext {
     ggml_context* ggml_ctx                        = nullptr;
     bool flash_attn_enabled                       = false;
     bool conv2d_direct_enabled                    = false;
-    bool circular_pad_x_enabled                   = false;
-    bool circular_pad_y_enabled                   = false;
-    bool rope_circular_x_enabled                  = false;
-    bool rope_circular_y_enabled                  = false;
+    bool circular_x_enabled                       = false;
+    bool circular_y_enabled                       = false;
     std::shared_ptr<WeightAdapter> weight_adapter = nullptr;
 };
 
@@ -1658,10 +1656,8 @@ struct GGMLRunner {
 
     bool flash_attn_enabled    = false;
     bool conv2d_direct_enabled = false;
-    bool circular_pad_x_enabled  = false;
-    bool circular_pad_y_enabled  = false;
-    bool rope_circular_x_enabled = false;
-    bool rope_circular_y_enabled = false;
+    bool circular_x_enabled = false;
+    bool circular_y_enabled = false;
 
     void alloc_params_ctx() {
         struct ggml_init_params params;
@@ -1939,10 +1935,8 @@ struct GGMLRunner {
         runner_ctx.backend               = runtime_backend;
         runner_ctx.flash_attn_enabled    = flash_attn_enabled;
         runner_ctx.conv2d_direct_enabled = conv2d_direct_enabled;
-        runner_ctx.circular_pad_x_enabled = circular_pad_x_enabled;
-        runner_ctx.circular_pad_y_enabled = circular_pad_y_enabled;
-        runner_ctx.rope_circular_x_enabled = rope_circular_x_enabled;
-        runner_ctx.rope_circular_y_enabled = rope_circular_y_enabled;
+        runner_ctx.circular_x_enabled = circular_x_enabled;
+        runner_ctx.circular_y_enabled = circular_y_enabled;
         runner_ctx.weight_adapter        = weight_adapter;
         return runner_ctx;
     }
@@ -2087,18 +2081,13 @@ struct GGMLRunner {
         conv2d_direct_enabled = enabled;
     }
 
-    void set_circular_pad_enabled(bool enabled) {
-        set_circular_pad_axes(enabled, enabled);
+    void set_circular_enabled(bool enabled) {
+        set_circular_axes(enabled, enabled);
     }
 
-    void set_circular_pad_axes(bool circular_x, bool circular_y) {
-        circular_pad_x_enabled = circular_x;
-        circular_pad_y_enabled = circular_y;
-    }
-
-    void set_circular_rope_enabled(bool circular_x, bool circular_y) {
-        rope_circular_x_enabled = circular_x;
-        rope_circular_y_enabled = circular_y;
+    void set_circular_axes(bool circular_x, bool circular_y) {
+        circular_x_enabled = circular_x;
+        circular_y_enabled = circular_y;
     }
 
     void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) {
@@ -2372,8 +2361,8 @@ class Conv2d : public UnaryBlock {
             forward_params.conv2d.d0     = dilation.second;
             forward_params.conv2d.d1     = dilation.first;
             forward_params.conv2d.direct = ctx->conv2d_direct_enabled;
-            forward_params.conv2d.circular_x = ctx->circular_pad_x_enabled;
-            forward_params.conv2d.circular_y = ctx->circular_pad_y_enabled;
+            forward_params.conv2d.circular_x = ctx->circular_x_enabled;
+            forward_params.conv2d.circular_y = ctx->circular_y_enabled;
             forward_params.conv2d.scale    = scale;
             return ctx->weight_adapter->forward_with_lora(ctx->ggml_ctx, x, w, b, prefix, forward_params);
         }
@@ -2388,8 +2377,8 @@ class Conv2d : public UnaryBlock {
                                 dilation.second,
                                 dilation.first,
                                 ctx->conv2d_direct_enabled,
-                                ctx->circular_pad_x_enabled,
-                                ctx->circular_pad_y_enabled,
+                                ctx->circular_x_enabled,
+                                ctx->circular_y_enabled,
                                 scale);
     }
 };
diff --git a/qwen_image.hpp b/qwen_image.hpp
index 169e1e325..847f61171 100644
--- a/qwen_image.hpp
+++ b/qwen_image.hpp
@@ -361,7 +361,7 @@ namespace Qwen {
 
             int pad_h = (params.patch_size - H % params.patch_size) % params.patch_size;
             int pad_w = (params.patch_size - W % params.patch_size) % params.patch_size;
-            x = sd_pad(ctx->ggml_ctx, x, pad_w, pad_h, 0, 0, ctx->circular_pad_x_enabled, ctx->circular_pad_y_enabled);
+            x = sd_pad(ctx->ggml_ctx, x, pad_w, pad_h, 0, 0, ctx->circular_x_enabled, ctx->circular_y_enabled);
             return x;
         }
 
@@ -565,8 +565,8 @@ namespace Qwen {
                                                   ref_latents,
                                                   increase_ref_index,
                                                   qwen_image_params.theta,
-                                                  rope_circular_y_enabled,
-                                                  rope_circular_x_enabled,
+                                                  circular_y_enabled,
+                                                  circular_x_enabled,
                                                   qwen_image_params.axes_dim);
             int pos_len = pe_vec.size() / qwen_image_params.axes_dim_sum / 2;
             // LOG_DEBUG("pos_len %d", pos_len);
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index 6a51b6b48..d94134602 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -127,9 +127,9 @@ class StableDiffusionGGML {
     bool use_tiny_autoencoder            = false;
     sd_tiling_params_t vae_tiling_params = {false, 0, 0, 0.5f, 0, 0};
     bool offload_params_to_cpu           = false;
-    bool circular_pad                    = false;
-    bool circular_pad_x                  = false;
-    bool circular_pad_y                  = false;
+    bool circular                       = false;
+    bool circular_x                     = false;
+    bool circular_y                     = false;
     bool stacked_id                      = false;
 
     bool is_using_v_parameterization     = false;
@@ -213,10 +213,10 @@ class StableDiffusionGGML {
         taesd_path              = SAFE_STR(sd_ctx_params->taesd_path);
         use_tiny_autoencoder    = taesd_path.size() > 0;
         offload_params_to_cpu   = sd_ctx_params->offload_params_to_cpu;
-        circular_pad            = sd_ctx_params->circular_pad;
-        circular_pad_x          = sd_ctx_params->circular_pad_x || circular_pad;
-        circular_pad_y          = sd_ctx_params->circular_pad_y || circular_pad;
-        bool circular_pad_any   = circular_pad || circular_pad_x || circular_pad_y;
+        circular            = sd_ctx_params->circular;
+        circular_x          = sd_ctx_params->circular_x || circular;
+        circular_y          = sd_ctx_params->circular_y || circular;
+        bool circular_any   = circular || circular_x || circular_y;
 
         rng = get_rng(sd_ctx_params->rng_type);
         if (sd_ctx_params->sampler_rng_type != RNG_TYPE_COUNT && sd_ctx_params->sampler_rng_type != sd_ctx_params->rng_type) {
@@ -394,7 +394,7 @@ class StableDiffusionGGML {
             vae_decode_only = false;
         }
 
-        if (circular_pad_any) {
+        if (circular_any) {
             LOG_INFO("Using circular padding for convolutions");
         }
 
@@ -413,7 +413,7 @@ class StableDiffusionGGML {
                 diffusion_model  = std::make_shared<MMDiTModel>(backend,
                                                                offload_params_to_cpu,
                                                                tensor_storage_map);
-                diffusion_model->set_circular_pad_axes(circular_pad_x, circular_pad_y);
+                diffusion_model->set_circular_axes(circular_x, circular_y);
             } else if (sd_version_is_flux(version)) {
                 bool is_chroma = false;
                 for (auto pair : tensor_storage_map) {
@@ -454,8 +454,7 @@ class StableDiffusionGGML {
                                                               tensor_storage_map,
                                                               version,
                                                               sd_ctx_params->chroma_use_dit_mask);
-                diffusion_model->set_circular_pad_axes(circular_pad_x, circular_pad_y);
-                diffusion_model->set_rope_circular_axes(circular_pad_x, circular_pad_y);
+                diffusion_model->set_circular_axes(circular_x, circular_y);
             } else if (sd_version_is_flux2(version)) {
                 bool is_chroma   = false;
                 cond_stage_model = std::make_shared<LLMEmbedder>(clip_backend,
@@ -467,8 +466,7 @@ class StableDiffusionGGML {
                                                                tensor_storage_map,
                                                                version,
                                                                sd_ctx_params->chroma_use_dit_mask);
-                diffusion_model->set_circular_pad_axes(circular_pad_x, circular_pad_y);
-                diffusion_model->set_rope_circular_axes(circular_pad_x, circular_pad_y);
+                diffusion_model->set_circular_axes(circular_x, circular_y);
             } else if (sd_version_is_wan(version)) {
                 cond_stage_model = std::make_shared<T5CLIPEmbedder>(clip_backend,
                                                                     offload_params_to_cpu,
@@ -481,14 +479,14 @@ class StableDiffusionGGML {
                                                              tensor_storage_map,
                                                              "model.diffusion_model",
                                                              version);
-                diffusion_model->set_circular_pad_axes(circular_pad_x, circular_pad_y);
+                diffusion_model->set_circular_axes(circular_x, circular_y);
                 if (strlen(SAFE_STR(sd_ctx_params->high_noise_diffusion_model_path)) > 0) {
                     high_noise_diffusion_model = std::make_shared<WanModel>(backend,
                                                                             offload_params_to_cpu,
                                                                             tensor_storage_map,
                                                                             "model.high_noise_diffusion_model",
                                                                             version);
-                    high_noise_diffusion_model->set_circular_pad_axes(circular_pad_x, circular_pad_y);
+                    high_noise_diffusion_model->set_circular_axes(circular_x, circular_y);
                 }
                 if (diffusion_model->get_desc() == "Wan2.1-I2V-14B" ||
                     diffusion_model->get_desc() == "Wan2.1-FLF2V-14B" ||
@@ -515,8 +513,7 @@ class StableDiffusionGGML {
                                                                    tensor_storage_map,
                                                                    "model.diffusion_model",
                                                                    version);
-                diffusion_model->set_circular_pad_axes(circular_pad_x, circular_pad_y);
-                diffusion_model->set_rope_circular_axes(circular_pad_x, circular_pad_y);
+                diffusion_model->set_circular_axes(circular_x, circular_y);
             } else if (sd_version_is_z_image(version)) {
                 cond_stage_model = std::make_shared<LLMEmbedder>(clip_backend,
                                                                  offload_params_to_cpu,
@@ -527,8 +524,7 @@ class StableDiffusionGGML {
                                                                 tensor_storage_map,
                                                                 "model.diffusion_model",
                                                                 version);
-                diffusion_model->set_circular_pad_axes(circular_pad_x, circular_pad_y);
-                diffusion_model->set_rope_circular_axes(circular_pad_x, circular_pad_y);
+                diffusion_model->set_circular_axes(circular_x, circular_y);
             } else {  // SD1.x SD2.x SDXL
                 std::map<std::string, std::string> embbeding_map;
                 for (int i = 0; i < sd_ctx_params->embedding_count; i++) {
@@ -556,8 +552,8 @@ class StableDiffusionGGML {
                 LOG_INFO("Using Conv2d direct in the diffusion model");
             std::dynamic_pointer_cast<UNetModel>(diffusion_model)->unet.set_conv2d_direct_enabled(true);
         }
-        diffusion_model->set_circular_pad_axes(circular_pad_x, circular_pad_y);
-            std::dynamic_pointer_cast<UNetModel>(diffusion_model)->unet.set_circular_pad_axes(circular_pad_x, circular_pad_y);
+        diffusion_model->set_circular_axes(circular_x, circular_y);
+        std::dynamic_pointer_cast<UNetModel>(diffusion_model)->unet.set_circular_axes(circular_x, circular_y);
         }
 
             if (sd_ctx_params->diffusion_flash_attn) {
@@ -594,7 +590,7 @@ class StableDiffusionGGML {
                                                                         "first_stage_model",
                                                                         vae_decode_only,
                                                                         version);
-                first_stage_model->set_circular_pad_axes(circular_pad_x, circular_pad_y);
+                first_stage_model->set_circular_axes(circular_x, circular_y);
                 first_stage_model->alloc_params_buffer();
                 first_stage_model->get_param_tensors(tensors, "first_stage_model");
             } else if (version == VERSION_CHROMA_RADIANCE) {
@@ -621,7 +617,7 @@ class StableDiffusionGGML {
                         vae_conv_2d_scale);
                     first_stage_model->set_conv2d_scale(vae_conv_2d_scale);
                 }
-                first_stage_model->set_circular_pad_axes(circular_pad_x, circular_pad_y);
+                first_stage_model->set_circular_axes(circular_x, circular_y);
                 first_stage_model->alloc_params_buffer();
                 first_stage_model->get_param_tensors(tensors, "first_stage_model");
             }
@@ -636,7 +632,7 @@ class StableDiffusionGGML {
                     LOG_INFO("Using Conv2d direct in the tae model");
                     tae_first_stage->set_conv2d_direct_enabled(true);
                 }
-                tae_first_stage->set_circular_pad_axes(circular_pad_x, circular_pad_y);
+                tae_first_stage->set_circular_axes(circular_x, circular_y);
             }
             // first_stage_model->get_param_tensors(tensors, "first_stage_model.");
 
@@ -656,7 +652,7 @@ class StableDiffusionGGML {
                     LOG_INFO("Using Conv2d direct in the control net");
                     control_net->set_conv2d_direct_enabled(true);
                 }
-                control_net->set_circular_pad_axes(circular_pad_x, circular_pad_y);
+                control_net->set_circular_axes(circular_x, circular_y);
             }
 
             if (strstr(SAFE_STR(sd_ctx_params->photo_maker_path), "v2")) {
@@ -2539,9 +2535,9 @@ void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) {
     sd_ctx_params->keep_control_net_on_cpu = false;
     sd_ctx_params->keep_vae_on_cpu         = false;
     sd_ctx_params->diffusion_flash_attn    = false;
-    sd_ctx_params->circular_pad            = false;
-    sd_ctx_params->circular_pad_x          = false;
-    sd_ctx_params->circular_pad_y          = false;
+    sd_ctx_params->circular                = false;
+    sd_ctx_params->circular_x              = false;
+    sd_ctx_params->circular_y              = false;
     sd_ctx_params->chroma_use_dit_mask     = true;
     sd_ctx_params->chroma_use_t5_mask      = false;
     sd_ctx_params->chroma_t5_mask_pad      = 1;
@@ -2582,9 +2578,9 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
              "keep_control_net_on_cpu: %s\n"
              "keep_vae_on_cpu: %s\n"
              "diffusion_flash_attn: %s\n"
-             "circular_pad: %s\n"
-             "circular_pad_x: %s\n"
-             "circular_pad_y: %s\n"
+             "circular: %s\n"
+             "circular_x: %s\n"
+             "circular_y: %s\n"
              "chroma_use_dit_mask: %s\n"
              "chroma_use_t5_mask: %s\n"
              "chroma_t5_mask_pad: %d\n",
@@ -2615,9 +2611,9 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
              BOOL_STR(sd_ctx_params->keep_control_net_on_cpu),
              BOOL_STR(sd_ctx_params->keep_vae_on_cpu),
              BOOL_STR(sd_ctx_params->diffusion_flash_attn),
-             BOOL_STR(sd_ctx_params->circular_pad),
-             BOOL_STR(sd_ctx_params->circular_pad_x),
-             BOOL_STR(sd_ctx_params->circular_pad_y),
+             BOOL_STR(sd_ctx_params->circular),
+             BOOL_STR(sd_ctx_params->circular_x),
+             BOOL_STR(sd_ctx_params->circular_y),
              BOOL_STR(sd_ctx_params->chroma_use_dit_mask),
              BOOL_STR(sd_ctx_params->chroma_use_t5_mask),
              sd_ctx_params->chroma_t5_mask_pad);
diff --git a/stable-diffusion.h b/stable-diffusion.h
index 3eb1324f5..4ef3799b0 100644
--- a/stable-diffusion.h
+++ b/stable-diffusion.h
@@ -189,9 +189,9 @@ typedef struct {
     bool tae_preview_only;
     bool diffusion_conv_direct;
     bool vae_conv_direct;
-    bool circular_pad;
-    bool circular_pad_x;
-    bool circular_pad_y;
+    bool circular;
+    bool circular_x;
+    bool circular_y;
     bool force_sdxl_vae_conv_scale;
     bool chroma_use_dit_mask;
     bool chroma_use_t5_mask;
diff --git a/wan.hpp b/wan.hpp
index 90091c70e..8e5984622 100644
--- a/wan.hpp
+++ b/wan.hpp
@@ -75,7 +75,7 @@ namespace WAN {
                 lp2 -= (int)cache_x->ne[2];
             }
 
-            x = sd_pad_ext(ctx->ggml_ctx, x, lp0, rp0, lp1, rp1, lp2, rp2, 0, 0, ctx->circular_pad_x_enabled, ctx->circular_pad_y_enabled);
+            x = sd_pad_ext(ctx->ggml_ctx, x, lp0, rp0, lp1, rp1, lp2, rp2, 0, 0, ctx->circular_x_enabled, ctx->circular_y_enabled);
             return ggml_ext_conv_3d(ctx->ggml_ctx, x, w, b, in_channels,
                                     std::get<2>(stride), std::get<1>(stride), std::get<0>(stride),
                                     0, 0, 0,
@@ -206,9 +206,9 @@ namespace WAN {
                 } else if (mode == "upsample3d") {
                     x = ggml_upscale(ctx->ggml_ctx, x, 2, GGML_SCALE_MODE_NEAREST);
                 } else if (mode == "downsample2d") {
-                    x = sd_pad(ctx->ggml_ctx, x, 1, 1, 0, 0, ctx->circular_pad_x_enabled, ctx->circular_pad_y_enabled);
+                    x = sd_pad(ctx->ggml_ctx, x, 1, 1, 0, 0, ctx->circular_x_enabled, ctx->circular_y_enabled);
                 } else if (mode == "downsample3d") {
-                    x = sd_pad(ctx->ggml_ctx, x, 1, 1, 0, 0, ctx->circular_pad_x_enabled, ctx->circular_pad_y_enabled);
+                    x = sd_pad(ctx->ggml_ctx, x, 1, 1, 0, 0, ctx->circular_x_enabled, ctx->circular_y_enabled);
                 }
                 x = resample_1->forward(ctx, x);
                 x = ggml_ext_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, x, 0, 1, 3, 2));  // (c, t, h, w)
@@ -1835,7 +1835,7 @@ namespace WAN {
             int pad_t = (std::get<0>(params.patch_size) - T % std::get<0>(params.patch_size)) % std::get<0>(params.patch_size);
             int pad_h = (std::get<1>(params.patch_size) - H % std::get<1>(params.patch_size)) % std::get<1>(params.patch_size);
             int pad_w = (std::get<2>(params.patch_size) - W % std::get<2>(params.patch_size)) % std::get<2>(params.patch_size);
-            sd_pad(ctx->ggml_ctx, x, pad_w, pad_h, pad_t, 0, ctx->circular_pad_x_enabled, ctx->circular_pad_y_enabled);
+            sd_pad(ctx->ggml_ctx, x, pad_w, pad_h, pad_t, 0, ctx->circular_x_enabled, ctx->circular_y_enabled);
             return x;
         }
 
diff --git a/z_image.hpp b/z_image.hpp
index cb64d7b0e..5a53fe675 100644
--- a/z_image.hpp
+++ b/z_image.hpp
@@ -331,7 +331,7 @@ namespace ZImage {
 
             int pad_h = (z_image_params.patch_size - H % z_image_params.patch_size) % z_image_params.patch_size;
             int pad_w = (z_image_params.patch_size - W % z_image_params.patch_size) % z_image_params.patch_size;
-            x = sd_pad(ctx->ggml_ctx, x, pad_w, pad_h, 0, 0, ctx->circular_pad_x_enabled, ctx->circular_pad_y_enabled);
+            x = sd_pad(ctx->ggml_ctx, x, pad_w, pad_h, 0, 0, ctx->circular_x_enabled, ctx->circular_y_enabled);
             return x;
         }
 
@@ -552,8 +552,8 @@ namespace ZImage {
                                                ref_latents,
                                                increase_ref_index,
                                                z_image_params.theta,
-                                               rope_circular_y_enabled,
-                                               rope_circular_x_enabled,
+                                               circular_y_enabled,
+                                               circular_x_enabled,
                                                z_image_params.axes_dim);
             int pos_len = pe_vec.size() / z_image_params.axes_dim_sum / 2;
             // LOG_DEBUG("pos_len %d", pos_len);

From 665190f79bd3fe565e3b45e7e858abefb6aacb33 Mon Sep 17 00:00:00 2001
From: Phylliida <phylliidadev@gmail.com>
Date: Sat, 13 Dec 2025 02:43:16 -0800
Subject: [PATCH 25/25] Removed redundant set_circular_enabled

---
 diffusion_model.hpp | 25 -------------------------
 ggml_extend.hpp     |  4 ----
 2 files changed, 29 deletions(-)

diff --git a/diffusion_model.hpp b/diffusion_model.hpp
index 0b32babf8..0724cc938 100644
--- a/diffusion_model.hpp
+++ b/diffusion_model.hpp
@@ -39,7 +39,6 @@ struct DiffusionModel {
     virtual void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter){};
     virtual int64_t get_adm_in_channels()             = 0;
     virtual void set_flash_attn_enabled(bool enabled) = 0;
-    virtual void set_circular_enabled(bool enabled) = 0;
     virtual void set_circular_axes(bool circular_x, bool circular_y) = 0;
 };
 
@@ -89,10 +88,6 @@ struct UNetModel : public DiffusionModel {
         unet.set_flash_attention_enabled(enabled);
     }
 
-    void set_circular_enabled(bool enabled) override {
-        unet.set_circular_axes(enabled, enabled);
-    }
-
     void set_circular_axes(bool circular_x, bool circular_y) override {
         unet.set_circular_axes(circular_x, circular_y);
     }
@@ -158,10 +153,6 @@ struct MMDiTModel : public DiffusionModel {
         mmdit.set_flash_attention_enabled(enabled);
     }
 
-    void set_circular_enabled(bool enabled) override {
-        mmdit.set_circular_axes(enabled, enabled);
-    }
-
     void set_circular_axes(bool circular_x, bool circular_y) override {
         mmdit.set_circular_axes(circular_x, circular_y);
     }
@@ -228,10 +219,6 @@ struct FluxModel : public DiffusionModel {
         flux.set_flash_attention_enabled(enabled);
     }
 
-    void set_circular_enabled(bool enabled) override {
-        flux.set_circular_axes(enabled, enabled);
-    }
-
     void set_circular_axes(bool circular_x, bool circular_y) override {
         flux.set_circular_axes(circular_x, circular_y);
     }
@@ -303,10 +290,6 @@ struct WanModel : public DiffusionModel {
         wan.set_flash_attention_enabled(enabled);
     }
 
-    void set_circular_enabled(bool enabled) override {
-        wan.set_circular_axes(enabled, enabled);
-    }
-
     void set_circular_axes(bool circular_x, bool circular_y) override {
         wan.set_circular_axes(circular_x, circular_y);
     }
@@ -377,10 +360,6 @@ struct QwenImageModel : public DiffusionModel {
         qwen_image.set_flash_attention_enabled(enabled);
     }
 
-    void set_circular_enabled(bool enabled) override {
-        qwen_image.set_circular_axes(enabled, enabled);
-    }
-
     void set_circular_axes(bool circular_x, bool circular_y) override {
         qwen_image.set_circular_axes(circular_x, circular_y);
     }
@@ -448,10 +427,6 @@ struct ZImageModel : public DiffusionModel {
         z_image.set_flash_attention_enabled(enabled);
     }
 
-    void set_circular_enabled(bool enabled) override {
-        z_image.set_circular_axes(enabled, enabled);
-    }
-
     void set_circular_axes(bool circular_x, bool circular_y) override {
         z_image.set_circular_axes(circular_x, circular_y);
     }
diff --git a/ggml_extend.hpp b/ggml_extend.hpp
index 0d520fb10..663012d5b 100644
--- a/ggml_extend.hpp
+++ b/ggml_extend.hpp
@@ -2081,10 +2081,6 @@ struct GGMLRunner {
         conv2d_direct_enabled = enabled;
     }
 
-    void set_circular_enabled(bool enabled) {
-        set_circular_axes(enabled, enabled);
-    }
-
     void set_circular_axes(bool circular_x, bool circular_y) {
         circular_x_enabled = circular_x;
         circular_y_enabled = circular_y;