From 885c7cc033fff987b83108959e089249f024ff48 Mon Sep 17 00:00:00 2001
From: Alex Gemberg <gemberg@canva.com>
Date: Tue, 30 Dec 2025 11:05:45 +1300
Subject: [PATCH 1/2] perf: eliminate overdraw for opaque image fills

---
 Cargo.lock                                    |   1 +
 sparse_strips/vello_bench/Cargo.toml          |   1 +
 sparse_strips/vello_bench/benches/main.rs     |   6 +-
 sparse_strips/vello_bench/src/lib.rs          |   1 +
 sparse_strips/vello_bench/src/scene.rs        |  97 +++++++++
 sparse_strips/vello_common/src/coarse.rs      | 202 ++++++++++++++----
 sparse_strips/vello_cpu/src/dispatch/mod.rs   |  12 +-
 .../vello_cpu/src/dispatch/multi_threaded.rs  |  24 ++-
 .../vello_cpu/src/dispatch/single_threaded.rs |  34 ++-
 sparse_strips/vello_cpu/src/render.rs         |  17 +-
 sparse_strips/vello_hybrid/src/scene.rs       |  19 +-
 sparse_strips/vello_toy/src/debug.rs          |   1 +
 12 files changed, 352 insertions(+), 63 deletions(-)
 create mode 100644 sparse_strips/vello_bench/src/scene.rs

diff --git a/Cargo.lock b/Cargo.lock
index f3e5bd346..9565b1c61 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3944,6 +3944,7 @@ name = "vello_bench"
 version = "0.0.0"
 dependencies = [
  "criterion",
+ "image",
  "parley",
  "rand",
  "smallvec",
diff --git a/sparse_strips/vello_bench/Cargo.toml b/sparse_strips/vello_bench/Cargo.toml
index d35ebe213..941ca0cae 100644
--- a/sparse_strips/vello_bench/Cargo.toml
+++ b/sparse_strips/vello_bench/Cargo.toml
@@ -14,6 +14,7 @@ vello_common = { workspace = true }
 vello_cpu = { workspace = true }
 vello_dev_macros = { workspace = true }
 criterion = { workspace = true }
+image = { workspace = true, features = ["jpeg"] }
 parley = { version = "0.5.0", default-features = true }
 rand = { workspace = true }
 smallvec = { workspace = true }
diff --git a/sparse_strips/vello_bench/benches/main.rs b/sparse_strips/vello_bench/benches/main.rs
index daf4b9dfa..bc2b6b7e0 100644
--- a/sparse_strips/vello_bench/benches/main.rs
+++ b/sparse_strips/vello_bench/benches/main.rs
@@ -5,7 +5,7 @@
 #![allow(dead_code, reason = "Might be unused on platforms not supporting SIMD")]
 
 use criterion::{criterion_group, criterion_main};
-use vello_bench::{fine, flatten, glyph, strip, tile};
+use vello_bench::{fine, flatten, glyph, scene, strip, tile};
 
 criterion_group!(fine_solid, fine::fill);
 criterion_group!(fine_strip, fine::strip);
@@ -19,6 +19,7 @@ criterion_group!(flatten, flatten::flatten);
 criterion_group!(strokes, flatten::strokes);
 criterion_group!(render_strips, strip::render_strips);
 criterion_group!(glyph, glyph::glyph);
+criterion_group!(scene_bench, scene::images);
 criterion_main!(
     tile,
     render_strips,
@@ -31,5 +32,6 @@ criterion_main!(
     fine_gradient,
     fine_rounded_blurred_rect,
     fine_blend,
-    fine_image
+    fine_image,
+    scene_bench
 );
diff --git a/sparse_strips/vello_bench/src/lib.rs b/sparse_strips/vello_bench/src/lib.rs
index a0d5f572d..1ac6e7b4c 100644
--- a/sparse_strips/vello_bench/src/lib.rs
+++ b/sparse_strips/vello_bench/src/lib.rs
@@ -11,6 +11,7 @@ pub mod data;
 pub mod fine;
 pub mod flatten;
 pub mod glyph;
+pub mod scene;
 pub mod strip;
 pub mod tile;
 
diff --git a/sparse_strips/vello_bench/src/scene.rs b/sparse_strips/vello_bench/src/scene.rs
new file mode 100644
index 000000000..09cedd144
--- /dev/null
+++ b/sparse_strips/vello_bench/src/scene.rs
@@ -0,0 +1,97 @@
+// Copyright 2025 the Vello Authors
+// SPDX-License-Identifier: Apache-2.0 OR MIT
+
+//! Full scene rendering benchmarks.
+
+use std::sync::Arc;
+
+use criterion::Criterion;
+use vello_common::kurbo::{Affine, Rect};
+use vello_common::paint::{Image, ImageSource};
+use vello_common::peniko::ImageSampler;
+use vello_common::peniko::{Extend, ImageQuality};
+use vello_common::pixmap::Pixmap;
+use vello_cpu::RenderContext;
+
+/// Image scene rendering benchmark.
+pub fn images(c: &mut Criterion) {
+    let mut g = c.benchmark_group("images");
+
+    let flower_image = load_flower_image();
+
+    const VIEWPORT_WIDTH: u16 = 1280;
+    const VIEWPORT_HEIGHT: u16 = 960;
+
+    let ImageSource::Pixmap(ref image_pixmap) = flower_image else {
+        panic!("Expected Pixmap");
+    };
+    let original_width = f64::from(image_pixmap.width());
+    let original_height = f64::from(image_pixmap.height());
+    let image_count = VIEWPORT_WIDTH / 256;
+
+    g.bench_function("overlapping", |b| {
+        let mut renderer = RenderContext::new(VIEWPORT_WIDTH, VIEWPORT_HEIGHT);
+        let mut pixmap = Pixmap::new(VIEWPORT_WIDTH, VIEWPORT_HEIGHT);
+
+        b.iter(|| {
+            renderer.reset();
+
+            for i in (1..=image_count).rev() {
+                let width = 256.0 * i as f64;
+                let scale = width / original_width;
+                let height = original_height * scale;
+
+                renderer.set_transform(Affine::IDENTITY);
+                renderer.set_paint_transform(Affine::scale(scale));
+                renderer.set_paint(Image {
+                    image: flower_image.clone(),
+                    sampler: ImageSampler {
+                        x_extend: Extend::Pad,
+                        y_extend: Extend::Pad,
+                        quality: ImageQuality::Low,
+                        alpha: 1.0,
+                    },
+                });
+                renderer.fill_rect(&Rect::new(0.0, 0.0, width, height));
+            }
+
+            renderer.flush();
+            renderer.render_to_pixmap(&mut pixmap);
+            std::hint::black_box(&pixmap);
+        });
+    });
+
+    g.finish();
+}
+
+fn load_flower_image() -> ImageSource {
+    let image_data = include_bytes!("../../../examples/assets/splash-flower.jpg");
+    let image = image::load_from_memory(image_data).expect("Failed to decode image");
+    let width = image.width();
+    let height = image.height();
+    let rgba_data = image.into_rgba8().into_vec();
+
+    #[expect(
+        clippy::cast_possible_truncation,
+        reason = "Image dimensions fit in u16"
+    )]
+    let pixmap = Pixmap::from_parts(
+        rgba_data
+            .chunks_exact(4)
+            .map(|rgba| {
+                let alpha = u16::from(rgba[3]);
+                let premultiply = |component| (alpha * u16::from(component) / 255) as u8;
+                vello_common::color::PremulRgba8 {
+                    r: premultiply(rgba[0]),
+                    g: premultiply(rgba[1]),
+                    b: premultiply(rgba[2]),
+                    a: alpha as u8,
+                }
+            })
+            .collect(),
+        width as u16,
+        height as u16,
+    );
+
+    ImageSource::Pixmap(Arc::new(pixmap))
+}
diff --git a/sparse_strips/vello_common/src/coarse.rs b/sparse_strips/vello_common/src/coarse.rs
index d2ecfaba0..e82721805 100644
--- a/sparse_strips/vello_common/src/coarse.rs
+++ b/sparse_strips/vello_common/src/coarse.rs
@@ -4,6 +4,7 @@
 //! Generating and processing wide tiles.
 
 use crate::color::palette::css::TRANSPARENT;
+use crate::encode::EncodedPaint;
 use crate::filter_effects::Filter;
 use crate::kurbo::{Affine, Rect};
 use crate::mask::Mask;
@@ -62,6 +63,21 @@ pub const MODE_CPU: u8 = 0;
 /// generation specific for `vello_hybrid`.
 pub const MODE_HYBRID: u8 = 1;
 
+/// Optimization hint for fill operations, computed in `Wide::generate` and passed to `WideTile::fill`.
+///
+/// This enum communicates whether a fill operation can benefit from overdraw elimination:
+/// - For opaque solid colors: we can set the background color directly and skip the fill
+/// - For opaque images: we can clear previous commands but still need to emit the fill
+#[derive(Debug, Clone, Copy)]
+pub enum FillOptimization {
+    /// No optimization possible, emit fill command normally.
+    None,
+    /// Paint is an opaque solid color - can replace background if conditions are met.
+    OpaqueSolid(PremulColor),
+    /// Paint is an opaque image - can clear previous commands if conditions are met.
+    OpaqueImage,
+}
+
 /// A container for wide tiles.
 #[derive(Debug)]
 pub struct Wide<const MODE: u8 = MODE_CPU> {
@@ -406,6 +422,7 @@ impl<const MODE: u8> Wide<MODE> {
         blend_mode: BlendMode,
         thread_idx: u8,
         mask: Option<Mask>,
+        encoded_paints: &[EncodedPaint],
     ) {
         if strip_buf.is_empty() {
             return;
@@ -516,10 +533,25 @@ impl<const MODE: u8> Wide<MODE> {
                     .min(bbox.x1())
                     .min(WideTile::MAX_WIDE_TILE_COORD);
 
+                // Compute fill optimization based on paint type
                 let fill_attrs = &self.attrs.fill[attrs_idx as usize];
-                let override_color = match &fill_attrs.paint {
-                    Paint::Solid(s) if s.is_opaque() && fill_attrs.mask.is_none() => Some(*s),
-                    _ => None,
+                let optimization = if fill_attrs.mask.is_none() {
+                    match &fill_attrs.paint {
+                        Paint::Solid(s) if s.is_opaque() => FillOptimization::OpaqueSolid(*s),
+                        Paint::Indexed(idx) => {
+                            if let Some(EncodedPaint::Image(img)) = encoded_paints.get(idx.index())
+                                && !img.has_opacities
+                                && img.sampler.alpha == 1.0
+                            {
+                                FillOptimization::OpaqueImage
+                            } else {
+                                FillOptimization::None
+                            }
+                        }
+                        _ => FillOptimization::None,
+                    }
+                } else {
+                    FillOptimization::None
                 };
 
                 // Generate fill commands for each wide tile in the fill region
@@ -537,7 +569,7 @@ impl<const MODE: u8> Wide<MODE> {
                         width,
                         attrs_idx,
                         current_layer_id,
-                        override_color,
+                        optimization,
                     );
                     // TODO: This bbox update might be redundant since filled regions are always
                     // bounded by strip regions (which already update the bbox). Consider removing
@@ -1207,49 +1239,69 @@ impl<const MODE: u8> WideTile<MODE> {
     /// For clipped filter layers, commands are always generated since filters need the full
     /// layer content rendered before applying the clip as a mask.
     ///
-    /// The `override_color` parameter is pre-computed by the caller: if the paint is a solid
-    /// opaque color with no mask, this contains that color for potential background replacement
-    /// optimization.
+    /// The `optimization` parameter is pre-computed by the caller based on paint type:
+    /// - `OpaqueSolid(color)`: Paint is an opaque solid color, can replace background
+    /// - `OpaqueImage`: Paint is an opaque image, can clear previous commands
+    /// - `None`: No optimization available
     pub(crate) fn fill(
         &mut self,
         x: u16,
         width: u16,
         attrs_idx: u32,
         current_layer_id: LayerId,
-        override_color: Option<PremulColor>,
+        optimization: FillOptimization,
     ) {
         if !self.is_zero_clip() || self.in_clipped_filter_layer {
             match MODE {
                 MODE_CPU => {
-                    // Note that we could be more aggressive in optimizing a whole-tile opaque fill
-                    // even with a clip stack. It would be valid to elide all drawing commands from
-                    // the enclosing clip push up to the fill. Further, we could extend the clip
-                    // push command to include a background color, rather than always starting with
-                    // a transparent buffer. Lastly, a sequence of push(bg); strip/fill; pop could
-                    // be replaced with strip/fill with the color (the latter is true even with a
-                    // non-opaque color).
-                    //
-                    // However, the extra cost of tracking such optimizations may outweigh the
-                    // benefit, especially in hybrid mode with GPU painting.
-                    let bg = override_color.filter(|_| {
-                        x == 0 && width == WideTile::WIDTH && self.n_clip == 0 && self.n_bufs == 0
-                    });
-
-                    if let Some(bg) = bg {
-                        self.cmds.clear();
-                        self.bg = bg;
-                        // Clear layer ranges when we clear commands
-                        if let Some(ranges) = self.layer_cmd_ranges.get_mut(&current_layer_id) {
-                            ranges.clear();
+                    // Check if we can apply overdraw elimination optimization.
+                    // This requires filling the entire tile width with no clip/buffer stack.
+                    let can_override =
+                        x == 0 && width == WideTile::WIDTH && self.n_clip == 0 && self.n_bufs == 0;
+
+                    if can_override {
+                        match optimization {
+                            FillOptimization::OpaqueSolid(color) => {
+                                // Note that we could be more aggressive in optimizing a whole-tile opaque fill
+                                // even with a clip stack. It would be valid to elide all drawing commands from
+                                // the enclosing clip push up to the fill. Further, we could extend the clip
+                                // push command to include a background color, rather than always starting with
+                                // a transparent buffer. Lastly, a sequence of push(bg); strip/fill; pop could
+                                // be replaced with strip/fill with the color (the latter is true even with a
+                                // non-opaque color).
+                                //
+                                // However, the extra cost of tracking such optimizations may outweigh the
+                                // benefit, especially in hybrid mode with GPU painting.
+                                self.cmds.clear();
+                                self.bg = color;
+                                if let Some(ranges) =
+                                    self.layer_cmd_ranges.get_mut(&current_layer_id)
+                                {
+                                    ranges.clear();
+                                }
+                                return;
+                            }
+                            FillOptimization::OpaqueImage => {
+                                // Opaque image: clear previous commands but still emit the fill.
+                                self.cmds.clear();
+                                self.bg = PremulColor::from_alpha_color(TRANSPARENT);
+                                if let Some(ranges) =
+                                    self.layer_cmd_ranges.get_mut(&current_layer_id)
+                                {
+                                    ranges.clear();
+                                }
+                                // Fall through to emit the fill command below.
+                            }
+                            FillOptimization::None => {}
                         }
-                    } else {
-                        self.record_fill_cmd(current_layer_id, self.cmds.len());
-                        self.cmds.push(Cmd::Fill(CmdFill {
-                            x,
-                            width,
-                            attrs_idx,
-                        }));
                     }
+
+                    self.record_fill_cmd(current_layer_id, self.cmds.len());
+                    self.cmds.push(Cmd::Fill(CmdFill {
+                        x,
+                        width,
+                        attrs_idx,
+                    }));
                 }
                 MODE_HYBRID => {
                     self.record_fill_cmd(current_layer_id, self.cmds.len());
@@ -1472,7 +1524,7 @@ impl<const MODE: u8> WideTile<MODE> {
     /// // 4: PopBuf
     /// ```
     #[allow(dead_code, reason = "useful for debugging")]
-    pub(crate) fn list_commands(&self) -> String {
+    pub fn list_commands(&self) -> String {
         self.cmds
             .iter()
             .enumerate()
@@ -1571,7 +1623,8 @@ impl Cmd {
     /// Returns a human-readable name for this command.
     ///
     /// This is useful for debugging, logging, and displaying command information
-    /// in a user-friendly format.
+    /// in a user-friendly format. To get detailed paint information, use `name_with_attrs`
+    /// which can look up the paint from the command attributes.
     ///
     /// **Note:** This method is only available in debug builds (`debug_assertions`).
     pub fn name(&self) -> &'static str {
@@ -1594,6 +1647,69 @@ impl Cmd {
             Self::Mask(_) => "Mask",
         }
     }
+
+    /// Returns a human-readable name for this command with detailed paint information.
+    ///
+    /// This variant looks up paint details from the command attributes for fill commands.
+    ///
+    /// **Note:** This method is only available in debug builds (`debug_assertions`).
+    pub fn name_with_attrs(
+        &self,
+        fill_attrs: &[FillAttrs],
+        encoded_paints: &[EncodedPaint],
+    ) -> String {
+        match self {
+            Self::Fill(cmd) => {
+                if let Some(attrs) = fill_attrs.get(cmd.attrs_idx as usize) {
+                    format!("FillPath({})", paint_name(&attrs.paint, encoded_paints))
+                } else {
+                    format!("FillPath(attrs_idx={})", cmd.attrs_idx)
+                }
+            }
+            Self::AlphaFill(cmd) => {
+                if let Some(attrs) = fill_attrs.get(cmd.attrs_idx as usize) {
+                    format!(
+                        "AlphaFillPath({})",
+                        paint_name(&attrs.paint, encoded_paints)
+                    )
+                } else {
+                    format!("AlphaFillPath(attrs_idx={})", cmd.attrs_idx)
+                }
+            }
+            _ => self.name().into(),
+        }
+    }
+}
+
+/// Returns a human-readable description of a paint.
+#[cfg(debug_assertions)]
+fn paint_name(paint: &Paint, encoded_paints: &[EncodedPaint]) -> String {
+    match paint {
+        Paint::Solid(color) => {
+            let rgba = color.as_premul_rgba8();
+            format!(
+                "Solid(#{:02x}{:02x}{:02x}{:02x})",
+                rgba.r, rgba.g, rgba.b, rgba.a
+            )
+        }
+        Paint::Indexed(idx) => {
+            let index = idx.index();
+            if let Some(encoded) = encoded_paints.get(index) {
+                let kind = match encoded {
+                    EncodedPaint::Gradient(g) => match &g.kind {
+                        crate::encode::EncodedKind::Linear(_) => "LinearGradient",
+                        crate::encode::EncodedKind::Radial(_) => "RadialGradient",
+                        crate::encode::EncodedKind::Sweep(_) => "SweepGradient",
+                    },
+                    EncodedPaint::Image(_) => "Image",
+                    EncodedPaint::BlurredRoundedRect(_) => "BlurredRoundedRect",
+                };
+                format!("{}[{}]", kind, index)
+            } else {
+                format!("Indexed({})", index)
+            }
+        }
+    }
 }
 
 /// Shared attributes for alpha fill commands.
@@ -1772,7 +1888,7 @@ impl LayerCommandRanges {
 
 #[cfg(test)]
 mod tests {
-    use crate::coarse::{LayerKind, MODE_CPU, Wide, WideTile};
+    use crate::coarse::{FillOptimization, LayerKind, MODE_CPU, Wide, WideTile};
     use crate::kurbo::Affine;
     use crate::peniko::{BlendMode, Compose, Mix};
     use crate::render_graph::RenderGraph;
@@ -1792,8 +1908,8 @@ mod tests {
     fn basic_layer() {
         let mut wide = WideTile::<MODE_CPU>::new(0, 0);
         wide.push_buf(LayerKind::Regular(0));
-        wide.fill(0, 10, 0, 0, None);
-        wide.fill(10, 10, 0, 0, None);
+        wide.fill(0, 10, 0, 0, FillOptimization::None);
+        wide.fill(10, 10, 0, 0, FillOptimization::None);
         wide.pop_buf();
 
         assert_eq!(wide.cmds.len(), 4);
@@ -1805,8 +1921,8 @@ mod tests {
 
         let mut wide = WideTile::<MODE_CPU>::new(0, 0);
         wide.push_buf(LayerKind::Regular(0));
-        wide.fill(0, 10, 0, 0, None);
-        wide.fill(10, 10, 0, 0, None);
+        wide.fill(0, 10, 0, 0, FillOptimization::None);
+        wide.fill(10, 10, 0, 0, FillOptimization::None);
         wide.blend(blend_mode);
         wide.pop_buf();
 
@@ -1819,7 +1935,7 @@ mod tests {
 
         let mut wide = WideTile::<MODE_CPU>::new(0, 0);
         wide.push_buf(LayerKind::Regular(0));
-        wide.fill(0, 10, 0, 0, None);
+        wide.fill(0, 10, 0, 0, FillOptimization::None);
         wide.blend(blend_mode);
         wide.pop_buf();
 
diff --git a/sparse_strips/vello_cpu/src/dispatch/mod.rs b/sparse_strips/vello_cpu/src/dispatch/mod.rs
index 8b84ddca6..25abcd2d9 100644
--- a/sparse_strips/vello_cpu/src/dispatch/mod.rs
+++ b/sparse_strips/vello_cpu/src/dispatch/mod.rs
@@ -19,7 +19,13 @@ use vello_common::strip_generator::StripStorage;
 
 pub(crate) trait Dispatcher: Debug + Send + Sync {
     fn wide(&self) -> &Wide;
-    fn generate_wide_cmd(&mut self, strip_buf: &[Strip], paint: Paint, blend_mode: BlendMode);
+    fn generate_wide_cmd(
+        &mut self,
+        strip_buf: &[Strip],
+        paint: Paint,
+        blend_mode: BlendMode,
+        encoded_paints: &[EncodedPaint],
+    );
     fn fill_path(
         &mut self,
         path: &BezPath,
@@ -29,6 +35,7 @@ pub(crate) trait Dispatcher: Debug + Send + Sync {
         blend_mode: BlendMode,
         aliasing_threshold: Option<u8>,
         mask: Option<Mask>,
+        encoded_paints: &[EncodedPaint],
     );
     fn stroke_path(
         &mut self,
@@ -39,6 +46,7 @@ pub(crate) trait Dispatcher: Debug + Send + Sync {
         blend_mode: BlendMode,
         aliasing_threshold: Option<u8>,
         mask: Option<Mask>,
+        encoded_paints: &[EncodedPaint],
     );
     fn push_clip_path(
         &mut self,
@@ -61,7 +69,7 @@ pub(crate) trait Dispatcher: Debug + Send + Sync {
     );
     fn pop_layer(&mut self);
     fn reset(&mut self);
-    fn flush(&mut self);
+    fn flush(&mut self, encoded_paints: &[EncodedPaint]);
     fn rasterize(
         &self,
         buffer: &mut [u8],
diff --git a/sparse_strips/vello_cpu/src/dispatch/multi_threaded.rs b/sparse_strips/vello_cpu/src/dispatch/multi_threaded.rs
index ab68e77bd..417de7292 100644
--- a/sparse_strips/vello_cpu/src/dispatch/multi_threaded.rs
+++ b/sparse_strips/vello_cpu/src/dispatch/multi_threaded.rs
@@ -270,7 +270,8 @@ impl MultiThreadedDispatcher {
             allocation_group,
         };
         task_sender.send(task).unwrap();
-        self.run_coarse(true);
+        // TODO: Support encoded_paints in multithreading.
+        self.run_coarse(true, &[]);
     }
 
     // Currently, we do coarse rasterization in two phases:
@@ -285,7 +286,7 @@ impl MultiThreadedDispatcher {
     // new strips that will be generated.
     //
     // This is why we have the `abort_empty`flag.
-    fn run_coarse(&mut self, abort_empty: bool) {
+    fn run_coarse(&mut self, abort_empty: bool, encoded_paints: &[EncodedPaint]) {
         let result_receiver = self.coarse_task_receiver.as_mut().unwrap();
 
         loop {
@@ -307,6 +308,7 @@ impl MultiThreadedDispatcher {
                                 blend_mode,
                                 thread_id,
                                 mask,
+                                encoded_paints,
                             ),
                             CoarseTaskType::RenderWideCommand {
                                 strips,
@@ -320,6 +322,7 @@ impl MultiThreadedDispatcher {
                                 blend_mode,
                                 thread_id,
                                 mask,
+                                encoded_paints,
                             ),
                             CoarseTaskType::PushLayer {
                                 thread_id,
@@ -429,6 +432,7 @@ impl Dispatcher for MultiThreadedDispatcher {
         blend_mode: BlendMode,
         aliasing_threshold: Option<u8>,
         mask: Option<Mask>,
+        _encoded_paints: &[EncodedPaint],
     ) {
         let start = self.allocation_group.path.len() as u32;
         self.allocation_group.path.extend(path);
@@ -453,6 +457,7 @@ impl Dispatcher for MultiThreadedDispatcher {
         blend_mode: BlendMode,
         aliasing_threshold: Option<u8>,
         mask: Option<Mask>,
+        _encoded_paints: &[EncodedPaint],
     ) {
         let start = self.allocation_group.path.len() as u32;
         self.allocation_group.path.extend(path);
@@ -541,7 +546,7 @@ impl Dispatcher for MultiThreadedDispatcher {
         self.init();
     }
 
-    fn flush(&mut self) {
+    fn flush(&mut self, encoded_paints: &[EncodedPaint]) {
         if self.flushed {
             return;
         }
@@ -551,7 +556,7 @@ impl Dispatcher for MultiThreadedDispatcher {
         // Note that dropping the sender will signal to the workers that no more new paths
         // can arrive.
         drop(sender);
-        self.run_coarse(false);
+        self.run_coarse(false, encoded_paints);
 
         self.alpha_storage.with_inner(|alphas| {
             // The main thread stores the alphas that are produced by playing a recording.
@@ -596,7 +601,13 @@ impl Dispatcher for MultiThreadedDispatcher {
         }
     }
 
-    fn generate_wide_cmd(&mut self, strip_buf: &[Strip], paint: Paint, blend_mode: BlendMode) {
+    fn generate_wide_cmd(
+        &mut self,
+        strip_buf: &[Strip],
+        paint: Paint,
+        blend_mode: BlendMode,
+        _encoded_paints: &[EncodedPaint],
+    ) {
         // Note that we are essentially round-tripping here: The wide container is inside of the
         // main thread, but we first send a render task to a child thread which basically just
         // forwards it back to the main thread again. We cannot apply the wide command directly
@@ -880,8 +891,9 @@ mod tests {
                 BlendMode::default(),
                 None,
                 None,
+                &[],
             );
-            dispatcher.flush();
+            dispatcher.flush(&[]);
         }
 
         assert_eq!(dispatcher.allocations.paths.entries.len(), 1);
diff --git a/sparse_strips/vello_cpu/src/dispatch/single_threaded.rs b/sparse_strips/vello_cpu/src/dispatch/single_threaded.rs
index 5cf2b8663..748e40318 100644
--- a/sparse_strips/vello_cpu/src/dispatch/single_threaded.rs
+++ b/sparse_strips/vello_cpu/src/dispatch/single_threaded.rs
@@ -414,6 +414,7 @@ impl Dispatcher for SingleThreadedDispatcher {
         blend_mode: BlendMode,
         aliasing_threshold: Option<u8>,
         mask: Option<Mask>,
+        encoded_paints: &[EncodedPaint],
     ) {
         let wide = &mut self.wide;
 
@@ -428,7 +429,14 @@ impl Dispatcher for SingleThreadedDispatcher {
         );
 
         // Generate coarse-level commands from strips (layer_id 0 = root layer).
-        wide.generate(&self.strip_storage.strips, paint, blend_mode, 0, mask);
+        wide.generate(
+            &self.strip_storage.strips,
+            paint,
+            blend_mode,
+            0,
+            mask,
+            encoded_paints,
+        );
     }
 
     fn stroke_path(
@@ -440,6 +448,7 @@ impl Dispatcher for SingleThreadedDispatcher {
         blend_mode: BlendMode,
         aliasing_threshold: Option<u8>,
         mask: Option<Mask>,
+        encoded_paints: &[EncodedPaint],
     ) {
         let wide = &mut self.wide;
 
@@ -454,7 +463,14 @@ impl Dispatcher for SingleThreadedDispatcher {
         );
 
         // Generate coarse-level commands from strips (layer_id 0 = root layer).
-        wide.generate(&self.strip_storage.strips, paint, blend_mode, 0, mask);
+        wide.generate(
+            &self.strip_storage.strips,
+            paint,
+            blend_mode,
+            0,
+            mask,
+            encoded_paints,
+        );
     }
 
     fn push_layer(
@@ -531,7 +547,7 @@ impl Dispatcher for SingleThreadedDispatcher {
         self.layer_id_next = 0;
     }
 
-    fn flush(&mut self) {
+    fn flush(&mut self, _encoded_paints: &[EncodedPaint]) {
         // No-op for single-threaded dispatcher (no work queue to flush).
     }
 
@@ -578,9 +594,16 @@ impl Dispatcher for SingleThreadedDispatcher {
         }
     }
 
-    fn generate_wide_cmd(&mut self, strip_buf: &[Strip], paint: Paint, blend_mode: BlendMode) {
+    fn generate_wide_cmd(
+        &mut self,
+        strip_buf: &[Strip],
+        paint: Paint,
+        blend_mode: BlendMode,
+        encoded_paints: &[EncodedPaint],
+    ) {
         // Generate coarse-level commands from pre-computed strips (layer_id 0 = root layer).
-        self.wide.generate(strip_buf, paint, blend_mode, 0, None);
+        self.wide
+            .generate(strip_buf, paint, blend_mode, 0, None, encoded_paints);
     }
 
     fn strip_storage_mut(&mut self) -> &mut StripStorage {
@@ -652,6 +675,7 @@ mod tests {
             BlendMode::default(),
             None,
             None,
+            &[],
         );
 
         // Ensure there is data to clear.
diff --git a/sparse_strips/vello_cpu/src/render.rs b/sparse_strips/vello_cpu/src/render.rs
index 49025a573..6de8eed93 100644
--- a/sparse_strips/vello_cpu/src/render.rs
+++ b/sparse_strips/vello_cpu/src/render.rs
@@ -203,6 +203,7 @@ impl RenderContext {
                 ctx.blend_mode,
                 ctx.aliasing_threshold,
                 ctx.mask.clone(),
+                &ctx.encoded_paints,
             );
         });
     }
@@ -219,6 +220,7 @@ impl RenderContext {
                 ctx.blend_mode,
                 ctx.aliasing_threshold,
                 ctx.mask.clone(),
+                &ctx.encoded_paints,
             );
         });
     }
@@ -236,6 +238,7 @@ impl RenderContext {
                 ctx.blend_mode,
                 ctx.aliasing_threshold,
                 ctx.mask.clone(),
+                &ctx.encoded_paints,
             );
         });
     }
@@ -253,6 +256,7 @@ impl RenderContext {
                 ctx.blend_mode,
                 ctx.aliasing_threshold,
                 ctx.mask.clone(),
+                &ctx.encoded_paints,
             );
         });
     }
@@ -307,6 +311,7 @@ impl RenderContext {
             self.blend_mode,
             self.aliasing_threshold,
             self.mask.clone(),
+            &self.encoded_paints,
         );
     }
 
@@ -546,7 +551,7 @@ impl RenderContext {
     /// For multi-threaded rendering, you _have_ to call this before rasterizing, otherwise
     /// the program will panic.
     pub fn flush(&mut self) {
-        self.dispatcher.flush();
+        self.dispatcher.flush(&self.encoded_paints);
     }
 
     /// Render the current context into a buffer.
@@ -630,6 +635,7 @@ impl GlyphRenderer for RenderContext {
                     self.blend_mode,
                     self.aliasing_threshold,
                     self.mask.clone(),
+                    &self.encoded_paints,
                 );
             }
             GlyphType::Bitmap(glyph) => {
@@ -737,6 +743,7 @@ impl GlyphRenderer for RenderContext {
                     self.blend_mode,
                     self.aliasing_threshold,
                     self.mask.clone(),
+                    &self.encoded_paints,
                 );
             }
             GlyphType::Bitmap(_) | GlyphType::Colr(_) => {
@@ -1051,8 +1058,12 @@ impl RenderContext {
             "Invalid strip range"
         );
         let paint = self.encode_current_paint();
-        self.dispatcher
-            .generate_wide_cmd(&adjusted_strips[start..end], paint, self.blend_mode);
+        self.dispatcher.generate_wide_cmd(
+            &adjusted_strips[start..end],
+            paint,
+            self.blend_mode,
+            &self.encoded_paints,
+        );
     }
 
     /// Prepare cached strips for rendering by adjusting indices.
diff --git a/sparse_strips/vello_hybrid/src/scene.rs b/sparse_strips/vello_hybrid/src/scene.rs
index f6226d796..d755ce50d 100644
--- a/sparse_strips/vello_hybrid/src/scene.rs
+++ b/sparse_strips/vello_hybrid/src/scene.rs
@@ -228,7 +228,14 @@ impl Scene {
             &mut self.strip_storage,
             self.clip_context.get(),
         );
-        wide.generate(&self.strip_storage.strips, paint, self.blend_mode, 0, None);
+        wide.generate(
+            &self.strip_storage.strips,
+            paint,
+            self.blend_mode,
+            0,
+            None,
+            &self.encoded_paints,
+        );
     }
 
     /// Push a new clip path to the clip stack.
@@ -287,7 +294,14 @@ impl Scene {
             self.clip_context.get(),
         );
 
-        wide.generate(&self.strip_storage.strips, paint, self.blend_mode, 0, None);
+        wide.generate(
+            &self.strip_storage.strips,
+            paint,
+            self.blend_mode,
+            0,
+            None,
+            &self.encoded_paints,
+        );
     }
 
     /// Set the aliasing threshold.
@@ -751,6 +765,7 @@ impl Scene {
             self.blend_mode,
             0,
             None,
+            &self.encoded_paints,
         );
     }
 
diff --git a/sparse_strips/vello_toy/src/debug.rs b/sparse_strips/vello_toy/src/debug.rs
index 9833c8c2b..ac1df2643 100644
--- a/sparse_strips/vello_toy/src/debug.rs
+++ b/sparse_strips/vello_toy/src/debug.rs
@@ -94,6 +94,7 @@ fn main() {
             BlendMode::new(Mix::Normal, Compose::SrcOver),
             0,
             None,
+            &[],
         );
     }
 

From 5e74593728b08a8f099edc4e549a4917ddf25a02 Mon Sep 17 00:00:00 2001
From: Alex Gemberg <gemberg@canva.com>
Date: Tue, 30 Dec 2025 11:22:43 +1300
Subject: [PATCH 2/2] .

---
 sparse_strips/vello_bench/benches/main.rs     |  6 +-
 .../src/{scene.rs => integration.rs}          | 15 +--
 sparse_strips/vello_bench/src/lib.rs          |  2 +-
 sparse_strips/vello_common/src/coarse.rs      | 94 ++++++++++---------
 .../vello_cpu/src/dispatch/multi_threaded.rs  |  5 +-
 5 files changed, 61 insertions(+), 61 deletions(-)
 rename sparse_strips/vello_bench/src/{scene.rs => integration.rs} (85%)

diff --git a/sparse_strips/vello_bench/benches/main.rs b/sparse_strips/vello_bench/benches/main.rs
index bc2b6b7e0..b9f50d4e5 100644
--- a/sparse_strips/vello_bench/benches/main.rs
+++ b/sparse_strips/vello_bench/benches/main.rs
@@ -5,7 +5,7 @@
 #![allow(dead_code, reason = "Might be unused on platforms not supporting SIMD")]
 
 use criterion::{criterion_group, criterion_main};
-use vello_bench::{fine, flatten, glyph, scene, strip, tile};
+use vello_bench::{fine, flatten, glyph, integration, strip, tile};
 
 criterion_group!(fine_solid, fine::fill);
 criterion_group!(fine_strip, fine::strip);
@@ -19,7 +19,7 @@ criterion_group!(flatten, flatten::flatten);
 criterion_group!(strokes, flatten::strokes);
 criterion_group!(render_strips, strip::render_strips);
 criterion_group!(glyph, glyph::glyph);
-criterion_group!(scene_bench, scene::images);
+criterion_group!(integration_bench, integration::images);
 criterion_main!(
     tile,
     render_strips,
@@ -33,5 +33,5 @@ criterion_main!(
     fine_rounded_blurred_rect,
     fine_blend,
     fine_image,
-    scene_bench
+    integration_bench
 );
diff --git a/sparse_strips/vello_bench/src/scene.rs b/sparse_strips/vello_bench/src/integration.rs
similarity index 85%
rename from sparse_strips/vello_bench/src/scene.rs
rename to sparse_strips/vello_bench/src/integration.rs
index 09cedd144..1daa02785 100644
--- a/sparse_strips/vello_bench/src/scene.rs
+++ b/sparse_strips/vello_bench/src/integration.rs
@@ -1,7 +1,7 @@
 // Copyright 2025 the Vello Authors
 // SPDX-License-Identifier: Apache-2.0 OR MIT
 
-//! Full scene rendering benchmarks.
+//! Integration benchmarks for full rendering pipelines.
 
 use std::sync::Arc;
 
@@ -12,6 +12,7 @@ use vello_common::peniko::ImageSampler;
 use vello_common::peniko::{Extend, ImageQuality};
 use vello_common::pixmap::Pixmap;
 use vello_cpu::RenderContext;
+use vello_cpu::color::AlphaColor;
 
 /// Image scene rendering benchmark.
 pub fn images(c: &mut Criterion) {
@@ -41,7 +42,6 @@ pub fn images(c: &mut Criterion) {
                 let scale = width / original_width;
                 let height = original_height * scale;
 
-                renderer.set_transform(Affine::IDENTITY);
                 renderer.set_paint_transform(Affine::scale(scale));
                 renderer.set_paint(Image {
                     image: flower_image.clone(),
@@ -79,14 +79,9 @@ fn load_flower_image() -> ImageSource {
         rgba_data
             .chunks_exact(4)
             .map(|rgba| {
-                let alpha = u16::from(rgba[3]);
-                let premultiply = |component| (alpha * u16::from(component) / 255) as u8;
-                vello_common::color::PremulRgba8 {
-                    r: premultiply(rgba[0]),
-                    g: premultiply(rgba[1]),
-                    b: premultiply(rgba[2]),
-                    a: alpha as u8,
-                }
+                AlphaColor::from_rgba8(rgba[0], rgba[1], rgba[2], rgba[3])
+                    .premultiply()
+                    .to_rgba8()
             })
             .collect(),
         width as u16,
diff --git a/sparse_strips/vello_bench/src/lib.rs b/sparse_strips/vello_bench/src/lib.rs
index 1ac6e7b4c..dbf89fd43 100644
--- a/sparse_strips/vello_bench/src/lib.rs
+++ b/sparse_strips/vello_bench/src/lib.rs
@@ -11,7 +11,7 @@ pub mod data;
 pub mod fine;
 pub mod flatten;
 pub mod glyph;
-pub mod scene;
+pub mod integration;
 pub mod strip;
 pub mod tile;
 
diff --git a/sparse_strips/vello_common/src/coarse.rs b/sparse_strips/vello_common/src/coarse.rs
index e82721805..887a485e6 100644
--- a/sparse_strips/vello_common/src/coarse.rs
+++ b/sparse_strips/vello_common/src/coarse.rs
@@ -63,21 +63,6 @@ pub const MODE_CPU: u8 = 0;
 /// generation specific for `vello_hybrid`.
 pub const MODE_HYBRID: u8 = 1;
 
-/// Optimization hint for fill operations, computed in `Wide::generate` and passed to `WideTile::fill`.
-///
-/// This enum communicates whether a fill operation can benefit from overdraw elimination:
-/// - For opaque solid colors: we can set the background color directly and skip the fill
-/// - For opaque images: we can clear previous commands but still need to emit the fill
-#[derive(Debug, Clone, Copy)]
-pub enum FillOptimization {
-    /// No optimization possible, emit fill command normally.
-    None,
-    /// Paint is an opaque solid color - can replace background if conditions are met.
-    OpaqueSolid(PremulColor),
-    /// Paint is an opaque image - can clear previous commands if conditions are met.
-    OpaqueImage,
-}
-
 /// A container for wide tiles.
 #[derive(Debug)]
 pub struct Wide<const MODE: u8 = MODE_CPU> {
@@ -533,25 +518,25 @@ impl<const MODE: u8> Wide<MODE> {
                     .min(bbox.x1())
                     .min(WideTile::MAX_WIDE_TILE_COORD);
 
-                // Compute fill optimization based on paint type
+                // Compute fill hint based on paint type
                 let fill_attrs = &self.attrs.fill[attrs_idx as usize];
-                let optimization = if fill_attrs.mask.is_none() {
+                let fill_hint = if fill_attrs.mask.is_none() {
                     match &fill_attrs.paint {
-                        Paint::Solid(s) if s.is_opaque() => FillOptimization::OpaqueSolid(*s),
+                        Paint::Solid(s) if s.is_opaque() => FillHint::OpaqueSolid(*s),
                         Paint::Indexed(idx) => {
                             if let Some(EncodedPaint::Image(img)) = encoded_paints.get(idx.index())
                                 && !img.has_opacities
                                 && img.sampler.alpha == 1.0
                             {
-                                FillOptimization::OpaqueImage
+                                FillHint::OpaqueImage
                             } else {
-                                FillOptimization::None
+                                FillHint::None
                             }
                         }
-                        _ => FillOptimization::None,
+                        _ => FillHint::None,
                     }
                 } else {
-                    FillOptimization::None
+                    FillHint::None
                 };
 
                 // Generate fill commands for each wide tile in the fill region
@@ -569,7 +554,7 @@ impl<const MODE: u8> Wide<MODE> {
                         width,
                         attrs_idx,
                         current_layer_id,
-                        optimization,
+                        fill_hint,
                     );
                     // TODO: This bbox update might be redundant since filled regions are always
                     // bounded by strip regions (which already update the bbox). Consider removing
@@ -1239,7 +1224,7 @@ impl<const MODE: u8> WideTile<MODE> {
     /// For clipped filter layers, commands are always generated since filters need the full
     /// layer content rendered before applying the clip as a mask.
     ///
-    /// The `optimization` parameter is pre-computed by the caller based on paint type:
+    /// The `fill_hint` parameter is pre-computed by the caller based on paint type:
     /// - `OpaqueSolid(color)`: Paint is an opaque solid color, can replace background
     /// - `OpaqueImage`: Paint is an opaque image, can clear previous commands
     /// - `None`: No optimization available
@@ -1249,29 +1234,30 @@ impl<const MODE: u8> WideTile<MODE> {
         width: u16,
         attrs_idx: u32,
         current_layer_id: LayerId,
-        optimization: FillOptimization,
+        fill_hint: FillHint,
     ) {
         if !self.is_zero_clip() || self.in_clipped_filter_layer {
             match MODE {
                 MODE_CPU => {
                     // Check if we can apply overdraw elimination optimization.
                     // This requires filling the entire tile width with no clip/buffer stack.
+                    //
+                    // Note that we could be more aggressive in optimizing a whole-tile opaque fill
+                    // even with a clip stack. It would be valid to elide all drawing commands from
+                    // the enclosing clip push up to the fill. Further, we could extend the clip
+                    // push command to include a background color, rather than always starting with
+                    // a transparent buffer. Lastly, a sequence of push(bg); strip/fill; pop could
+                    // be replaced with strip/fill with the color (the latter is true even with a
+                    // non-opaque color).
+                    //
+                    // However, the extra cost of tracking such optimizations may outweigh the
+                    // benefit, especially in hybrid mode with GPU painting.
                     let can_override =
                         x == 0 && width == WideTile::WIDTH && self.n_clip == 0 && self.n_bufs == 0;
 
                     if can_override {
-                        match optimization {
-                            FillOptimization::OpaqueSolid(color) => {
-                                // Note that we could be more aggressive in optimizing a whole-tile opaque fill
-                                // even with a clip stack. It would be valid to elide all drawing commands from
-                                // the enclosing clip push up to the fill. Further, we could extend the clip
-                                // push command to include a background color, rather than always starting with
-                                // a transparent buffer. Lastly, a sequence of push(bg); strip/fill; pop could
-                                // be replaced with strip/fill with the color (the latter is true even with a
-                                // non-opaque color).
-                                //
-                                // However, the extra cost of tracking such optimizations may outweigh the
-                                // benefit, especially in hybrid mode with GPU painting.
+                        match fill_hint {
+                            FillHint::OpaqueSolid(color) => {
                                 self.cmds.clear();
                                 self.bg = color;
                                 if let Some(ranges) =
@@ -1281,7 +1267,7 @@ impl<const MODE: u8> WideTile<MODE> {
                                 }
                                 return;
                             }
-                            FillOptimization::OpaqueImage => {
+                            FillHint::OpaqueImage => {
                                 // Opaque image: clear previous commands but still emit the fill.
                                 self.cmds.clear();
                                 self.bg = PremulColor::from_alpha_color(TRANSPARENT);
@@ -1290,9 +1276,10 @@ impl<const MODE: u8> WideTile<MODE> {
                                 {
                                     ranges.clear();
                                 }
-                                // Fall through to emit the fill command below.
+                                // Fall through to emit the fill command below, as opposed to
+                                // solid paints where we have a return statement.
                             }
-                            FillOptimization::None => {}
+                            FillHint::None => {}
                         }
                     }
 
@@ -1534,6 +1521,21 @@ impl<const MODE: u8> WideTile<MODE> {
     }
 }
 
+/// Optimization hint for fill operations, computed in `Wide::generate` and passed to `WideTile::fill`.
+///
+/// This enum communicates whether a fill operation can benefit from overdraw elimination:
+/// - For opaque solid colors: we can set the background color directly and skip the fill
+/// - For opaque images: we can clear previous commands but still need to emit the fill
+#[derive(Debug, Clone, Copy)]
+pub enum FillHint {
+    /// No optimization possible, emit fill command normally.
+    None,
+    /// Paint is an opaque solid color - can replace background if conditions are met.
+    OpaqueSolid(PremulColor),
+    /// Paint is an opaque image - can clear previous commands if conditions are met.
+    OpaqueImage,
+}
+
 /// Distinguishes between different types of layers and their storage strategies.
 ///
 /// Each layer kind determines how the layer's content is stored and processed:
@@ -1888,7 +1890,7 @@ impl LayerCommandRanges {
 
 #[cfg(test)]
 mod tests {
-    use crate::coarse::{FillOptimization, LayerKind, MODE_CPU, Wide, WideTile};
+    use crate::coarse::{FillHint, LayerKind, MODE_CPU, Wide, WideTile};
     use crate::kurbo::Affine;
     use crate::peniko::{BlendMode, Compose, Mix};
     use crate::render_graph::RenderGraph;
@@ -1908,8 +1910,8 @@ mod tests {
     fn basic_layer() {
         let mut wide = WideTile::<MODE_CPU>::new(0, 0);
         wide.push_buf(LayerKind::Regular(0));
-        wide.fill(0, 10, 0, 0, FillOptimization::None);
-        wide.fill(10, 10, 0, 0, FillOptimization::None);
+        wide.fill(0, 10, 0, 0, FillHint::None);
+        wide.fill(10, 10, 0, 0, FillHint::None);
         wide.pop_buf();
 
         assert_eq!(wide.cmds.len(), 4);
@@ -1921,8 +1923,8 @@ mod tests {
 
         let mut wide = WideTile::<MODE_CPU>::new(0, 0);
         wide.push_buf(LayerKind::Regular(0));
-        wide.fill(0, 10, 0, 0, FillOptimization::None);
-        wide.fill(10, 10, 0, 0, FillOptimization::None);
+        wide.fill(0, 10, 0, 0, FillHint::None);
+        wide.fill(10, 10, 0, 0, FillHint::None);
         wide.blend(blend_mode);
         wide.pop_buf();
 
@@ -1935,7 +1937,7 @@ mod tests {
 
         let mut wide = WideTile::<MODE_CPU>::new(0, 0);
         wide.push_buf(LayerKind::Regular(0));
-        wide.fill(0, 10, 0, 0, FillOptimization::None);
+        wide.fill(0, 10, 0, 0, FillHint::None);
         wide.blend(blend_mode);
         wide.pop_buf();
 
diff --git a/sparse_strips/vello_cpu/src/dispatch/multi_threaded.rs b/sparse_strips/vello_cpu/src/dispatch/multi_threaded.rs
index 417de7292..c19e532a8 100644
--- a/sparse_strips/vello_cpu/src/dispatch/multi_threaded.rs
+++ b/sparse_strips/vello_cpu/src/dispatch/multi_threaded.rs
@@ -270,7 +270,10 @@ impl MultiThreadedDispatcher {
             allocation_group,
         };
         task_sender.send(task).unwrap();
-        // TODO: Support encoded_paints in multithreading.
+        // TODO: Pass encoded_paints here to enable overdraw elimination for opaque indexed
+        // paints. Currently we pass an empty slice, so indexed paints render correctly but miss
+        // the FillHint::OpaqueImage optimization. The challenge is that encoded_paints is a
+        // borrowed reference that may not be valid by the time coarse processing runs asynchronously.
         self.run_coarse(true, &[]);
     }