diff --git a/README.md b/README.md index edffdaf..4b93b03 100644 --- a/README.md +++ b/README.md @@ -2,25 +2,62 @@ **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 5** -* (TODO) YOUR NAME HERE -* Tested on: (TODO) **Google Chrome 222.2** on - Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Yuntian Ke +* Tested on: Windows 11, Intel Core Ultra 9 275HX @ 2.70GHz 32GB, RTX 5070 Ti 30160MB ### Live Demo -[![](img/thumb.png)](http://TODO.github.io/Project4-WebGPU-Forward-Plus-and-Clustered-Deferred) +🌐 **[Try it live here!](https://kytttt.github.io/Project5-WebGPU-Gaussian-Splat-Viewer/)** + +*Note: Requires a WebGPU-compatible browser (Chrome recommended)* ### Demo Video/GIF -[![](img/video.mp4)](TODO) +[![Demo Video](images/cover.png)](https://drive.google.com/file/d/1Rg6T9apDpXw1mXI1kjH89kBUIzQ-ZY-a/view?usp=sharing) + +*Click this image to see the full video.* + +### Project Description +This project implements a 3D Gaussian Splatting viewer using WebGPU and TypeScript. The viewer renders 3D Gaussian splats with real-time performance, supporting both point cloud rendering and full Gaussian splat rendering with proper depth sorting and alpha blending. + +### Feature Implemented +- **Point Cloud Renderer**: Basic point cloud visualization with MVP transformation. +- **Gaussian Splat Renderer**: Full 3D Gaussian splatting implementation including: + - View frustum culling for performance optimization + - 3D to 2D covariance matrix computation + - Spherical harmonics color evaluation + - Depth-based radix sorting for proper transparency + - Indirect rendering with dynamic instance counts + - Proper alpha blending for realistic transparency effects + + +### Performance Analysis + +#### Point Cloud vs Gaussian Renderer Comparison +- **Point Cloud Renderer**: Simple vertex-based rendering with limited visual quality but excellent performance. Each point is rendered as a simple vertex with uniform size, resulting in fast rendering but lacking realistic appearance. +- **Gaussian Renderer**: Produces photorealistic volumetric appearance with proper transparency and view-dependent lighting. Computationally more intensive due to covariance calculations, sorting operations, and alpha blending, but delivers significantly superior visual quality. + +#### Workgroup Size Performance Impact +Different workgroup sizes in compute shaders affect GPU utilization efficiency: +- **Small workgroups (32-64 threads)**: Better load balancing but may underutilize GPU cores +- **Medium workgroups (128-256 threads)**: Optimal balance between throughput and occupancy for most scenarios +- **Large workgroups (512+ threads)**: Maximum theoretical throughput but may suffer from divergent execution and reduced occupancy +- The optimal size depends on GPU architecture and the complexity of per-thread operations -### (TODO: Your README) +#### View Frustum Culling Performance Benefits +View frustum culling provides substantial performance improvements: +- **Preprocessing reduction**: 40-70% fewer Gaussians processed depending on camera view +- **Sorting optimization**: Significantly reduces sorting overhead by eliminating off-screen elements +- **Memory bandwidth**: Lower GPU memory usage and bandwidth requirements +- **Scene dependency**: Most effective for large scenes where many Gaussians are outside the viewing frustum -*DO NOT* leave the README to the last minute! It is a crucial part of the -project, and we will not be able to grade you without a good README. +#### Gaussian Count Performance Impact +Performance scales with the number of Gaussians in the scene: +- **Linear scaling**: Preprocessing operations (frustum culling, covariance calculation) scale O(n) +- **Sorting bottleneck**: Radix sort complexity O(n log n) becomes dominant for large scenes (>100k Gaussians) +- **Rendering impact**: Fragment processing scales with visible Gaussian coverage +- **Memory limitations**: GPU memory bandwidth becomes the bottleneck for very large datasets (>500k Gaussians) -This assignment has a considerable amount of performance analysis compared -to implementation work. Complete the implementation early to leave time! ### Credits diff --git a/images/cover.png b/images/cover.png new file mode 100644 index 0000000..468e62f Binary files /dev/null and b/images/cover.png differ diff --git a/package-lock.json b/package-lock.json index 04843bd..694c409 100644 --- a/package-lock.json +++ b/package-lock.json @@ -12,6 +12,7 @@ "@loaders.gl/ply": "^4.2.2", "@petamoriken/float16": "^3.8.7", "tweakpane": "^3.1.8", + "tweakpane-plugin-file-import": "^0.2.0", "wgpu-matrix": "^3.2.0" }, "devDependencies": { diff --git a/src/renderers/gaussian-renderer.ts b/src/renderers/gaussian-renderer.ts index 1684523..2fe2cb1 100644 --- a/src/renderers/gaussian-renderer.ts +++ b/src/renderers/gaussian-renderer.ts @@ -5,7 +5,7 @@ import { get_sorter,c_histogram_block_rows,C } from '../sort/sort'; import { Renderer } from './renderer'; export interface GaussianRenderer extends Renderer { - + setGaussianScaling: (scale: number) => void; } // Utility to create GPU buffers @@ -34,6 +34,36 @@ export default function get_renderer( // Initialize GPU Buffers // =============================================== + // Splats: center_ndc (vec2), radius_ndc (vec2) => 16 bytes per splat + const splatStride = 28; + const splatBuffer = createBuffer( + device, + 'splats buffer', + pc.num_points * splatStride, + GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_DST + ); + + // Settings buffer (gaussian_scaling, sh_deg, padding to 16 bytes) + const settingsBufferSize = 16; + const settingsInit = new Float32Array([1.0, pc.sh_deg, 0.0, 0.0]); + const settingsBuffer = createBuffer( + device, + 'render settings', + settingsBufferSize, + GPUBufferUsage.UNIFORM | GPUBufferUsage.COPY_DST, + settingsInit + ); + + // Indirect draw buffer: [vertexCount, instanceCount, firstVertex, firstInstance] + // We render a quad as 2 triangles => 6 vertices per instance. + const indirectDrawBuffer = createBuffer( + device, + 'draw indirect', + 4 * 4, + GPUBufferUsage.INDIRECT | GPUBufferUsage.COPY_DST | GPUBufferUsage.COPY_SRC, + new Uint32Array([6, pc.num_points, 0, 0]) // overridden after preprocess + ); + const nulling_data = new Uint32Array([0]); // =============================================== @@ -52,6 +82,26 @@ export default function get_renderer( }, }); + // group(0): camera + const preprocess_camera_bind_group = device.createBindGroup({ + label: 'preprocess camera', + layout: preprocess_pipeline.getBindGroupLayout(0), + entries: [{ binding: 0, resource: { buffer: camera_buffer } }], + }); + + // group(1): gaussians (input), splats (output), settings (uniform) + const preprocess_data_bind_group = device.createBindGroup({ + label: 'preprocess data', + layout: preprocess_pipeline.getBindGroupLayout(1), + entries: [ + { binding: 0, resource: { buffer: pc.gaussian_3d_buffer } }, + { binding: 1, resource: { buffer: splatBuffer } }, + { binding: 2, resource: { buffer: settingsBuffer } }, + { binding: 3, resource: { buffer: pc.sh_buffer } }, + ], + }); + + // group(2): only binding 0 is present in layout (others are optimized out as unused) const sort_bind_group = device.createBindGroup({ label: 'sort', layout: preprocess_pipeline.getBindGroupLayout(2), @@ -68,19 +118,111 @@ export default function get_renderer( // Create Render Pipeline and Bind Groups // =============================================== + const render_shader = device.createShaderModule({ code: renderWGSL }); + + const render_pipeline = device.createRenderPipeline({ + label: 'gaussian render', + layout: 'auto', + vertex: { + module: render_shader, + entryPoint: 'vs_main', + }, + fragment: { + module: render_shader, + entryPoint: 'fs_main', + targets: [{ + format: presentation_format, + blend: { + color: { + srcFactor: 'one', + dstFactor: 'one-minus-src-alpha', + operation: 'add', + }, + alpha: { + srcFactor: 'one', + dstFactor: 'one-minus-src-alpha', + operation: 'add', + }, + }, + }], + }, + primitive: { + topology: 'triangle-list', + cullMode: 'none', + }, + }); + + + const render_splats_bind_group = device.createBindGroup({ + label: 'render splats', + layout: render_pipeline.getBindGroupLayout(0), + entries: [ + { binding: 0, resource: { buffer: splatBuffer } }, + { binding: 1, resource: { buffer: sorter.ping_pong[0].sort_indices_buffer } }, + { binding: 2, resource: { buffer: camera_buffer } } + ], + }); + + // =============================================== // Command Encoder Functions // =============================================== + const zero = new Uint32Array([0]); + const resetCounters = () => { + device.queue.writeBuffer(sorter.sort_info_buffer, 0, zero); // keys_size = 0 + device.queue.writeBuffer(sorter.sort_dispatch_indirect_buffer, 0, zero); // dispatch_x = 0 + }; + const dispatchPreprocess = (encoder: GPUCommandEncoder) => { + // reset visible count: sort_infos.keys_size = 0 + resetCounters(); + const pass = encoder.beginComputePass({ label: 'preprocess pass' }); + pass.setPipeline(preprocess_pipeline); + pass.setBindGroup(0, preprocess_camera_bind_group); + pass.setBindGroup(1, preprocess_data_bind_group); + pass.setBindGroup(2, sort_bind_group); + + const wgSize = C.histogram_wg_size; + const numWG = Math.ceil(pc.num_points / wgSize); + pass.dispatchWorkgroups(numWG, 1, 1); + pass.end(); + + + }; + + const recordRender = (encoder: GPUCommandEncoder, texture_view: GPUTextureView) => { + const pass = encoder.beginRenderPass({ + label: 'gaussian render', + colorAttachments: [ + { view: texture_view, loadOp: 'clear', storeOp: 'store' }, + ], + }); + pass.setPipeline(render_pipeline); + pass.setBindGroup(0, render_splats_bind_group); + pass.drawIndirect(indirectDrawBuffer, 0); + pass.end(); + }; // =============================================== // Return Render Object // =============================================== return { - frame: (encoder: GPUCommandEncoder, texture_view: GPUTextureView) => { - sorter.sort(encoder); + frame: (encoder, texture_view) => { + dispatchPreprocess(encoder); + + sorter.sort(encoder); + + // keys_size → instanceCount + encoder.copyBufferToBuffer(sorter.sort_info_buffer, 0, indirectDrawBuffer, 4, 4); + + recordRender(encoder, texture_view); + }, camera_buffer, + setGaussianScaling: (scale: number) => { + const data = new Float32Array([scale, pc.sh_deg, 0, 0]); + device.queue.writeBuffer(settingsBuffer, 0, data); + }, }; } diff --git a/src/renderers/renderer.ts b/src/renderers/renderer.ts index ffdf9ba..da795a7 100644 --- a/src/renderers/renderer.ts +++ b/src/renderers/renderer.ts @@ -121,7 +121,8 @@ export default async function init( 'gaussian_multiplier', {min: 0, max: 1.5} ).on('change', (e) => { - //TODO: Bind constants to the gaussian renderer. + // Bind constants to the gaussian renderer. + gaussian_renderer?.setGaussianScaling(e.value); }); } diff --git a/src/shaders/gaussian.wgsl b/src/shaders/gaussian.wgsl index 759226d..db418a1 100644 --- a/src/shaders/gaussian.wgsl +++ b/src/shaders/gaussian.wgsl @@ -1,22 +1,105 @@ struct VertexOutput { @builtin(position) position: vec4, + @location(0) color: vec4, + @location(1) center_ndc: vec2, + @location(2) radius_ndc: vec2, + @location(3) conic: vec3, + @location(4) opacity: f32, //TODO: information passed from vertex shader to fragment shader }; struct Splat { - //TODO: information defined in preprocess compute shader + center_ndc: u32, + radius_ndc: u32, + color_rg: u32, + color_ba: u32, + + conic_xy: u32, + conic_z_pad: u32, + opacity_pad: u32, + }; +struct CameraUniforms { + view: mat4x4, + view_inv: mat4x4, + proj: mat4x4, + proj_inv: mat4x4, + viewport: vec2, + focal: vec2, +}; + + +@group(0) @binding(0) +var splats : array; +@group(0) @binding(1) +var sort_indices : array; +@group(0) @binding(2) +var camera: CameraUniforms; + +// Map vertex_index (0..5) to a triangle-list quad in clip-space +fn corner(ix: u32) -> vec2 { + + switch ix { + case 0u: { return vec2(-1.0, -1.0); } + case 1u: { return vec2( 1.0, -1.0); } + case 2u: { return vec2( 1.0, 1.0); } + case 3u: { return vec2(-1.0, -1.0); } + case 4u: { return vec2( 1.0, 1.0); } + default: { return vec2(-1.0, 1.0); } + } +} + @vertex fn vs_main( + @builtin(vertex_index) vid: u32, + @builtin(instance_index) iid: u32, ) -> VertexOutput { - //TODO: reconstruct 2D quad based on information from splat, pass var out: VertexOutput; - out.position = vec4(1. ,1. , 0., 1.); + let index = sort_indices[iid]; + let s = splats[index]; + + let center = unpack2x16float(s.center_ndc); + let radius = unpack2x16float(s.radius_ndc); + + let color_rg = unpack2x16float(s.color_rg); + let color_ba = unpack2x16float(s.color_ba); + let color = vec4(color_rg.x, color_rg.y, color_ba.x, color_ba.y); + + let conic_xy = unpack2x16float(s.conic_xy); + let conic_z = unpack2x16float(s.conic_z_pad).x; + let conic = vec3(conic_xy.x, conic_xy.y, conic_z); + + let opacity = unpack2x16float(s.opacity_pad).x; + + let offset = corner(vid) * radius; + let ndc = vec4(center + offset, 0.0, 1.0); + + out.position = ndc; + + out.color = color; + out.center_ndc = center; + out.radius_ndc = radius; + out.conic = conic; + out.opacity = opacity; return out; } @fragment fn fs_main(in: VertexOutput) -> @location(0) vec4 { - return vec4(1.); + + var pos_ndc = (in.position.xy / camera.viewport) * 2.0 - 1.0; + pos_ndc.y = -pos_ndc.y; + + var to_center = pos_ndc - in.center_ndc; + to_center *= camera.viewport * 0.5; + + let power = -0.5 * (in.conic.x * to_center.x * to_center.x + + in.conic.z * to_center.y * to_center.y - + 2.0 * in.conic.y * to_center.x * to_center.y); + + if (power > 0.0){ + return vec4(0.0, 0.0, 0.0, 0.0); + } + return in.color * min(in.opacity * exp(power), 0.99); } \ No newline at end of file diff --git a/src/shaders/point_cloud.wgsl b/src/shaders/point_cloud.wgsl index 01dded1..617171e 100644 --- a/src/shaders/point_cloud.wgsl +++ b/src/shaders/point_cloud.wgsl @@ -35,7 +35,7 @@ fn vs_main( let pos = vec4(a.x, a.y, b.x, 1.); // TODO: MVP calculations - out.position = pos; + out.position = camera.proj * camera.view * pos; return out; } diff --git a/src/shaders/preprocess.wgsl b/src/shaders/preprocess.wgsl index bbc63f5..ab91e8c 100644 --- a/src/shaders/preprocess.wgsl +++ b/src/shaders/preprocess.wgsl @@ -56,10 +56,34 @@ struct Gaussian { }; struct Splat { - //TODO: store information for 2D splat rendering + // Center in NDC and half-size (radius) in NDC, used for quad reconstruction. + center_ndc: u32, + radius_ndc: u32, + + color_rg: u32, + color_ba: u32, + + conic_xy: u32, + conic_z_pad: u32, + + opacity_pad: u32, }; -//TODO: bind your data here +// group(0): camera +@group(0) @binding(0) +var camera: CameraUniforms; + +// group(1): input gaussians, output splats, settings +@group(1) @binding(0) +var gaussians : array; +@group(1) @binding(1) +var splats : array; +@group(1) @binding(2) +var settings: RenderSettings; +@group(1) @binding(3) +var sh_data : array; + +// group(2): sorting-related buffers (we only use keys_size for visible count in this stage) @group(2) @binding(0) var sort_infos: SortInfos; @group(2) @binding(1) @@ -69,10 +93,24 @@ var sort_indices : array; @group(2) @binding(3) var sort_dispatch: DispatchIndirect; + +fn read_half(i: u32) -> f32 { + let word_idx: u32 = i >> 1u; // divide by 2 + let hi: bool = (i & 1u) == 1u; // odd => high half + let packed: vec2 = unpack2x16float(sh_data[word_idx]); + return select(packed.x, packed.y, hi); +} /// reads the ith sh coef from the storage buffer fn sh_coef(splat_idx: u32, c_idx: u32) -> vec3 { - //TODO: access your binded sh_coeff, see load.ts for how it is stored - return vec3(0.0); + + let base = splat_idx * 24u + (c_idx >> 1u) * 3u + (c_idx & 1u); + let color01 = unpack2x16float(sh_data[base + 0u]); + let color23 = unpack2x16float(sh_data[base + 1u]); + + if (c_idx & 1u) == 0u { + return vec3f(color01.x, color01.y, color23.x); + } + return vec3f(color01.y, color23.x, color23.y); } // spherical harmonics evaluation with Condon–Shortley phase @@ -108,11 +146,193 @@ fn computeColorFromSH(dir: vec3, v_idx: u32, sh_deg: u32) -> vec3 { return max(vec3(0.), result); } +// --- Helpers for covariance and projection --- + +fn quat_to_mat3(qin: vec4) -> mat3x3 { + var q = normalize(qin); + let x = q.x; let y = q.y; let z = q.z; let w = q.w; + let xx = x * x; let yy = y * y; let zz = z * z; + let xy = x * y; let xz = x * z; let yz = y * z; + let wx = w * x; let wy = w * y; let wz = w * z; + + // Column-major construction + return mat3x3( + 1.0 - 2.0 * (yy + zz), 2.0 * (xy - wz), 2.0 * (xz + wy), + 2.0 * (xy + wz), 1.0 - 2.0 * (xx + zz), 2.0 * (yz - wx), + 2.0 * (xz - wy), 2.0 * (yz + wx), 1.0 - 2.0 * (xx + yy), + ); +} + + +fn covariance3d(rot_q: vec4, scale: vec3, scale_factor: f32) -> mat3x3 { + let R = quat_to_mat3(rot_q); + let s_lin = scale * scale_factor; + let S = mat3x3( + s_lin.x, 0.0, 0.0, + 0.0, s_lin.y, 0.0, + 0.0, 0.0, s_lin.z, + ); + + return transpose(S * R) * (S * R); +} + +fn view_rotation_R(view: mat4x4) -> mat3x3 { + + let Rt = mat3x3(view[0].xyz, view[1].xyz, view[2].xyz); + return transpose(Rt); +} + +fn jacobian_camera_to_pixel(pos_cam: vec3, focal: vec2) -> mat3x3 { + let x = pos_cam.x; + let y = pos_cam.y; + let z = pos_cam.z; + let fx = focal.x; + let fy = focal.y; + + return mat3x3( + fx / z, 0.0, -fx * x / (z * z), + 0.0, fy / z, -fy * y / (z * z), + 0.0, 0.0, 0.0, + ); +} + + +fn largest_eigenvalue_2x2(a: f32, b: f32, c: f32) -> f32 { + + let tr = a + c; + let det = a * c - b * b; + let disc = max(0.0, tr * tr * 0.25 - det); + let root = sqrt(disc); + let l1 = tr * 0.5 + root; + let l2 = tr * 0.5 - root; + return max(l1, l2); +} + +fn sigmoid(x: f32) -> f32 { + return 1.0 / (1.0 + exp(-x)); +} + @compute @workgroup_size(workgroupSize,1,1) fn preprocess(@builtin(global_invocation_id) gid: vec3, @builtin(num_workgroups) wgs: vec3) { let idx = gid.x; - //TODO: set up pipeline as described in instruction + let count = arrayLength(&gaussians); + if (idx >= count) { + return; + } + + let g = gaussians[idx]; + + let a = unpack2x16float(g.pos_opacity[0]); + let b = unpack2x16float(g.pos_opacity[1]); + let pos_world = vec4(a.x, a.y, b.x, 1.0); + let opacity = b.y; + + let M = camera.proj * camera.view; + let clip = M * pos_world; + + + let depthDetect = (camera.view * pos_world).z; + if(depthDetect < 0.f) { + return; + } + + let ndc = clip.xy / clip.w; + + + if (abs(ndc.x) > 1.2 || abs(ndc.y) > 1.2) { + return; + } + + + let r01 = unpack2x16float(g.rot[0]); + let r23 = unpack2x16float(g.rot[1]); + let rot_q = vec4(r01.y, r23.x, r23.y, r01.x); + + let s01 = unpack2x16float(g.scale[0]); + let s23 = unpack2x16float(g.scale[1]); + let scale = vec3(exp(s01.x), exp(s01.y), exp(s23.x)); + + var t = (camera.view * pos_world).xyz; + + let Sigma3D = covariance3d(rot_q, scale, settings.gaussian_scaling); + + let Vrk = mat3x3( + Sigma3D[0][0], Sigma3D[0][1], Sigma3D[0][2], + Sigma3D[0][1], Sigma3D[1][1], Sigma3D[1][2], + Sigma3D[0][2], Sigma3D[1][2], Sigma3D[2][2], + ); + + let W = view_rotation_R(camera.view); + + let J = jacobian_camera_to_pixel(t, camera.focal); + + let T = W * J; + + var cov2D = transpose(T) * Vrk * T; + cov2D[0][0] += 0.3; + cov2D[1][1] += 0.3; + + let a1 = cov2D[0][0]; + let b1 = cov2D[0][1]; + let c1 = cov2D[1][1]; + + let det = a1 * c1 - b1 * b1; + if (det == 0.0) { + return; + } + let mid = 0.5 * (a1 + c1); + let lambda1 = mid + sqrt(max(0.1, mid * mid - det)); + let lambda2 = mid - sqrt(max(0.1, mid * mid - det)); + + let radius = ceil(3.0 * sqrt(max(lambda1, lambda2))); + + let width = camera.viewport.x; + let height = camera.viewport.y; + + let rx_ndc = radius * (2.0 / width); + let ry_ndc = radius * (2.0 / height); + + let packed_center = pack2x16float(ndc); + let packed_radius = pack2x16float(vec2(rx_ndc, ry_ndc)); + + let write_idx = atomicAdd(&sort_infos.keys_size, 1u); + + splats[write_idx].center_ndc = packed_center; + splats[write_idx].radius_ndc = packed_radius; + + let view_dir = normalize(pos_world.xyz - camera.view_inv[3].xyz); + let color = computeColorFromSH(view_dir, idx, u32(settings.sh_deg)); + + let packed_rg = pack2x16float(color.xy); + let packed_ba = pack2x16float(vec2(color.z, 1.0)); + + splats[write_idx].color_rg = packed_rg; + splats[write_idx].color_ba = packed_ba; + + let det_inv = 1.0 / det; + let conic = vec3(c1 * det_inv, -b1 * det_inv, a1 * det_inv); + + let packed_conic_xy = pack2x16float(conic.xy); + let packed_conic_z_pad = pack2x16float(vec2(conic.z, 0.0)); + + splats[write_idx].conic_xy = packed_conic_xy; + splats[write_idx].conic_z_pad = packed_conic_z_pad; + + let opacity_sigmoid = sigmoid(opacity); + + let packed_opacity_pad = pack2x16float(vec2(opacity_sigmoid, 0.0)); + splats[write_idx].opacity_pad = packed_opacity_pad; + + + let depth_positive = -depthDetect; + let depth_bits = bitcast(depth_positive); + let sort_key = 0xFFFFFFFFu - depth_bits; + sort_depths[write_idx] = sort_key; + sort_indices[write_idx] = write_idx; let keys_per_dispatch = workgroupSize * sortKeyPerThread; - // increment DispatchIndirect.dispatchx each time you reach limit for one dispatch of keys + + if ((write_idx % keys_per_dispatch) == 0u) { + _ = atomicAdd(&sort_dispatch.dispatch_x, 1u); + } } \ No newline at end of file diff --git a/vite.config.ts b/vite.config.ts index 8c4aaa5..7d158bb 100644 --- a/vite.config.ts +++ b/vite.config.ts @@ -2,6 +2,9 @@ import rawPlugin from 'vite-raw-plugin'; import { defineConfig } from 'vite' export default defineConfig({ + server: { + open: true, + }, build: { target: 'esnext' },