From 1e1ae8ead174ffb1f222604852d11f670602c25a Mon Sep 17 00:00:00 2001 From: Ben Schmidt Date: Tue, 17 Dec 2024 11:57:38 -0500 Subject: [PATCH] webGPU resources --- src/webGPU/buffertools.ts | 127 +++++++++++++++++++++++++ src/webGPU/forests.ts | 170 +++++++++++++++++++++++++++++++++ src/webGPU/lib.ts | 194 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 491 insertions(+) create mode 100644 src/webGPU/buffertools.ts create mode 100644 src/webGPU/forests.ts create mode 100644 src/webGPU/lib.ts diff --git a/src/webGPU/buffertools.ts b/src/webGPU/buffertools.ts new file mode 100644 index 00000000..b980ad08 --- /dev/null +++ b/src/webGPU/buffertools.ts @@ -0,0 +1,127 @@ +import { isTypedArray, type TypedArray } from 'webgpu-utils'; +import { BufferSet } from '../regl_rendering'; +import { WebGPUBufferLocation } from '../types'; +// I track locations on buffers like this. +// We keep track of both size -- the number of meaningful data bytes +// and paddedSize -- the number of bytes including 256-byte padding. + +export class WebGPUBufferSet extends BufferSet { + // Copied with alterations from deepscatter + + // An abstraction creating an expandable set of buffers that can be subdivided + // to put more than one variable on the same + // block of memory. Reusing buffers this way can have performance benefits over allocating + // multiple different buffers for each small block used. + + // The general purpose here is to call 'allocate_block' that releases a block of memory + // to use in creating a new array to be passed to regl. + + public device: GPUDevice; + private stagingBuffer: GPUBuffer; + public usage: number; + + public store: Map = new Map(); + + /** + * + * @param regl the Regl context we're using. + * @param buffer_size The number of bytes on each strip of memory that we'll ask for. + */ + + constructor( + device: GPUDevice, + buffer_size: number, + usage: number = GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_DST | GPUBufferUsage.COPY_SRC + ) { + super(buffer_size) + this.device = device; + // Track the ends in case we want to allocate smaller items. + this.usage = usage; + this.generate_new_buffer(); + this.stagingBuffer = device.createBuffer({ + size: buffer_size, + usage: GPUBufferUsage.COPY_SRC | GPUBufferUsage.MAP_WRITE, + mappedAtCreation: false // saves a little trouble in the passThrough function + }); + } + + private async passThroughStagingBuffer(values: Uint32Array, bufferLocation: WebGPUBufferLocation) { + // WebGPU + const { buffer, offset, paddedSize } = bufferLocation; + while (this.stagingBuffer.mapState !== 'unmapped') { + // Wait in line for a millisecond. + // Would be better to hold a queue and apply more than one of these at once. + await new Promise((resolve) => setTimeout(resolve, 1)); + } + await this.stagingBuffer.mapAsync(GPUMapMode.WRITE, 0, paddedSize); + new Uint32Array(this.stagingBuffer.getMappedRange(0, values.byteLength)).set(values); + this.stagingBuffer.unmap(); + const commandEncoder = this.device.createCommandEncoder(); + commandEncoder.copyBufferToBuffer(this.stagingBuffer, 0, buffer, offset, paddedSize); + this.device.queue.submit([commandEncoder.finish()]); + } + + register(k: string, v: WebGPUBufferLocation) { + this.store.set(k, v); + } + + async set(key: string, value: TypedArray) { + if (this.store.has(key)) { + throw new Error(`Key ${key} already exists in buffer set.`); + } + const size = value.byteLength; + const paddedSize = Math.ceil(size / 256) * 256; + + const { buffer, offset } = this.allocate_block(paddedSize); + + // If it's a typed array, we can just copy it directly. + // cast it to uint32array + const v2 = value; + const data = new Uint32Array(v2.buffer, v2.byteOffset, v2.byteLength / 4); + const description = { buffer, offset, size, paddedSize }; + await this.passThroughStagingBuffer(data, description); + this.register(key, description); + } + + _create_buffer() : GPUBuffer { + return this.device.createBuffer({ + size: this.buffer_size, + usage: this.usage, + mappedAtCreation: false + }) + } + + _create_leftover_buffer() : WebGPUBufferLocation { + return { + buffer: this.buffers[0], + offset: this.pointer, + stride: 4, // meaningless here. + byte_size: this.buffer_size - this.pointer, + paddedSize: this.buffer_size - this.pointer + } + } +} + + +export function createSingletonBuffer( + device: GPUDevice, + data: Uint32Array | Int32Array | Float32Array | ArrayBuffer, + usage: number +): GPUBuffer { + // Creates a disposable singleton buffer. + // ReadonlyBufferSet ought to provide better performance; but + // this allows more different buffer sizes and easier destruction. + const buffer = device.createBuffer({ + size: data.byteLength, + usage, + mappedAtCreation: true + }); + const mappedRange = buffer.getMappedRange(); + if (isTypedArray(data)) { + new Uint32Array(mappedRange).set(data as TypedArray); + } else { + new Uint32Array(mappedRange).set(new Uint32Array(data as ArrayBuffer)); + } + buffer.unmap(); + return buffer; +} diff --git a/src/webGPU/forests.ts b/src/webGPU/forests.ts new file mode 100644 index 00000000..2c24626f --- /dev/null +++ b/src/webGPU/forests.ts @@ -0,0 +1,170 @@ +import { createSingletonBuffer, WebGPUBufferSet } from "./buffertools"; +import { StatefulGPU } from "./lib"; + +type TinyForestParams = { + nTrees: number; + depth: number; + // The number of features to consider at each split. + maxFeatures: number; + D: number; +} + +const defaultTinyForestParams : TinyForestParams = { + nTrees: 128, + depth: 8, + maxFeatures: 32, + D: 768, +} + +export class TinyForest extends StatefulGPU { + params: TinyForestParams; + + private _bootstrapSamples?: GPUBuffer; // On the order of 100 KB + protected _forests?: GPUBuffer // On the order of 10 MB. + // private trainedThrough: number = 0; + constructor( + device: GPUDevice, + bufferSize = 1024 * 1024 * 256, + t: Partial = {}) { + super(device, bufferSize) + this.params = {...defaultTinyForestParams, ...t} + this.initializeForestsToZero() + this.bufferSet = new WebGPUBufferSet(device, bufferSize); + } + + countPipeline(): GPUComputePipeline { + const { device } = this; + // const { maxFeatures, nTrees } = this.params + // const OPTIONS = 2; + // const countBuffer = device.createBuffer({ + // size: OPTIONS * maxFeatures * nTrees * 4, + // usage: GPUBufferUsage.STORAGE & GPUBufferUsage.COPY_SRC, + // mappedAtCreation: false + // }); + + const layout = device.createBindGroupLayout({ + entries: [ + { + // features buffer; + binding: 0, + visibility: GPUShaderStage.COMPUTE, + buffer: { type: 'storage' } + }, + { + // dims to check array; + binding: 1, + visibility: GPUShaderStage.COMPUTE, + buffer: { type: 'storage' } + }, + { + // output count buffer. + binding: 2, + visibility: GPUShaderStage.COMPUTE, + buffer: { type: 'storage' } + } + ] + }) + + // const subsetsToCheck = this.chooseNextFeatures(); + const pipelineLayout = device.createPipelineLayout({ bindGroupLayouts: [layout] }); + + const shaderModule = device.createShaderModule({ code: ` + @group(0) @binding(0) var features: array; + @group(0) @binding(1) var dimsToCheck: array; + @group(0) @binding(2) var counts: array; + + @compute @workgroup_size(64) + //TODOD HERE + ` }); + + + return device.createComputePipeline({ + layout: pipelineLayout, + compute: { + module: shaderModule, + entryPoint: 'main' + } + }); + } + + //@ts-expect-error foo + private chooseNextFeatures(n = 32) { + console.log({n}) + const { maxFeatures, nTrees, D } = this.params; + const features = new Uint16Array(maxFeatures * D); + for (let i = 0; i < nTrees; i++) { + const set = new Set(); + while (set.size < maxFeatures) { + set.add(Math.floor(Math.random() * D)); + } + const arr = new Uint16Array([...set].sort()); + features.set(arr, i * maxFeatures); + } + return createSingletonBuffer( + this.device, + features, + GPUBufferUsage.STORAGE + ) + } + + + + initializeForestsToZero() { + // Each tree is a set of bits; For every possible configuration + // the first D indicating + // the desired outcome for the dimension, + // the second D indicating whether the bits in those + // positions are to be considered in checking if the tree + // fits. There are 2**depth bitmasks for each dimension--each point + // will match only one, and part of the inference task is determining which one. + + const treeSizeInBytes = + 2 * this.params.D * (2 ** this.params.depth) / 8; + + const data = new Uint8Array(treeSizeInBytes * this.params.nTrees) + this._forests = createSingletonBuffer( + this.device, + data, + GPUBufferUsage.STORAGE + ) + } + + + // Rather than actually bootstrap, we generate a single + // list of 100,000 numbers drawn from a poisson distribution. + // These serve as weights for draws with replacement; to + // bootstrap any given record batch, we take a sequence of + // numbers from the buffer with offset i. + get bootstrapSamples() { + if (this._bootstrapSamples) { + return this._bootstrapSamples + } else { + const arr = new Uint8Array(100000) + for (let i = 0; i < arr.length; i++) { + arr[i] = poissonRandomNumber() + } + this._bootstrapSamples = createSingletonBuffer( + this.device, + arr, + GPUBufferUsage.STORAGE + ) + return this._bootstrapSamples + } + } + + +} + + +function poissonRandomNumber() : number { + let p = 1.0; + let k = 0; + + do { + k++; + p *= Math.random(); + } while (p > 1/Math.E); + + return k - 1; +} + diff --git a/src/webGPU/lib.ts b/src/webGPU/lib.ts new file mode 100644 index 00000000..4ad88694 --- /dev/null +++ b/src/webGPU/lib.ts @@ -0,0 +1,194 @@ +import { makeShaderDataDefinitions, makeStructuredView } from 'webgpu-utils'; +import { WebGPUBufferSet, createSingletonBuffer } from './buffertools'; +import { Scatterplot, Tile } from '../deepscatter'; + +export class StatefulGPU { + device: GPUDevice; + bufferSet: WebGPUBufferSet; + constructor(device: GPUDevice, bufferSize = 1024 * 1024 * 256) { + this.device = device; + this.bufferSet = new WebGPUBufferSet(device, bufferSize); + } + static async initializeWebGPU(): Promise { + if (!navigator.gpu) { + throw new Error('WebGPU is not supported in this browser.'); + } + + const adapter = await navigator.gpu.requestAdapter(); + if (!adapter) { + throw new Error('Failed to get GPU adapter.'); + } + + const device = await adapter.requestDevice(); + return new StatefulGPU(device); + } +} + +const bindGroupLayout = (device: GPUDevice) => + device.createBindGroupLayout({ + entries: [ + { + binding: 0, + visibility: GPUShaderStage.COMPUTE, + buffer: { type: 'read-only-storage' } + }, + { + binding: 1, + visibility: GPUShaderStage.COMPUTE, + buffer: { type: 'read-only-storage' } + }, + { + binding: 2, + visibility: GPUShaderStage.COMPUTE, + buffer: { type: 'storage' } + }, + { + binding: 3, + visibility: GPUShaderStage.COMPUTE, + buffer: { type: 'uniform' } + } + ] + }); + + export function prepareComputeShader( + state: StatefulGPU, + comparisonArray: Uint32Array, + embeddingSize: number = 128 +): (tile, key) => Promise { + // Create buffers + const { device, bufferSet } = state; + const comparisonBuffer = createSingletonBuffer( + device, + comparisonArray, + GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_SRC + ); + + // Shader code + const shaderCode = ` + + struct SizeEtc { + objectSize: u32, + }; + + @group(0) @binding(0) var comparisonArray : array; + @group(0) @binding(1) var matrixArray : array; + @group(0) @binding(2) var outputArray : array; + @group(0) @binding(3) var myUniforms: SizeEtc; + + @compute @workgroup_size(64) + fn main(@builtin(global_invocation_id) global_id : vec3) { + let idx = global_id.x; + let o = myUniforms.objectSize; + if (idx < arrayLength(&matrixArray)) { + var totalDistance: u32 = 0; + for (var i: u32 = 0; i < o; i = i + 1) { + for (var j: u32 = 0; j < arrayLength(&comparisonArray) / o; j = j + 1) { + totalDistance = totalDistance + countOneBits(comparisonArray[j * o + i] ^ matrixArray[idx * o + i]); + } + } + outputArray[global_id.x] = totalDistance; + } + } +`; + + const defs = makeShaderDataDefinitions(shaderCode); + const myUniformValues = makeStructuredView(defs.uniforms.myUniforms); + myUniformValues.set({ + objectSize: embeddingSize / 32 + }); + const layout = device.createPipelineLayout({ bindGroupLayouts: [bindGroupLayout(device)] }); + // Create shader module and pipeline + const shaderModule = device.createShaderModule({ code: shaderCode }); + const pipeline = device.createComputePipeline({ + layout, + compute: { + module: shaderModule, + entryPoint: 'main' + } + }); + const uniformBuffer = createSingletonBuffer( + device, + myUniformValues.arrayBuffer, + GPUBufferUsage.UNIFORM + ); + + const run = async function (tile: Tile, fieldName: string) { + const commandEncoder = device.createCommandEncoder(); + const key = `${tile.key}_${fieldName}`; + if (!bufferSet.store.has(key)) { + const values = (await tile.get_column(fieldName)).data[0].children[0].values as Uint8Array; + await bufferSet.set(key, values); + } + const { buffer, offset, byte_size: size } = bufferSet.store.get(key); + const outputSize = (size / embeddingSize) * 8; + const paddedSize = Math.ceil(outputSize / 4) * 4; + + // TODO this should be a permanent buffer. + const outputBuffer = device.createBuffer({ + // Put a ceiling on it. + size: paddedSize * 4, + usage: GPUBufferUsage.COPY_SRC | GPUBufferUsage.STORAGE + }); + + const passEncoder = commandEncoder.beginComputePass(); + passEncoder.setPipeline(pipeline); + passEncoder.setBindGroup( + 0, + device.createBindGroup({ + layout: pipeline.getBindGroupLayout(0), + entries: [ + { binding: 0, resource: { buffer: comparisonBuffer } }, + { binding: 1, resource: { buffer, offset, size } }, + { binding: 2, resource: { buffer: outputBuffer } }, + { binding: 3, resource: { buffer: uniformBuffer } } + ] + }) + ); + + passEncoder.dispatchWorkgroups(size / 4 / 64); + passEncoder.end(); + + // Submit the commands + const gpuReadBuffer = device.createBuffer({ + size: paddedSize * 4, + usage: GPUBufferUsage.COPY_DST | GPUBufferUsage.MAP_READ + }); + + commandEncoder.copyBufferToBuffer(outputBuffer, 0, gpuReadBuffer, 0, paddedSize * 4); + device.queue.submit([commandEncoder.finish()]); + + // Read back the results + await gpuReadBuffer.mapAsync(GPUMapMode.READ); + const outputArray = new Uint32Array(gpuReadBuffer.getMappedRange()); + return outputArray.slice(0, outputSize); + }; + return run; +} + +// hide the state in a global variable. +const dumb: StatefulGPU[] = []; + +export async function create_hamming_transform( + scatterplot: Scatterplot, + id: string, + view: Uint8Array, + dims: number, + column: string +) { + if (dumb.length === 0) { + dumb.push(await StatefulGPU.initializeWebGPU()); + } + if (scatterplot.dataset.transformations[id] !== undefined) { + return; + } + // Cast from int8 to int32 + const comparisonArray = new Uint32Array(view.buffer); + const run = prepareComputeShader(dumb[0], comparisonArray, dims); + + scatterplot.dataset.transformations[id] = async function (tile) { + const value = await run(tile, column); + const scaled = [...value].map((d) => d / ( comparisonArray.length * 32 / dims)); + return new Float32Array(scaled) + }; + await scatterplot.dataset.root_tile.get_column(id); +}