From 1e1ae8ead174ffb1f222604852d11f670602c25a Mon Sep 17 00:00:00 2001
From: Ben Schmidt <bmschmidt@gmail.com>
Date: Tue, 17 Dec 2024 11:57:38 -0500
Subject: [PATCH] webGPU resources

---
 src/webGPU/buffertools.ts | 127 +++++++++++++++++++++++++
 src/webGPU/forests.ts     | 170 +++++++++++++++++++++++++++++++++
 src/webGPU/lib.ts         | 194 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 491 insertions(+)
 create mode 100644 src/webGPU/buffertools.ts
 create mode 100644 src/webGPU/forests.ts
 create mode 100644 src/webGPU/lib.ts

diff --git a/src/webGPU/buffertools.ts b/src/webGPU/buffertools.ts
new file mode 100644
index 00000000..b980ad08
--- /dev/null
+++ b/src/webGPU/buffertools.ts
@@ -0,0 +1,127 @@
+import { isTypedArray, type TypedArray } from 'webgpu-utils';
+import { BufferSet } from '../regl_rendering';
+import { WebGPUBufferLocation } from '../types';
+// I track locations on buffers like this.
+// We keep track of both size -- the number of meaningful data bytes
+// and paddedSize -- the number of bytes including 256-byte padding.
+
+export class WebGPUBufferSet extends BufferSet<GPUBuffer, WebGPUBufferLocation> {
+	// Copied with alterations from deepscatter
+
+	// An abstraction creating an expandable set of buffers that can be subdivided
+	// to put more than one variable on the same
+	// block of memory. Reusing buffers this way can have performance benefits over allocating
+	// multiple different buffers for each small block used.
+
+	// The general purpose here is to call 'allocate_block' that releases a block of memory
+	// to use in creating a new array to be passed to regl.
+
+	public device: GPUDevice;
+	private stagingBuffer: GPUBuffer;
+	public usage: number;
+
+	public store: Map<string, WebGPUBufferLocation> = new Map();
+
+	/**
+	 *
+	 * @param regl the Regl context we're using.
+	 * @param buffer_size The number of bytes on each strip of memory that we'll ask for.
+	 */
+
+	constructor(
+		device: GPUDevice,
+		buffer_size: number,
+		usage: number = GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_DST | GPUBufferUsage.COPY_SRC
+	) {
+		super(buffer_size)
+		this.device = device;
+		// Track the ends in case we want to allocate smaller items.
+		this.usage = usage;
+		this.generate_new_buffer();
+		this.stagingBuffer = device.createBuffer({
+			size: buffer_size,
+			usage: GPUBufferUsage.COPY_SRC | GPUBufferUsage.MAP_WRITE,
+			mappedAtCreation: false // saves a little trouble in the passThrough function
+		});
+	}
+
+	private async passThroughStagingBuffer(values: Uint32Array, bufferLocation: WebGPUBufferLocation) {
+		// WebGPU 
+		const { buffer, offset, paddedSize } = bufferLocation;
+		while (this.stagingBuffer.mapState !== 'unmapped') {
+			// Wait in line for a millisecond.
+			// Would be better to hold a queue and apply more than one of these at once.
+			await new Promise((resolve) => setTimeout(resolve, 1));
+		}
+		await this.stagingBuffer.mapAsync(GPUMapMode.WRITE, 0, paddedSize);
+		new Uint32Array(this.stagingBuffer.getMappedRange(0, values.byteLength)).set(values);
+		this.stagingBuffer.unmap();
+		const commandEncoder = this.device.createCommandEncoder();
+		commandEncoder.copyBufferToBuffer(this.stagingBuffer, 0, buffer, offset, paddedSize);
+		this.device.queue.submit([commandEncoder.finish()]);
+	}
+
+	register(k: string, v: WebGPUBufferLocation) {
+		this.store.set(k, v);
+	}
+
+	async set(key: string, value: TypedArray) {
+		if (this.store.has(key)) {
+			throw new Error(`Key ${key} already exists in buffer set.`);
+		}
+		const size = value.byteLength;
+		const paddedSize = Math.ceil(size / 256) * 256;
+
+		const { buffer, offset } = this.allocate_block(paddedSize);
+
+		// If it's a typed array, we can just copy it directly.
+		// cast it to uint32array
+		const v2 = value;
+		const data = new Uint32Array(v2.buffer, v2.byteOffset, v2.byteLength / 4);
+		const description = { buffer, offset, size, paddedSize };
+		await this.passThroughStagingBuffer(data, description);
+		this.register(key, description);
+	}
+
+		_create_buffer() : GPUBuffer {
+			return this.device.createBuffer({
+				size: this.buffer_size,
+				usage: this.usage,
+				mappedAtCreation: false
+			})
+		}
+	
+		_create_leftover_buffer() : WebGPUBufferLocation {
+			return {
+					buffer: this.buffers[0],
+					offset: this.pointer,
+					stride: 4, // meaningless here.
+					byte_size: this.buffer_size - this.pointer,
+					paddedSize: this.buffer_size - this.pointer
+			}
+		}
+}
+
+
+export function createSingletonBuffer(
+	device: GPUDevice,
+	data: Uint32Array | Int32Array | Float32Array | ArrayBuffer,
+	usage: number
+): GPUBuffer {
+	// Creates a disposable singleton buffer.
+	// ReadonlyBufferSet ought to provide better performance; but
+	// this allows more different buffer sizes and easier destruction.
+	const buffer = device.createBuffer({
+		size: data.byteLength,
+		usage,
+		mappedAtCreation: true
+	});
+	const mappedRange = buffer.getMappedRange();
+	if (isTypedArray(data)) {
+		new Uint32Array(mappedRange).set(data as TypedArray);
+	} else {
+		new Uint32Array(mappedRange).set(new Uint32Array(data as ArrayBuffer));
+	}
+	buffer.unmap();
+	return buffer;
+}
diff --git a/src/webGPU/forests.ts b/src/webGPU/forests.ts
new file mode 100644
index 00000000..2c24626f
--- /dev/null
+++ b/src/webGPU/forests.ts
@@ -0,0 +1,170 @@
+import { createSingletonBuffer, WebGPUBufferSet } from "./buffertools";
+import { StatefulGPU } from "./lib";
+
+type TinyForestParams = {
+  nTrees: number;
+  depth: number;
+  // The number of features to consider at each split.
+  maxFeatures: number;
+  D: number;
+}
+
+const defaultTinyForestParams : TinyForestParams = {
+  nTrees: 128,
+  depth: 8,
+  maxFeatures: 32,
+  D: 768,
+}
+
+export class TinyForest extends StatefulGPU {
+  params: TinyForestParams;
+  
+  private _bootstrapSamples?: GPUBuffer; // On the order of 100 KB
+  protected _forests?: GPUBuffer // On the order of 10 MB.
+  // private trainedThrough: number = 0;
+  constructor(
+    device: GPUDevice, 
+    bufferSize = 1024 * 1024 * 256, 
+    t: Partial<TinyForestParams> = {}) {
+    super(device, bufferSize)
+    this.params = {...defaultTinyForestParams, ...t}
+    this.initializeForestsToZero()
+    this.bufferSet = new WebGPUBufferSet(device, bufferSize);
+  }
+
+  countPipeline(): GPUComputePipeline {
+    const { device } = this;
+    // const { maxFeatures, nTrees } = this.params
+    // const OPTIONS = 2;
+    // const countBuffer = device.createBuffer({
+    //   size: OPTIONS * maxFeatures * nTrees * 4,
+    //   usage: GPUBufferUsage.STORAGE & GPUBufferUsage.COPY_SRC,
+    //   mappedAtCreation: false
+    // });
+
+    const layout = device.createBindGroupLayout({
+      entries: [
+        {
+          // features buffer;
+          binding: 0,
+          visibility: GPUShaderStage.COMPUTE,
+          buffer: { type: 'storage' }
+        },
+        {
+          // dims to check array;
+          binding: 1,
+          visibility: GPUShaderStage.COMPUTE,
+          buffer: { type: 'storage' }
+        },
+        {
+          // output count buffer.
+          binding: 2,
+          visibility: GPUShaderStage.COMPUTE,
+          buffer: { type: 'storage' }
+        }
+      ]
+    })
+
+    // const subsetsToCheck = this.chooseNextFeatures();
+    const pipelineLayout = device.createPipelineLayout({ bindGroupLayouts: [layout] });
+
+    const shaderModule = device.createShaderModule({ code: `
+      @group(0) @binding(0) var<storage, read> features: array<u32>;
+      @group(0) @binding(1) var<storage, read> dimsToCheck: array<u16>;
+      @group(0) @binding(2) var<storage, write> counts: array<u32>;
+
+      @compute @workgroup_size(64)
+      //TODOD HERE
+      ` });
+
+
+    return device.createComputePipeline({
+      layout: pipelineLayout,
+      compute: {
+        module: shaderModule,
+        entryPoint: 'main'
+      }
+    });
+  }
+
+  //@ts-expect-error foo
+  private chooseNextFeatures(n = 32) {
+    console.log({n})
+    const { maxFeatures, nTrees, D } = this.params;
+    const features = new Uint16Array(maxFeatures * D);
+    for (let i = 0; i < nTrees; i++) {
+      const set = new Set<number>();
+      while (set.size < maxFeatures) {
+        set.add(Math.floor(Math.random() * D));
+      }
+      const arr = new Uint16Array([...set].sort());
+      features.set(arr, i * maxFeatures);
+    }
+    return createSingletonBuffer(
+      this.device,
+      features,
+      GPUBufferUsage.STORAGE
+    )
+  }
+
+
+
+  initializeForestsToZero() {
+    // Each tree is a set of bits; For every possible configuration 
+    // the first D indicating 
+    // the desired outcome for the dimension,
+    // the second D indicating whether the bits in those
+    // positions are to be considered in checking if the tree
+    // fits. There are 2**depth bitmasks for each dimension--each point
+    // will match only one, and part of the inference task is determining which one.
+
+    const treeSizeInBytes = 
+      2 * this.params.D * (2 ** this.params.depth) / 8;
+
+    const data = new Uint8Array(treeSizeInBytes * this.params.nTrees)
+    this._forests = createSingletonBuffer(
+      this.device,
+      data,
+      GPUBufferUsage.STORAGE
+    )
+  }
+  
+
+  // Rather than actually bootstrap, we generate a single
+  // list of 100,000 numbers drawn from a poisson distribution.
+  // These serve as weights for draws with replacement; to 
+  // bootstrap any given record batch, we take a sequence of
+  // numbers from the buffer with offset i. 
+  get bootstrapSamples() {
+    if (this._bootstrapSamples) {
+      return this._bootstrapSamples
+    } else {
+      const arr = new Uint8Array(100000)
+      for (let i = 0; i < arr.length; i++) {
+        arr[i] = poissonRandomNumber()
+      }
+      this._bootstrapSamples = createSingletonBuffer(
+        this.device,
+        arr,
+        GPUBufferUsage.STORAGE
+      )
+      return this._bootstrapSamples
+    }
+  }
+
+  
+}
+
+
+function poissonRandomNumber() : number {
+  let p = 1.0;
+  let k = 0;
+
+  do {
+    k++;
+    p *= Math.random();
+  } while (p > 1/Math.E);
+
+  return k - 1;
+}
+
diff --git a/src/webGPU/lib.ts b/src/webGPU/lib.ts
new file mode 100644
index 00000000..4ad88694
--- /dev/null
+++ b/src/webGPU/lib.ts
@@ -0,0 +1,194 @@
+import { makeShaderDataDefinitions, makeStructuredView } from 'webgpu-utils';
+import { WebGPUBufferSet, createSingletonBuffer } from './buffertools';
+import { Scatterplot, Tile } from '../deepscatter';
+
+export class StatefulGPU {
+	device: GPUDevice;
+	bufferSet: WebGPUBufferSet;
+	constructor(device: GPUDevice, bufferSize = 1024 * 1024 * 256) {
+		this.device = device;
+		this.bufferSet = new WebGPUBufferSet(device, bufferSize);
+	}
+	static async initializeWebGPU(): Promise<StatefulGPU> {
+		if (!navigator.gpu) {
+			throw new Error('WebGPU is not supported in this browser.');
+		}
+	
+		const adapter = await navigator.gpu.requestAdapter();
+		if (!adapter) {
+			throw new Error('Failed to get GPU adapter.');
+		}
+	
+		const device = await adapter.requestDevice();
+		return new StatefulGPU(device);
+	}
+}
+
+const bindGroupLayout = (device: GPUDevice) =>
+	device.createBindGroupLayout({
+		entries: [
+			{
+				binding: 0,
+				visibility: GPUShaderStage.COMPUTE,
+				buffer: { type: 'read-only-storage' }
+			},
+			{
+				binding: 1,
+				visibility: GPUShaderStage.COMPUTE,
+				buffer: { type: 'read-only-storage' }
+			},
+			{
+				binding: 2,
+				visibility: GPUShaderStage.COMPUTE,
+				buffer: { type: 'storage' }
+			},
+			{
+				binding: 3,
+				visibility: GPUShaderStage.COMPUTE,
+				buffer: { type: 'uniform' }
+			}
+		]
+	});
+
+  export function prepareComputeShader(
+	state: StatefulGPU,
+	comparisonArray: Uint32Array,
+	embeddingSize: number = 128
+): (tile, key) => Promise<Uint32Array> {
+	// Create buffers
+	const { device, bufferSet } = state;
+	const comparisonBuffer = createSingletonBuffer(
+		device,
+		comparisonArray,
+		GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_SRC
+	);
+
+	// Shader code
+	const shaderCode = `
+
+  struct SizeEtc {
+    objectSize: u32,
+  };
+  
+  @group(0) @binding(0) var<storage, read> comparisonArray : array<u32>;
+  @group(0) @binding(1) var<storage, read> matrixArray : array<u32>;
+  @group(0) @binding(2) var<storage, read_write> outputArray : array<u32>;
+  @group(0) @binding(3) var<uniform> myUniforms: SizeEtc;
+  
+  @compute @workgroup_size(64)
+  fn main(@builtin(global_invocation_id) global_id : vec3<u32>) {
+      let idx = global_id.x;
+			let o = myUniforms.objectSize;
+      if (idx < arrayLength(&matrixArray)) {
+          var totalDistance: u32 = 0;
+          for (var i: u32 = 0; i < o; i = i + 1) {
+              for (var j: u32 = 0; j < arrayLength(&comparisonArray) / o; j = j + 1) {
+                totalDistance = totalDistance + countOneBits(comparisonArray[j * o + i] ^ matrixArray[idx * o + i]);
+              }
+          }
+          outputArray[global_id.x] = totalDistance;
+      }
+  }
+`;
+
+	const defs = makeShaderDataDefinitions(shaderCode);
+	const myUniformValues = makeStructuredView(defs.uniforms.myUniforms);
+	myUniformValues.set({
+		objectSize: embeddingSize / 32
+	});
+	const layout = device.createPipelineLayout({ bindGroupLayouts: [bindGroupLayout(device)] });
+	// Create shader module and pipeline
+	const shaderModule = device.createShaderModule({ code: shaderCode });
+	const pipeline = device.createComputePipeline({
+		layout,
+		compute: {
+			module: shaderModule,
+			entryPoint: 'main'
+		}
+	});
+	const uniformBuffer = createSingletonBuffer(
+		device,
+		myUniformValues.arrayBuffer,
+		GPUBufferUsage.UNIFORM
+	);
+
+	const run = async function (tile: Tile, fieldName: string) {
+		const commandEncoder = device.createCommandEncoder();
+		const key = `${tile.key}_${fieldName}`;
+		if (!bufferSet.store.has(key)) {
+			const values = (await tile.get_column(fieldName)).data[0].children[0].values as Uint8Array;
+			await bufferSet.set(key, values);
+		}
+		const { buffer, offset, byte_size: size } = bufferSet.store.get(key);
+		const outputSize = (size / embeddingSize) * 8;
+		const paddedSize = Math.ceil(outputSize / 4) * 4;
+
+		// TODO this should be a permanent buffer.
+		const outputBuffer = device.createBuffer({
+			// Put a ceiling on it.
+			size: paddedSize * 4,
+			usage: GPUBufferUsage.COPY_SRC | GPUBufferUsage.STORAGE
+		});
+
+		const passEncoder = commandEncoder.beginComputePass();
+		passEncoder.setPipeline(pipeline);
+		passEncoder.setBindGroup(
+			0,
+			device.createBindGroup({
+				layout: pipeline.getBindGroupLayout(0),
+				entries: [
+					{ binding: 0, resource: { buffer: comparisonBuffer } },
+					{ binding: 1, resource: { buffer, offset, size } },
+					{ binding: 2, resource: { buffer: outputBuffer } },
+					{ binding: 3, resource: { buffer: uniformBuffer } }
+				]
+			})
+		);
+
+		passEncoder.dispatchWorkgroups(size / 4 / 64);
+		passEncoder.end();
+
+		// Submit the commands
+		const gpuReadBuffer = device.createBuffer({
+			size: paddedSize * 4,
+			usage: GPUBufferUsage.COPY_DST | GPUBufferUsage.MAP_READ
+		});
+
+		commandEncoder.copyBufferToBuffer(outputBuffer, 0, gpuReadBuffer, 0, paddedSize * 4);
+		device.queue.submit([commandEncoder.finish()]);
+
+		// Read back the results
+		await gpuReadBuffer.mapAsync(GPUMapMode.READ);
+		const outputArray = new Uint32Array(gpuReadBuffer.getMappedRange());
+		return outputArray.slice(0, outputSize);
+	};
+	return run;
+}
+
+// hide the state in a global variable.
+const dumb: StatefulGPU[] = [];
+
+export async function create_hamming_transform(
+	scatterplot: Scatterplot,
+	id: string,
+	view: Uint8Array,
+	dims: number,
+	column: string
+) {
+	if (dumb.length === 0) {
+		dumb.push(await StatefulGPU.initializeWebGPU());
+	}
+	if (scatterplot.dataset.transformations[id] !== undefined) {
+		return;
+	}
+	// Cast from int8 to int32
+	const comparisonArray = new Uint32Array(view.buffer);
+	const run = prepareComputeShader(dumb[0], comparisonArray, dims);
+
+	scatterplot.dataset.transformations[id] = async function (tile) {
+		const value = await run(tile, column);
+		const scaled = [...value].map((d) => d  / ( comparisonArray.length * 32 / dims));
+		return 	new Float32Array(scaled)
+	};
+	await scatterplot.dataset.root_tile.get_column(id);
+}