nomic-ai · bmschmidt · Dec 17, 2024 · graphite-app · Dec 18, 2024
diff --git a/src/webGPU/buffertools.ts b/src/webGPU/buffertools.ts
@@ -0,0 +1,127 @@
+import { isTypedArray, type TypedArray } from 'webgpu-utils';
+import { BufferSet } from '../regl_rendering';
+import { WebGPUBufferLocation } from '../types';
+// I track locations on buffers like this.
+// We keep track of both size -- the number of meaningful data bytes
+// and paddedSize -- the number of bytes including 256-byte padding.
+
+export class WebGPUBufferSet extends BufferSet<GPUBuffer, WebGPUBufferLocation> {
+	// Copied with alterations from deepscatter
+
+	// An abstraction creating an expandable set of buffers that can be subdivided
+	// to put more than one variable on the same
+	// block of memory. Reusing buffers this way can have performance benefits over allocating
+	// multiple different buffers for each small block used.
+
+	// The general purpose here is to call 'allocate_block' that releases a block of memory
+	// to use in creating a new array to be passed to regl.
+
+	public device: GPUDevice;
+	private stagingBuffer: GPUBuffer;
+	public usage: number;
+
+	public store: Map<string, WebGPUBufferLocation> = new Map();
+
+	/**
+	 *
+	 * @param regl the Regl context we're using.
+	 * @param buffer_size The number of bytes on each strip of memory that we'll ask for.
+	 */
+
+	constructor(
+		device: GPUDevice,
+		buffer_size: number,
+		usage: number = GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_DST | GPUBufferUsage.COPY_SRC
+	) {
+		super(buffer_size)
+		this.device = device;
+		// Track the ends in case we want to allocate smaller items.
+		this.usage = usage;
+		this.generate_new_buffer();
+		this.stagingBuffer = device.createBuffer({
+			size: buffer_size,
+			usage: GPUBufferUsage.COPY_SRC | GPUBufferUsage.MAP_WRITE,
+			mappedAtCreation: false // saves a little trouble in the passThrough function
+		});
+	}
+
+	private async passThroughStagingBuffer(values: Uint32Array, bufferLocation: WebGPUBufferLocation) {
+		// WebGPU 
+		const { buffer, offset, paddedSize } = bufferLocation;
+		while (this.stagingBuffer.mapState !== 'unmapped') {
+			// Wait in line for a millisecond.
+			// Would be better to hold a queue and apply more than one of these at once.
+			await new Promise((resolve) => setTimeout(resolve, 1));
+		}
+		await this.stagingBuffer.mapAsync(GPUMapMode.WRITE, 0, paddedSize);
+		new Uint32Array(this.stagingBuffer.getMappedRange(0, values.byteLength)).set(values);
+		this.stagingBuffer.unmap();
+		const commandEncoder = this.device.createCommandEncoder();
+		commandEncoder.copyBufferToBuffer(this.stagingBuffer, 0, buffer, offset, paddedSize);
+		this.device.queue.submit([commandEncoder.finish()]);
+	}
+
+	register(k: string, v: WebGPUBufferLocation) {
+		this.store.set(k, v);
+	}
+
+	async set(key: string, value: TypedArray) {
+		if (this.store.has(key)) {
+			throw new Error(`Key ${key} already exists in buffer set.`);
+		}
+		const size = value.byteLength;
+		const paddedSize = Math.ceil(size / 256) * 256;
+
+		const { buffer, offset } = this.allocate_block(paddedSize);
+
+		// If it's a typed array, we can just copy it directly.
+		// cast it to uint32array
+		const v2 = value;
+		const data = new Uint32Array(v2.buffer, v2.byteOffset, v2.byteLength / 4);
+		const description = { buffer, offset, size, paddedSize };
+		await this.passThroughStagingBuffer(data, description);
+		this.register(key, description);
+	}
+
+		_create_buffer() : GPUBuffer {
+			return this.device.createBuffer({
+				size: this.buffer_size,
+				usage: this.usage,
+				mappedAtCreation: false
+			})
+		}
+
+		_create_leftover_buffer() : WebGPUBufferLocation {
+			return {
+					buffer: this.buffers[0],
+					offset: this.pointer,
+					stride: 4, // meaningless here.
+					byte_size: this.buffer_size - this.pointer,
+					paddedSize: this.buffer_size - this.pointer
+			}
+		}
+}
+
+
+export function createSingletonBuffer(
+	device: GPUDevice,
+	data: Uint32Array | Int32Array | Float32Array | ArrayBuffer,
+	usage: number
+): GPUBuffer {
+	// Creates a disposable singleton buffer.
+	// ReadonlyBufferSet ought to provide better performance; but
+	// this allows more different buffer sizes and easier destruction.
+	const buffer = device.createBuffer({
+		size: data.byteLength,
+		usage,
+		mappedAtCreation: true
+	});
+	const mappedRange = buffer.getMappedRange();
+	if (isTypedArray(data)) {
+		new Uint32Array(mappedRange).set(data as TypedArray);
+	} else {
+		new Uint32Array(mappedRange).set(new Uint32Array(data as ArrayBuffer));
+	}
+	buffer.unmap();
+	return buffer;
+}
diff --git a/src/webGPU/forests.ts b/src/webGPU/forests.ts
@@ -0,0 +1,170 @@
+import { createSingletonBuffer, WebGPUBufferSet } from "./buffertools";
+import { StatefulGPU } from "./lib";
+
+type TinyForestParams = {
+  nTrees: number;
+  depth: number;
+  // The number of features to consider at each split.
+  maxFeatures: number;
+  D: number;
+}
+
+const defaultTinyForestParams : TinyForestParams = {
+  nTrees: 128,
+  depth: 8,
+  maxFeatures: 32,
+  D: 768,
+}
+
+export class TinyForest extends StatefulGPU {
+  params: TinyForestParams;
+
+  private _bootstrapSamples?: GPUBuffer; // On the order of 100 KB
+  protected _forests?: GPUBuffer // On the order of 10 MB.
+  // private trainedThrough: number = 0;
+  constructor(
+    device: GPUDevice, 
+    bufferSize = 1024 * 1024 * 256, 
+    t: Partial<TinyForestParams> = {}) {
+    super(device, bufferSize)
+    this.params = {...defaultTinyForestParams, ...t}
+    this.initializeForestsToZero()
+    this.bufferSet = new WebGPUBufferSet(device, bufferSize);
+  }
+
+  countPipeline(): GPUComputePipeline {
+    const { device } = this;
+    // const { maxFeatures, nTrees } = this.params
+    // const OPTIONS = 2;
+    // const countBuffer = device.createBuffer({
+    //   size: OPTIONS * maxFeatures * nTrees * 4,
+    //   usage: GPUBufferUsage.STORAGE & GPUBufferUsage.COPY_SRC,
+    //   mappedAtCreation: false
+    // });
+
+    const layout = device.createBindGroupLayout({
+      entries: [
+        {
+          // features buffer;
+          binding: 0,
+          visibility: GPUShaderStage.COMPUTE,
+          buffer: { type: 'storage' }
+        },
+        {
+          // dims to check array;
+          binding: 1,
+          visibility: GPUShaderStage.COMPUTE,
+          buffer: { type: 'storage' }
+        },
+        {
+          // output count buffer.
+          binding: 2,
+          visibility: GPUShaderStage.COMPUTE,
+          buffer: { type: 'storage' }
+        }
+      ]
+    })
+
+    // const subsetsToCheck = this.chooseNextFeatures();
+    const pipelineLayout = device.createPipelineLayout({ bindGroupLayouts: [layout] });
+
+    const shaderModule = device.createShaderModule({ code: `
+      @group(0) @binding(0) var<storage, read> features: array<u32>;
+      @group(0) @binding(1) var<storage, read> dimsToCheck: array<u16>;
+      @group(0) @binding(2) var<storage, write> counts: array<u32>;
+
+      @compute @workgroup_size(64)
+      //TODOD HERE
+      ` });
+
+
+    return device.createComputePipeline({
+      layout: pipelineLayout,
+      compute: {
+        module: shaderModule,
+        entryPoint: 'main'
+      }
+    });
+  }
+
+  //@ts-expect-error foo
+  private chooseNextFeatures(n = 32) {
+    console.log({n})
+    const { maxFeatures, nTrees, D } = this.params;
+    const features = new Uint16Array(maxFeatures * D);
+    for (let i = 0; i < nTrees; i++) {
+      const set = new Set<number>();
+      while (set.size < maxFeatures) {
+        set.add(Math.floor(Math.random() * D));
+      }
+      const arr = new Uint16Array([...set].sort());
+      features.set(arr, i * maxFeatures);
+    }
+    return createSingletonBuffer(
+      this.device,
+      features,
+      GPUBufferUsage.STORAGE
+    )
+  }
+
+
+
+  initializeForestsToZero() {
+    // Each tree is a set of bits; For every possible configuration 
+    // the first D indicating 
+    // the desired outcome for the dimension,
+    // the second D indicating whether the bits in those
+    // positions are to be considered in checking if the tree
+    // fits. There are 2**depth bitmasks for each dimension--each point
+    // will match only one, and part of the inference task is determining which one.
+
+    const treeSizeInBytes = 
+      2 * this.params.D * (2 ** this.params.depth) / 8;
+
+    const data = new Uint8Array(treeSizeInBytes * this.params.nTrees)
+    this._forests = createSingletonBuffer(
+      this.device,
+      data,
+      GPUBufferUsage.STORAGE
+    )
+  }
+
+
+  // Rather than actually bootstrap, we generate a single
+  // list of 100,000 numbers drawn from a poisson distribution.
+  // These serve as weights for draws with replacement; to 
+  // bootstrap any given record batch, we take a sequence of
+  // numbers from the buffer with offset i. 
+  get bootstrapSamples() {
+    if (this._bootstrapSamples) {
+      return this._bootstrapSamples
+    } else {
+      const arr = new Uint8Array(100000)
+      for (let i = 0; i < arr.length; i++) {
+        arr[i] = poissonRandomNumber()
+      }
+      this._bootstrapSamples = createSingletonBuffer(
+        this.device,
+        arr,
+        GPUBufferUsage.STORAGE
+      )
+      return this._bootstrapSamples
+    }
+  }
+
+
+}
+
+
+function poissonRandomNumber() : number {
+  let p = 1.0;
+  let k = 0;
+
+  do {
+    k++;
+    p *= Math.random();
+  } while (p > 1/Math.E);
+
+  return k - 1;
+}
+