Merge branch 'main' into enable-messages-api

huggingface · May 1, 2024 · a97d7f7 · a97d7f7
2 parents eec9f25 + 779c9f1
commit a97d7f7
Show file tree

Hide file tree

Showing 90 changed files with 2,558 additions and 981 deletions.
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -89,6 +89,7 @@ jobs:
           deno-version: vx.x.x
       - name: E2E test - deno import from npm
         working-directory: e2e/deno
-        run: deno run --allow-net index.ts
+        run: deno run --allow-net --allow-env=HF_TOKEN index.ts
         env:
           NPM_CONFIG_REGISTRY: http://localhost:4874/
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -11,5 +11,7 @@
 	"json.format.enable": false,
 	"[json]": {
 		"editor.defaultFormatter": "esbenp.prettier-vscode"
-	}
+	},
+	"cSpell.words": ["huggingface"],
+	"deno.enablePaths": ["./e2e/deno"]
 }
diff --git a/README.md b/README.md
@@ -85,8 +85,8 @@ You can run our packages with vanilla JS, without any bundler, by using a CDN or
 
 ```html
 <script type="module">
-    import { HfInference } from 'https://cdn.jsdelivr.net/npm/@huggingface/inference@2.6.6/+esm';
-    import { createRepo, commit, deleteRepo, listFiles } from "https://cdn.jsdelivr.net/npm/@huggingface/hub@0.14.6/+esm";
+    import { HfInference } from 'https://cdn.jsdelivr.net/npm/@huggingface/inference@2.6.7/+esm';
+    import { createRepo, commit, deleteRepo, listFiles } from "https://cdn.jsdelivr.net/npm/@huggingface/hub@0.14.10/+esm";
 </script>
 ```
 

diff --git a/e2e/deno/index.ts b/e2e/deno/index.ts
@@ -1,7 +1,12 @@
 import { HfInference } from "npm:@huggingface/inference@*";
 import { whoAmI, listFiles } from "npm:@huggingface/hub@*";
 
-const hf = new HfInference();
+const token = Deno.env.get("HF_TOKEN");
+
+if (!token) {
+	console.error("Please set the HF_TOKEN environment variable.");
+	Deno.exit(1);
+}
 
 const info = await whoAmI({ credentials: { accessToken: "hf_hub.js" }, hubUrl: "https://hub-ci.huggingface.co" });
 console.log(info);
@@ -10,6 +15,11 @@ for await (const file of listFiles({ credentials: { accessToken: "hf_hub.js" },
 	console.log(file);
 }
 
+const hf = new HfInference(token);
+
+const tokenInfo = await whoAmI({ credentials: { accessToken: token } });
+console.log(tokenInfo);
+
 const sum = await hf.summarization({
 	model: "google/pegasus-xsum",
 	inputs:

diff --git a/e2e/ts/.gitignore b/e2e/ts/.gitignore
@@ -0,0 +1 @@
+package-lock.json
diff --git a/packages/gguf/package.json b/packages/gguf/package.json
@@ -1,7 +1,7 @@
 {
 	"name": "@huggingface/gguf",
 	"packageManager": "pnpm@8.10.5",
-	"version": "0.0.10",
+	"version": "0.1.2",
 	"description": "a GGUF parser that works on remotely hosted files",
 	"repository": "https://github.com/huggingface/huggingface.js.git",
 	"publishConfig": {
@@ -47,5 +47,7 @@
 	],
 	"author": "Hugging Face",
 	"license": "MIT",
-	"devDependencies": {}
+	"devDependencies": {
+		"type-fest": "^3.9.0"
+	}
 }
diff --git a/packages/gguf/pnpm-lock.yaml b/packages/gguf/pnpm-lock.yaml
diff --git a/packages/gguf/src/gguf.spec.ts b/packages/gguf/src/gguf.spec.ts
@@ -1,5 +1,5 @@
 import { describe, expect, it } from "vitest";
-import { GGMLQuantizationType, gguf } from "./gguf";
+import { GGMLQuantizationType, gguf, ggufAllShards, parseGgufShardFilename } from "./gguf";
 
 const URL_LLAMA = "https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/resolve/191239b/llama-2-7b-chat.Q2_K.gguf";
 const URL_MISTRAL_7B =
@@ -9,6 +9,8 @@ const URL_BIG_ENDIAN =
 	"https://huggingface.co/ggml-org/models/resolve/1213976/bert-bge-small/ggml-model-f16-big-endian.gguf";
 const URL_V1 =
 	"https://huggingface.co/tmadge/testing/resolve/66c078028d1ff92d7a9264a1590bc61ba6437933/tinyllamas-stories-260k-f32.gguf";
+const URL_SHARDED_GROK =
+	"https://huggingface.co/Arki05/Grok-1-GGUF/resolve/ecafa8d8eca9b8cd75d11a0d08d3a6199dc5a068/grok-1-IQ3_XS-split-00001-of-00009.gguf";
 
 describe("gguf", () => {
 	it("should parse a llama2 7b", async () => {
@@ -220,4 +222,23 @@ describe("gguf", () => {
 			dtype: GGMLQuantizationType.F32,
 		});
 	});
+
+	it("should detect sharded gguf filename", async () => {
+		const ggufPath = "grok-1/grok-1-q4_0-00003-of-00009.gguf"; // https://huggingface.co/ggml-org/models/blob/fcf344adb9686474c70e74dd5e55465e9e6176ef/grok-1/grok-1-q4_0-00003-of-00009.gguf
+		const ggufShardFileInfo = parseGgufShardFilename(ggufPath);
+
+		expect(ggufShardFileInfo?.prefix).toEqual("grok-1/grok-1-q4_0");
+		expect(ggufShardFileInfo?.shard).toEqual("00003");
+		expect(ggufShardFileInfo?.total).toEqual("00009");
+	});
+
+	it("should get param count for llama2 7b", async () => {
+		const { parameterCount } = await gguf(URL_LLAMA, { computeParametersCount: true });
+		expect(parameterCount).toEqual(6_738_415_616); // 7B
+	});
+
+	it("should get param count for sharded gguf", async () => {
+		const { parameterCount } = await ggufAllShards(URL_SHARDED_GROK);
+		expect(parameterCount).toEqual(316_490_127_360); // 316B
+	});
 });
diff --git a/packages/gguf/src/gguf.ts b/packages/gguf/src/gguf.ts
@@ -1,10 +1,31 @@
 import type { MetadataValue, Version, GGUFMetadata, GGUFTensorInfo, GGUFParseOutput } from "./types";
 import { GGUFValueType } from "./types";
+import { promisesQueue } from "./utils/promisesQueue";
 
 export type { MetadataBaseValue, MetadataValue, Version, GGUFMetadata, GGUFTensorInfo, GGUFParseOutput } from "./types";
 export { GGUFValueType, GGMLQuantizationType } from "./types";
+export { GGUF_QUANT_DESCRIPTIONS } from "./quant-descriptions";
 
 export const RE_GGUF_FILE = /\.gguf$/;
+export const RE_GGUF_SHARD_FILE = /^(?<prefix>.*?)-(?<shard>\d{5})-of-(?<total>\d{5})\.gguf$/;
+
+export interface GgufShardFileInfo {
+	prefix: string;
+	shard: string;
+	total: string;
+}
+
+export function parseGgufShardFilename(filename: string): GgufShardFileInfo | null {
+	const match = RE_GGUF_SHARD_FILE.exec(filename);
+	if (match && match.groups) {
+		return {
+			prefix: match.groups["prefix"],
+			shard: match.groups["shard"],
+			total: match.groups["total"],
+		};
+	}
+	return null;
+}
 
 const isVersion = (version: number): version is Version => version === 1 || version === 2 || version === 3;
 
@@ -30,8 +51,11 @@ const HTTP_TOTAL_MAX_SIZE = 50 * 10 ** 6; /// 50MB
 class RangeView {
 	private chunk: number;
 	private buffer: ArrayBuffer;
+	private dataView: DataView;
 
-	readonly view: DataView;
+	get view(): DataView {
+		return this.dataView;
+	}
 
 	constructor(
 		public url: string,
@@ -47,7 +71,7 @@ class RangeView {
 		// eslint-disable-next-line @typescript-eslint/ban-ts-comment
 		// @ts-ignore
 		this.buffer = new ArrayBuffer(0, { maxByteLength: HTTP_TOTAL_MAX_SIZE });
-		this.view = new DataView(this.buffer);
+		this.dataView = new DataView(this.buffer);
 	}
 	/**
 	 * Fetch a new chunk from the server
@@ -63,18 +87,40 @@ class RangeView {
 				})
 			).arrayBuffer()
 		);
+		this.appendBuffer(buf);
+		this.chunk += 1;
+	}
+	/**
+	 * Append new data into the buffer
+	 */
+	appendBuffer(buf: Uint8Array) {
 		/// TODO(fix typing)
 		// eslint-disable-next-line @typescript-eslint/ban-ts-comment
 		// @ts-ignore
-		this.buffer.resize((this.chunk + 1) * HTTP_CHUNK_SIZE);
-		new Uint8Array(this.buffer).set(buf, this.chunk * HTTP_CHUNK_SIZE);
-		this.chunk += 1;
+		if (ArrayBuffer.prototype.resize) {
+			/// TODO(fix typing)
+			// eslint-disable-next-line @typescript-eslint/ban-ts-comment
+			// @ts-ignore
+			this.buffer.resize((this.chunk + 1) * HTTP_CHUNK_SIZE);
+			new Uint8Array(this.buffer).set(buf, this.chunk * HTTP_CHUNK_SIZE);
+		} else {
+			// If the browser does not support ArrayBuffer.resize, we fallback to this polyfill version
+			/// TODO(fix typing)
+			// eslint-disable-next-line @typescript-eslint/ban-ts-comment
+			// @ts-ignore
+			const newBuffer = new ArrayBuffer((this.chunk + 1) * HTTP_CHUNK_SIZE, { maxByteLength: HTTP_TOTAL_MAX_SIZE });
+			const arrView = new Uint8Array(newBuffer);
+			arrView.set(new Uint8Array(this.buffer));
+			arrView.set(buf, this.chunk * HTTP_CHUNK_SIZE);
+			this.buffer = newBuffer;
+			this.dataView = new DataView(this.buffer);
+		}
 	}
 	/**
 	 * Check whether we need to fetch a new chunk
 	 */
 	async fetchChunkIfNeeded(offset: number) {
-		if (this.view.byteLength - offset < HTTP_DATA_LEEWAY) {
+		if (this.dataView.byteLength - offset < HTTP_DATA_LEEWAY) {
 			await this.fetchChunk();
 		}
 	}
@@ -156,6 +202,16 @@ function readMetadataValue(
 	}
 }
 
+export async function gguf(
+	url: string,
+	params: {
+		/**
+		 * Custom fetch function to use instead of the default one, for example to use a proxy or edit headers.
+		 */
+		fetch?: typeof fetch;
+		computeParametersCount: true;
+	}
+): Promise<GGUFParseOutput & { parameterCount: number }>;
 export async function gguf(
 	url: string,
 	params?: {
@@ -164,7 +220,17 @@ export async function gguf(
 		 */
 		fetch?: typeof fetch;
 	}
-): Promise<GGUFParseOutput> {
+): Promise<GGUFParseOutput>;
+export async function gguf(
+	url: string,
+	params?: {
+		/**
+		 * Custom fetch function to use instead of the default one, for example to use a proxy or edit headers.
+		 */
+		fetch?: typeof fetch;
+		computeParametersCount?: boolean;
+	}
+): Promise<GGUFParseOutput & { parameterCount?: number }> {
 	const r = new RangeView(url, params);
 	await r.fetchChunk();
 
@@ -273,5 +339,47 @@ export async function gguf(
 		});
 	}
 
-	return { metadata, tensorInfos };
+	if (params?.computeParametersCount) {
+		const parameterCount = tensorInfos
+			.map(({ shape }) => shape.reduce((acc, val) => acc * Number(val), 1))
+			.reduce((acc, val) => acc + val, 0);
+
+		return { metadata, tensorInfos, parameterCount };
+	} else {
+		return { metadata, tensorInfos };
+	}
+}
+
+export async function ggufAllShards(
+	url: string,
+	params?: {
+		/**
+		 * Custom fetch function to use instead of the default one, for example to use a proxy or edit headers.
+		 */
+		fetch?: typeof fetch;
+	}
+): Promise<{ shards: GGUFParseOutput[]; parameterCount: number }> {
+	const ggufShardFileInfo = parseGgufShardFilename(url);
+	if (ggufShardFileInfo) {
+		const total = parseInt(ggufShardFileInfo.total);
+		const prefix = ggufShardFileInfo.prefix;
+
+		const urls: string[] = [];
+		for (let shardIdx = 1; shardIdx <= total; shardIdx++) {
+			urls.push(`${prefix}-${shardIdx.toString().padStart(5, "0")}-of-${total.toString().padStart(5, "0")}.gguf`);
+		}
+
+		const PARALLEL_DOWNLOADS = 20;
+		const shards = await promisesQueue(
+			urls.map((shardUrl) => () => gguf(shardUrl, { computeParametersCount: true })),
+			PARALLEL_DOWNLOADS
+		);
+		return {
+			shards,
+			parameterCount: shards.map(({ parameterCount }) => parameterCount).reduce((acc, val) => acc + val, 0),
+		};
+	} else {
+		const { metadata, tensorInfos, parameterCount } = await gguf(url, { ...params, computeParametersCount: true });
+		return { shards: [{ metadata, tensorInfos }], parameterCount };
+	}
 }