|
| 1 | +class Tokenizer { |
| 2 | + /** |
| 3 | + * Converts a list of message objects with `"role"` and `"content"` keys to a list of token |
| 4 | + * ids. This method is intended for use with chat models, and will read the tokenizer's chat_template attribute to |
| 5 | + * determine the format and control tokens to use when converting. When chat_template is None, it will fall back |
| 6 | + * to the default_chat_template specified at the class level. |
| 7 | + * |
| 8 | + * See [here](https://huggingface.co/docs/transformers/chat_templating) for more information. |
| 9 | + * |
| 10 | + * **Example:** Applying a chat template to a conversation. |
| 11 | + * |
| 12 | + * ```javascript |
| 13 | + * import { AutoTokenizer } from "@xenova/transformers"; |
| 14 | + * |
| 15 | + * const tokenizer = await AutoTokenizer.from_pretrained("Xenova/mistral-tokenizer-v1"); |
| 16 | + * |
| 17 | + * const chat = [ |
| 18 | + * { "role": "user", "content": "Hello, how are you?" }, |
| 19 | + * { "role": "assistant", "content": "I'm doing great. How can I help you today?" }, |
| 20 | + * { "role": "user", "content": "I'd like to show off how chat templating works!" }, |
| 21 | + * ] |
| 22 | + * |
| 23 | + * const text = tokenizer.apply_chat_template(chat, { tokenize: false }); |
| 24 | + * // "<s>[INST] Hello, how are you? [/INST]I'm doing great. How can I help you today?</s> [INST] I'd like to show off how chat templating works! [/INST]" |
| 25 | + * |
| 26 | + * const input_ids = tokenizer.apply_chat_template(chat, { tokenize: true, return_tensor: false }); |
| 27 | + * // [1, 733, 16289, 28793, 22557, 28725, 910, 460, 368, 28804, 733, 28748, 16289, 28793, 28737, 28742, 28719, 2548, 1598, 28723, 1602, 541, 315, 1316, 368, 3154, 28804, 2, 28705, 733, 16289, 28793, 315, 28742, 28715, 737, 298, 1347, 805, 910, 10706, 5752, 1077, 3791, 28808, 733, 28748, 16289, 28793] |
| 28 | + * ``` |
| 29 | + * |
| 30 | + * @param {Message[]} conversation A list of message objects with `"role"` and `"content"` keys. |
| 31 | + * @param {Object} options An optional object containing the following properties: |
| 32 | + * @param {string} [options.chat_template=null] A Jinja template to use for this conversion. If |
| 33 | + * this is not passed, the model's default chat template will be used instead. |
| 34 | + * @param {boolean} [options.add_generation_prompt=false] Whether to end the prompt with the token(s) that indicate |
| 35 | + * the start of an assistant message. This is useful when you want to generate a response from the model. |
| 36 | + * Note that this argument will be passed to the chat template, and so it must be supported in the |
| 37 | + * template for this argument to have any effect. |
| 38 | + * @param {boolean} [options.tokenize=true] Whether to tokenize the output. If false, the output will be a string. |
| 39 | + * @param {boolean} [options.padding=false] Whether to pad sequences to the maximum length. Has no effect if tokenize is false. |
| 40 | + * @param {boolean} [options.truncation=false] Whether to truncate sequences to the maximum length. Has no effect if tokenize is false. |
| 41 | + * @param {number} [options.max_length=null] Maximum length (in tokens) to use for padding or truncation. Has no effect if tokenize is false. |
| 42 | + * If not specified, the tokenizer's `max_length` attribute will be used as a default. |
| 43 | + * @param {boolean} [options.return_tensor=true] Whether to return the output as a Tensor or an Array. Has no effect if tokenize is false. |
| 44 | + * @param {Object} [options.tokenizer_kwargs={}] Additional options to pass to the tokenizer. |
| 45 | + * @returns {string | Tensor | number[]| number[][]} The tokenized output. |
| 46 | + */ |
| 47 | + apply_chat_template(conversation, { |
| 48 | + chat_template = null, |
| 49 | + add_generation_prompt = false, |
| 50 | + tokenize = true, |
| 51 | + padding = false, |
| 52 | + truncation = false, |
| 53 | + max_length = null, |
| 54 | + return_tensor = true, |
| 55 | + tokenizer_kwargs = {}, |
| 56 | + ...kwargs |
| 57 | + } = {}) { |
| 58 | + if (!inspectType(conversation, { |
| 59 | + "type": "array", |
| 60 | + "elementType": "Message", |
| 61 | + "optional": false |
| 62 | + }, 'Tokenizer#apply_chat_template', 'conversation')) { |
| 63 | + youCanAddABreakpointHere(); |
| 64 | + } |
| 65 | + if (!inspectType(chat_template, { |
| 66 | + "type": "string", |
| 67 | + "optional": true |
| 68 | + }, 'Tokenizer#apply_chat_template', 'options')) { |
| 69 | + youCanAddABreakpointHere(); |
| 70 | + } |
| 71 | + if (!inspectType(add_generation_prompt, { |
| 72 | + "type": "boolean", |
| 73 | + "optional": true |
| 74 | + }, 'Tokenizer#apply_chat_template', 'options')) { |
| 75 | + youCanAddABreakpointHere(); |
| 76 | + } |
| 77 | + if (!inspectType(tokenize, { |
| 78 | + "type": "boolean", |
| 79 | + "optional": true |
| 80 | + }, 'Tokenizer#apply_chat_template', 'options')) { |
| 81 | + youCanAddABreakpointHere(); |
| 82 | + } |
| 83 | + if (!inspectType(padding, { |
| 84 | + "type": "boolean", |
| 85 | + "optional": true |
| 86 | + }, 'Tokenizer#apply_chat_template', 'options')) { |
| 87 | + youCanAddABreakpointHere(); |
| 88 | + } |
| 89 | + if (!inspectType(truncation, { |
| 90 | + "type": "boolean", |
| 91 | + "optional": true |
| 92 | + }, 'Tokenizer#apply_chat_template', 'options')) { |
| 93 | + youCanAddABreakpointHere(); |
| 94 | + } |
| 95 | + if (!inspectType(max_length, { |
| 96 | + "type": "number", |
| 97 | + "optional": true |
| 98 | + }, 'Tokenizer#apply_chat_template', 'options')) { |
| 99 | + youCanAddABreakpointHere(); |
| 100 | + } |
| 101 | + if (!inspectType(return_tensor, { |
| 102 | + "type": "boolean", |
| 103 | + "optional": true |
| 104 | + }, 'Tokenizer#apply_chat_template', 'options')) { |
| 105 | + youCanAddABreakpointHere(); |
| 106 | + } |
| 107 | + if (!inspectType(tokenizer_kwargs, { |
| 108 | + "type": "object", |
| 109 | + "optional": true |
| 110 | + }, 'Tokenizer#apply_chat_template', 'options')) { |
| 111 | + youCanAddABreakpointHere(); |
| 112 | + } |
| 113 | + return kwargs; |
| 114 | + } |
| 115 | +} |
| 116 | +registerClass(Tokenizer); |
| 117 | +const tokenizer = new Tokenizer(); |
| 118 | +tokenizer.apply_chat_template([1, 2, 3], { |
| 119 | + chat_template: "nope", |
| 120 | + lol: 123, |
| 121 | +}); |
0 commit comments