diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py index d5d3b2179..8c97f7dbc 100644 --- a/llama_cpp/_internals.py +++ b/llama_cpp/_internals.py @@ -1,6 +1,8 @@ from __future__ import annotations import os +import sys +import struct import ctypes from typing import ( @@ -11,6 +13,8 @@ ) from dataclasses import dataclass, field from contextlib import ExitStack +from io import BufferedReader +from enum import IntEnum import numpy as np import numpy.typing as npt @@ -234,7 +238,7 @@ def detokenize(self, tokens: List[int], special: bool = False) -> bytes: ) # Extra - def metadata(self) -> Dict[str, str]: + def _metadata_no_arrays(self) -> Dict[str, str]: assert self.model is not None metadata: Dict[str, str] = {} buffer_size = 1024 @@ -266,6 +270,12 @@ def metadata(self) -> Dict[str, str]: metadata[key] = value return metadata + def metadata(self) -> Dict[str, Union[str, int, float, bool, list]]: + assert self.model is not None + # Uncomment the next line to use the old method + #return self._metadata_no_arrays() + return QuickGGUFReader.load_metadata(self.path_model) + @staticmethod def default_params(): """Get the default llama_model_params.""" @@ -866,3 +876,207 @@ def accept(self, ctx_main: _LlamaContext, id: int, apply_grammar: bool): if apply_grammar and self.grammar is not None: ctx_main.grammar_accept_token(self.grammar, id) self.prev.append(id) + +class QuickGGUFReader: + """ + All logic in this class is based on the GGUF format specification, which + can be found here: https://github.com/ggerganov/ggml/blob/master/docs/gguf.md + """ + # NOTE: Officially, there is no way to determine if a GGUF file is little + # or big endian. The format specifcation directs us to assume that + # a file is little endian in all cases unless additional info is + # provided. + # + # In addition to this, GGUF files cannot run on hosts with the + # opposite endianness. And, at this point in the code, the model + # is already loaded. Therefore, we can assume that the endianness + # of the file is the same as the endianness of the host. + + # the GGUF format versions that this class supports + SUPPORTED_GGUF_VERSIONS = [2, 3] + + # GGUF only supports execution on little or big endian machines + if sys.byteorder not in ['little', 'big']: + raise ValueError( + "host is not little or big endian - GGUF is unsupported" + ) + + # Occasionally check to ensure these values are consistent with + # the latest values in llama.cpp/gguf-py/gguf/constants.py + class GGUFValueType(IntEnum): + UINT8 = 0 + INT8 = 1 + UINT16 = 2 + INT16 = 3 + UINT32 = 4 + INT32 = 5 + FLOAT32 = 6 + BOOL = 7 + STRING = 8 + ARRAY = 9 + UINT64 = 10 + INT64 = 11 + FLOAT64 = 12 + + # arguments for struct.unpack() based on gguf value type + value_packing: dict = { + GGUFValueType.UINT8: "=B", + GGUFValueType.INT8: "=b", + GGUFValueType.UINT16: "=H", + GGUFValueType.INT16: "=h", + GGUFValueType.UINT32: "=I", + GGUFValueType.INT32: "=i", + GGUFValueType.FLOAT32: "=f", + GGUFValueType.UINT64: "=Q", + GGUFValueType.INT64: "=q", + GGUFValueType.FLOAT64: "=d", + GGUFValueType.BOOL: "?" + } + + # length in bytes for each gguf value type + value_lengths: dict = { + GGUFValueType.UINT8: 1, + GGUFValueType.INT8: 1, + GGUFValueType.UINT16: 2, + GGUFValueType.INT16: 2, + GGUFValueType.UINT32: 4, + GGUFValueType.INT32: 4, + GGUFValueType.FLOAT32: 4, + GGUFValueType.UINT64: 8, + GGUFValueType.INT64: 8, + GGUFValueType.FLOAT64: 8, + GGUFValueType.BOOL: 1 + } + + @staticmethod + def unpack(value_type: GGUFValueType, file: BufferedReader): + return struct.unpack( + QuickGGUFReader.value_packing.get(value_type), + file.read(QuickGGUFReader.value_lengths.get(value_type)) + )[0] + + @staticmethod + def get_single( + value_type: GGUFValueType, + file: BufferedReader + ) -> Union[str, int, float, bool]: + """Read a single value from an open file""" + if value_type == QuickGGUFReader.GGUFValueType.STRING: + string_length = QuickGGUFReader.unpack( + QuickGGUFReader.GGUFValueType.UINT64, + file=file + ) + value = file.read(string_length) + # officially, strings that cannot be decoded into utf-8 are invalid + value = value.decode("utf-8") + else: + value = QuickGGUFReader.unpack(value_type, file=file) + return value + + @staticmethod + def load_metadata( + fn: Union[os.PathLike[str], str] + ) -> dict[str, Union[str, int, float, bool, list]]: + """ + Given a path to a GGUF file, peek at its header for metadata + + Return a dictionary where all keys are strings, and values can be + strings, ints, floats, bools, or lists + """ + + metadata: dict[str, Union[str, int, float, bool, list]] = {} + with open(fn, "rb") as file: + magic = file.read(4) + + if magic != b"GGUF": + raise ValueError( + "your model file is not a valid GGUF file " + f"(magic number mismatch, got {magic}, " + "expected b'GGUF')" + ) + + version = QuickGGUFReader.unpack( + QuickGGUFReader.GGUFValueType.UINT32, + file=file + ) + + if version not in QuickGGUFReader.SUPPORTED_GGUF_VERSIONS: + raise ValueError( + f"your model file reports GGUF version {version}, but " + f"only versions {QuickGGUFReader.SUPPORTED_GGUF_VERSIONS} " + "are supported. re-convert your model or download a newer " + "version" + ) + + tensor_count = QuickGGUFReader.unpack( + QuickGGUFReader.GGUFValueType.UINT64, + file=file + ) + + if version == 3: + metadata_kv_count = QuickGGUFReader.unpack( + QuickGGUFReader.GGUFValueType.UINT64, + file=file + ) + elif version == 2: + metadata_kv_count = QuickGGUFReader.unpack( + QuickGGUFReader.GGUFValueType.UINT32, + file=file + ) + + for _ in range(metadata_kv_count): + if version == 3: + key_length = QuickGGUFReader.unpack( + QuickGGUFReader.GGUFValueType.UINT64, + file=file + ) + elif version == 2: + key_length = 0 + while key_length == 0: + # read until next key is found + key_length = QuickGGUFReader.unpack( + QuickGGUFReader.GGUFValueType.UINT32, + file=file + ) + file.read(4) # 4 byte offset for GGUFv2 + key = file.read(key_length) + value_type = QuickGGUFReader.GGUFValueType( + QuickGGUFReader.unpack( + QuickGGUFReader.GGUFValueType.UINT32, + file=file + ) + ) + if value_type == QuickGGUFReader.GGUFValueType.ARRAY: + array_value_type = QuickGGUFReader.GGUFValueType( + QuickGGUFReader.unpack( + QuickGGUFReader.GGUFValueType.UINT32, + file=file + ) + ) + # array_length is the number of items in the array + if version == 3: + array_length = QuickGGUFReader.unpack( + QuickGGUFReader.GGUFValueType.UINT64, + file=file + ) + elif version == 2: + array_length = QuickGGUFReader.unpack( + QuickGGUFReader.GGUFValueType.UINT32, + file=file + ) + file.read(4) # 4 byte offset for GGUFv2 + array = [ + QuickGGUFReader.get_single( + array_value_type, + file=file + ) for _ in range(array_length) + ] + metadata[key.decode()] = array + else: + value = QuickGGUFReader.get_single( + value_type, + file=file + ) + metadata[key.decode()] = value + + return metadata diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index d8c2e0cdd..aceec945f 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -456,7 +456,15 @@ def __init__( print(f"Failed to load metadata: {e}", file=sys.stderr) if self.verbose: - print(f"Model metadata: {self.metadata}", file=sys.stderr) + print("Model metadata:", file=sys.stderr) + for k, v in self.metadata.items(): + # only calculate repr() once as it may be slow for large arrays + repr_v = repr(v) + if len(repr_v) > 63: + # truncate long values + print(f" {k}: {repr_v[:60]}...", file=sys.stderr) + else: + print(f" {k}: {repr_v}", file=sys.stderr) eos_token_id = self.token_eos() bos_token_id = self.token_bos()