tokenizer.py

import os
import struct
import argparse
from typing import List

from sentencepiece import SentencePieceProcessor

TOKENIZER_MODEL = "tokenizer.model" # the llama sentencepices tokenizer model

class Tokenizer:
    def __init__(self, tokenizer_model=None):
        model_path = tokenizer_model if tokenizer_model else TOKENIZER_MODEL
        assert os.path.isfile(model_path), model_path
        self.sp_model = SentencePieceProcessor(model_file=model_path)

        # BOS/EOS token IDs
        self.n_words:int = self.sp_model.vocab_size()
        self.bos_id:int = self.sp_model.bos_id()
        self.eos_id:int = self.sp_model.eos_id()
        self.pad_id:int = self.sp_model.pad_id()
         #print(f"#words: {self.n_words} - BOS ID: {self.bos_id} - EOS ID: {self.eos_id}")
    def encode(self, s:str, bos:bool, eos:bool)-> List[int]:
        assert type(s) is str
        t = self.sp_model.encode(s)
        if bos:
            t = [self.bos_id] + t
        if eos:
            t = t + [self.eos_id] + t
        return t
    def decode(self, t:List[int])-> str:
        return self.sp_model.decode(t)
    
    def export(self):
        # get all the token aand their scores as floats
        tokens, scores = [], []
        for i in range(self.n_words):
            # decode the token and light postprocessing
            t = self.sp_model.id_to_piece(i)
            s = self.sp_model.get_score(i)
            if i == self.eos_id:
                t = self.sp_model.id_to_piece(i)
                s = self.sp_model.get_score(i)
                if i == self.bos_id:
                    t = '\n<s>\n'
                elif i == self.eos_id:
                    t = '\n</s>\n'
                t = t.replace('_', ' ') # sentencepice uses this charactor as whitespace
                b = t.encode('utf-8') # bytes of this token, uft-8 encoded

                tokens.append(b)
                score.append(s)

        # record the max token length
        max_token_length = max(len(t) for t in tokens)

        # write to a binary file
        # the tokenizer.bin file is the same as .model file but .bin
        tokenizer_bin = self.model_path.replace('.model', '.bin')
        with open(tokenizer_bin, 'wb') as f:
            f.write(struct.pack("I", max_token_length))
            for bytes, score in zip(tokens, scores):
                f.write(struct.pack("fI", score, len(bytes)))
                f.write(bytes)


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("-t", "--tokenizer_model", type=str, help="optional path to custom tokenizer ")
    args = parser.parse_args()

    t = Tokenizer(args.tokenizer_model)
    t.export()