From a7d56569febb802d1067a23b0f3e4bc5ac57a69a Mon Sep 17 00:00:00 2001 From: Thamme Gowda Date: Thu, 23 Dec 2021 15:41:58 -0800 Subject: [PATCH 1/6] Add `byte` scheme --- CHANGELOG.md | 4 +++ docs/intro.adoc | 2 ++ nlcodec/__init__.py | 4 +-- nlcodec/__main__.py | 2 +- nlcodec/codec.py | 72 +++++++++++++++++++++++++++++++++++++++++++-- nlcodec/learn.py | 2 +- tests/test_codec.py | 35 +++++++++++++++++++++- 7 files changed, 113 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9a8c522..ad10486 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,9 @@ # Changelog +## 0.5 -- 2021-12-23 + +- Add `byte` scheme + ## 0.4.0 -- 2021-08-03 - Add support for `class` scheme -- for multi-class classification field diff --git a/docs/intro.adoc b/docs/intro.adoc index 28749ec..69b4167 100644 --- a/docs/intro.adoc +++ b/docs/intro.adoc @@ -6,6 +6,8 @@ NLP pipeline. These codecs include encoding of sequences into one of the followi 2. Word 3. BPE based subwords 4. Class (for multiclass classification) +5. Byte: Character is a Unicode codepoint (which can be higher than 255) where as bytes are [0-255]; a proxy over `utf-8` scheme + It provides python (so embed into your app) and CLI APIs (use it as stand alone tool). diff --git a/nlcodec/__init__.py b/nlcodec/__init__.py index b206f0c..5eca6be 100644 --- a/nlcodec/__init__.py +++ b/nlcodec/__init__.py @@ -3,7 +3,7 @@ # Author: Thamme Gowda [tg (at) isi (dot) edu] # Created: 2019-10-25 -__version__ = '0.4.0' +__version__ = '0.5' __description__ = """nlcodec is a collection of encoding schemes for natural language sequences. nlcodec.db is a efficient storage and retrieval layer for integer sequences of varying lengths.""" PROJECT_HOME = 'https://github.com/isi-nlp/nlcodec' @@ -26,6 +26,6 @@ format='[%(asctime)s] p%(process)s {%(module)s:%(lineno)d} %(levelname)s - %(message)s') from nlcodec.codec import (EncoderScheme, WordScheme, CharScheme, BPEScheme, Type, Reseved, - REGISTRY, + REGISTRY, ByteScheme, learn_vocab, load_scheme, Level, encode, decode) from nlcodec.dstruct import LnNode, TrNode, MaxHeap diff --git a/nlcodec/__main__.py b/nlcodec/__main__.py index 5f65d50..1db932b 100644 --- a/nlcodec/__main__.py +++ b/nlcodec/__main__.py @@ -56,7 +56,7 @@ def parse_args() -> Dict[str, Any]: help='Vocabulary size. Valid only for task=learn. This is required for' ' "bpe", but optional for "word" and "char" models, specifying it' ' will trim the vocabulary at given top most frequent types.') - learn_args.add_argument('-l', '--level', choices=['char', 'word', 'bpe', 'class'], + learn_args.add_argument('-l', '--level', choices=['char', 'word', 'bpe', 'class', 'byte'], help='Encoding Level; Valid only for task=learn') learn_args.add_argument('-mf', '--min-freq', default=None, type=int, help='Minimum frequency of types for considering inclusion in vocabulary. ' diff --git a/nlcodec/codec.py b/nlcodec/codec.py index ee7e039..411bb2c 100755 --- a/nlcodec/codec.py +++ b/nlcodec/codec.py @@ -64,6 +64,7 @@ class Level: word = 2 phrase = 3 clasz = 0 # 0 means dont split these tokens + byte = 0 # 0 means dont split these tokens @dataclass(frozen=True) @@ -211,7 +212,6 @@ def __init__(self, table: List[Type], has_reserved=True, invertible=True): def __len__(self): return self.vocab_size - @abc.abstractmethod def encode_str(cls, line: str) -> List[str]: raise NotImplementedError() @@ -454,7 +454,6 @@ def make_vocab_prefix_trie(cls, vocab: List[Type]): assert not root.has_data # root node is not data node return root - def encode(self, line: str, split_ratio: float = 0.) -> List[int]: pieces = self.encode_str(line, split_ratio=split_ratio) return [self.str_to_idx.get(piece, self.unk_idx) for piece in pieces] @@ -533,6 +532,7 @@ def stochastic_split(self, seq, split_ratio, name=False): res += self.table[idx].get_stochastic_split(name=name, split_ratio=split_ratio) return res + class ClassScheme(WordScheme): """Scheme to be used for mapping labels or classes""" level = Level.clasz @@ -563,13 +563,79 @@ def get_init_vocab(cls, term_freqs, *args, **kwargs): return vocab +class ByteScheme(EncoderScheme): + level = Level.byte + name = "byte" + """ + using hex strings to represent bytes [0-255] => [00-ff] + aka BOS is 256 + aka EOS is 257 + """ + def __init__(self, table: List[Type]=None, encoding='utf-8', errors="replace"): + self.encoding = encoding + self.errors = errors # very likely, model is going to generate invalid code bytes during training + table = table or self.get_init_vocab() + super().__init__(table=table, has_reserved=False) + + + @staticmethod + def code_to_str(code: int) -> str: + return f'{code:x}' + + def compose_str(self, pieces: List[str]): + byte_arr = bytes.fromhex(''.join(pieces)) + return str(byte_arr, encoding=self.encoding, errors=self.errors) + + def encode_str(self, line: str) -> List[str]: + return [self.code_to_str(b) for b in str.encode(line, self.encoding)] + + def decode_str(self, seq: List[str]) -> str: + builder = [] # string builder + buffer = [] # buffer of past pieces + for piece in seq: + if piece in self.str_to_idx and self.table[self.str_to_idx[piece]].is_reserved: + if buffer: + builder.append(self.compose_str(pieces=buffer)) + buffer.clear() + builder.append(piece) + else: + buffer.append(piece) + if buffer: + builder.append(self.compose_str(pieces=buffer)) + return ''.join(builder) + + def encode(self, line: str) -> List[int]: + pieces = self.encode_str(line) + return [self.str_to_idx[piece] for piece in pieces] + + def decode(self, seq: List[int]) -> str: + pieces = [self.idx_to_str[idx] for idx in seq] + return self.decode_str(pieces) + + @classmethod + def get_init_vocab(cls, *args, **kwargs): + vocab = [Type(name=f'{code:x}', idx=code, freq=-1, level=cls.level) for code in range(256)] + for tok, _ in [Reseved.BOS_TOK, Reseved.EOS_TOK]: + vocab.append(Type(name=tok, idx=len(vocab), freq=-1, level=Level.reserved)) + log.info(f"Total {cls} vocab size {len(vocab):,}") + return vocab + + + @classmethod + def learn(cls, *args, **kwargs) -> List[Type]: + if args or kwargs: + log.warning(f"Byte vocabulary does not need learning; args are ignored: {args} {kwargs}") + return cls.get_init_vocab() + + ######################### REGISTRY = { 'char': CharScheme, 'word': WordScheme, 'bpe': BPEScheme, 'subword': BPEScheme, - 'class': ClassScheme + 'class': ClassScheme, + 'byte': ByteScheme } diff --git a/nlcodec/learn.py b/nlcodec/learn.py index c2f468f..ccf4457 100644 --- a/nlcodec/learn.py +++ b/nlcodec/learn.py @@ -36,7 +36,7 @@ def parse_args() -> Dict[str, Any]: help='Vocabulary size. This is required for' ' "bpe", but optional for "word" and "char" models, specifying it' ' will trim the vocabulary at given top most frequent types.') - p.add_argument('-l', '--level', choices=['char', 'word', 'bpe'], default='bpe', + p.add_argument('-l', '--level', choices=['char', 'word', 'bpe', 'class', 'byte'], default='bpe', help='Encoding Level') p.add_argument('-mf', '--min-freq', default=None, type=int, help='Minimum frequency of types for considering inclusion in vocabulary. ' diff --git a/tests/test_codec.py b/tests/test_codec.py index 603d414..a81768c 100644 --- a/tests/test_codec.py +++ b/tests/test_codec.py @@ -5,6 +5,8 @@ from pathlib import Path +import nlcodec + data_dir = Path(__file__).parent.parent / 'data' en_txt = data_dir / 'train.en.tok' fr_txt = data_dir / 'train.fr.tok' @@ -74,4 +76,35 @@ def test_class_scheme(): assert len(table2) == len(table) table_str = '\n'.join(x.format() for x in table) table2_str = '\n'.join(x.format() for x in table2) - assert table_str == table2_str \ No newline at end of file + assert table_str == table2_str + + +def test_byte_scheme(): + args = dict(inp=IO.read_as_stream(paths=[en_txt, fr_txt]), level='byte') + with tempfile.TemporaryDirectory() as tmpdir: + model_file = Path(tmpdir) / 'model.tsv' + args['model'] = model_file + table = nlc.learn_vocab(vocab_size=-1, **args) + table2, meta = nlc.Type.read_vocab(model_file) + assert len(table2) == len(table) + table_str = '\n'.join(x.format() for x in table) + table2_str = '\n'.join(x.format() for x in table2) + assert table_str == table2_str + codec = nlc.load_scheme(model_file) + for s in ['hello, world!?&%^&$#@1235214"\'', + "ಕನ್ನಡ ವಿಶ್ವಕೋಶವು ಮೀಡಿಯಾವಿಕಿಯನ್ನು ಬಳಸಿ ಕಟ್ಟಿರುವ ಸ್ವತಂತ್ರ ವಿಶ್ವಕೋಶ.", + "维基百科,自由的百科全书"]: + e = codec.encode_str(s) + d = codec.decode_str(e) + assert s == d + e = codec.encode(s) + d = codec.decode(e) + assert s == d + + +def test_byte_scheme_reserved(): + codec = nlcodec.ByteScheme() + s = codec.encode_str("hello world") + s.insert(0, '') + print(codec.decode_str(s)) + From 256c8704f7a1feead3c73b2e12ace8373728520f Mon Sep 17 00:00:00 2001 From: Thamme Gowda Date: Thu, 23 Dec 2021 15:42:06 -0800 Subject: [PATCH 2/6] Update docs --- docs/index.html | 105 +++++++++++++++++++++++++----------------------- 1 file changed, 55 insertions(+), 50 deletions(-) diff --git a/docs/index.html b/docs/index.html index ced4500..ffacba0 100644 --- a/docs/index.html +++ b/docs/index.html @@ -4,28 +4,26 @@ - + Natural Language Encoder Decoder (NLCodec)