Merge pull request #40 from isi-nlp/39-byte

Add `byte` scheme
isi-nlp · Dec 24, 2021 · 8267bff · 8267bff
2 parents b99d023 + b375ba1
commit 8267bff
Show file tree

Hide file tree

Showing 10 changed files with 172 additions and 62 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -5,8 +5,8 @@ python:
   #- "3.6"   # dataclasses came in 3.7, so 3.6 isnt supported
   - "3.7"
   #- "3.8"
-  #- "3.9"
-
+  - "3.9"
+  #- "3.10"
 before_install:
   #- sudo apt-get -y update
   - python3 setup.py install

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,9 @@
 # Changelog
 
+## 0.5 -- 2021-12-23
+
+- Add `byte` scheme
+
 ## 0.4.0  -- 2021-08-03
 
 - Add support for `class` scheme -- for multi-class classification field

diff --git a/docs/index.html b/docs/index.html
diff --git a/docs/intro.adoc b/docs/intro.adoc
@@ -6,6 +6,8 @@ NLP pipeline. These codecs include encoding of sequences into one of the followi
 2. Word
 3. BPE based subwords
 4. Class (for multiclass classification)
+5. Byte: Character is a Unicode codepoint (which can be higher than 255) where as bytes are [0-255]; a proxy over `utf-8` scheme
+
 
 It provides python (so embed into your app) and CLI APIs (use it as stand alone tool).
 

diff --git a/nlcodec/__init__.py b/nlcodec/__init__.py
@@ -3,7 +3,7 @@
 # Author: Thamme Gowda [tg (at) isi (dot) edu] 
 # Created: 2019-10-25
 
-__version__ = '0.4.0'
+__version__ = '0.5'
 __description__ = """nlcodec is a collection of encoding schemes for natural language sequences. 
 nlcodec.db is a efficient storage and retrieval layer for integer sequences of varying lengths."""
 PROJECT_HOME = 'https://github.com/isi-nlp/nlcodec'
@@ -26,6 +26,6 @@
     format='[%(asctime)s] p%(process)s {%(module)s:%(lineno)d} %(levelname)s - %(message)s')
 
 from nlcodec.codec import (EncoderScheme, WordScheme, CharScheme, BPEScheme, Type, Reseved,
-                           REGISTRY,
+                           REGISTRY, ByteScheme,
                            learn_vocab, load_scheme, Level, encode, decode)
 from nlcodec.dstruct import LnNode, TrNode, MaxHeap
diff --git a/nlcodec/__main__.py b/nlcodec/__main__.py
@@ -56,7 +56,7 @@ def parse_args() -> Dict[str, Any]:
                             help='Vocabulary size. Valid only for task=learn. This is required for'
                                  ' "bpe", but optional for "word" and "char" models, specifying it'
                                  ' will trim the vocabulary at given top most frequent types.')
-    learn_args.add_argument('-l', '--level', choices=['char', 'word', 'bpe', 'class'],
+    learn_args.add_argument('-l', '--level', choices=['char', 'word', 'bpe', 'class', 'byte'],
                             help='Encoding Level; Valid only for task=learn')
     learn_args.add_argument('-mf', '--min-freq', default=None, type=int,
                             help='Minimum frequency of types for considering inclusion in vocabulary. '

diff --git a/nlcodec/codec.py b/nlcodec/codec.py
@@ -11,7 +11,6 @@
 from pathlib import Path
 from typing import List, TextIO, Dict, Tuple, Union, Iterator, Optional
 import multiprocessing as mp
-from tqdm import tqdm
 from nlcodec import __version__, log
 from nlcodec.dstruct import TrNode
 from nlcodec.utils import filter_types_coverage, IO
@@ -64,6 +63,7 @@ class Level:
     word = 2
     phrase = 3
     clasz = 0   # 0 means dont split these tokens
+    byte = 0   # 0 means dont split these tokens
 
 
 @dataclass(frozen=True)
@@ -211,7 +211,6 @@ def __init__(self, table: List[Type], has_reserved=True, invertible=True):
     def __len__(self):
         return self.vocab_size
 
-
     @abc.abstractmethod
     def encode_str(cls, line: str) -> List[str]:
         raise NotImplementedError()
@@ -364,6 +363,7 @@ def decode_str(cls, seq: List[str]) -> str:
 
     @classmethod
     def term_frequencies(cls, data: Iterator[str]) -> Tuple[Dict[str, int], int]:
+        from tqdm import tqdm
         stats = coll.Counter()
         line_count = 0
         for line in tqdm(data, mininterval=1):
@@ -454,7 +454,6 @@ def make_vocab_prefix_trie(cls, vocab: List[Type]):
         assert not root.has_data  # root node is not data node
         return root
 
-
     def encode(self, line: str, split_ratio: float = 0.) -> List[int]:
         pieces = self.encode_str(line, split_ratio=split_ratio)
         return [self.str_to_idx.get(piece, self.unk_idx) for piece in pieces]
@@ -533,6 +532,7 @@ def stochastic_split(self, seq, split_ratio, name=False):
             res += self.table[idx].get_stochastic_split(name=name, split_ratio=split_ratio)
         return res
 
+
 class ClassScheme(WordScheme):
     """Scheme to be used for mapping labels or classes"""
     level = Level.clasz
@@ -563,13 +563,79 @@ def get_init_vocab(cls, term_freqs, *args, **kwargs):
         return vocab
 
 
+class ByteScheme(EncoderScheme):
+    level = Level.byte
+    name = "byte"
+    """
+     using hex strings to represent bytes [0-255] => [00-ff] 
+     <s> aka BOS is 256
+     </s> aka EOS is 257
+    """
+    def __init__(self, table: List[Type]=None, encoding='utf-8', errors="replace"):
+        self.encoding = encoding
+        self.errors = errors   # very likely, model is going to generate invalid code bytes during training
+        table = table or self.get_init_vocab()
+        super().__init__(table=table, has_reserved=False)
+
+
+    @staticmethod
+    def code_to_str(code: int) -> str:
+        return f'{code:x}'
+
+    def compose_str(self, pieces: List[str]):
+        byte_arr = bytes.fromhex(''.join(pieces))
+        return str(byte_arr, encoding=self.encoding, errors=self.errors)
+
+    def encode_str(self, line: str) -> List[str]:
+        return [self.code_to_str(b) for b in str.encode(line, self.encoding)]
+
+    def decode_str(self, seq: List[str]) -> str:
+        builder = []  # string builder
+        buffer = []   # buffer of past pieces
+        for piece in seq:
+            if piece in self.str_to_idx and self.table[self.str_to_idx[piece]].is_reserved:
+                if buffer:
+                    builder.append(self.compose_str(pieces=buffer))
+                    buffer.clear()
+                builder.append(piece)
+            else:
+                buffer.append(piece)
+        if buffer:
+            builder.append(self.compose_str(pieces=buffer))
+        return ''.join(builder)
+
+    def encode(self, line: str) -> List[int]:
+        pieces = self.encode_str(line)
+        return [self.str_to_idx[piece] for piece in pieces]
+
+    def decode(self, seq: List[int]) -> str:
+        pieces = [self.idx_to_str[idx] for idx in seq]
+        return self.decode_str(pieces)
+
+    @classmethod
+    def get_init_vocab(cls, *args, **kwargs):
+        vocab = [Type(name=f'{code:x}', idx=code, freq=-1, level=cls.level) for code in range(256)]
+        for tok, _ in [Reseved.BOS_TOK, Reseved.EOS_TOK]:
+            vocab.append(Type(name=tok, idx=len(vocab), freq=-1, level=Level.reserved))
+        log.info(f"Total {cls} vocab size {len(vocab):,}")
+        return vocab
+
+
+    @classmethod
+    def learn(cls, *args, **kwargs) -> List[Type]:
+        if args or kwargs:
+            log.warning(f"Byte vocabulary does not need learning; args are ignored: {args} {kwargs}")
+        return cls.get_init_vocab()
+
+
 #########################
 REGISTRY = {
     'char': CharScheme,
     'word': WordScheme,
     'bpe': BPEScheme,
     'subword': BPEScheme,
-    'class': ClassScheme
+    'class': ClassScheme,
+    'byte': ByteScheme
 }
 
 

diff --git a/nlcodec/learn.py b/nlcodec/learn.py
@@ -36,7 +36,7 @@ def parse_args() -> Dict[str, Any]:
                    help='Vocabulary size. This is required for'
                         ' "bpe", but optional for "word" and "char" models, specifying it'
                         ' will trim the vocabulary at given top most frequent types.')
-    p.add_argument('-l', '--level', choices=['char', 'word', 'bpe'], default='bpe',
+    p.add_argument('-l', '--level', choices=['char', 'word', 'bpe', 'class', 'byte'], default='bpe',
                    help='Encoding Level')
     p.add_argument('-mf', '--min-freq', default=None, type=int,
                    help='Minimum frequency of types for considering inclusion in vocabulary. '

diff --git a/nlcodec/utils.py b/nlcodec/utils.py
@@ -8,7 +8,6 @@
 from typing import List, Any, Iterable, Dict, Tuple, Union
 import collections as coll
 from nlcodec import log
-from tqdm import tqdm
 import gzip
 import time
 from contextlib import contextmanager
@@ -21,6 +20,7 @@ def make_n_grams(sent: List[Any], n):
 
 
 def make_n_grams_all(sents: Iterable[List[Any]], n):
+    from tqdm import tqdm
     grams = coll.Counter()
     n_sent = 0
     for sent in tqdm(sents, mininterval=1, dynamic_ncols=True):

diff --git a/tests/test_codec.py b/tests/test_codec.py
@@ -5,6 +5,8 @@
 
 from pathlib import Path
 
+import nlcodec
+
 data_dir = Path(__file__).parent.parent / 'data'
 en_txt = data_dir / 'train.en.tok'
 fr_txt = data_dir / 'train.fr.tok'
@@ -74,4 +76,35 @@ def test_class_scheme():
         assert len(table2) == len(table)
         table_str = '\n'.join(x.format() for x in table)
         table2_str = '\n'.join(x.format() for x in table2)
-        assert table_str == table2_str
+        assert table_str == table2_str
+
+
+def test_byte_scheme():
+    args = dict(inp=IO.read_as_stream(paths=[en_txt, fr_txt]), level='byte')
+    with tempfile.TemporaryDirectory() as tmpdir:
+        model_file = Path(tmpdir) / 'model.tsv'
+        args['model'] = model_file
+        table = nlc.learn_vocab(vocab_size=-1, **args)
+        table2, meta = nlc.Type.read_vocab(model_file)
+        assert len(table2) == len(table)
+        table_str = '\n'.join(x.format() for x in table)
+        table2_str = '\n'.join(x.format() for x in table2)
+        assert table_str == table2_str
+        codec = nlc.load_scheme(model_file)
+        for s in ['hello, world!?&%^&$#@1235214"\'',
+                  "ಕನ್ನಡ ವಿಶ್ವಕೋಶವು ಮೀಡಿಯಾವಿಕಿಯನ್ನು ಬಳಸಿ ಕಟ್ಟಿರುವ ಸ್ವತಂತ್ರ ವಿಶ್ವಕೋಶ.",
+                  "维基百科，自由的百科全书"]:
+            e = codec.encode_str(s)
+            d = codec.decode_str(e)
+            assert s == d
+            e = codec.encode(s)
+            d = codec.decode(e)
+            assert s == d
+
+
+def test_byte_scheme_reserved():
+    codec = nlcodec.ByteScheme()
+    s = codec.encode_str("hello world")
+    s.insert(0, '<s>')
+    print(codec.decode_str(s))
+