Skip to content

Commit

Permalink
Merge pull request #40 from isi-nlp/39-byte
Browse files Browse the repository at this point in the history
Add `byte` scheme
  • Loading branch information
thammegowda authored Dec 24, 2021
2 parents b99d023 + b375ba1 commit 8267bff
Show file tree
Hide file tree
Showing 10 changed files with 172 additions and 62 deletions.
4 changes: 2 additions & 2 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@ python:
#- "3.6" # dataclasses came in 3.7, so 3.6 isnt supported
- "3.7"
#- "3.8"
#- "3.9"

- "3.9"
#- "3.10"
before_install:
#- sudo apt-get -y update
- python3 setup.py install
Expand Down
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
# Changelog

## 0.5 -- 2021-12-23

- Add `byte` scheme

## 0.4.0 -- 2021-08-03

- Add support for `class` scheme -- for multi-class classification field
Expand Down
105 changes: 55 additions & 50 deletions docs/index.html

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions docs/intro.adoc
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ NLP pipeline. These codecs include encoding of sequences into one of the followi
2. Word
3. BPE based subwords
4. Class (for multiclass classification)
5. Byte: Character is a Unicode codepoint (which can be higher than 255) where as bytes are [0-255]; a proxy over `utf-8` scheme
It provides python (so embed into your app) and CLI APIs (use it as stand alone tool).

Expand Down
4 changes: 2 additions & 2 deletions nlcodec/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# Author: Thamme Gowda [tg (at) isi (dot) edu]
# Created: 2019-10-25

__version__ = '0.4.0'
__version__ = '0.5'
__description__ = """nlcodec is a collection of encoding schemes for natural language sequences.
nlcodec.db is a efficient storage and retrieval layer for integer sequences of varying lengths."""
PROJECT_HOME = 'https://github.com/isi-nlp/nlcodec'
Expand All @@ -26,6 +26,6 @@
format='[%(asctime)s] p%(process)s {%(module)s:%(lineno)d} %(levelname)s - %(message)s')

from nlcodec.codec import (EncoderScheme, WordScheme, CharScheme, BPEScheme, Type, Reseved,
REGISTRY,
REGISTRY, ByteScheme,
learn_vocab, load_scheme, Level, encode, decode)
from nlcodec.dstruct import LnNode, TrNode, MaxHeap
2 changes: 1 addition & 1 deletion nlcodec/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def parse_args() -> Dict[str, Any]:
help='Vocabulary size. Valid only for task=learn. This is required for'
' "bpe", but optional for "word" and "char" models, specifying it'
' will trim the vocabulary at given top most frequent types.')
learn_args.add_argument('-l', '--level', choices=['char', 'word', 'bpe', 'class'],
learn_args.add_argument('-l', '--level', choices=['char', 'word', 'bpe', 'class', 'byte'],
help='Encoding Level; Valid only for task=learn')
learn_args.add_argument('-mf', '--min-freq', default=None, type=int,
help='Minimum frequency of types for considering inclusion in vocabulary. '
Expand Down
74 changes: 70 additions & 4 deletions nlcodec/codec.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
from pathlib import Path
from typing import List, TextIO, Dict, Tuple, Union, Iterator, Optional
import multiprocessing as mp
from tqdm import tqdm
from nlcodec import __version__, log
from nlcodec.dstruct import TrNode
from nlcodec.utils import filter_types_coverage, IO
Expand Down Expand Up @@ -64,6 +63,7 @@ class Level:
word = 2
phrase = 3
clasz = 0 # 0 means dont split these tokens
byte = 0 # 0 means dont split these tokens


@dataclass(frozen=True)
Expand Down Expand Up @@ -211,7 +211,6 @@ def __init__(self, table: List[Type], has_reserved=True, invertible=True):
def __len__(self):
return self.vocab_size


@abc.abstractmethod
def encode_str(cls, line: str) -> List[str]:
raise NotImplementedError()
Expand Down Expand Up @@ -364,6 +363,7 @@ def decode_str(cls, seq: List[str]) -> str:

@classmethod
def term_frequencies(cls, data: Iterator[str]) -> Tuple[Dict[str, int], int]:
from tqdm import tqdm
stats = coll.Counter()
line_count = 0
for line in tqdm(data, mininterval=1):
Expand Down Expand Up @@ -454,7 +454,6 @@ def make_vocab_prefix_trie(cls, vocab: List[Type]):
assert not root.has_data # root node is not data node
return root


def encode(self, line: str, split_ratio: float = 0.) -> List[int]:
pieces = self.encode_str(line, split_ratio=split_ratio)
return [self.str_to_idx.get(piece, self.unk_idx) for piece in pieces]
Expand Down Expand Up @@ -533,6 +532,7 @@ def stochastic_split(self, seq, split_ratio, name=False):
res += self.table[idx].get_stochastic_split(name=name, split_ratio=split_ratio)
return res


class ClassScheme(WordScheme):
"""Scheme to be used for mapping labels or classes"""
level = Level.clasz
Expand Down Expand Up @@ -563,13 +563,79 @@ def get_init_vocab(cls, term_freqs, *args, **kwargs):
return vocab


class ByteScheme(EncoderScheme):
level = Level.byte
name = "byte"
"""
using hex strings to represent bytes [0-255] => [00-ff]
<s> aka BOS is 256
</s> aka EOS is 257
"""
def __init__(self, table: List[Type]=None, encoding='utf-8', errors="replace"):
self.encoding = encoding
self.errors = errors # very likely, model is going to generate invalid code bytes during training
table = table or self.get_init_vocab()
super().__init__(table=table, has_reserved=False)


@staticmethod
def code_to_str(code: int) -> str:
return f'{code:x}'

def compose_str(self, pieces: List[str]):
byte_arr = bytes.fromhex(''.join(pieces))
return str(byte_arr, encoding=self.encoding, errors=self.errors)

def encode_str(self, line: str) -> List[str]:
return [self.code_to_str(b) for b in str.encode(line, self.encoding)]

def decode_str(self, seq: List[str]) -> str:
builder = [] # string builder
buffer = [] # buffer of past pieces
for piece in seq:
if piece in self.str_to_idx and self.table[self.str_to_idx[piece]].is_reserved:
if buffer:
builder.append(self.compose_str(pieces=buffer))
buffer.clear()
builder.append(piece)
else:
buffer.append(piece)
if buffer:
builder.append(self.compose_str(pieces=buffer))
return ''.join(builder)

def encode(self, line: str) -> List[int]:
pieces = self.encode_str(line)
return [self.str_to_idx[piece] for piece in pieces]

def decode(self, seq: List[int]) -> str:
pieces = [self.idx_to_str[idx] for idx in seq]
return self.decode_str(pieces)

@classmethod
def get_init_vocab(cls, *args, **kwargs):
vocab = [Type(name=f'{code:x}', idx=code, freq=-1, level=cls.level) for code in range(256)]
for tok, _ in [Reseved.BOS_TOK, Reseved.EOS_TOK]:
vocab.append(Type(name=tok, idx=len(vocab), freq=-1, level=Level.reserved))
log.info(f"Total {cls} vocab size {len(vocab):,}")
return vocab


@classmethod
def learn(cls, *args, **kwargs) -> List[Type]:
if args or kwargs:
log.warning(f"Byte vocabulary does not need learning; args are ignored: {args} {kwargs}")
return cls.get_init_vocab()


#########################
REGISTRY = {
'char': CharScheme,
'word': WordScheme,
'bpe': BPEScheme,
'subword': BPEScheme,
'class': ClassScheme
'class': ClassScheme,
'byte': ByteScheme
}


Expand Down
2 changes: 1 addition & 1 deletion nlcodec/learn.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def parse_args() -> Dict[str, Any]:
help='Vocabulary size. This is required for'
' "bpe", but optional for "word" and "char" models, specifying it'
' will trim the vocabulary at given top most frequent types.')
p.add_argument('-l', '--level', choices=['char', 'word', 'bpe'], default='bpe',
p.add_argument('-l', '--level', choices=['char', 'word', 'bpe', 'class', 'byte'], default='bpe',
help='Encoding Level')
p.add_argument('-mf', '--min-freq', default=None, type=int,
help='Minimum frequency of types for considering inclusion in vocabulary. '
Expand Down
2 changes: 1 addition & 1 deletion nlcodec/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
from typing import List, Any, Iterable, Dict, Tuple, Union
import collections as coll
from nlcodec import log
from tqdm import tqdm
import gzip
import time
from contextlib import contextmanager
Expand All @@ -21,6 +20,7 @@ def make_n_grams(sent: List[Any], n):


def make_n_grams_all(sents: Iterable[List[Any]], n):
from tqdm import tqdm
grams = coll.Counter()
n_sent = 0
for sent in tqdm(sents, mininterval=1, dynamic_ncols=True):
Expand Down
35 changes: 34 additions & 1 deletion tests/test_codec.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@

from pathlib import Path

import nlcodec

data_dir = Path(__file__).parent.parent / 'data'
en_txt = data_dir / 'train.en.tok'
fr_txt = data_dir / 'train.fr.tok'
Expand Down Expand Up @@ -74,4 +76,35 @@ def test_class_scheme():
assert len(table2) == len(table)
table_str = '\n'.join(x.format() for x in table)
table2_str = '\n'.join(x.format() for x in table2)
assert table_str == table2_str
assert table_str == table2_str


def test_byte_scheme():
args = dict(inp=IO.read_as_stream(paths=[en_txt, fr_txt]), level='byte')
with tempfile.TemporaryDirectory() as tmpdir:
model_file = Path(tmpdir) / 'model.tsv'
args['model'] = model_file
table = nlc.learn_vocab(vocab_size=-1, **args)
table2, meta = nlc.Type.read_vocab(model_file)
assert len(table2) == len(table)
table_str = '\n'.join(x.format() for x in table)
table2_str = '\n'.join(x.format() for x in table2)
assert table_str == table2_str
codec = nlc.load_scheme(model_file)
for s in ['hello, world!?&%^&$#@1235214"\'',
"ಕನ್ನಡ ವಿಶ್ವಕೋಶವು ಮೀಡಿಯಾವಿಕಿಯನ್ನು ಬಳಸಿ ಕಟ್ಟಿರುವ ಸ್ವತಂತ್ರ ವಿಶ್ವಕೋಶ.",
"维基百科,自由的百科全书"]:
e = codec.encode_str(s)
d = codec.decode_str(e)
assert s == d
e = codec.encode(s)
d = codec.decode(e)
assert s == d


def test_byte_scheme_reserved():
codec = nlcodec.ByteScheme()
s = codec.encode_str("hello world")
s.insert(0, '<s>')
print(codec.decode_str(s))

0 comments on commit 8267bff

Please sign in to comment.