From a7d56569febb802d1067a23b0f3e4bc5ac57a69a Mon Sep 17 00:00:00 2001
From: Thamme Gowda <tgowdan@gmail.com>
Date: Thu, 23 Dec 2021 15:41:58 -0800
Subject: [PATCH 1/6] Add `byte` scheme

---
 CHANGELOG.md        |  4 +++
 docs/intro.adoc     |  2 ++
 nlcodec/__init__.py |  4 +--
 nlcodec/__main__.py |  2 +-
 nlcodec/codec.py    | 72 +++++++++++++++++++++++++++++++++++++++++++--
 nlcodec/learn.py    |  2 +-
 tests/test_codec.py | 35 +++++++++++++++++++++-
 7 files changed, 113 insertions(+), 8 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 9a8c522..ad10486 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,9 @@
 # Changelog
 
+## 0.5 -- 2021-12-23
+
+- Add `byte` scheme
+
 ## 0.4.0  -- 2021-08-03
 
 - Add support for `class` scheme -- for multi-class classification field
diff --git a/docs/intro.adoc b/docs/intro.adoc
index 28749ec..69b4167 100644
--- a/docs/intro.adoc
+++ b/docs/intro.adoc
@@ -6,6 +6,8 @@ NLP pipeline. These codecs include encoding of sequences into one of the followi
 2. Word
 3. BPE based subwords
 4. Class (for multiclass classification)
+5. Byte: Character is a Unicode codepoint (which can be higher than 255) where as bytes are [0-255]; a proxy over `utf-8` scheme
+
 
 It provides python (so embed into your app) and CLI APIs (use it as stand alone tool).
 
diff --git a/nlcodec/__init__.py b/nlcodec/__init__.py
index b206f0c..5eca6be 100644
--- a/nlcodec/__init__.py
+++ b/nlcodec/__init__.py
@@ -3,7 +3,7 @@
 # Author: Thamme Gowda [tg (at) isi (dot) edu] 
 # Created: 2019-10-25
 
-__version__ = '0.4.0'
+__version__ = '0.5'
 __description__ = """nlcodec is a collection of encoding schemes for natural language sequences. 
 nlcodec.db is a efficient storage and retrieval layer for integer sequences of varying lengths."""
 PROJECT_HOME = 'https://github.com/isi-nlp/nlcodec'
@@ -26,6 +26,6 @@
     format='[%(asctime)s] p%(process)s {%(module)s:%(lineno)d} %(levelname)s - %(message)s')
 
 from nlcodec.codec import (EncoderScheme, WordScheme, CharScheme, BPEScheme, Type, Reseved,
-                           REGISTRY,
+                           REGISTRY, ByteScheme,
                            learn_vocab, load_scheme, Level, encode, decode)
 from nlcodec.dstruct import LnNode, TrNode, MaxHeap
diff --git a/nlcodec/__main__.py b/nlcodec/__main__.py
index 5f65d50..1db932b 100644
--- a/nlcodec/__main__.py
+++ b/nlcodec/__main__.py
@@ -56,7 +56,7 @@ def parse_args() -> Dict[str, Any]:
                             help='Vocabulary size. Valid only for task=learn. This is required for'
                                  ' "bpe", but optional for "word" and "char" models, specifying it'
                                  ' will trim the vocabulary at given top most frequent types.')
-    learn_args.add_argument('-l', '--level', choices=['char', 'word', 'bpe', 'class'],
+    learn_args.add_argument('-l', '--level', choices=['char', 'word', 'bpe', 'class', 'byte'],
                             help='Encoding Level; Valid only for task=learn')
     learn_args.add_argument('-mf', '--min-freq', default=None, type=int,
                             help='Minimum frequency of types for considering inclusion in vocabulary. '
diff --git a/nlcodec/codec.py b/nlcodec/codec.py
index ee7e039..411bb2c 100755
--- a/nlcodec/codec.py
+++ b/nlcodec/codec.py
@@ -64,6 +64,7 @@ class Level:
     word = 2
     phrase = 3
     clasz = 0   # 0 means dont split these tokens
+    byte = 0   # 0 means dont split these tokens
 
 
 @dataclass(frozen=True)
@@ -211,7 +212,6 @@ def __init__(self, table: List[Type], has_reserved=True, invertible=True):
     def __len__(self):
         return self.vocab_size
 
-
     @abc.abstractmethod
     def encode_str(cls, line: str) -> List[str]:
         raise NotImplementedError()
@@ -454,7 +454,6 @@ def make_vocab_prefix_trie(cls, vocab: List[Type]):
         assert not root.has_data  # root node is not data node
         return root
 
-
     def encode(self, line: str, split_ratio: float = 0.) -> List[int]:
         pieces = self.encode_str(line, split_ratio=split_ratio)
         return [self.str_to_idx.get(piece, self.unk_idx) for piece in pieces]
@@ -533,6 +532,7 @@ def stochastic_split(self, seq, split_ratio, name=False):
             res += self.table[idx].get_stochastic_split(name=name, split_ratio=split_ratio)
         return res
 
+
 class ClassScheme(WordScheme):
     """Scheme to be used for mapping labels or classes"""
     level = Level.clasz
@@ -563,13 +563,79 @@ def get_init_vocab(cls, term_freqs, *args, **kwargs):
         return vocab
 
 
+class ByteScheme(EncoderScheme):
+    level = Level.byte
+    name = "byte"
+    """
+     using hex strings to represent bytes [0-255] => [00-ff] 
+     <s> aka BOS is 256
+     </s> aka EOS is 257
+    """
+    def __init__(self, table: List[Type]=None, encoding='utf-8', errors="replace"):
+        self.encoding = encoding
+        self.errors = errors   # very likely, model is going to generate invalid code bytes during training
+        table = table or self.get_init_vocab()
+        super().__init__(table=table, has_reserved=False)
+
+
+    @staticmethod
+    def code_to_str(code: int) -> str:
+        return f'{code:x}'
+
+    def compose_str(self, pieces: List[str]):
+        byte_arr = bytes.fromhex(''.join(pieces))
+        return str(byte_arr, encoding=self.encoding, errors=self.errors)
+
+    def encode_str(self, line: str) -> List[str]:
+        return [self.code_to_str(b) for b in str.encode(line, self.encoding)]
+
+    def decode_str(self, seq: List[str]) -> str:
+        builder = []  # string builder
+        buffer = []   # buffer of past pieces
+        for piece in seq:
+            if piece in self.str_to_idx and self.table[self.str_to_idx[piece]].is_reserved:
+                if buffer:
+                    builder.append(self.compose_str(pieces=buffer))
+                    buffer.clear()
+                builder.append(piece)
+            else:
+                buffer.append(piece)
+        if buffer:
+            builder.append(self.compose_str(pieces=buffer))
+        return ''.join(builder)
+
+    def encode(self, line: str) -> List[int]:
+        pieces = self.encode_str(line)
+        return [self.str_to_idx[piece] for piece in pieces]
+
+    def decode(self, seq: List[int]) -> str:
+        pieces = [self.idx_to_str[idx] for idx in seq]
+        return self.decode_str(pieces)
+
+    @classmethod
+    def get_init_vocab(cls, *args, **kwargs):
+        vocab = [Type(name=f'{code:x}', idx=code, freq=-1, level=cls.level) for code in range(256)]
+        for tok, _ in [Reseved.BOS_TOK, Reseved.EOS_TOK]:
+            vocab.append(Type(name=tok, idx=len(vocab), freq=-1, level=Level.reserved))
+        log.info(f"Total {cls} vocab size {len(vocab):,}")
+        return vocab
+
+
+    @classmethod
+    def learn(cls, *args, **kwargs) -> List[Type]:
+        if args or kwargs:
+            log.warning(f"Byte vocabulary does not need learning; args are ignored: {args} {kwargs}")
+        return cls.get_init_vocab()
+
+
 #########################
 REGISTRY = {
     'char': CharScheme,
     'word': WordScheme,
     'bpe': BPEScheme,
     'subword': BPEScheme,
-    'class': ClassScheme
+    'class': ClassScheme,
+    'byte': ByteScheme
 }
 
 
diff --git a/nlcodec/learn.py b/nlcodec/learn.py
index c2f468f..ccf4457 100644
--- a/nlcodec/learn.py
+++ b/nlcodec/learn.py
@@ -36,7 +36,7 @@ def parse_args() -> Dict[str, Any]:
                    help='Vocabulary size. This is required for'
                         ' "bpe", but optional for "word" and "char" models, specifying it'
                         ' will trim the vocabulary at given top most frequent types.')
-    p.add_argument('-l', '--level', choices=['char', 'word', 'bpe'], default='bpe',
+    p.add_argument('-l', '--level', choices=['char', 'word', 'bpe', 'class', 'byte'], default='bpe',
                    help='Encoding Level')
     p.add_argument('-mf', '--min-freq', default=None, type=int,
                    help='Minimum frequency of types for considering inclusion in vocabulary. '
diff --git a/tests/test_codec.py b/tests/test_codec.py
index 603d414..a81768c 100644
--- a/tests/test_codec.py
+++ b/tests/test_codec.py
@@ -5,6 +5,8 @@
 
 from pathlib import Path
 
+import nlcodec
+
 data_dir = Path(__file__).parent.parent / 'data'
 en_txt = data_dir / 'train.en.tok'
 fr_txt = data_dir / 'train.fr.tok'
@@ -74,4 +76,35 @@ def test_class_scheme():
         assert len(table2) == len(table)
         table_str = '\n'.join(x.format() for x in table)
         table2_str = '\n'.join(x.format() for x in table2)
-        assert table_str == table2_str
\ No newline at end of file
+        assert table_str == table2_str
+
+
+def test_byte_scheme():
+    args = dict(inp=IO.read_as_stream(paths=[en_txt, fr_txt]), level='byte')
+    with tempfile.TemporaryDirectory() as tmpdir:
+        model_file = Path(tmpdir) / 'model.tsv'
+        args['model'] = model_file
+        table = nlc.learn_vocab(vocab_size=-1, **args)
+        table2, meta = nlc.Type.read_vocab(model_file)
+        assert len(table2) == len(table)
+        table_str = '\n'.join(x.format() for x in table)
+        table2_str = '\n'.join(x.format() for x in table2)
+        assert table_str == table2_str
+        codec = nlc.load_scheme(model_file)
+        for s in ['hello, world!?&%^&$#@1235214"\'',
+                  "ಕನ್ನಡ ವಿಶ್ವಕೋಶವು ಮೀಡಿಯಾವಿಕಿಯನ್ನು ಬಳಸಿ ಕಟ್ಟಿರುವ ಸ್ವತಂತ್ರ ವಿಶ್ವಕೋಶ.",
+                  "维基百科，自由的百科全书"]:
+            e = codec.encode_str(s)
+            d = codec.decode_str(e)
+            assert s == d
+            e = codec.encode(s)
+            d = codec.decode(e)
+            assert s == d
+
+
+def test_byte_scheme_reserved():
+    codec = nlcodec.ByteScheme()
+    s = codec.encode_str("hello world")
+    s.insert(0, '<s>')
+    print(codec.decode_str(s))
+

From 256c8704f7a1feead3c73b2e12ace8373728520f Mon Sep 17 00:00:00 2001
From: Thamme Gowda <tgowdan@gmail.com>
Date: Thu, 23 Dec 2021 15:42:06 -0800
Subject: [PATCH 2/6] Update docs

---
 docs/index.html | 105 +++++++++++++++++++++++++-----------------------
 1 file changed, 55 insertions(+), 50 deletions(-)

diff --git a/docs/index.html b/docs/index.html
index ced4500..ffacba0 100644
--- a/docs/index.html
+++ b/docs/index.html
@@ -4,28 +4,26 @@
 <meta charset="utf-8">
 <meta http-equiv="X-UA-Compatible" content="IE=edge">
 <meta name="viewport" content="width=device-width, initial-scale=1.0">
-<meta name="generator" content="Asciidoctor 2.0.14">
+<meta name="generator" content="Asciidoctor 2.0.16">
 <meta name="description" content="NLCodec home page">
 <meta name="keywords" content="NLCodec, BPE, Byte Pair Encoding, NLP, Natural Language Processing">
 <meta name="author" content="Thamme Gowda @ USC Information Sciences Institute, Natural Language Group">
 <title>Natural Language Encoder Decoder (NLCodec)</title>
 <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Open+Sans:300,300italic,400,400italic,600,600italic%7CNoto+Serif:400,400italic,700,700italic%7CDroid+Sans+Mono:400,700">
 <style>
-/* Asciidoctor default stylesheet | MIT License | https://asciidoctor.org */
-/* Uncomment @import statement to use as custom stylesheet */
-/*@import "https://fonts.googleapis.com/css?family=Open+Sans:300,300italic,400,400italic,600,600italic%7CNoto+Serif:400,400italic,700,700italic%7CDroid+Sans+Mono:400,700";*/
-article,aside,details,figcaption,figure,footer,header,hgroup,main,nav,section{display:block}
-audio,video{display:inline-block}
-audio:not([controls]){display:none;height:0}
-html{font-family:sans-serif;-ms-text-size-adjust:100%;-webkit-text-size-adjust:100%}
+/*! Asciidoctor default stylesheet | MIT License | https://asciidoctor.org */
+/* Uncomment the following line when using as a custom stylesheet */
+/* @import "https://fonts.googleapis.com/css?family=Open+Sans:300,300italic,400,400italic,600,600italic%7CNoto+Serif:400,400italic,700,700italic%7CDroid+Sans+Mono:400,700"; */
+html{font-family:sans-serif;-webkit-text-size-adjust:100%}
 a{background:none}
 a:focus{outline:thin dotted}
 a:active,a:hover{outline:0}
 h1{font-size:2em;margin:.67em 0}
-abbr[title]{border-bottom:1px dotted}
 b,strong{font-weight:bold}
+abbr{font-size:.9em}
+abbr[title]{cursor:help;border-bottom:1px dotted #dddddf;text-decoration:none}
 dfn{font-style:italic}
-hr{-moz-box-sizing:content-box;box-sizing:content-box;height:0}
+hr{height:0}
 mark{background:#ff0;color:#000}
 code,kbd,pre,samp{font-family:monospace;font-size:1em}
 pre{white-space:pre-wrap}
@@ -37,20 +35,22 @@
 img{border:0}
 svg:not(:root){overflow:hidden}
 figure{margin:0}
+audio,video{display:inline-block}
+audio:not([controls]){display:none;height:0}
 fieldset{border:1px solid silver;margin:0 2px;padding:.35em .625em .75em}
 legend{border:0;padding:0}
 button,input,select,textarea{font-family:inherit;font-size:100%;margin:0}
 button,input{line-height:normal}
 button,select{text-transform:none}
-button,html input[type="button"],input[type="reset"],input[type="submit"]{-webkit-appearance:button;cursor:pointer}
+button,html input[type=button],input[type=reset],input[type=submit]{-webkit-appearance:button;cursor:pointer}
 button[disabled],html input[disabled]{cursor:default}
-input[type="checkbox"],input[type="radio"]{box-sizing:border-box;padding:0}
+input[type=checkbox],input[type=radio]{padding:0}
 button::-moz-focus-inner,input::-moz-focus-inner{border:0;padding:0}
 textarea{overflow:auto;vertical-align:top}
 table{border-collapse:collapse;border-spacing:0}
-*,*::before,*::after{-moz-box-sizing:border-box;-webkit-box-sizing:border-box;box-sizing:border-box}
+*,::before,::after{box-sizing:border-box}
 html,body{font-size:100%}
-body{background:#fff;color:rgba(0,0,0,.8);padding:0;margin:0;font-family:"Noto Serif","DejaVu Serif",serif;font-weight:400;font-style:normal;line-height:1;position:relative;cursor:auto;tab-size:4;word-wrap:anywhere;-moz-osx-font-smoothing:grayscale;-webkit-font-smoothing:antialiased}
+body{background:#fff;color:rgba(0,0,0,.8);padding:0;margin:0;font-family:"Noto Serif","DejaVu Serif",serif;line-height:1;position:relative;cursor:auto;-moz-tab-size:4;-o-tab-size:4;tab-size:4;word-wrap:anywhere;-moz-osx-font-smoothing:grayscale;-webkit-font-smoothing:antialiased}
 a:hover{cursor:pointer}
 img,object,embed{max-width:100%;height:auto}
 object,embed{height:100%}
@@ -70,7 +70,7 @@
 a{color:#2156a5;text-decoration:underline;line-height:inherit}
 a:hover,a:focus{color:#1d4b8f}
 a img{border:0}
-p{font-family:inherit;font-weight:400;font-size:1em;line-height:1.6;margin-bottom:1.25em;text-rendering:optimizeLegibility}
+p{line-height:1.6;margin-bottom:1.25em;text-rendering:optimizeLegibility}
 p aside{font-size:.875em;line-height:1.35;font-style:italic}
 h1,h2,h3,#toctitle,.sidebarblock>.content>.title,h4,h5,h6{font-family:"Open Sans","DejaVu Sans",sans-serif;font-weight:300;font-style:normal;color:#ba3925;text-rendering:optimizeLegibility;margin-top:1em;margin-bottom:.5em;line-height:1.0125em}
 h1 small,h2 small,h3 small,#toctitle small,.sidebarblock>.content>.title small,h4 small,h5 small,h6 small{font-size:60%;color:#e99b8f;line-height:0}
@@ -79,14 +79,14 @@
 h3,#toctitle,.sidebarblock>.content>.title{font-size:1.375em}
 h4,h5{font-size:1.125em}
 h6{font-size:1em}
-hr{border:solid #dddddf;border-width:1px 0 0;clear:both;margin:1.25em 0 1.1875em;height:0}
+hr{border:solid #dddddf;border-width:1px 0 0;clear:both;margin:1.25em 0 1.1875em}
 em,i{font-style:italic;line-height:inherit}
 strong,b{font-weight:bold;line-height:inherit}
 small{font-size:60%;line-height:inherit}
 code{font-family:"Droid Sans Mono","DejaVu Sans Mono",monospace;font-weight:400;color:rgba(0,0,0,.9)}
-ul,ol,dl{font-size:1em;line-height:1.6;margin-bottom:1.25em;list-style-position:outside;font-family:inherit}
+ul,ol,dl{line-height:1.6;margin-bottom:1.25em;list-style-position:outside;font-family:inherit}
 ul,ol{margin-left:1.5em}
-ul li ul,ul li ol{margin-left:1.25em;margin-bottom:0;font-size:1em}
+ul li ul,ul li ol{margin-left:1.25em;margin-bottom:0}
 ul.square li ul,ul.circle li ul,ul.disc li ul{list-style:inherit}
 ul.square{list-style-type:square}
 ul.circle{list-style-type:circle}
@@ -94,8 +94,6 @@
 ol li ul,ol li ol{margin-left:1.25em;margin-bottom:0}
 dl dt{margin-bottom:.3125em;font-weight:bold}
 dl dd{margin-bottom:1.25em}
-abbr,acronym{text-transform:uppercase;font-size:90%;color:rgba(0,0,0,.8);border-bottom:1px dotted #ddd;cursor:help}
-abbr{text-transform:none}
 blockquote{margin:0 0 1.25em;padding:.5625em 1.25em 0 1.1875em;border-left:1px solid #ddd}
 blockquote,blockquote p{line-height:1.6;color:rgba(0,0,0,.85)}
 @media screen and (min-width:768px){h1,h2,h3,#toctitle,.sidebarblock>.content>.title,h4,h5,h6{line-height:1.2}
@@ -103,7 +101,7 @@
 h2{font-size:2.3125em}
 h3,#toctitle,.sidebarblock>.content>.title{font-size:1.6875em}
 h4{font-size:1.4375em}}
-table{background:#fff;margin-bottom:1.25em;border:solid 1px #dedede;word-wrap:normal}
+table{background:#fff;margin-bottom:1.25em;border:1px solid #dedede;word-wrap:normal}
 table thead,table tfoot{background:#f7f8f7}
 table thead tr th,table thead tr td,table tfoot tr th,table tfoot tr td{padding:.5em .625em .625em;font-size:inherit;color:rgba(0,0,0,.8);text-align:left}
 table tr th,table tr td{padding:.5625em .625em;font-size:inherit;color:rgba(0,0,0,.8)}
@@ -118,7 +116,7 @@
 :not(pre).nobreak{word-wrap:normal}
 :not(pre).nowrap{white-space:nowrap}
 :not(pre).pre-wrap{white-space:pre-wrap}
-:not(pre):not([class^=L])>code{font-size:.9375em;font-style:normal!important;letter-spacing:0;padding:.1em .5ex;word-spacing:-.15em;background:#f7f7f8;-webkit-border-radius:4px;border-radius:4px;line-height:1.45;text-rendering:optimizeSpeed}
+:not(pre):not([class^=L])>code{font-size:.9375em;font-style:normal!important;letter-spacing:0;padding:.1em .5ex;word-spacing:-.15em;background:#f7f7f8;border-radius:4px;line-height:1.45;text-rendering:optimizeSpeed}
 pre{color:rgba(0,0,0,.9);font-family:"Droid Sans Mono","DejaVu Sans Mono",monospace;line-height:1.45;text-rendering:optimizeSpeed}
 pre code,pre pre{color:inherit;font-size:inherit;line-height:inherit}
 pre>code{display:block}
@@ -126,7 +124,7 @@
 em em{font-style:normal}
 strong strong{font-weight:400}
 .keyseq{color:rgba(51,51,51,.8)}
-kbd{font-family:"Droid Sans Mono","DejaVu Sans Mono",monospace;display:inline-block;color:rgba(0,0,0,.8);font-size:.65em;line-height:1.45;background:#f7f7f7;border:1px solid #ccc;-webkit-border-radius:3px;border-radius:3px;-webkit-box-shadow:0 1px 0 rgba(0,0,0,.2),0 0 0 .1em white inset;box-shadow:0 1px 0 rgba(0,0,0,.2),0 0 0 .1em #fff inset;margin:0 .15em;padding:.2em .5em;vertical-align:middle;position:relative;top:-.1em;white-space:nowrap}
+kbd{font-family:"Droid Sans Mono","DejaVu Sans Mono",monospace;display:inline-block;color:rgba(0,0,0,.8);font-size:.65em;line-height:1.45;background:#f7f7f7;border:1px solid #ccc;border-radius:3px;box-shadow:0 1px 0 rgba(0,0,0,.2),inset 0 0 0 .1em #fff;margin:0 .15em;padding:.2em .5em;vertical-align:middle;position:relative;top:-.1em;white-space:nowrap}
 .keyseq kbd:first-child{margin-left:0}
 .keyseq kbd:last-child{margin-right:0}
 .menuseq,.menuref{color:#000}
@@ -138,7 +136,7 @@
 b.button::before{content:"[";padding:0 3px 0 2px}
 b.button::after{content:"]";padding:0 2px 0 3px}
 p a>code:hover{color:rgba(0,0,0,.9)}
-#header,#content,#footnotes,#footer{width:100%;margin-left:auto;margin-right:auto;margin-top:0;margin-bottom:0;max-width:62.5em;*zoom:1;position:relative;padding-left:.9375em;padding-right:.9375em}
+#header,#content,#footnotes,#footer{width:100%;margin:0 auto;max-width:62.5em;*zoom:1;position:relative;padding-left:.9375em;padding-right:.9375em}
 #header::before,#header::after,#content::before,#content::after,#footnotes::before,#footnotes::after,#footer::before,#footer::after{content:" ";display:table}
 #header::after,#content::after,#footnotes::after,#footer::after{clear:both}
 #content{margin-top:1.25em}
@@ -146,7 +144,7 @@
 #header>h1:first-child{color:rgba(0,0,0,.85);margin-top:2.25rem;margin-bottom:0}
 #header>h1:first-child+#toc{margin-top:8px;border-top:1px solid #dddddf}
 #header>h1:only-child,body.toc2 #header>h1:nth-last-child(2){border-bottom:1px solid #dddddf;padding-bottom:8px}
-#header .details{border-bottom:1px solid #dddddf;line-height:1.45;padding-top:.25em;padding-bottom:.25em;padding-left:.25em;color:rgba(0,0,0,.6);display:-ms-flexbox;display:-webkit-flex;display:flex;-ms-flex-flow:row wrap;-webkit-flex-flow:row wrap;flex-flow:row wrap}
+#header .details{border-bottom:1px solid #dddddf;line-height:1.45;padding-top:.25em;padding-bottom:.25em;padding-left:.25em;color:rgba(0,0,0,.6);display:flex;flex-flow:row wrap}
 #header .details span:first-child{margin-left:-.125em}
 #header .details span.email a{color:rgba(0,0,0,.85)}
 #header .details br{display:none}
@@ -180,11 +178,11 @@
 #toc.toc2>ul{font-size:.95em}
 #toc.toc2 ul ul{padding-left:1.25em}
 body.toc2.toc-right{padding-left:0;padding-right:20em}}
-#content #toc{border-style:solid;border-width:1px;border-color:#e0e0dc;margin-bottom:1.25em;padding:1.25em;background:#f8f8f7;-webkit-border-radius:4px;border-radius:4px}
+#content #toc{border:1px solid #e0e0dc;margin-bottom:1.25em;padding:1.25em;background:#f8f8f7;border-radius:4px}
 #content #toc>:first-child{margin-top:0}
 #content #toc>:last-child{margin-bottom:0}
 #footer{max-width:none;background:rgba(0,0,0,.8);padding:1.25em}
-#footer-text{color:rgba(255,255,255,.8);line-height:1.44}
+#footer-text{color:hsla(0,0%,100%,.8);line-height:1.44}
 #content{margin-bottom:.625em}
 .sect1{padding-bottom:.625em}
 @media screen and (min-width:768px){#content{margin-bottom:1.25em}
@@ -197,29 +195,32 @@
 #content h1>a.link,h2>a.link,h3>a.link,#toctitle>a.link,.sidebarblock>.content>.title>a.link,h4>a.link,h5>a.link,h6>a.link{color:#ba3925;text-decoration:none}
 #content h1>a.link:hover,h2>a.link:hover,h3>a.link:hover,#toctitle>a.link:hover,.sidebarblock>.content>.title>a.link:hover,h4>a.link:hover,h5>a.link:hover,h6>a.link:hover{color:#a53221}
 details,.audioblock,.imageblock,.literalblock,.listingblock,.stemblock,.videoblock{margin-bottom:1.25em}
-details>summary:first-of-type{cursor:pointer;display:list-item;outline:none;margin-bottom:.75em}
+details{margin-left:1.25rem}
+details>summary{cursor:pointer;display:block;position:relative;line-height:1.6;margin-bottom:.625rem;-webkit-tap-highlight-color:transparent}
+details>summary::before{content:"";border:solid transparent;border-left:solid;border-width:.3em 0 .3em .5em;position:absolute;top:.5em;left:-1.25rem;transform:translateX(15%)}
+details[open]>summary::before{border:solid transparent;border-top:solid;border-width:.5em .3em 0;transform:translateY(15%)}
+details>summary::after{content:"";width:1.25rem;height:1em;position:absolute;top:.3em;left:-1.25rem}
 .admonitionblock td.content>.title,.audioblock>.title,.exampleblock>.title,.imageblock>.title,.listingblock>.title,.literalblock>.title,.stemblock>.title,.openblock>.title,.paragraph>.title,.quoteblock>.title,table.tableblock>.title,.verseblock>.title,.videoblock>.title,.dlist>.title,.olist>.title,.ulist>.title,.qlist>.title,.hdlist>.title{text-rendering:optimizeLegibility;text-align:left;font-family:"Noto Serif","DejaVu Serif",serif;font-size:1rem;font-style:italic}
 table.tableblock.fit-content>caption.title{white-space:nowrap;width:0}
-.paragraph.lead>p,#preamble>.sectionbody>[class="paragraph"]:first-of-type p{font-size:1.21875em;line-height:1.6;color:rgba(0,0,0,.85)}
-table.tableblock #preamble>.sectionbody>[class="paragraph"]:first-of-type p{font-size:inherit}
+.paragraph.lead>p,#preamble>.sectionbody>[class=paragraph]:first-of-type p{font-size:1.21875em;line-height:1.6;color:rgba(0,0,0,.85)}
 .admonitionblock>table{border-collapse:separate;border:0;background:none;width:100%}
 .admonitionblock>table td.icon{text-align:center;width:80px}
 .admonitionblock>table td.icon img{max-width:none}
 .admonitionblock>table td.icon .title{font-weight:bold;font-family:"Open Sans","DejaVu Sans",sans-serif;text-transform:uppercase}
 .admonitionblock>table td.content{padding-left:1.125em;padding-right:1.25em;border-left:1px solid #dddddf;color:rgba(0,0,0,.6);word-wrap:anywhere}
 .admonitionblock>table td.content>:last-child>:last-child{margin-bottom:0}
-.exampleblock>.content{border-style:solid;border-width:1px;border-color:#e6e6e6;margin-bottom:1.25em;padding:1.25em;background:#fff;-webkit-border-radius:4px;border-radius:4px}
+.exampleblock>.content{border:1px solid #e6e6e6;margin-bottom:1.25em;padding:1.25em;background:#fff;border-radius:4px}
 .exampleblock>.content>:first-child{margin-top:0}
 .exampleblock>.content>:last-child{margin-bottom:0}
-.sidebarblock{border-style:solid;border-width:1px;border-color:#dbdbd6;margin-bottom:1.25em;padding:1.25em;background:#f3f3f2;-webkit-border-radius:4px;border-radius:4px}
+.sidebarblock{border:1px solid #dbdbd6;margin-bottom:1.25em;padding:1.25em;background:#f3f3f2;border-radius:4px}
 .sidebarblock>:first-child{margin-top:0}
 .sidebarblock>:last-child{margin-bottom:0}
 .sidebarblock>.content>.title{color:#7a2518;margin-top:0;text-align:center}
 .exampleblock>.content>:last-child>:last-child,.exampleblock>.content .olist>ol>li:last-child>:last-child,.exampleblock>.content .ulist>ul>li:last-child>:last-child,.exampleblock>.content .qlist>ol>li:last-child>:last-child,.sidebarblock>.content>:last-child>:last-child,.sidebarblock>.content .olist>ol>li:last-child>:last-child,.sidebarblock>.content .ulist>ul>li:last-child>:last-child,.sidebarblock>.content .qlist>ol>li:last-child>:last-child{margin-bottom:0}
-.literalblock pre,.listingblock>.content>pre{-webkit-border-radius:4px;border-radius:4px;overflow-x:auto;padding:1em;font-size:.8125em}
+.literalblock pre,.listingblock>.content>pre{border-radius:4px;overflow-x:auto;padding:1em;font-size:.8125em}
 @media screen and (min-width:768px){.literalblock pre,.listingblock>.content>pre{font-size:.90625em}}
 @media screen and (min-width:1280px){.literalblock pre,.listingblock>.content>pre{font-size:1em}}
-.literalblock pre,.listingblock>.content>pre:not(.highlight),.listingblock>.content>pre[class="highlight"],.listingblock>.content>pre[class^="highlight "]{background:#f7f7f8}
+.literalblock pre,.listingblock>.content>pre:not(.highlight),.listingblock>.content>pre[class=highlight],.listingblock>.content>pre[class^="highlight "]{background:#f7f7f8}
 .literalblock.output pre{color:#f7f7f8;background:rgba(0,0,0,.9)}
 .listingblock>.content{position:relative}
 .listingblock code[data-lang]::before{display:none;content:attr(data-lang);position:absolute;font-size:.75em;top:.425rem;right:.5rem;line-height:1;text-transform:uppercase;color:inherit;opacity:.5}
@@ -227,7 +228,7 @@
 .listingblock.terminal pre .command::before{content:attr(data-prompt);padding-right:.5em;color:inherit;opacity:.5}
 .listingblock.terminal pre .command:not([data-prompt])::before{content:"$"}
 .listingblock pre.highlightjs{padding:0}
-.listingblock pre.highlightjs>code{padding:1em;-webkit-border-radius:4px;border-radius:4px}
+.listingblock pre.highlightjs>code{padding:1em;border-radius:4px}
 .listingblock pre.prettyprint{border-width:0}
 .prettyprint{background:#f7f7f8}
 pre.prettyprint .linenums{line-height:1.45;margin-left:2em}
@@ -237,8 +238,8 @@
 table.linenotable{border-collapse:separate;border:0;margin-bottom:0;background:none}
 table.linenotable td[class]{color:inherit;vertical-align:top;padding:0;line-height:inherit;white-space:normal}
 table.linenotable td.code{padding-left:.75em}
-table.linenotable td.linenos{border-right:1px solid currentColor;opacity:.35;padding-right:.5em}
-pre.pygments .lineno{border-right:1px solid currentColor;opacity:.35;display:inline-block;margin-right:.75em}
+table.linenotable td.linenos{border-right:1px solid;opacity:.35;padding-right:.5em}
+pre.pygments .lineno{border-right:1px solid;opacity:.35;display:inline-block;margin-right:.75em}
 pre.pygments .lineno::before{content:"";margin-right:-.125em}
 .quoteblock{margin:0 1em 1.25em 1.5em;display:table}
 .quoteblock:not(.excerpt)>.title{margin-left:-1.5em;margin-bottom:.75em}
@@ -248,7 +249,7 @@
 .quoteblock blockquote>.paragraph:last-child p{margin-bottom:0}
 .quoteblock .attribution{margin-top:.75em;margin-right:.5ex;text-align:right}
 .verseblock{margin:0 1em 1.25em}
-.verseblock pre{font-family:"Open Sans","DejaVu Sans",sans;font-size:1.15rem;color:rgba(0,0,0,.85);font-weight:300;text-rendering:optimizeLegibility}
+.verseblock pre{font-family:"Open Sans","DejaVu Sans",sans-serif;font-size:1.15rem;color:rgba(0,0,0,.85);font-weight:300;text-rendering:optimizeLegibility}
 .verseblock pre strong{font-weight:400}
 .verseblock .attribution{margin-top:1.25rem;margin-left:.5ex}
 .quoteblock .attribution,.verseblock .attribution{font-size:.9375em;line-height:1.45;font-style:italic}
@@ -296,10 +297,10 @@
 ul.checklist,ul.none,ol.none,ul.no-bullet,ol.no-bullet,ol.unnumbered,ul.unstyled,ol.unstyled{list-style-type:none}
 ul.no-bullet,ol.no-bullet,ol.unnumbered{margin-left:.625em}
 ul.unstyled,ol.unstyled{margin-left:0}
-ul.checklist{margin-left:.625em}
-ul.checklist li>p:first-child>.fa-square-o:first-child,ul.checklist li>p:first-child>.fa-check-square-o:first-child{width:1.25em;font-size:.8em;position:relative;bottom:.125em}
-ul.checklist li>p:first-child>input[type="checkbox"]:first-child{margin-right:.25em}
-ul.inline{display:-ms-flexbox;display:-webkit-box;display:flex;-ms-flex-flow:row wrap;-webkit-flex-flow:row wrap;flex-flow:row wrap;list-style:none;margin:0 0 .625em -1.25em}
+ul.checklist>li>p:first-child{margin-left:-1em}
+ul.checklist>li>p:first-child>.fa-square-o:first-child,ul.checklist>li>p:first-child>.fa-check-square-o:first-child{width:1.25em;font-size:.8em;position:relative;bottom:.125em}
+ul.checklist>li>p:first-child>input[type=checkbox]:first-child{margin-right:.25em}
+ul.inline{display:flex;flex-flow:row wrap;list-style:none;margin:0 0 .625em -1.25em}
 ul.inline>li{margin-left:1.25em}
 .unstyled dl dt{font-weight:400;font-style:normal}
 ol.arabic{list-style-type:decimal}
@@ -318,7 +319,7 @@
 .colist td:not([class]):first-child{padding:.4em .75em 0;line-height:1;vertical-align:top}
 .colist td:not([class]):first-child img{max-width:none}
 .colist td:not([class]):last-child{padding:.25em 0}
-.thumb,.th{line-height:0;display:inline-block;border:solid 4px #fff;-webkit-box-shadow:0 0 0 1px #ddd;box-shadow:0 0 0 1px #ddd}
+.thumb,.th{line-height:0;display:inline-block;border:4px solid #fff;box-shadow:0 0 0 1px #ddd}
 .imageblock.left{margin:.25em .625em 1.25em 0}
 .imageblock.right{margin:.25em 0 1.25em .625em}
 .imageblock>.title{margin-bottom:0}
@@ -386,7 +387,7 @@
 .admonitionblock td.icon .icon-warning::before{content:"\f071";color:#bf6900}
 .admonitionblock td.icon .icon-caution::before{content:"\f06d";color:#bf3400}
 .admonitionblock td.icon .icon-important::before{content:"\f06a";color:#bf0000}
-.conum[data-value]{display:inline-block;color:#fff!important;background:rgba(0,0,0,.8);-webkit-border-radius:50%;border-radius:50%;text-align:center;font-size:.75em;width:1.67em;height:1.67em;line-height:1.67em;font-family:"Open Sans","DejaVu Sans",sans-serif;font-style:normal;font-weight:bold}
+.conum[data-value]{display:inline-block;color:#fff!important;background:rgba(0,0,0,.8);border-radius:50%;text-align:center;font-size:.75em;width:1.67em;height:1.67em;line-height:1.67em;font-family:"Open Sans","DejaVu Sans",sans-serif;font-style:normal;font-weight:bold}
 .conum[data-value] *{color:#fff!important}
 .conum[data-value]+b{display:none}
 .conum[data-value]::after{content:attr(data-value)}
@@ -394,19 +395,20 @@
 b.conum *{color:inherit!important}
 .conum:not([data-value]):empty{display:none}
 dt,th.tableblock,td.content,div.footnote{text-rendering:optimizeLegibility}
-h1,h2,p,td.content,span.alt{letter-spacing:-.01em}
+h1,h2,p,td.content,span.alt,summary{letter-spacing:-.01em}
 p strong,td.content strong,div.footnote strong{letter-spacing:-.005em}
-p,blockquote,dt,td.content,span.alt{font-size:1.0625rem}
+p,blockquote,dt,td.content,span.alt,summary{font-size:1.0625rem}
 p{margin-bottom:1.25rem}
 .sidebarblock p,.sidebarblock dt,.sidebarblock td.content,p.tableblock{font-size:1em}
-.exampleblock>.content{background:#fffef7;border-color:#e0e0dc;-webkit-box-shadow:0 1px 4px #e0e0dc;box-shadow:0 1px 4px #e0e0dc}
+.exampleblock>.content{background:#fffef7;border-color:#e0e0dc;box-shadow:0 1px 4px #e0e0dc}
 .print-only{display:none!important}
 @page{margin:1.25cm .75cm}
-@media print{*{-webkit-box-shadow:none!important;box-shadow:none!important;text-shadow:none!important}
+@media print{*{box-shadow:none!important;text-shadow:none!important}
 html{font-size:80%}
 a{color:inherit!important;text-decoration:underline!important}
 a.bare,a[href^="#"],a[href^="mailto:"]{text-decoration:none!important}
 a[href^="http:"]:not(.bare)::after,a[href^="https:"]:not(.bare)::after{content:"(" attr(href) ")";display:inline-block;font-size:.875em;padding-left:.25em}
+abbr[title]{border-bottom:1px dotted}
 abbr[title]::after{content:" (" attr(title) ")"}
 pre,blockquote,tr,img,object,svg{page-break-inside:avoid}
 thead{display:table-header-group}
@@ -430,7 +432,7 @@
 .print-only{display:block!important}
 .hide-for-print{display:none!important}
 .show-for-print{display:inherit!important}}
-@media print,amzn-kf8{#header>h1:first-child{margin-top:1.25rem}
+@media amzn-kf8,print{#header>h1:first-child{margin-top:1.25rem}
 .sect1{padding:0!important}
 .sect1+.sect1{border:0}
 #footer{background:none}
@@ -499,6 +501,9 @@ <h2 id="_nlcodec">1. NLCodec</h2>
 <li>
 <p>Class (for multiclass classification)</p>
 </li>
+<li>
+<p>Byte: Character is a Unicode codepoint (which can be higher than 255) where as bytes are [0-255]; a proxy over <code>utf-8</code> scheme</p>
+</li>
 </ol>
 </div>
 <div class="paragraph">
@@ -1010,7 +1015,7 @@ <h1 id="_acknowledgements" class="sect0">Acknowledgements</h1>
 </div>
 <div id="footer">
 <div id="footer-text">
-Last updated 2020-07-31 08:20:44 -0700
+Last updated 2021-08-16 19:47:22 -0700
 </div>
 </div>
 </body>

From 4d700cad1af9d3b138e8cc3852e6ec6653aa2f20 Mon Sep 17 00:00:00 2001
From: Thamme Gowda <tgowdan@gmail.com>
Date: Thu, 23 Dec 2021 15:48:04 -0800
Subject: [PATCH 3/6] Fix tqdm import;

---
 nlcodec/codec.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nlcodec/codec.py b/nlcodec/codec.py
index 411bb2c..b118a4a 100755
--- a/nlcodec/codec.py
+++ b/nlcodec/codec.py
@@ -11,7 +11,6 @@
 from pathlib import Path
 from typing import List, TextIO, Dict, Tuple, Union, Iterator, Optional
 import multiprocessing as mp
-from tqdm import tqdm
 from nlcodec import __version__, log
 from nlcodec.dstruct import TrNode
 from nlcodec.utils import filter_types_coverage, IO
@@ -364,6 +363,7 @@ def decode_str(cls, seq: List[str]) -> str:
 
     @classmethod
     def term_frequencies(cls, data: Iterator[str]) -> Tuple[Dict[str, int], int]:
+        from tqdm import tqdm
         stats = coll.Counter()
         line_count = 0
         for line in tqdm(data, mininterval=1):

From 51273416b6a30d8477891b59ed478b6d6979bdc3 Mon Sep 17 00:00:00 2001
From: Thamme Gowda <tgowdan@gmail.com>
Date: Thu, 23 Dec 2021 15:48:19 -0800
Subject: [PATCH 4/6] enable travis testing on 3.9 and 3.10

---
 .travis.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index d33c18c..40041da 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -5,8 +5,8 @@ python:
   #- "3.6"   # dataclasses came in 3.7, so 3.6 isnt supported
   - "3.7"
   #- "3.8"
-  #- "3.9"
-
+  - "3.9"
+  - "3.10"
 before_install:
   #- sudo apt-get -y update
   - python3 setup.py install

From 5b474c03972085b342feda5b568b2d32cb93fa7c Mon Sep 17 00:00:00 2001
From: Thamme Gowda <tgowdan@gmail.com>
Date: Thu, 23 Dec 2021 15:52:20 -0800
Subject: [PATCH 5/6] Fix tqdm import

---
 nlcodec/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nlcodec/utils.py b/nlcodec/utils.py
index affd509..b85072a 100644
--- a/nlcodec/utils.py
+++ b/nlcodec/utils.py
@@ -8,7 +8,6 @@
 from typing import List, Any, Iterable, Dict, Tuple, Union
 import collections as coll
 from nlcodec import log
-from tqdm import tqdm
 import gzip
 import time
 from contextlib import contextmanager
@@ -21,6 +20,7 @@ def make_n_grams(sent: List[Any], n):
 
 
 def make_n_grams_all(sents: Iterable[List[Any]], n):
+    from tqdm import tqdm
     grams = coll.Counter()
     n_sent = 0
     for sent in tqdm(sents, mininterval=1, dynamic_ncols=True):

From b375ba1840cc7d2f23f4dcbe7c905d79bd376d7e Mon Sep 17 00:00:00 2001
From: Thamme Gowda <thammegowda@users.noreply.github.com>
Date: Thu, 23 Dec 2021 16:04:23 -0800
Subject: [PATCH 6/6] travis build: disable 3.10 as travis is not ready yet

---
 .travis.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.travis.yml b/.travis.yml
index 40041da..3644b68 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -6,7 +6,7 @@ python:
   - "3.7"
   #- "3.8"
   - "3.9"
-  - "3.10"
+  #- "3.10"
 before_install:
   #- sudo apt-get -y update
   - python3 setup.py install