From 08e97a9ad41c2c63869813b9119bf30cfd792976 Mon Sep 17 00:00:00 2001
From: raynardj <raynard@rasenn.com>
Date: Mon, 1 Feb 2021 16:27:53 +0800
Subject: [PATCH] =?UTF-8?q?=F0=9F=A6=BE=20io=20doc=20page?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 README.md             |   2 +-
 docs/loader/README.md | 142 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 143 insertions(+), 1 deletion(-)
 create mode 100644 docs/loader/README.md

diff --git a/README.md b/README.md
index 29d5b14..027ac98 100644
--- a/README.md
+++ b/README.md
@@ -101,7 +101,7 @@ app = NERTask.from_df(
 
 #### From downloaded data => pytorch dataset
 > For downloaded NER data tags, you can create a dataloader with the json file automatically:
-* [pytorch + huggingface tokenizer](https://github.com/raynardj/langhuan/tree/main/tests/loader.ipynb)
+* [pytorch + huggingface tokenizer](https://raynardj.github.io/langhuan/docs/loaders)
 * tensorflow + huggingface tokenizer, development pending
 
 #### Gunicorn support
diff --git a/docs/loader/README.md b/docs/loader/README.md
new file mode 100644
index 0000000..7f03256
--- /dev/null
+++ b/docs/loader/README.md
@@ -0,0 +1,142 @@
+# Ways to load NER dataset
+
+## For huggingface tokenizer
+> If you're using huggingface tokenizer, most of the preprocessing can be automated into the following way
+
+First we are loading a tokenizer
+
+
+```python
+from transformers import AutoTokenizer
+tk = AutoTokenizer.from_pretrained("bert-base-uncased", use_fast=True)
+```
+
+Load the downloaded data with pre-designed pipeline
+
+
+```python
+from langhuan.loaders import load_ner_data_pytorch_huggingface
+```
+
+This step will return a dataset
+
+
+```python
+data_ds = load_ner_data_pytorch_huggingface(
+    "ner_result_sample.json",
+    tk,
+)
+```
+
+Get a data_loader, this function will save you the effort to specify ```collate_fn```
+
+
+```python
+data_loader = data_ds.get_data_loader(batch_size=3, num_workers=2)
+```
+
+Split 1 dataset into train/ valid
+
+
+```python
+train_ds, val_ds = data_ds.split_train_valid(valid_ratio=.2)
+len(train_ds), len(val_ds)
+```
+
+
+
+
+    (7, 1)
+
+
+
+## Test a sample of x, y
+
+
+```python
+x, y = data_ds.one_batch(5)
+```
+
+
+```python
+x, y
+```
+
+
+
+
+    (tensor([[ 101, 2013, 1024,  ...,    0,    0,    0],
+             [ 101, 2013, 1024,  ...,    0,    0,    0],
+             [ 101, 2013, 1024,  ..., 1007, 1012,  102],
+             [ 101, 2013, 1024,  ...,    0,    0,    0],
+             [ 101, 2013, 1024,  ...,    0,    0,    0]]),
+     tensor([[0, 0, 0,  ..., 0, 0, 0],
+             [0, 0, 0,  ..., 0, 0, 0],
+             [0, 0, 0,  ..., 0, 0, 0],
+             [0, 0, 0,  ..., 0, 0, 0],
+             [0, 0, 0,  ..., 0, 0, 0]]))
+
+
+
+Here we left the slicing configuration to the hands of users
+
+
+```python
+x.shape, y.shape
+```
+
+
+
+
+    (torch.Size([5, 838]), torch.Size([5, 838]))
+
+
+
+## Convert x, y back to NER tags
+This also works for predicted y
+
+Make sure both x and y tensors are:
+* torch.LongTenser
+* in cpu, not cuda  
+
+
+```python
+data_ds.decode(x, y)
+```
+
+
+
+
+    [{'row_id': 1,
+      'token_id': 30,
+      'text': 'smithsonian astrophysical observatory',
+      'label': 'school'},
+     {'row_id': 2,
+      'token_id': 34,
+      'text': 'new mexico state university',
+      'label': 'school'},
+     {'row_id': 2, 'token_id': 565, 'text': 'ibm', 'label': 'company'},
+     {'row_id': 2, 'token_id': 633, 'text': 'ibm', 'label': 'company'},
+     {'row_id': 2, 'token_id': 655, 'text': 'quadra', 'label': 'company'},
+     {'row_id': 2, 'token_id': 664, 'text': 'apple', 'label': 'company'},
+     {'row_id': 2, 'token_id': 809, 'text': 'quadra', 'label': 'company'},
+     {'row_id': 2, 'token_id': 821, 'text': 'digital review', 'label': 'company'},
+     {'row_id': 3, 'token_id': 32, 'text': 'purdue university', 'label': 'school'},
+     {'row_id': 3,
+      'token_id': 35,
+      'text': 'engineering computer network',
+      'label': 'company'},
+     {'row_id': 3,
+      'token_id': 441,
+      'text': 'purdue electrical engineering',
+      'label': 'company'},
+     {'row_id': 4,
+      'token_id': 68,
+      'text': 'university of washington',
+      'label': 'school'},
+     {'row_id': 4, 'token_id': 97, 'text': 'si', 'label': 'company'}]
+
+
+
+## Tensorflow:
+> Development pending, [check here](https://github.com/raynardj/langhuan) to help