Add benchmark script.

syoyo · syoyo · commit cafd45e6ad96 · 2024-01-22T04:03:10.000+09:00
diff --git a/benchmark/README.md b/benchmark/README.md
@@ -0,0 +1,41 @@
+# Benchmark jdepp-python
+
+## Dataset
+
+Wiki40b
+
+## Requirements
+
+* Python
+* Conda
+
+## Install
+
+```
+$ python -m pip install -r requirements.txt
+```
+
+## Prepare data
+
+We use huggingface datasets to download wiki40b, ja_sentence_splitter to split text into sentences(we use `ja_sentence_splitter` for speed. not all text are correctly splitted though), jagger for POS tagging.
+
+Download and unpack jagger model file: https://github.com/lighttransport/jagger-python/releases/tag/v0.1.0
+
+Then,
+
+Run `prepare_dataset.py`
+
+
+## Benchmark in J.DepP
+
+Download and extract jdepp model file.  https://github.com/lighttransport/jdepp-python/releases/tag/v0.1.0
+(`2ndpoly` recommended)
+
+Then,
+
+```
+$ python run-jdepp.py
+```
+
+
+EoL.
diff --git a/benchmark/prepare_dataset.py b/benchmark/prepare_dataset.py
@@ -0,0 +1,162 @@
+import os
+import functools
+import signal
+import concurrent.futures
+
+#
+import ja_sentence_segmenter
+import datasets
+import jagger
+from tqdm import tqdm
+
+model_path = "model/kwdlc/patterns"
+tagger = jagger.Jagger()
+tagger.load_model(model_path)
+
+
+from ja_sentence_segmenter.common.pipeline import make_pipeline
+from ja_sentence_segmenter.concatenate.simple_concatenator import  concatenate_matching
+from ja_sentence_segmenter.normalize.neologd_normalizer import  normalize
+from ja_sentence_segmenter.split.simple_splitter import  split_newline, split_punctuation
+
+# Assume wikitext all uses '。' for punctuation(no period '.' for punctuation)
+split_punc = functools.partial(split_punctuation, punctuations=r"。 !?")
+concat_tail_no = functools.partial(concatenate_matching, former_matching_rule=r"^(?P<result>.+)(の)$", remove_former_matched=False)
+segmenter = make_pipeline(normalize, split_newline, concat_tail_no, split_punc)
+
+
+interrupted = False
+
+def handler(signum, frame):
+    # Gracefull shutfown
+    print('Signal handler called with signal', signum)
+
+    global interrupted
+    interrupted = True
+    
+
+
+dss = datasets.load_dataset("range3/wiki40b-ja")
+print(dss)
+
+def senter(text):
+    
+    result = list(segmenter(text))
+
+    outputs = ''
+    for sent in result:
+
+        toks = tagger.tokenize(sent)
+
+        pos_tagged = ''
+        for tok in toks:
+            pos_tagged += tok.surface() + '\t' + tok.feature() + '\n'
+
+        pos_tagged += 'EOS\n'
+
+        # no newline-only line between sentence.
+        outputs += pos_tagged
+
+    
+    return outputs
+
+
+
+def singleprocess_proc(f):
+    for example in tqdm(dss['train']):
+        texts = example['text'].split()
+
+        # extract paragraph only.
+        in_paragraph = False
+
+        txts_result = []
+        for text in texts:
+            if in_paragraph:
+                text = text.replace("_NEWLINE_", '\n')
+                text = senter(text)
+                f.write(text)
+                in_paragraph = False
+
+            if text == "_START_PARAGRAPH_":
+                in_paragraph = True
+
+def run_task(texts: [str]):
+    out_texts = []
+
+    #global interrupted
+
+    for text in texts:
+        #print(text)
+        #if interrupted:
+        #    return None
+
+        lines = text.split()
+
+        # extract paragraph only.
+        in_paragraph = False
+
+        txt_result = ''
+        for line in lines:
+            if in_paragraph:
+                line = line.replace("_NEWLINE_", '\n')
+                line = senter(line)
+                
+                txt_result += line
+                in_paragraph = False
+
+            if line == "_START_PARAGRAPH_":
+                in_paragraph = True
+
+        out_texts.append(txt_result)
+
+    return {'text': out_texts}
+
+def multiprocess_proc(f):
+
+    split_name = 'train'
+
+    nprocs = max(1, os.cpu_count() // 2)
+    print("nprocs", nprocs)
+    nexamples_per_batch = 10000 # small batch size results in slow(due to Python future object creation?). 10000 or more recommended for wiki40b/ja `train'
+
+    # datasets.map is a easy solution, but consumes lots of disk space.
+    # so disabled atm.
+    #
+    # processed_ds = dss['train'].map(run_task, batched=True, batch_size=nexamples_per_batch, num_proc=nprocs)
+    #for p in tqdm(processed_ds['text']):
+    #    f.write(p)
+
+    # ProcessPoolExecutor version
+
+    chunks = []
+    for i in tqdm(range(0, len(dss[split_name]['text']), nexamples_per_batch), desc="[chunking input]"):
+        chunks.append(dss[split_name]['text'][i:i+nexamples_per_batch])
+
+    signal.signal(signal.SIGINT, handler)
+    total_ticks = len(chunks)
+    with tqdm(total=total_ticks) as pbar:
+        with concurrent.futures.ProcessPoolExecutor(max_workers=nprocs) as executor:
+            futures = {executor.submit(run_task, chunks[i]): i for i in range(len(chunks))}
+
+            for future in concurrent.futures.as_completed(futures):
+                arg = futures[future]
+                result = future.result()
+                # single IO
+                for text in result['text']:
+                    f.write(text)
+
+                del result
+
+                pbar.update(1)
+
+                del future
+
+if __name__ == '__main__':
+
+    f = open("output-wiki-postagged.txt", 'w')
+
+    # multiprocessing: finish in few mins, but consumes 20GB~40GB memory.
+    # Use singleprocess_proc() if you face out-of-memory
+
+    # singleprocess_proc(f)
+    multiprocess_proc(f)
diff --git a/benchmark/requirements.txt b/benchmark/requirements.txt
@@ -0,0 +1,5 @@
+datasets
+jagger
+jdepp
+tqdm
+zstandard
diff --git a/benchmark/run-jdepp.py b/benchmark/run-jdepp.py
@@ -0,0 +1,57 @@
+import sys
+import tqdm
+import time
+import jdepp
+
+parser = jdepp.Jdepp()
+
+model_path = "model/knbc"
+parser.load_model(model_path)
+
+input_filename = "output-wiki-postagged.txt"
+#input_filename = "test-posttagged.txt"
+
+print("reading test data:", input_filename)
+lines = open(input_filename, 'r', encoding='utf8').readlines()
+
+s = time.time()
+
+nprocessed_sentences = 0
+
+sents = []
+for line in tqdm.tqdm(lines):
+    if line == '\n':
+        continue
+
+    sents.append(line)
+
+    if line.startswith("EOS"):
+        result = parser.parse_from_postagged(sents)
+        print(result)
+        
+        nprocessed_sentences += 1
+        sents = []
+        
+e = time.time()
+proc_sec = e - s
+ms_per_sentence = 1000.0 * proc_sec / float(nprocessed_sentences)
+sys.stderr.write("J.DepP: Total {} secs({} sentences. {} ms per sentence))\n".format(proc_sec, nprocessed_sentences, ms_per_sentence))
+
+#total_secs = 0
+#nlines_per_batch = 1024*128
+#for i in tqdm.tqdm(range(0, len(lines), nlines_per_batch)):
+#    text = '\n'.join(lines[i:i+nlines_per_batch])
+#
+#    print("run jagger for {} lines.".format(nlines_per_batch))
+#    s = time.time()
+#    toks_list = tokenizer.tokenize_batch(text)
+#    e = time.time()
+#    print("{} secs".format(e - s))
+#
+#    total_secs += (e - s)
+#
+#    # print result
+#    #for toks in toks_list:
+#    #    for tok in toks:
+#    #        print(tok.surface(), tok.feature())
+# print("Total processing time: {} secs".format(total_secs))

-Original file line number
+Diff line change
@@ @@ -0,0 +1,5 @@ @@
 +datasets
 +jagger
 +jdepp
 +tqdm
 +zstandard