initial commit

fastcws · Jan 17, 2023 · 65f83a8 · 65f83a8
commit 65f83a8
Show file tree

Hide file tree

Showing 2 changed files with 141 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -0,0 +1,39 @@
+# 进行了分词标注的2019中文维基语料库
+
+<font size="9">[下载语料库](/releases/latest)</font>
+
+基于经过清洗和切分的2019年中文wiki语料库[wiki2019zh.zip](https://github.com/brightmart/nlp_chinese_corpus#1%E7%BB%B4%E5%9F%BA%E7%99%BE%E7%A7%91json%E7%89%88wiki2019zh)，使用[hanlp](https://github.com/hankcs/HanLP)中的[COARSE_ELECTRA_SMALL_ZH](https://hanlp.hankcs.com/docs/api/hanlp/pretrained/tok.html)模型进行了分词。
+
+分词结果采用4-tag BMES标注法进行了序列标注，格式如下：
+
+假设被分词的语料是：`你好Tom。我喜欢吃羊肉串。`，标注结果为：
+
+```
+你 B
+好 E
+T B
+o M
+m E
+。 S
+SENTENCE END
+我 S
+喜 B
+欢 E
+吃 S
+羊 B
+肉 M
+串 E
+。 S
+SENTENCE END
+TEXT END
+```
+使用中可能需要注意嵌入（embeddings）和标点符号的处理方式，以及语句和语料结束的标志`SENTENCE END`和`TEXT END`。
+
+分词使用的脚本是[process_wiki_data.py](process_wiki_data.py)。
+
+运行此脚本需要花费大量的时间：
+
+* CPU型号：Intel Xeon(Cascade Lake) Platinum 8269CY
+* CPU主频：2.5Ghz/3.2Ghz
+* 花费时间：7天11小时2分钟
+
diff --git a/process_wiki_data.py b/process_wiki_data.py
@@ -0,0 +1,102 @@
+import hanlp
+import numpy as np
+from itertools import chain
+from tqdm import tqdm
+from enum import IntEnum
+from itertools import chain
+import json
+import argparse
+import os
+from multiprocessing import Pool, current_process
+
+def split_para(text):
+    return list(filter(None, text.split('\n')))
+
+def to_4tag(text, split_sent, tok):
+    full_split = list(chain(*split_sent(split_para(text))))
+    for sentence in tok(full_split):
+        for word in sentence:
+            if len(word) == 1:
+                yield (word, 'S', )
+            else:
+                yield (word[0], 'B', )
+                for char in word[1:-1]:
+                    yield (char, 'M', )
+                yield (word[-1], 'E', )
+        yield None
+
+def get_texts_from_wiki_json_file(filename):
+    texts = []
+    file = open(filename, 'r')
+    for line in file.readlines():
+        texts.append(json.loads(line)['text'])
+    return texts
+
+def write_corpus_to(corpus, writable, pbar, split_sent, tok):
+    for text in corpus:
+        for item in to_4tag(text, split_sent, tok):
+            if item is None:
+                writable.write('SENTENCE END\n')
+            else:
+                writable.write('%s %s\n' % (item[0], item[1]))
+        writable.write('TEXT END\n')
+        pbar.update(1)
+
+def process_task(filename, dest_filename):
+    tok = hanlp.load(hanlp.pretrained.tok.COARSE_ELECTRA_SMALL_ZH, verbose=False)
+    split_sent = hanlp.load(hanlp.pretrained.eos.UD_CTB_EOS_MUL, verbose=False)
+    os.makedirs(os.path.dirname(dest_filename), exist_ok=True)
+    corpus = get_texts_from_wiki_json_file(filename)
+    # corpus = corpus[:3]
+    current = current_process()
+    with tqdm(total=len(corpus), desc=filename, leave=False, position=current._identity[0] - 1) as pbar:
+        with open(dest_filename, 'w') as ofile:
+            write_corpus_to(corpus, ofile, pbar, split_sent, tok)
+    return dest_filename
+
+def task_wrapper(task):
+    return process_task(task[0], task[1])
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('input_dir')
+    parser.add_argument('output_dir')
+    args = parser.parse_args()
+    os.makedirs(args.output_dir, exist_ok=True)
+    completed_list_file = os.path.join(args.output_dir, ".completed_list")
+    completed_list = []
+    if os.path.exists(completed_list_file):
+        with open(completed_list_file) as f:
+            completed_list = [l.strip() for l in f.readlines()]
+
+    all_tasks = []
+    for (dirpath, _, files) in os.walk(args.input_dir):
+        for file in files:
+            all_tasks.append((os.path.join(dirpath, file), os.path.join(args.output_dir, os.path.relpath(dirpath, args.input_dir), file), ))
+    tasks = []
+    for (ifile, ofile) in all_tasks:
+        if ofile in completed_list:
+            print('%s already completed, skipping ..' % ofile)
+        tasks.append((ifile, ofile, ))
+    print('Tasks:')
+    for (ifile, ofile) in tasks:
+        print('  %s --> %s' % (ifile, ofile))
+    proceed = False
+    while True:
+        s = input('is this ok? [y/n] : ')
+        if s == 'y':
+            proceed = True
+            break
+        if s == 'n':
+            proceed = False
+            break
+
+    if proceed:
+        processes = os.cpu_count()
+        with Pool(processes) as pool:
+            with tqdm(total=len(tasks), desc='Total', leave=False, position=processes + 1) as pbar:
+                for completed_file in pool.imap_unordered(task_wrapper, tasks):
+                    with open(completed_list_file, 'a') as f:
+                        f.write('%s\n' % completed_file)
+                    pbar.update(1)
+