-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 65f83a8
Showing
2 changed files
with
141 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
# 进行了分词标注的2019中文维基语料库 | ||
|
||
<font size="9">[下载语料库](/releases/latest)</font> | ||
|
||
基于经过清洗和切分的2019年中文wiki语料库[wiki2019zh.zip](https://github.com/brightmart/nlp_chinese_corpus#1%E7%BB%B4%E5%9F%BA%E7%99%BE%E7%A7%91json%E7%89%88wiki2019zh),使用[hanlp](https://github.com/hankcs/HanLP)中的[COARSE_ELECTRA_SMALL_ZH](https://hanlp.hankcs.com/docs/api/hanlp/pretrained/tok.html)模型进行了分词。 | ||
|
||
分词结果采用4-tag BMES标注法进行了序列标注,格式如下: | ||
|
||
假设被分词的语料是:`你好Tom。我喜欢吃羊肉串。`,标注结果为: | ||
|
||
``` | ||
你 B | ||
好 E | ||
T B | ||
o M | ||
m E | ||
。 S | ||
SENTENCE END | ||
我 S | ||
喜 B | ||
欢 E | ||
吃 S | ||
羊 B | ||
肉 M | ||
串 E | ||
。 S | ||
SENTENCE END | ||
TEXT END | ||
``` | ||
使用中可能需要注意嵌入(embeddings)和标点符号的处理方式,以及语句和语料结束的标志`SENTENCE END`和`TEXT END`。 | ||
|
||
分词使用的脚本是[process_wiki_data.py](process_wiki_data.py)。 | ||
|
||
运行此脚本需要花费大量的时间: | ||
|
||
* CPU型号:Intel Xeon(Cascade Lake) Platinum 8269CY | ||
* CPU主频:2.5Ghz/3.2Ghz | ||
* 花费时间:7天11小时2分钟 | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,102 @@ | ||
import hanlp | ||
import numpy as np | ||
from itertools import chain | ||
from tqdm import tqdm | ||
from enum import IntEnum | ||
from itertools import chain | ||
import json | ||
import argparse | ||
import os | ||
from multiprocessing import Pool, current_process | ||
|
||
def split_para(text): | ||
return list(filter(None, text.split('\n'))) | ||
|
||
def to_4tag(text, split_sent, tok): | ||
full_split = list(chain(*split_sent(split_para(text)))) | ||
for sentence in tok(full_split): | ||
for word in sentence: | ||
if len(word) == 1: | ||
yield (word, 'S', ) | ||
else: | ||
yield (word[0], 'B', ) | ||
for char in word[1:-1]: | ||
yield (char, 'M', ) | ||
yield (word[-1], 'E', ) | ||
yield None | ||
|
||
def get_texts_from_wiki_json_file(filename): | ||
texts = [] | ||
file = open(filename, 'r') | ||
for line in file.readlines(): | ||
texts.append(json.loads(line)['text']) | ||
return texts | ||
|
||
def write_corpus_to(corpus, writable, pbar, split_sent, tok): | ||
for text in corpus: | ||
for item in to_4tag(text, split_sent, tok): | ||
if item is None: | ||
writable.write('SENTENCE END\n') | ||
else: | ||
writable.write('%s %s\n' % (item[0], item[1])) | ||
writable.write('TEXT END\n') | ||
pbar.update(1) | ||
|
||
def process_task(filename, dest_filename): | ||
tok = hanlp.load(hanlp.pretrained.tok.COARSE_ELECTRA_SMALL_ZH, verbose=False) | ||
split_sent = hanlp.load(hanlp.pretrained.eos.UD_CTB_EOS_MUL, verbose=False) | ||
os.makedirs(os.path.dirname(dest_filename), exist_ok=True) | ||
corpus = get_texts_from_wiki_json_file(filename) | ||
# corpus = corpus[:3] | ||
current = current_process() | ||
with tqdm(total=len(corpus), desc=filename, leave=False, position=current._identity[0] - 1) as pbar: | ||
with open(dest_filename, 'w') as ofile: | ||
write_corpus_to(corpus, ofile, pbar, split_sent, tok) | ||
return dest_filename | ||
|
||
def task_wrapper(task): | ||
return process_task(task[0], task[1]) | ||
|
||
if __name__ == '__main__': | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument('input_dir') | ||
parser.add_argument('output_dir') | ||
args = parser.parse_args() | ||
os.makedirs(args.output_dir, exist_ok=True) | ||
completed_list_file = os.path.join(args.output_dir, ".completed_list") | ||
completed_list = [] | ||
if os.path.exists(completed_list_file): | ||
with open(completed_list_file) as f: | ||
completed_list = [l.strip() for l in f.readlines()] | ||
|
||
all_tasks = [] | ||
for (dirpath, _, files) in os.walk(args.input_dir): | ||
for file in files: | ||
all_tasks.append((os.path.join(dirpath, file), os.path.join(args.output_dir, os.path.relpath(dirpath, args.input_dir), file), )) | ||
tasks = [] | ||
for (ifile, ofile) in all_tasks: | ||
if ofile in completed_list: | ||
print('%s already completed, skipping ..' % ofile) | ||
tasks.append((ifile, ofile, )) | ||
print('Tasks:') | ||
for (ifile, ofile) in tasks: | ||
print(' %s --> %s' % (ifile, ofile)) | ||
proceed = False | ||
while True: | ||
s = input('is this ok? [y/n] : ') | ||
if s == 'y': | ||
proceed = True | ||
break | ||
if s == 'n': | ||
proceed = False | ||
break | ||
|
||
if proceed: | ||
processes = os.cpu_count() | ||
with Pool(processes) as pool: | ||
with tqdm(total=len(tasks), desc='Total', leave=False, position=processes + 1) as pbar: | ||
for completed_file in pool.imap_unordered(task_wrapper, tasks): | ||
with open(completed_list_file, 'a') as f: | ||
f.write('%s\n' % completed_file) | ||
pbar.update(1) | ||
|