Skip to content

Commit

Permalink
initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
omegacoleman committed Jan 17, 2023
0 parents commit 65f83a8
Show file tree
Hide file tree
Showing 2 changed files with 141 additions and 0 deletions.
39 changes: 39 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# 进行了分词标注的2019中文维基语料库

<font size="9">[下载语料库](/releases/latest)</font>

基于经过清洗和切分的2019年中文wiki语料库[wiki2019zh.zip](https://github.com/brightmart/nlp_chinese_corpus#1%E7%BB%B4%E5%9F%BA%E7%99%BE%E7%A7%91json%E7%89%88wiki2019zh),使用[hanlp](https://github.com/hankcs/HanLP)中的[COARSE_ELECTRA_SMALL_ZH](https://hanlp.hankcs.com/docs/api/hanlp/pretrained/tok.html)模型进行了分词。

分词结果采用4-tag BMES标注法进行了序列标注,格式如下:

假设被分词的语料是:`你好Tom。我喜欢吃羊肉串。`,标注结果为:

```
你 B
好 E
T B
o M
m E
。 S
SENTENCE END
我 S
喜 B
欢 E
吃 S
羊 B
肉 M
串 E
。 S
SENTENCE END
TEXT END
```
使用中可能需要注意嵌入(embeddings)和标点符号的处理方式,以及语句和语料结束的标志`SENTENCE END``TEXT END`

分词使用的脚本是[process_wiki_data.py](process_wiki_data.py)

运行此脚本需要花费大量的时间:

* CPU型号:Intel Xeon(Cascade Lake) Platinum 8269CY
* CPU主频:2.5Ghz/3.2Ghz
* 花费时间:7天11小时2分钟

102 changes: 102 additions & 0 deletions process_wiki_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
import hanlp
import numpy as np
from itertools import chain
from tqdm import tqdm
from enum import IntEnum
from itertools import chain
import json
import argparse
import os
from multiprocessing import Pool, current_process

def split_para(text):
return list(filter(None, text.split('\n')))

def to_4tag(text, split_sent, tok):
full_split = list(chain(*split_sent(split_para(text))))
for sentence in tok(full_split):
for word in sentence:
if len(word) == 1:
yield (word, 'S', )
else:
yield (word[0], 'B', )
for char in word[1:-1]:
yield (char, 'M', )
yield (word[-1], 'E', )
yield None

def get_texts_from_wiki_json_file(filename):
texts = []
file = open(filename, 'r')
for line in file.readlines():
texts.append(json.loads(line)['text'])
return texts

def write_corpus_to(corpus, writable, pbar, split_sent, tok):
for text in corpus:
for item in to_4tag(text, split_sent, tok):
if item is None:
writable.write('SENTENCE END\n')
else:
writable.write('%s %s\n' % (item[0], item[1]))
writable.write('TEXT END\n')
pbar.update(1)

def process_task(filename, dest_filename):
tok = hanlp.load(hanlp.pretrained.tok.COARSE_ELECTRA_SMALL_ZH, verbose=False)
split_sent = hanlp.load(hanlp.pretrained.eos.UD_CTB_EOS_MUL, verbose=False)
os.makedirs(os.path.dirname(dest_filename), exist_ok=True)
corpus = get_texts_from_wiki_json_file(filename)
# corpus = corpus[:3]
current = current_process()
with tqdm(total=len(corpus), desc=filename, leave=False, position=current._identity[0] - 1) as pbar:
with open(dest_filename, 'w') as ofile:
write_corpus_to(corpus, ofile, pbar, split_sent, tok)
return dest_filename

def task_wrapper(task):
return process_task(task[0], task[1])

if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('input_dir')
parser.add_argument('output_dir')
args = parser.parse_args()
os.makedirs(args.output_dir, exist_ok=True)
completed_list_file = os.path.join(args.output_dir, ".completed_list")
completed_list = []
if os.path.exists(completed_list_file):
with open(completed_list_file) as f:
completed_list = [l.strip() for l in f.readlines()]

all_tasks = []
for (dirpath, _, files) in os.walk(args.input_dir):
for file in files:
all_tasks.append((os.path.join(dirpath, file), os.path.join(args.output_dir, os.path.relpath(dirpath, args.input_dir), file), ))
tasks = []
for (ifile, ofile) in all_tasks:
if ofile in completed_list:
print('%s already completed, skipping ..' % ofile)
tasks.append((ifile, ofile, ))
print('Tasks:')
for (ifile, ofile) in tasks:
print(' %s --> %s' % (ifile, ofile))
proceed = False
while True:
s = input('is this ok? [y/n] : ')
if s == 'y':
proceed = True
break
if s == 'n':
proceed = False
break

if proceed:
processes = os.cpu_count()
with Pool(processes) as pool:
with tqdm(total=len(tasks), desc='Total', leave=False, position=processes + 1) as pbar:
for completed_file in pool.imap_unordered(task_wrapper, tasks):
with open(completed_list_file, 'a') as f:
f.write('%s\n' % completed_file)
pbar.update(1)

0 comments on commit 65f83a8

Please sign in to comment.