forked from cl-tohoku/bert-japanese
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmake_corpus.py
118 lines (91 loc) · 3.63 KB
/
make_corpus.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import re
import bz2
import json
import unicodedata
import argparse
import MeCab
from logzero import logger
class MeCabSentenceSplitter(object):
def __init__(self, mecab_dict_path=None):
if mecab_dict_path is not None:
self.mecab = MeCab.Tagger('-d {}'.format(mecab_dict_path))
else:
self.mecab = MeCab.Tagger()
def __call__(self, text):
sentences = []
start = 0
end = 0
for line in self.mecab.parse(text).split('\n'):
if line == 'EOS':
if text[start:]:
sentences.append(text[start:])
break
token, token_info = line.split('\t')
end = text.index(token, end) + len(token)
if token_info.startswith('記号,句点,'):
sentences.append(text[start:end])
start = end
return sentences
def preprocess_text(text):
text = re.sub(r'、+', '、', text)
text = text.replace('(、', '(')
text = text.replace('、)', ')')
text = text.replace('()', '')
text = re.sub(r'\s+', ' ', text)
return text.strip()
def filter_text(text, min_length, max_length):
if re.search(r'\| *\|+', text):
return False
if len(text) < min_length or len(text) > max_length:
return False
return True
regex_link = re.compile(r'\<a href="(.*?)"\>(.*?)\</a\>')
def main(args):
sent_splitter = MeCabSentenceSplitter(args.mecab_dict_path)
num_processed_docs = 0
with bz2.open(args.input_file, 'rt') as input_file, \
open(args.output_file, 'w') as output_file:
for line in input_file:
page_item = json.loads(line)
text = page_item['text']
# replace links
text = regex_link.sub(r'\2', text)
# normalize text
text = unicodedata.normalize('NFKC', text)
paragraphs = re.split(r'\n\n+', text)[1:]
sentences = [preprocess_text(s) for p in paragraphs
for s in sent_splitter(p)]
# ignore too short/long sentences
sentences = [s for s in sentences
if filter_text(s, args.min_length, args.max_length)]
if sentences:
# write document to a file
for s in sentences:
assert not '\n' in s, s
assert s, s
output_file.write(s + '\n')
output_file.write('\n')
num_processed_docs += 1
if args.debug and num_processed_docs == 1000:
logger.info('processed: {}'.format(num_processed_docs))
break
# logging
if num_processed_docs % 10000 == 0:
logger.info('processed: {}'.format(num_processed_docs))
if num_processed_docs % 10000 != 0:
logger.info('processed: {}'.format(num_processed_docs))
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('--input_file', type=str, required=True,
help='preprocessed Wikipedia articles file (.bz2)')
parser.add_argument('--output_file', type=str, required=True,
help='output corpus file')
parser.add_argument('--min_length', type=int, default=16,
help='only extract sentences with no less than N characters [16]')
parser.add_argument('--max_length', type=int, default=1024,
help='only extract sentences with no more than N characters [1024]')
parser.add_argument('--mecab_dict_path', type=str,
help='path to MeCab dictionary')
parser.add_argument('--debug', action='store_true')
args = parser.parse_args()
main(args)