From 644523eb675906d2cbdf453a3dff703d01b85800 Mon Sep 17 00:00:00 2001 From: Xiaoquan Kong Date: Tue, 10 Jul 2018 23:46:50 +0800 Subject: [PATCH] Add multiprocess wiki-extractor argument --- extract_wikipedia_json_corpus.bash | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/extract_wikipedia_json_corpus.bash b/extract_wikipedia_json_corpus.bash index b7a13af..a2fe104 100755 --- a/extract_wikipedia_json_corpus.bash +++ b/extract_wikipedia_json_corpus.bash @@ -1,3 +1,5 @@ #!/bin/bash -WikiExtractor.py --json raw_data/zhwiki-latest-pages-articles.xml.bz2 -o extracted_json_data +cpu_count=`nproc --all` +process_count=$(expr $cpu_count - 1) +WikiExtractor.py --json raw_data/zhwiki-latest-pages-articles.xml.bz2 -o extracted_json_data --processes ${process_count}