From 37c07f1cb1bd89d795950a38d87bcac5131366f2 Mon Sep 17 00:00:00 2001 From: Santiago Goycoechea Date: Tue, 19 Apr 2022 13:38:05 -0300 Subject: [PATCH 1/3] Add scripts to preprocess europarl corpus and generate alignments --- README.md | 244 ++++++++---------- .../generate_alignments.sh | 12 + .../generate-alignments/remove_bad_rows.py | 60 +++-- .../train-mask-align/preprocess_europarl.sh | 23 ++ scripts/train-mask-align/split_corpus.py | 42 +++ 5 files changed, 214 insertions(+), 167 deletions(-) create mode 100644 scripts/generate-alignments/generate_alignments.sh create mode 100644 scripts/train-mask-align/preprocess_europarl.sh create mode 100644 scripts/train-mask-align/split_corpus.py diff --git a/README.md b/README.md index 56b47b6..1c52b54 100644 --- a/README.md +++ b/README.md @@ -1,137 +1,109 @@ -# Mask-Align for NewsQA-es - -This repo forks [THUNLP-MT/Mask-Align](https://github.com/THUNLP-MT/Mask-Align) to adapt it to translate -the [NewsQA](https://www.microsoft.com/en-us/research/project/newsqa-dataset/) reading comprehension dataset to Spanish. -Mask-Align is an algorithm that aligns translations to a token level, which allows us to find the English answer span -inside the translated Spanish text. - -## Setup - -First, clone this repo: - -```bash -git clone https://github.com/pln-fing-udelar/Mask-Align -cd Mask-Align/ -``` - -Then, using [Conda](https://docs.conda.io/en/latest/index.html), run: - -```bash -conda env update -conda activate mask-align -``` - -## Training Mask-Align in English-Spanish - -We need a trained Mask-Align model to align translations between English and Spanish. To download the pretrained model, run the following commands: - -```bash -mkdir -p spanish-output/output -curl -o spanish-output/output/model-1.pt https://www.fing.edu.uy/owncloud/index.php/s/siRkUqxnwmdtfaJ/download -``` - -Alternatively, follow these steps to train it yourself. - -### Prepare the Corpus - -1. Download the [Europarl Spanish-English parallel corpus](https://www.statmt.org/europarl/v7/es-en.tgz). -2. Remove the sentences that don't form a pair (the sentences that correspond with an empty line). -3. Remove sentences of length 1. -4. Remove sentences that contain tags (characters "<" and ">"). -5. Split the corpus into test, train and val. A good size can be 2000 sentences for test and 2000 for val. Name the - files `corpus.es`, `validation.es`, `test.es`, `corpus.en`, `validation.en` and `test.en`. -7. Run the following commands, to learn the vocabulary, tokenize the sentences, and shuffle the corpus: - - ```bash - spm_train --input=corpus.en --model_prefix=en --vocab_size=32000 --character_coverage=1.0 --model_type=unigram - spm_train --input=corpus.es --model_prefix=es --vocab_size=32000 --character_coverage=1.0 --model_type=unigram - spm_encode --model=en.model --output_format=piece < corpus.en > corpus.32k.en - spm_encode --model=en.model --output_format=piece < validation.en > validation.32k.en - spm_encode --model=en.model --output_format=piece < test.en > test.32k.en - spm_encode --model=es.model --output_format=piece < corpus.es > corpus.32k.es - spm_encode --model=es.model --output_format=piece < validation.es > validation.32k.es - spm_encode --model=es.model --output_format=piece < test.es > test.32k.es - python thualign/scripts/shuffle_corpus.py --corpus corpus.32k.es corpus.32k.en - sed -i -e 's///' -e 's///' corpus.32k.en - sed -i -e 's///' -e 's///' validation.32k.en - sed -i -e 's///' -e 's///' test.32k.en - sed -i -e 's///' -e 's///' corpus.32k.es - sed -i -e 's///' -e 's///' validation.32k.es - sed -i -e 's///' -e 's///' test.32k.es - ``` - -### Train the Model - -Note you need a computer with a CUDA-capable GPU to train the model. - -1. In the config file [`thualign/configs/user/spanish.config`](thualign/configs/user/spanish.config), specify the - location of the following files: - - * `corpus.32k.es.shuf` - * `corpus.32k.en.shuf` - * `validation.32k.es` - * `validation.32k.en` - * `test.32k.es` - * `test.32k.en` - * `es.vocab` - * `en.vocab` - -2. In `device_list` specify the number of GPUs. -3. In `batch_size` choose the highest value that doesn't make the training stop due to lack of memory (try different - numbers). -4. The value of `update_cycle` must be 36000 / batch_size. -5. Run: - - ```bash - ./thualign/bin/train.sh -s spanish - ``` - -6. The model is saved in a folder created in the root directory of the repository. - -### Test the alignments - -1. Run: - - ```bash - ./thualign/bin/test.sh -s spanish -gvt - ``` - -2. The alignments are generated in `test/alignments.txt`, where the model was saved. -3. To see the alignments in an interactive way, run: - - ```bash - ./thualign/scripts/visualize.py spanish/output/test/alignment_vizdata.pt - ``` - -## Generating the Answer Alignments for NewsQA-es - -Run these commands to generate the answer alignments for the NewsQA-es dataset. You should have a trained Mask-Align -model and have the `newsqa.csv` file. - -```bash -./scripts/generate-alignments/remove_bad_rows.py -./scripts/generate-alignments/generate_files.py -spm_encode --model=en.model --output_format=piece < test.en > test.32k.en -spm_encode --model=es.model --output_format=piece < test.es > test.32k.es -./scripts/generate-alignments/process_answer_indexes.py -mkdir corpus-es -mv test.32k.en test.32k.es answers.en corpus-es/ -./thualign/bin/generate.sh -s spanish -o output.txt -spm_decode --model=es.model --input_format=piece < output.txt > output-plain.txt -./scripts/generate-alignments/output_brackets_to_indexes.py -``` - -The following three files are generated: - -* `output-indexes.txt`: the indexes of the answers in Spanish. -* `output-answers.txt`: the answers in Spanish (in plain text). -* `output-sentences.txt`: the sentences in Spanish (not tokenized). - -### Generate the final merged CSV file - -Finally, run these commands to generate the `newsqa-es.csv` file, a new version of `newsqa_filtered.csv` which has the columns with the answers in Spanish. - -```bash -sed -i '1ianswer_index_esp' output-indexes.txt -csvjoin -y 0 newsqa_filtered.csv output-indexes.txt > newsqa-es.csv +# Mask-Align for NewsQA-es + +This repo forks [THUNLP-MT/Mask-Align](https://github.com/THUNLP-MT/Mask-Align) to adapt it to translate +the [NewsQA](https://www.microsoft.com/en-us/research/project/newsqa-dataset/) reading comprehension dataset to Spanish. +Mask-Align is an algorithm that aligns translations to a token level, which allows us to find the English answer span +inside the translated Spanish text. + +## Setup + +First, clone this repo: + +```bash +git clone https://github.com/pln-fing-udelar/Mask-Align +cd Mask-Align/ +``` + +Then, using [Conda](https://docs.conda.io/en/latest/index.html), run: + +```bash +conda env update +conda activate mask-align +``` + +## Training Mask-Align in English-Spanish + +We need a trained Mask-Align model to align translations between English and Spanish. To download the pretrained model, run the following commands: + +```bash +mkdir -p spanish-output/output +curl -o spanish-output/output/model-1.pt https://www.fing.edu.uy/owncloud/index.php/s/siRkUqxnwmdtfaJ/download +``` + +Alternatively, follow these steps to train it yourself. + +### Prepare the Corpus + +1. Download the [Europarl Spanish-English parallel corpus](https://www.statmt.org/europarl/v7/es-en.tgz). +2. Run the following script to do some preprocessing, learn the vocabulary, tokenize the sentences, and split the corpus: + + ```bash +./scripts/train-mask-align/preprocess_europarl.sh + ``` + +### Train the Model + +Note you need a computer with a CUDA-capable GPU to train the model. + +1. In the config file [`thualign/configs/user/spanish.config`](thualign/configs/user/spanish.config), specify the + location of the following files: + + * `corpus.32k.es.shuf` + * `corpus.32k.en.shuf` + * `validation.32k.es` + * `validation.32k.en` + * `test.32k.es` + * `test.32k.en` + * `es.vocab` + * `en.vocab` + +2. In `device_list` specify the number of GPUs. +3. In `batch_size` choose the highest value that doesn't make the training stop due to lack of memory (try different + numbers). +4. The value of `update_cycle` must be 36000 / batch_size. +5. Run: + + ```bash + ./thualign/bin/train.sh -s spanish + ``` + +6. The model is saved in a folder created in the root directory of the repository. + +### Test the alignments + +1. Run: + + ```bash + ./thualign/bin/test.sh -s spanish -gvt + ``` + +2. The alignments are generated in `test/alignments.txt`, where the model was saved. +3. To see the alignments in an interactive way, run: + + ```bash + ./thualign/scripts/visualize.py spanish/output/test/alignment_vizdata.pt + ``` + +## Generating the Answer Alignments for NewsQA-es + +Run the following script to generate the answer alignments for the NewsQA-es dataset. You should have a trained Mask-Align +model and have the `newsqa.csv` file. + +```bash +./scripts/generate-alignments/generate_alignments.sh +``` + +The following three files are generated: + +* `output-indexes.txt`: the indexes of the answers in Spanish. +* `output-answers.txt`: the answers in Spanish (in plain text). +* `output-sentences.txt`: the sentences in Spanish (not tokenized). + +### Generate the final merged CSV file + +Finally, run these commands to generate the `newsqa-es.csv` file, a new version of `newsqa_filtered.csv` which has the columns with the answers in Spanish. + +```bash +sed -i '1ianswer_index_esp' output-indexes.txt +csvjoin -y 0 newsqa_filtered.csv output-indexes.txt > newsqa-es.csv ``` \ No newline at end of file diff --git a/scripts/generate-alignments/generate_alignments.sh b/scripts/generate-alignments/generate_alignments.sh new file mode 100644 index 0000000..4619b61 --- /dev/null +++ b/scripts/generate-alignments/generate_alignments.sh @@ -0,0 +1,12 @@ +#!/usr/bin/env bash +set -ex +python ./scripts/generate-alignments/remove_bad_rows.py +python ./scripts/generate-alignments/generate_files.py +spm_encode --model=en.model --output_format=piece < test.en > test.32k.en +spm_encode --model=es.model --output_format=piece < test.es > test.32k.es +python ./scripts/generate-alignments/process_answer_indexes.py +mkdir -p corpus-es +mv test.32k.en test.32k.es answers.en vocab.32k.es.txt vocab.32k.en.txt corpus-es/ +./thualign/bin/generate.sh -s spanish -o output.txt +spm_decode --model=es.model --input_format=piece < output.txt > output-plain.txt +python ./scripts/generate-alignments/output_brackets_to_indexes.py \ No newline at end of file diff --git a/scripts/generate-alignments/remove_bad_rows.py b/scripts/generate-alignments/remove_bad_rows.py index 2991423..0281ed0 100755 --- a/scripts/generate-alignments/remove_bad_rows.py +++ b/scripts/generate-alignments/remove_bad_rows.py @@ -1,37 +1,35 @@ -#!/usr/bin/env python -# This script is used to filter the rows with bad data in newsqa.csv, creating the newsqa_filtered.csv file -import csv +import os +import random import re +import csv + +### This script is used to filter the rows with bad data in newsqa.csv, creating the newsqa_filtered.csv file + +csv_input = open("newsqa.csv", "r", encoding="utf8") +csv_input2 = open("newsqa.csv", "r", encoding="utf8") +csv_data = list(csv.reader(csv_input, delimiter=',')) +csv_lines = csv_input2.readlines() +csv_output = open("newsqa_filtered.csv", "w", encoding="utf8") +csv_output2 = open("newsqa_bad_rows.csv", "w", encoding="utf8") -def main() -> None: - with open("newsqa.csv", encoding="utf8") as csv_input, \ - open("newsqa.csv", encoding="utf8") as csv_input2, \ - open("newsqa_filtered.csv", "w", encoding="utf8") as csv_output, \ - open("newsqa_bad_rows.csv", "w", encoding="utf8") as csv_output2: - for i, (row, csv_line) in enumerate(zip(csv.reader(csv_input), csv_input2)): - if i == 0: - csv_output.write(csv_line) - csv_output2.write(csv_line) - else: - ans_start = -1 - ans_end = -1 - if indexes := re.search(r"\d+:\d+", row[5]): - start_end_str = indexes.group(0).split(":") - ans_start = int(start_end_str[0]) - ans_end = int(start_end_str[1]) +csv_output.write(csv_lines[0]) +csv_output2.write(csv_lines[0]) - if any(re.match(r"^(\s|\t|\n|\r)*$", str(row[j])) for j in range(6)) is None \ - and "*" not in str(row[2]) \ - and "Ã" not in str(row[3]) \ - and i not in {33677, 33676, 116925, 116926} \ - and ans_start > -1 \ - and ans_end > -1: - output_file = csv_output - else: - output_file = csv_output2 - output_file.write(csv_line) +for idx, entry in enumerate(csv_data[1:]): + ans_start = -1 + ans_end = -1 + indexes = re.search(r"\d+:\d+", entry[5]) + if indexes: + ans_start = int(indexes.group(0).split(":")[0]) + ans_end = int(indexes.group(0).split(":")[1]) + if re.match(r"^(\s|\t|\n|\r)*$", str(entry[0])) == None and re.match(r"^(\s|\t|\n|\r)*$", str(entry[1])) == None and re.match(r"^(\s|\t|\n|\r)*$", str(entry[2])) == None and re.match(r"^(\s|\t|\n|\r)*$", str(entry[3])) == None and re.match(r"^(\s|\t|\n|\r)*$", str(entry[4])) == None and re.match(r"^(\s|\t|\n|\r)*$", str(entry[5])) == None and "*" not in str(entry[2]) and "Ã" not in str(entry[3]) and idx != 33676 and idx != 33677 and idx != 116925 and idx != 116926 and ans_start > -1 and ans_end > -1: + csv_output.write(csv_lines[idx + 1]) + else: + csv_output2.write(csv_lines[idx + 1]) -if __name__ == "__main__": - main() +csv_input.close() +csv_input2.close() +csv_output.close() +csv_output2.close() \ No newline at end of file diff --git a/scripts/train-mask-align/preprocess_europarl.sh b/scripts/train-mask-align/preprocess_europarl.sh new file mode 100644 index 0000000..848dde4 --- /dev/null +++ b/scripts/train-mask-align/preprocess_europarl.sh @@ -0,0 +1,23 @@ +#!/usr/bin/env bash +set -ex +python scripts/train-mask-align/split_corpus.py +spm_train --input=corpus.en --model_prefix=en --vocab_size=32000 --character_coverage=1.0 --model_type=unigram +spm_train --input=corpus.es --model_prefix=es --vocab_size=32000 --character_coverage=1.0 --model_type=unigram +python scripts/train-mask-align/process_vocab.py +sed -i 's///g' vocab.32k.es.txt +sed -i 's/<\/s>//g' vocab.32k.es.txt +sed -i 's///g' vocab.32k.en.txt +sed -i 's/<\/s>//g' vocab.32k.en.txt +spm_encode --model=en.model --output_format=piece < corpus.en > corpus.32k.en +spm_encode --model=en.model --output_format=piece < validation.en > validation.32k.en +spm_encode --model=en.model --output_format=piece < test.en > test.32k.en +spm_encode --model=es.model --output_format=piece < corpus.es > corpus.32k.es +spm_encode --model=es.model --output_format=piece < validation.es > validation.32k.es +spm_encode --model=es.model --output_format=piece < test.es > test.32k.es +python thualign/scripts/shuffle_corpus.py --corpus corpus.32k.es corpus.32k.en +sed -i -e 's///' -e 's///' corpus.32k.en +sed -i -e 's///' -e 's///' validation.32k.en +sed -i -e 's///' -e 's///' test.32k.en +sed -i -e 's///' -e 's///' corpus.32k.es +sed -i -e 's///' -e 's///' validation.32k.es +sed -i -e 's///' -e 's///' test.32k.es diff --git a/scripts/train-mask-align/split_corpus.py b/scripts/train-mask-align/split_corpus.py new file mode 100644 index 0000000..b5c81de --- /dev/null +++ b/scripts/train-mask-align/split_corpus.py @@ -0,0 +1,42 @@ +import os +import re +import random + +# For the corpus https://opus.nlpl.eu/download.php?f=WikiMatrix/v1/tmx/en-es.tmx.gz + +corpus_es = open("./corpus.es", "w", encoding="utf-8") +corpus_en = open("./corpus.en", "w", encoding="utf-8") +validation_es = open("./validation.es", "w", encoding="utf-8") +validation_en = open("./validation.en", "w", encoding="utf-8") +test_es = open("./test.es", "w", encoding="utf-8") +test_en = open("./test.en", "w", encoding="utf-8") + +file1 = open("./europarl-v7.es-en.es", "r", encoding="utf-8") +file2 = open("./europarl-v7.es-en.en", "r", encoding="utf-8") +data1 = file1.readlines() +data2 = file2.readlines() +num_sentences = min(len(data1), len(data2)) + +for i in range(0, num_sentences): + num_words1 = len(data1[i].split(' ')) + num_words2 = len(data2[i].split(' ')) + if re.search(r'\w+', data1[i]) and re.search(r'\w+', data2[i]) and "<" not in data1[i] and "<" not in data2[i] and num_words1 > 1 and num_words1 < 120 and num_words2 > 1 and num_words2 < 120: + random_number = random.uniform(0, 1) + if random_number < 0.9989: + corpus_es.write(data1[i].lower()) + corpus_en.write(data2[i].lower()) + elif random_number < 0.9978: + validation_es.write(data1[i].lower()) + validation_en.write(data2[i].lower()) + else: + test_es.write(data1[i].lower()) + test_en.write(data2[i].lower()) + +file1.close() +file2.close() +corpus_es.close() +corpus_en.close() +validation_es.close() +validation_en.close() +test_es.close() +test_en.close() \ No newline at end of file From 4cf0e59e1769fd4922ae3fe0a3cfb0892ab928f3 Mon Sep 17 00:00:00 2001 From: Santiago Goycoechea Date: Fri, 22 Apr 2022 12:46:12 -0300 Subject: [PATCH 2/3] Fix line endings --- README.md | 216 +++++++++++++++++++++++++++--------------------------- 1 file changed, 108 insertions(+), 108 deletions(-) diff --git a/README.md b/README.md index 1c52b54..e5db640 100644 --- a/README.md +++ b/README.md @@ -1,109 +1,109 @@ -# Mask-Align for NewsQA-es - -This repo forks [THUNLP-MT/Mask-Align](https://github.com/THUNLP-MT/Mask-Align) to adapt it to translate -the [NewsQA](https://www.microsoft.com/en-us/research/project/newsqa-dataset/) reading comprehension dataset to Spanish. -Mask-Align is an algorithm that aligns translations to a token level, which allows us to find the English answer span -inside the translated Spanish text. - -## Setup - -First, clone this repo: - -```bash -git clone https://github.com/pln-fing-udelar/Mask-Align -cd Mask-Align/ -``` - -Then, using [Conda](https://docs.conda.io/en/latest/index.html), run: - -```bash -conda env update -conda activate mask-align -``` - -## Training Mask-Align in English-Spanish - -We need a trained Mask-Align model to align translations between English and Spanish. To download the pretrained model, run the following commands: - -```bash -mkdir -p spanish-output/output -curl -o spanish-output/output/model-1.pt https://www.fing.edu.uy/owncloud/index.php/s/siRkUqxnwmdtfaJ/download -``` - -Alternatively, follow these steps to train it yourself. - -### Prepare the Corpus - -1. Download the [Europarl Spanish-English parallel corpus](https://www.statmt.org/europarl/v7/es-en.tgz). -2. Run the following script to do some preprocessing, learn the vocabulary, tokenize the sentences, and split the corpus: - - ```bash -./scripts/train-mask-align/preprocess_europarl.sh - ``` - -### Train the Model - -Note you need a computer with a CUDA-capable GPU to train the model. - -1. In the config file [`thualign/configs/user/spanish.config`](thualign/configs/user/spanish.config), specify the - location of the following files: - - * `corpus.32k.es.shuf` - * `corpus.32k.en.shuf` - * `validation.32k.es` - * `validation.32k.en` - * `test.32k.es` - * `test.32k.en` - * `es.vocab` - * `en.vocab` - -2. In `device_list` specify the number of GPUs. -3. In `batch_size` choose the highest value that doesn't make the training stop due to lack of memory (try different - numbers). -4. The value of `update_cycle` must be 36000 / batch_size. -5. Run: - - ```bash - ./thualign/bin/train.sh -s spanish - ``` - -6. The model is saved in a folder created in the root directory of the repository. - -### Test the alignments - -1. Run: - - ```bash - ./thualign/bin/test.sh -s spanish -gvt - ``` - -2. The alignments are generated in `test/alignments.txt`, where the model was saved. -3. To see the alignments in an interactive way, run: - - ```bash - ./thualign/scripts/visualize.py spanish/output/test/alignment_vizdata.pt - ``` - -## Generating the Answer Alignments for NewsQA-es - -Run the following script to generate the answer alignments for the NewsQA-es dataset. You should have a trained Mask-Align -model and have the `newsqa.csv` file. - -```bash -./scripts/generate-alignments/generate_alignments.sh -``` - -The following three files are generated: - -* `output-indexes.txt`: the indexes of the answers in Spanish. -* `output-answers.txt`: the answers in Spanish (in plain text). -* `output-sentences.txt`: the sentences in Spanish (not tokenized). - -### Generate the final merged CSV file - -Finally, run these commands to generate the `newsqa-es.csv` file, a new version of `newsqa_filtered.csv` which has the columns with the answers in Spanish. - -```bash -sed -i '1ianswer_index_esp' output-indexes.txt -csvjoin -y 0 newsqa_filtered.csv output-indexes.txt > newsqa-es.csv +# Mask-Align for NewsQA-es + +This repo forks [THUNLP-MT/Mask-Align](https://github.com/THUNLP-MT/Mask-Align) to adapt it to translate +the [NewsQA](https://www.microsoft.com/en-us/research/project/newsqa-dataset/) reading comprehension dataset to Spanish. +Mask-Align is an algorithm that aligns translations to a token level, which allows us to find the English answer span +inside the translated Spanish text. + +## Setup + +First, clone this repo: + +```bash +git clone https://github.com/pln-fing-udelar/Mask-Align +cd Mask-Align/ +``` + +Then, using [Conda](https://docs.conda.io/en/latest/index.html), run: + +```bash +conda env update +conda activate mask-align +``` + +## Training Mask-Align in English-Spanish + +We need a trained Mask-Align model to align translations between English and Spanish. To download the pretrained model, run the following commands: + +```bash +mkdir -p spanish-output/output +curl -o spanish-output/output/model-1.pt https://www.fing.edu.uy/owncloud/index.php/s/siRkUqxnwmdtfaJ/download +``` + +Alternatively, follow these steps to train it yourself. + +### Prepare the Corpus + +1. Download the [Europarl Spanish-English parallel corpus](https://www.statmt.org/europarl/v7/es-en.tgz). +2. Run the following script to do some preprocessing, learn the vocabulary, tokenize the sentences, and split the corpus: + + ```bash +./scripts/train-mask-align/preprocess_europarl.sh + ``` + +### Train the Model + +Note you need a computer with a CUDA-capable GPU to train the model. + +1. In the config file [`thualign/configs/user/spanish.config`](thualign/configs/user/spanish.config), specify the + location of the following files: + + * `corpus.32k.es.shuf` + * `corpus.32k.en.shuf` + * `validation.32k.es` + * `validation.32k.en` + * `test.32k.es` + * `test.32k.en` + * `es.vocab` + * `en.vocab` + +2. In `device_list` specify the number of GPUs. +3. In `batch_size` choose the highest value that doesn't make the training stop due to lack of memory (try different + numbers). +4. The value of `update_cycle` must be 36000 / batch_size. +5. Run: + + ```bash + ./thualign/bin/train.sh -s spanish + ``` + +6. The model is saved in a folder created in the root directory of the repository. + +### Test the alignments + +1. Run: + + ```bash + ./thualign/bin/test.sh -s spanish -gvt + ``` + +2. The alignments are generated in `test/alignments.txt`, where the model was saved. +3. To see the alignments in an interactive way, run: + + ```bash + ./thualign/scripts/visualize.py spanish/output/test/alignment_vizdata.pt + ``` + +## Generating the Answer Alignments for NewsQA-es + +Run the following script to generate the answer alignments for the NewsQA-es dataset. You should have a trained Mask-Align +model and have the `newsqa.csv` file. + +```bash +./scripts/generate-alignments/generate_alignments.sh +``` + +The following three files are generated: + +* `output-indexes.txt`: the indexes of the answers in Spanish. +* `output-answers.txt`: the answers in Spanish (in plain text). +* `output-sentences.txt`: the sentences in Spanish (not tokenized). + +### Generate the final merged CSV file + +Finally, run these commands to generate the `newsqa-es.csv` file, a new version of `newsqa_filtered.csv` which has the columns with the answers in Spanish. + +```bash +sed -i '1ianswer_index_esp' output-indexes.txt +csvjoin -y 0 newsqa_filtered.csv output-indexes.txt > newsqa-es.csv ``` \ No newline at end of file From 20a985a9f05d5256dca9a1da19de3fa7ab8cf12b Mon Sep 17 00:00:00 2001 From: Santiago Goycoechea Date: Tue, 26 Apr 2022 09:16:18 -0300 Subject: [PATCH 3/3] Add more commands to the scripts --- README.md | 15 +++------------ .../generate-alignments/generate_alignments.sh | 4 +++- scripts/train-mask-align/preprocess_europarl.sh | 2 ++ 3 files changed, 8 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index e5db640..7128a96 100644 --- a/README.md +++ b/README.md @@ -34,8 +34,7 @@ Alternatively, follow these steps to train it yourself. ### Prepare the Corpus -1. Download the [Europarl Spanish-English parallel corpus](https://www.statmt.org/europarl/v7/es-en.tgz). -2. Run the following script to do some preprocessing, learn the vocabulary, tokenize the sentences, and split the corpus: +Run the following script to download the [Europarl Spanish-English parallel corpus](https://www.statmt.org/europarl/v7/es-en.tgz), do some preprocessing, learn the vocabulary, tokenize the sentences, and split the corpus: ```bash ./scripts/train-mask-align/preprocess_europarl.sh @@ -93,17 +92,9 @@ model and have the `newsqa.csv` file. ./scripts/generate-alignments/generate_alignments.sh ``` -The following three files are generated: +The following four files are generated: * `output-indexes.txt`: the indexes of the answers in Spanish. * `output-answers.txt`: the answers in Spanish (in plain text). * `output-sentences.txt`: the sentences in Spanish (not tokenized). - -### Generate the final merged CSV file - -Finally, run these commands to generate the `newsqa-es.csv` file, a new version of `newsqa_filtered.csv` which has the columns with the answers in Spanish. - -```bash -sed -i '1ianswer_index_esp' output-indexes.txt -csvjoin -y 0 newsqa_filtered.csv output-indexes.txt > newsqa-es.csv -``` \ No newline at end of file +* `newsqa-es.csv`: a new version of `newsqa_filtered.csv` which has the columns with the answers in Spanish. diff --git a/scripts/generate-alignments/generate_alignments.sh b/scripts/generate-alignments/generate_alignments.sh index 4619b61..086525b 100644 --- a/scripts/generate-alignments/generate_alignments.sh +++ b/scripts/generate-alignments/generate_alignments.sh @@ -9,4 +9,6 @@ mkdir -p corpus-es mv test.32k.en test.32k.es answers.en vocab.32k.es.txt vocab.32k.en.txt corpus-es/ ./thualign/bin/generate.sh -s spanish -o output.txt spm_decode --model=es.model --input_format=piece < output.txt > output-plain.txt -python ./scripts/generate-alignments/output_brackets_to_indexes.py \ No newline at end of file +python ./scripts/generate-alignments/output_brackets_to_indexes.py +sed -i '1ianswer_index_esp' output-indexes.txt +csvjoin -y 0 newsqa_filtered.csv output-indexes.txt > newsqa-es.csv \ No newline at end of file diff --git a/scripts/train-mask-align/preprocess_europarl.sh b/scripts/train-mask-align/preprocess_europarl.sh index 848dde4..e51b587 100644 --- a/scripts/train-mask-align/preprocess_europarl.sh +++ b/scripts/train-mask-align/preprocess_europarl.sh @@ -1,5 +1,7 @@ #!/usr/bin/env bash set -ex +wget -qO- https://www.statmt.org/europarl/v7/es-en.tgz +tar zxvf es-en.tgz python scripts/train-mask-align/split_corpus.py spm_train --input=corpus.en --model_prefix=en --vocab_size=32000 --character_coverage=1.0 --model_type=unigram spm_train --input=corpus.es --model_prefix=es --vocab_size=32000 --character_coverage=1.0 --model_type=unigram