-
Notifications
You must be signed in to change notification settings - Fork 15
/
Copy pathtest_token_is_equal_to_lemma.py
32 lines (26 loc) · 1.18 KB
/
test_token_is_equal_to_lemma.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
import argparse
from pathlib import Path
import csv
from typing import Set
import json
if __name__ == '__main__':
description = '''
Tests for single word lexicon files if the `token` and `lemma` values per
row/line are equal. Will output to stdout a JSON object for each
line that contains a different `token` and `lemma` value. An example of the
JSON object is shown below:
{"token": "A.E", "lemma": "A.E.", "row index": 0}
'''
parser = argparse.ArgumentParser()
parser.add_argument('lexicon_file_path', type=Path,
help='File path to the single word lexicon file to check')
args = parser.parse_args()
lexicon_collection_file = args.lexicon_file_path
with lexicon_collection_file.open('r', newline='') as lexicon_data:
csv_reader = csv.DictReader(lexicon_data, delimiter='\t')
file_field_names: Set[str] = set()
for row_index, row in enumerate(csv_reader):
if row['token'] != row['lemma']:
output_dict = {'token': row['token'], 'lemma': row['lemma'],
'row index': row_index}
print(json.dumps(output_dict))