-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path07.adjectives.py
69 lines (56 loc) · 2.6 KB
/
07.adjectives.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import csv
import spacy
import pyinflect # Ensure pyinflect is installed: pip install pyinflect
def generate_comparative_and_superlative_spacy(adjective, nlp):
"""
Generate the comparative and superlative forms of an adjective using spaCy and pyinflect.
Args:
adjective (str): The base (positive) form of the adjective.
nlp: A spaCy language model instance.
Returns:
tuple: (comparative, superlative) forms, or (None, None) if not applicable.
"""
doc = nlp(adjective)
token = doc[0]
comparative = token._.inflect("JJR") # Generate comparative form
superlative = token._.inflect("JJS") # Generate superlative form
return comparative, superlative
def process_file(input_path, csv_output_path):
"""
Process the input file to generate a CSV of adjectives with their comparative and superlative forms
using spaCy and pyinflect.
Args:
input_path (str): Path to the input file.
csv_output_path (str): Path to the output CSV file.
"""
# Load spaCy model
nlp = spacy.load("en_core_web_sm")
adjective_forms = []
seen_forms = set() # Track seen (comparative, superlative) pairs
# Read the input file
with open(input_path, "r", encoding="utf-8") as file:
lines = file.readlines()
for line in lines:
original_line = line.strip()
parts = original_line.split("\t", 1)
keys = parts[0].strip().split("|")
second_column = parts[1] if len(parts) > 1 else ""
# Check if the entry is an adjective
if '<i class="p"><font color="green">adj</font></i>' in second_column:
for adjective in keys:
comparative, superlative = generate_comparative_and_superlative_spacy(adjective, nlp)
if comparative and superlative:
# Skip if the (comparative, superlative) pair has been seen before
if (comparative, superlative) in seen_forms:
continue
seen_forms.add((comparative, superlative)) # Mark the pair as seen
adjective_forms.append([adjective, comparative, superlative])
# Write the adjectives to the CSV file
with open(csv_output_path, "w", encoding="utf-8", newline="") as csvfile:
writer = csv.writer(csvfile)
writer.writerows(adjective_forms)
print(f"Adjective forms saved to {csv_output_path}")
if __name__ == "__main__":
input_file = "temp/05.filter-irregular-nouns.txt" # Input file path
csv_output_file = "temp/adjectives.csv" # CSV output file path
process_file(input_file, csv_output_file)