-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathWord List Generator.py
91 lines (83 loc) · 3.06 KB
/
Word List Generator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
#Issues: Spacy categories many things incorrectly. For example, sometimes putting our
#https://stackoverflow.com/questions/4456446/dictionary-text-file
import oxford
import spacy
nlp = spacy.load("en_core_web_sm")
with open('Input/words_alpha.txt') as words_alpha:
wordsAlpha = words_alpha.read()
words_alpha.close
adjectives = open("Output/adjectives.txt", "a")
prepositions = open("Output/prepositions.txt", "a")
adverbs = open("Output/adverbs.txt", "a")
auxiliarys = open("Output/auxiliarys.txt", "a")
coordinating_conjunctions = open("Output/coordinating_conjunctions.txt", "a")
determiners = open("Output/determiners.txt", "a")
interjections = open("Output/interjections.txt", "a")
nouns = open("Output/nouns.txt", "a")
numerals = open("Output/numerals.txt", "a") #
particles = open("Output/particles.txt", "a")
pronouns = open("Output/pronouns.txt", "a")
proper_nouns = open("Output/proper_nouns.txt", "a")
punctuations = open("Output/punctuations.txt", "a") #
subordinating_conjunctions = open("Output/subordinating_conjunctions.txt", "a")
symbols = open("Output/symbols.txt", "a") #
verbs = open("Output/verbs.txt", "a")
others = open("Output/others.txt", "a")
#identify lexical categories AKA parts of speech
nlp.max_length = 4000000
doc = nlp(wordsAlpha)
i = 0
while i < len(doc):
print((doc[i]).text)
if (doc[i]).pos_ == "ADJ":
adjectives.write((doc[i]).text)
adjectives.write(" ")
elif (doc[i]).pos_ == "ADJ":
prepositions.write((doc[i]).text)
prepositions.write(" ")
elif (doc[i]).pos_ == "ADV":
adverbs.write((doc[i]).text)
adverbs.write(" ")
elif (doc[i]).pos_ == "AUX":
auxiliarys.write((doc[i]).text)
auxiliarys.write(" ")
elif (doc[i]).pos_ == "CCONJ":
coordinating_conjunctions.write((doc[i]).text)
coordinating_conjunctions.write(" ")
elif (doc[i]).pos_ == "DET":
determiners.write((doc[i]).text)
determiners.write(" ")
elif (doc[i]).pos_ == "INTJ":
interjections.write((doc[i]).text)
interjections.write(" ")
elif (doc[i]).pos_ == "NOUN":
nouns.write((doc[i]).text)
nouns.write(" ")
elif (doc[i]).pos_ == "NUM":
numerals.write((doc[i]).text)
numerals.write(" ")
elif (doc[i]).pos_ == "PART":
particles.write((doc[i]).text)
particles.write(" ")
elif (doc[i]).pos_ == "PRON":
pronouns.write((doc[i]).text)
pronouns.write(" ")
elif (doc[i]).pos_ == "PROPN":
proper_nouns.write((doc[i]).text)
proper_nouns.write(" ")
elif (doc[i]).pos_ == "PUNCT":
punctuations.write((doc[i]).text)
punctuations.write(" ") #
elif (doc[i]).pos_ == "SCONJ":
subordinating_conjunctions.write((doc[i]).text)
subordinating_conjunctions.write(" ")
elif (doc[i]).pos_ == "SYM":
symbols.write((doc[i]).text)
symbols.write(" ")
elif (doc[i]).pos_ == "VERB":
verbs.write((doc[i]).text)
verbs.write(" ")
elif (doc[i]).pos_ == "X":
others.write((doc[i]).text)
others.write(" ")
i += 1