Skip to content

Commit

Permalink
Download NLTK corpus data if not found
Browse files Browse the repository at this point in the history
  • Loading branch information
brandonlim-hs committed Oct 27, 2019
1 parent fb22c9c commit 9438442
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 5 deletions.
15 changes: 15 additions & 0 deletions minpair/corpus.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from nltk import download
from nltk.data import find


def require(corpora: list = []):
"""Download the required NLTK corpus if not found.
Keyword Arguments:
corpora {list} -- The identifier or name of NLTK corpus (default: {[]})
"""
for corpus in corpora:
try:
find(corpus)
except LookupError:
download(corpus)
7 changes: 2 additions & 5 deletions minpair/minpair.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from . import arpabet
from .corpus import require as corpus_require
from collections import defaultdict
from nltk import download as nltk_download
from nltk.corpus import brown
from nltk.corpus import cmudict
from nltk.corpus import words
Expand Down Expand Up @@ -32,12 +32,9 @@ def vowel_minpair(vowels: list, pos: list = []):
raise Exception('At least a pair of unique vowels required.')
if any(not arpabet.is_vowel(vowel) for vowel in vowels):
raise Exception('Only vowels are accepted.')
corpus_require(['brown', 'cmudict', 'universal_tagset', 'words'])
possible_pairs = defaultdict(lambda: {})
vowels_regex = re.compile(r'^(?:%s)' % '|'.join(vowels))
nltk_download('brown')
nltk_download('cmudict')
nltk_download('universal_tagset')
nltk_download('words')
pos = pos or ['ADJ', 'NOUN', 'VERB']
tagged_words = {word
for word, tag in brown.tagged_words(tagset='universal')
Expand Down

0 comments on commit 9438442

Please sign in to comment.