diff --git a/minpair/corpus.py b/minpair/corpus.py new file mode 100644 index 0000000..07e6fa0 --- /dev/null +++ b/minpair/corpus.py @@ -0,0 +1,15 @@ +from nltk import download +from nltk.data import find + + +def require(corpora: list = []): + """Download the required NLTK corpus if not found. + + Keyword Arguments: + corpora {list} -- The identifier or name of NLTK corpus (default: {[]}) + """ + for corpus in corpora: + try: + find(corpus) + except LookupError: + download(corpus) diff --git a/minpair/minpair.py b/minpair/minpair.py index 8fd0982..29d4978 100644 --- a/minpair/minpair.py +++ b/minpair/minpair.py @@ -1,6 +1,6 @@ from . import arpabet +from .corpus import require as corpus_require from collections import defaultdict -from nltk import download as nltk_download from nltk.corpus import brown from nltk.corpus import cmudict from nltk.corpus import words @@ -32,12 +32,9 @@ def vowel_minpair(vowels: list, pos: list = []): raise Exception('At least a pair of unique vowels required.') if any(not arpabet.is_vowel(vowel) for vowel in vowels): raise Exception('Only vowels are accepted.') + corpus_require(['brown', 'cmudict', 'universal_tagset', 'words']) possible_pairs = defaultdict(lambda: {}) vowels_regex = re.compile(r'^(?:%s)' % '|'.join(vowels)) - nltk_download('brown') - nltk_download('cmudict') - nltk_download('universal_tagset') - nltk_download('words') pos = pos or ['ADJ', 'NOUN', 'VERB'] tagged_words = {word for word, tag in brown.tagged_words(tagset='universal')