Download NLTK corpus data if not found

brandonlim-hs · Oct 27, 2019 · 9438442 · 9438442
1 parent fb22c9c
commit 9438442
Show file tree

Hide file tree

Showing 2 changed files with 17 additions and 5 deletions.
diff --git a/minpair/corpus.py b/minpair/corpus.py
@@ -0,0 +1,15 @@
+from nltk import download
+from nltk.data import find
+
+
+def require(corpora: list = []):
+    """Download the required NLTK corpus if not found.
+
+    Keyword Arguments:
+        corpora {list} -- The identifier or name of NLTK corpus (default: {[]})
+    """
+    for corpus in corpora:
+        try:
+            find(corpus)
+        except LookupError:
+            download(corpus)
diff --git a/minpair/minpair.py b/minpair/minpair.py
@@ -1,6 +1,6 @@
 from . import arpabet
+from .corpus import require as corpus_require
 from collections import defaultdict
-from nltk import download as nltk_download
 from nltk.corpus import brown
 from nltk.corpus import cmudict
 from nltk.corpus import words
@@ -32,12 +32,9 @@ def vowel_minpair(vowels: list, pos: list = []):
         raise Exception('At least a pair of unique vowels required.')
     if any(not arpabet.is_vowel(vowel) for vowel in vowels):
         raise Exception('Only vowels are accepted.')
+    corpus_require(['brown', 'cmudict', 'universal_tagset', 'words'])
     possible_pairs = defaultdict(lambda: {})
     vowels_regex = re.compile(r'^(?:%s)' % '|'.join(vowels))
-    nltk_download('brown')
-    nltk_download('cmudict')
-    nltk_download('universal_tagset')
-    nltk_download('words')
     pos = pos or ['ADJ', 'NOUN', 'VERB']
     tagged_words = {word
                     for word, tag in brown.tagged_words(tagset='universal')