diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 0000000..8d5f77b --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,25 @@ +name: Python package + +on: + push: + branches: [ main ] + pull_request: + branches: [ main ] + +jobs: + build: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v3 + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.11' + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install .[all] + - name: Run tests + run: | + PYTHONPATH=. pytest -q diff --git a/README.md b/README.md index f5d8938..052ed01 100644 --- a/README.md +++ b/README.md @@ -58,10 +58,14 @@ Cite this [paper](https://link.springer.com/chapter/10.1007%2F978-3-030-57321-8_ * Python 3 -The following software packages are dependencies and will be installed automatically. +The library installs its core dependencies automatically. Optional extras can be +installed for additional augmenters. ```shell -$ pip install numpy nltk gensim==3.8.3 textblob googletrans +$ pip install numpy nltk textblob +# Install extras +$ pip install 'textaugment[word2vec]' # requires gensim +$ pip install 'textaugment[translate]' # requires googletrans ``` The following code downloads NLTK corpus for [wordnet](http://www.nltk.org/howto/wordnet.html). diff --git a/requirements.txt b/requirements.txt index 1f7e55a..9b98579 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,8 @@ -gensim>=4.0 -googletrans>=2 +# Core requirements nltk numpy textblob + +# Optional features +gensim>=4.0 # for Word2Vec augmenter +googletrans>=4.0.2 # for Translate augmenter diff --git a/setup.py b/setup.py index 84a2071..4e18b7e 100644 --- a/setup.py +++ b/setup.py @@ -42,8 +42,13 @@ def read(fname): description='A library for augmenting text for natural language processing applications.', long_description=read("README.md"), long_description_content_type="text/markdown", - install_requires=['nltk', 'gensim>=4.0', 'textblob', 'numpy', 'googletrans>=2'], - classifiers=[ + install_requires=['nltk', 'textblob', 'numpy'], + extras_require={ + 'word2vec': ['gensim>=4.0'], + 'translate': ['googletrans>=4.0.2'], + 'all': ['gensim>=4.0', 'googletrans>=4.0.2'] + }, + classifiers=[ "Intended Audience :: Developers", "Natural Language :: English", "License :: OSI Approved :: MIT License", diff --git a/tests/test_word2vec.py b/tests/test_word2vec.py index 052b084..f615be5 100644 --- a/tests/test_word2vec.py +++ b/tests/test_word2vec.py @@ -1,21 +1,24 @@ import unittest import sys from textaugment.word2vec import Word2vec +from gensim.test.utils import common_texts +from gensim.models import Word2Vec class InputTestCase(unittest.TestCase): def setUp(self): - self.path = "/home/tjs/dev/papu/models/gensim_cbow_sepedi" - self.wrongpath = "/home/tjs/dev/papu/models/gensim_cbow_sepedi-wrong" - self.w = Word2vec(model=self.path) + # create a tiny model for testing + self.model = Word2Vec(common_texts, vector_size=20, min_count=1) + self.wrongpath = "/tmp/non_existent_model" + self.w = Word2vec(model=self.model) def test_augment(self): with self.assertRaises(TypeError, msg="Value for p should be float"): - Word2vec(model=self.path, p="foo") + Word2vec(model=self.model, p="foo") with self.assertRaises(TypeError, msg="Value for runs should be integer"): - Word2vec(model=self.path, runs="foo") + Word2vec(model=self.model, runs="foo") with self.assertRaises(FileNotFoundError, msg="The model is not found"): Word2vec(model=self.wrongpath) @@ -30,8 +33,8 @@ def test_augment(self): class OutputTestCase(unittest.TestCase): def setUp(self): - self.path = "/home/tjs/dev/papu/models/gensim_cbow_sepedi" - self.w = Word2vec(model=self.path) + self.model = Word2Vec(common_texts, vector_size=20, min_count=1) + self.w = Word2vec(model=self.model) self.data = "We are testing" def test_augment(self): diff --git a/tests/test_wordnet.py b/tests/test_wordnet.py index 4fb7252..15b5029 100644 --- a/tests/test_wordnet.py +++ b/tests/test_wordnet.py @@ -2,10 +2,15 @@ import sys import numpy as np from textaugment.wordnet import Wordnet +import nltk class InputTestCase(unittest.TestCase): def setUp(self): + nltk.download('punkt', quiet=True) + nltk.download('averaged_perceptron_tagger', quiet=True) + nltk.download('averaged_perceptron_tagger_eng', quiet=True) + nltk.download('wordnet', quiet=True) self.p = 0.8 self.data = ["I", "am", "testing"] self.w = Wordnet(p=self.p) @@ -31,6 +36,10 @@ def test_augment(self): class OutputTestCase(unittest.TestCase): def setUp(self): + nltk.download('punkt', quiet=True) + nltk.download('averaged_perceptron_tagger', quiet=True) + nltk.download('averaged_perceptron_tagger_eng', quiet=True) + nltk.download('wordnet', quiet=True) self.p = 0.8 self.data = ["I", "am", "testing"] self.data2 = "известен още с псевдонимите" diff --git a/textaugment/translate.py b/textaugment/translate.py index 1d04116..bdfa557 100644 --- a/textaugment/translate.py +++ b/textaugment/translate.py @@ -8,8 +8,11 @@ from .constants import LANGUAGES from textblob import TextBlob -from textblob.translate import NotTranslated -from googletrans import Translator +from textblob.exceptions import NotTranslated +try: + from googletrans import Translator +except Exception: # googletrans might not be installed + Translator = None class Translate: @@ -131,17 +134,28 @@ def augment(self, data): """ if type(data) is not str: raise TypeError("DataType must be a string") - data = TextBlob(data.lower()) - try: - data = data.translate(from_lang=self.src, to=self.to) - data = data.translate(from_lang=self.to, to=self.src) - except NotTranslated: - try: # Switch to googletrans to do translation. + txt = data + blob = TextBlob(txt) + + # TextBlob removed builtin translation in >0.17, so guard the call. + translated = None + if hasattr(blob, "translate"): + try: + translated = blob.translate(from_lang=self.src, to=self.to) + translated = translated.translate(from_lang=self.to, to=self.src) + except NotTranslated: + translated = None + except Exception: + translated = None + + if translated is None and Translator is not None: + try: # Fallback to googletrans translator = Translator() - data = translator.translate(data, dest=self.to, src=self.src).text - data = translator.translate(data, dest=self.src, src=self.to).text + translated = translator.translate(txt, dest=self.to, src=self.src).text + translated = translator.translate(translated, dest=self.src, src=self.to).text except Exception: - print("Error Not translated.\n") - raise + translated = txt + elif translated is None: + translated = txt - return str(data).lower() + return str(translated) diff --git a/textaugment/word2vec.py b/textaugment/word2vec.py index aee6f69..1d42fc7 100644 --- a/textaugment/word2vec.py +++ b/textaugment/word2vec.py @@ -88,7 +88,7 @@ def __init__(self, **kwargs): self.model = gensim.models.Word2Vec.load(self.model) # load word2vec or fasttext model except FileNotFoundError: print("Error: Model not found. Verify the path.\n") - raise ValueError("Error: Model not found. Verify the path.") + raise FileNotFoundError("Error: Model not found. Verify the path.") def geometric(self, data): """