Merge pull request #9 from johnsonandjohnson/8-pip-install-broken

8 pip install broken
johnsonandjohnson · Jan 11, 2023 · 3fbf41f · 3fbf41f
2 parents a68fbe1 + 3f7324a
commit 3fbf41f
Show file tree

Hide file tree

Showing 7 changed files with 14 additions and 14 deletions.
diff --git a/dev_environment.yml b/dev_environment.yml
@@ -7,8 +7,8 @@ dependencies:
   - pip
   - pycodestyle=2.5.0
   - python=3.7.*
-  - spacy=2.1.*
-  - spacy-model-en_core_web_sm=2.1.0
+  - spacy=3.4.*
+  - spacy-model-en_core_web_sm=3.4.*
   - pytest=4.4.*
   - pytest-xdist>=1.28.0
   - pytest-sugar=0.9.*

diff --git a/nlprov/__init__.py b/nlprov/__init__.py
@@ -2,12 +2,12 @@
 Copyright © 2020 Johnson & Johnson
 """
 
-import spacy
+import en_core_web_sm
 
 
 def get_spacy_nlp():
     try:
-        spacy_nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner', 'textcat'])
+        spacy_nlp = en_core_web_sm.load(disable=['parser', 'ner', 'textcat'])
     except OSError:
         # We should tell the user explicitly what they need to do.
         raise Exception("Please run `python -m spacy download en_core_web_sm` locally.")

diff --git a/nlprov/preprocessing.py b/nlprov/preprocessing.py
@@ -70,7 +70,7 @@ def preprocess_text(text: pd.Series,
     text = text.apply(lambda s: re.sub(regex, ' ', s))
 
     text = text.str.strip()
-    text = text.str.replace(r'\s+', ' ')
+    text = text.str.replace(r'\s+', ' ', regex=True)
 
     if eng_lang:
         text = text[text.apply(lambda t: langid.classify(t)[0]) == 'en']

diff --git a/nlprov/test/test_preprocess.py b/nlprov/test/test_preprocess.py
@@ -224,7 +224,7 @@ def lemma_actual():
 
 @pytest.fixture
 def lemma_expected():
-    return pd.Series(data=["lemma need",
+    return pd.Series(data=["lemmas need",
                            "duck and cat and pony be not similar",
                            "normal string"])
 
@@ -238,10 +238,10 @@ def test_lemma(lemma_actual, lemma_expected):
 # Test data for the list of token list output (lemmas)
 @pytest.fixture
 def token_list_expected():
-    return pd.Series(data=[["lemma", "need"],
-                           ["duck", "and", "cat", "and", "pony", "be",
-                            "not", "similar"],
-                           ["normal", "string"]])
+    return pd.Series(data=[['lemmas', 'need'],
+                           ['duck', 'and', 'cat', 'and', 'pony', 'be',
+                            'not', 'similar'],
+                           ['normal', 'string']])
 
 
 # Test data for the list of token list output (no lemmas)

diff --git a/nlprov/test/test_vectorize.py b/nlprov/test/test_vectorize.py
@@ -42,7 +42,7 @@ def test_count_vectorizer(vectorize_actual, count_dfm_expected,
     assert type(vec_obj) is CountVectorizer
 
     # Check original terms are included
-    vocab_set = set(vec_obj.get_feature_names())
+    vocab_set = set(vec_obj.get_feature_names_out())
     assert vocab_set == vocab_set_expected
 
 
@@ -64,7 +64,7 @@ def test_tfidf_vectorizer(vectorize_actual, tfidf_dfm_expected,
     assert type(vec_obj) is TfidfVectorizer
 
     # Check original terms are included
-    vocab_set = set(vec_obj.get_feature_names())
+    vocab_set = set(vec_obj.get_feature_names_out())
     assert vocab_set == vocab_set_expected
 
 

diff --git a/requirements.txt b/requirements.txt
@@ -1,5 +1,5 @@
 pandas>=1.0.0
-spacy>=2.1,<2.1.7
+spacy>=3.4
 nltk>=3.4.3
 langid>=1.1.6
 scikit-learn>=0.21.3
diff --git a/setup.py b/setup.py
@@ -18,7 +18,7 @@
     packages=pkgs,
     install_requires=[
         'pandas>=1.0.0',
-        'spacy>=2.1.0,<2.1.7',
+        'spacy>=3.4',
         'nltk>=3.4.3',
         'langid>=1.1.6',
         'scikit-learn>=0.21.3'