SupervisedStylometry · Jean-Baptiste-Camps · Feb 13, 2024 · Feb 13, 2024 · Feb 13, 2024 · Feb 13, 2024
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
@@ -0,0 +1,50 @@
+# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
+# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
+
+name: Python package
+
+on:
+  push:
+    branches: [ "master" ]
+  pull_request:
+    branches: [ "master" ]
+
+jobs:
+  build:
+
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: ["3.9", "3.10", "3.11"]
+
+    steps:
+    - uses: actions/checkout@v3
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v3
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        python -m pip install flake8 pytest
+        if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
+    - name: Lint with flake8
+      run: |
+        # stop the build if there are Python syntax errors or undefined names
+        flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
+        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
+        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
+    - name: Generate Report
+      run: |
+          pip install coverage
+          coverage run -m unittest
+    - name: Upload Coverage to Codecov
+      uses: codecov/codecov-action@v4
+      with:
+        fail_ci_if_error: true # optional (default = false)
+        flags: unittests # optional
+        name: codecov-umbrella # optional
+        token: ${{ secrets.CODECOV_TOKEN }} # required
+        slug: SupervisedStylometry/SuperStyl
+        verbose: true # optional (default = false)
diff --git a/README.md b/README.md
@@ -1,21 +1,17 @@
 # SUPERvised STYLometry
 
+[![codecov](https://codecov.io/github/SupervisedStylometry/SuperStyl/graph/badge.svg?token=TY5HCBOOKL)](https://codecov.io/github/SupervisedStylometry/SuperStyl)
+
 ## Installing
 
 You will need python3.9 or later, the corresponding `-dev` package, `virtualenv` and `pip`
 
 ```bash
-# Only if you don't have it
-sudo apt install python3.9-dev
-# then
 git clone https://github.com/SupervisedStylometry/SuperStyl.git
 cd SuperStyl
 virtualenv -p python3.9 env
 source env/bin/activate
 pip install -r requirements.txt
-# And get the model for language prediction
-mkdir superstyl/preproc/models
-wget https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin -P ./superstyl/preproc/models/
 ```
 
 ## Workflow

diff --git a/codecov.yml b/codecov.yml
@@ -0,0 +1,2 @@
+ignore:
+  - "*/tests/*"
diff --git a/main.py b/main.py
@@ -26,7 +26,6 @@
     parser.add_argument('-t', action='store', help="types of features (words or chars)", type=str)
     parser.add_argument('-n', action='store', help="n grams lengths (default 1)", default=1, type=int)
     parser.add_argument('-p', action='store', help="Processes to use (default 1)", default=1, type=int)
-    parser.add_argument('-c', action='store', help="Path to file with metadata corrections", default=None, type=str)
     parser.add_argument('-k', action='store', help="How many most frequent?", default=5000, type=int)
     parser.add_argument('--absolute_freqs', action='store_true', help="switch to get absolute instead of relative freqs", default=False)
     parser.add_argument('--z_scores', action='store_true', help="Use z-scores?", default=False) # TODO: remove this as already covered in model training?
@@ -44,32 +43,19 @@
                         help="if true, same as keep_punct, plus no Unidecode, and numbers are kept as well (default is False)",
                         default=False)
     parser.add_argument('--identify_lang', action='store_true',
-                        help="if true, should the language of each text be guessed, using a fasttext model (default is False) -- Necessitates downloading the model",
+                        help="if true, should the language of each text be guessed, using langdetect (default is False)",
                         default=False)
     args = parser.parse_args()
 
-    if args.identify_lang:
-        model = fasttext.load_model("superstyl/preproc/models/lid.176.bin")
-    else:
-        model=None
-
     print(".......loading texts.......")
 
-    if args.c:
-        # "debug_authors.csv"
-        correct_aut = pandas.read_csv(args.c)
-        # a bit hacky. Improve later
-        correct_aut.index = list(correct_aut.loc[:, "Original"])
-        myTexts = tuy.load_texts(args.s, identify_lang=model, format=args.x, correct_aut=correct_aut, keep_punct=args.keep_punct, keep_sym=args.keep_sym)
+    if args.sampling:
+        myTexts = tuy.docs_to_samples(args.s, identify_lang=args.identify_lang, size=args.sample_size, step=args.sample_step,
+                                  units=args.sample_units, feature="tokens", format=args.x,
+                                      keep_punct=args.keep_punct, keep_sym=args.keep_sym, max_samples=args.max_samples)
 
     else:
-        if args.sampling:
-            myTexts = tuy.docs_to_samples(args.s, identify_lang=model, size=args.sample_size, step=args.sample_step,
-                                      units=args.sample_units, feature="tokens", format=args.x,
-                                          keep_punct=args.keep_punct, keep_sym=args.keep_sym, max_samples=args.max_samples)
-
-        else:
-            myTexts = tuy.load_texts(args.s, identify_lang=model, format=args.x, keep_punct=args.keep_punct, keep_sym=args.keep_sym)
+        myTexts = tuy.load_texts(args.s, identify_lang=args.identify_lang, format=args.x, keep_punct=args.keep_punct, keep_sym=args.keep_sym)
 
     print(".......getting features.......")
 

diff --git a/requirements.txt b/requirements.txt
@@ -1,15 +1,16 @@
-fasttext==0.9.2
+langdetect==1.0.9
 joblib==1.2.0
 lxml==4.9.1
 nltk==3.6.6
-numpy==1.22.0
+numpy==1.26.4
 pybind11==2.8.1
 scikit-learn==1.2.1
 scipy==1.10.0
 six==1.16.0
 tqdm==4.64.1
 unidecode==1.3.2
-pandas==1.3.4
+pandas==2.2.0
+pyarrow==15.0.0
 argparse==1.4.0
 regex==2022.10.31
 matplotlib==3.6.2

diff --git a/superstyl/preproc/tuyau.py b/superstyl/preproc/tuyau.py
@@ -3,12 +3,12 @@
 import unidecode
 import nltk.tokenize
 import random
+import langdetect
 
-def XML_to_text(path, correct_aut=None):
+def XML_to_text(path):
     """
     Get main text from xml file
     :param path: path to the file to transform
-    :param correct_aut: optional data frame of metadata correction (authors)
     :return: a tuple with auts, and string (the text).
     """
 
@@ -45,18 +45,14 @@ def XML_to_text(path, correct_aut=None):
 
         else:
             aut = auts[0]
-            if correct_aut is not None and aut in list(correct_aut.loc[:, "Original"]):
-                print("correcting " + aut + " to " + correct_aut.loc[aut, "Actual"])
-                aut = correct_aut.loc[aut, "Actual"]
 
         return aut, re.sub(r"\s+", " ", str(myxsl(my_doc)))
 
 
-def TXT_to_text(path, correct_aut=None):
+def TXT_to_text(path):
     """
     Get main text from xml file
     :param path: path to the file to transform
-    :param correct_aut: optional data frame of metadata correction (authors)
     :return: a tuple with auts, and string (the text).
     """
 
@@ -70,15 +66,14 @@ def TXT_to_text(path, correct_aut=None):
     return aut, re.sub(r"\s+", " ", str(' '.join(txt)))
 
 
-def identify_lang(string, model):
+def detect_lang(string):
     """
     Get the language from a string
     :param string: a string, duh
-    :param model, the fasttext model
     :return: the language
     """
 
-    return model.predict(string)  # , k = 3)
+    return langdetect.detect(string)  # , k = 3)
 
 
 def normalise(text, keep_punct=False, keep_sym=False):
@@ -98,14 +93,13 @@ def normalise(text, keep_punct=False, keep_sym=False):
     return out
 
 
-def load_texts(paths, identify_lang=None, format="txt", correct_aut=None, keep_punct=False, keep_sym=False):
+def load_texts(paths, identify_lang=False, format="txt", keep_punct=False, keep_sym=False):
     """
     Loads a collection of documents into a 'myTexts' object for further processing.
     TODO: a proper class
     :param paths: path to docs
-    :param identify_lang: what model to use for language guessing of the texts (default: None)
+    :param identify_lang: whether or not try to identify lang (default: False)
     :param format: format of the source files (implemented values: txt [default], xml)
-    :param correct_aut: optional data frame of metadata correction (authors)
     :param keep_punct: whether or not to keep punctuation and caps.
     :param keep_sym: whether or not to keep punctuation, caps, letter variants and numbers (no unidecode).
     :return: a myTexts object
@@ -118,14 +112,13 @@ def load_texts(paths, identify_lang=None, format="txt", correct_aut=None, keep_p
         name = path.split('/')[-1]
 
         if format=='xml':
-            aut, text = XML_to_text(path, correct_aut=correct_aut)
+            aut, text = XML_to_text(path)
 
         else:
-            aut, text = TXT_to_text(path)  # implement correct_aut
+            aut, text = TXT_to_text(path)
 
-        if identify_lang is not None:
-            lang, cert = identify_lang(text, identify_lang)
-            lang = lang[0].replace("__label__", "")
+        if identify_lang:
+            lang = detect_lang(text)
         else:
             lang = "NA"
 
@@ -215,7 +208,7 @@ def get_samples(path, size, step=None, units="verses", feature="tokens", format=
 
 
 def docs_to_samples(paths, size, step=None, units="verses", feature="tokens", format="tei", keep_punct=False,
-                    keep_sym=False, max_samples=None, identify_lang=None):
+                    keep_sym=False, max_samples=None, identify_lang=False):
     """
     Loads a collection of documents into a 'myTexts' object for further processing BUT with samples !
     :param paths: path to docs
@@ -227,20 +220,20 @@ def docs_to_samples(paths, size, step=None, units="verses", feature="tokens", fo
     :param format: type of document, one of full text, TEI or simple XML (ONLY TEI and TXT IMPLEMENTED)
     :param keep_punct: whether or not to keep punctuation and caps.
     :param max_samples: maximum number of samples per author.
-    :param identify_lang: what model to use for language guessing of the texts (default: None)
+    :param identify_lang: whether or not try to identify lang (default: False)
     """
     myTexts = []
     for path in paths:
         aut = path.split('/')[-1].split('_')[0]
-        if identify_lang is not None:
+        if identify_lang:
             if format == 'xml':
-                aut, text = XML_to_text(path, correct_aut=correct_aut)
+                aut, text = XML_to_text(path)
 
             else:
-                aut, text = TXT_to_text(path)  # implement correct_aut
+                aut, text = TXT_to_text(path)
+
+            lang = detect_lang(text)
 
-            lang, cert = identify_lang(text, identify_lang)
-            lang = lang[0].replace("__label__", "")
         else:
             lang = 'NA'
 

diff --git a/tests/__init__.py b/tests/__init__.py