Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Tests #39

Merged
merged 18 commits into from
Feb 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 50 additions & 0 deletions .github/workflows/python-package.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python

name: Python package

on:
push:
branches: [ "master" ]
pull_request:
branches: [ "master" ]

jobs:
build:

runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
python-version: ["3.9", "3.10", "3.11"]

steps:
- uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v3
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
python -m pip install --upgrade pip
python -m pip install flake8 pytest
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
- name: Lint with flake8
run: |
# stop the build if there are Python syntax errors or undefined names
flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
# exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
- name: Generate Report
run: |
pip install coverage
coverage run -m unittest
- name: Upload Coverage to Codecov
uses: codecov/codecov-action@v4
with:
fail_ci_if_error: true # optional (default = false)
flags: unittests # optional
name: codecov-umbrella # optional
token: ${{ secrets.CODECOV_TOKEN }} # required
slug: SupervisedStylometry/SuperStyl
verbose: true # optional (default = false)
8 changes: 2 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,21 +1,17 @@
# SUPERvised STYLometry

[![codecov](https://codecov.io/github/SupervisedStylometry/SuperStyl/graph/badge.svg?token=TY5HCBOOKL)](https://codecov.io/github/SupervisedStylometry/SuperStyl)

## Installing

You will need python3.9 or later, the corresponding `-dev` package, `virtualenv` and `pip`

```bash
# Only if you don't have it
sudo apt install python3.9-dev
# then
git clone https://github.com/SupervisedStylometry/SuperStyl.git
cd SuperStyl
virtualenv -p python3.9 env
source env/bin/activate
pip install -r requirements.txt
# And get the model for language prediction
mkdir superstyl/preproc/models
wget https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin -P ./superstyl/preproc/models/
```

## Workflow
Expand Down
2 changes: 2 additions & 0 deletions codecov.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
ignore:
- "*/tests/*"
26 changes: 6 additions & 20 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@
parser.add_argument('-t', action='store', help="types of features (words or chars)", type=str)
parser.add_argument('-n', action='store', help="n grams lengths (default 1)", default=1, type=int)
parser.add_argument('-p', action='store', help="Processes to use (default 1)", default=1, type=int)
parser.add_argument('-c', action='store', help="Path to file with metadata corrections", default=None, type=str)
parser.add_argument('-k', action='store', help="How many most frequent?", default=5000, type=int)
parser.add_argument('--absolute_freqs', action='store_true', help="switch to get absolute instead of relative freqs", default=False)
parser.add_argument('--z_scores', action='store_true', help="Use z-scores?", default=False) # TODO: remove this as already covered in model training?
Expand All @@ -44,32 +43,19 @@
help="if true, same as keep_punct, plus no Unidecode, and numbers are kept as well (default is False)",
default=False)
parser.add_argument('--identify_lang', action='store_true',
help="if true, should the language of each text be guessed, using a fasttext model (default is False) -- Necessitates downloading the model",
help="if true, should the language of each text be guessed, using langdetect (default is False)",
default=False)
args = parser.parse_args()

if args.identify_lang:
model = fasttext.load_model("superstyl/preproc/models/lid.176.bin")
else:
model=None

print(".......loading texts.......")

if args.c:
# "debug_authors.csv"
correct_aut = pandas.read_csv(args.c)
# a bit hacky. Improve later
correct_aut.index = list(correct_aut.loc[:, "Original"])
myTexts = tuy.load_texts(args.s, identify_lang=model, format=args.x, correct_aut=correct_aut, keep_punct=args.keep_punct, keep_sym=args.keep_sym)
if args.sampling:
myTexts = tuy.docs_to_samples(args.s, identify_lang=args.identify_lang, size=args.sample_size, step=args.sample_step,
units=args.sample_units, feature="tokens", format=args.x,
keep_punct=args.keep_punct, keep_sym=args.keep_sym, max_samples=args.max_samples)

else:
if args.sampling:
myTexts = tuy.docs_to_samples(args.s, identify_lang=model, size=args.sample_size, step=args.sample_step,
units=args.sample_units, feature="tokens", format=args.x,
keep_punct=args.keep_punct, keep_sym=args.keep_sym, max_samples=args.max_samples)

else:
myTexts = tuy.load_texts(args.s, identify_lang=model, format=args.x, keep_punct=args.keep_punct, keep_sym=args.keep_sym)
myTexts = tuy.load_texts(args.s, identify_lang=args.identify_lang, format=args.x, keep_punct=args.keep_punct, keep_sym=args.keep_sym)

print(".......getting features.......")

Expand Down
7 changes: 4 additions & 3 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,15 +1,16 @@
fasttext==0.9.2
langdetect==1.0.9
joblib==1.2.0
lxml==4.9.1
nltk==3.6.6
numpy==1.22.0
numpy==1.26.4
pybind11==2.8.1
scikit-learn==1.2.1
scipy==1.10.0
six==1.16.0
tqdm==4.64.1
unidecode==1.3.2
pandas==1.3.4
pandas==2.2.0
pyarrow==15.0.0
argparse==1.4.0
regex==2022.10.31
matplotlib==3.6.2
Expand Down
43 changes: 18 additions & 25 deletions superstyl/preproc/tuyau.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,12 @@
import unidecode
import nltk.tokenize
import random
import langdetect

def XML_to_text(path, correct_aut=None):
def XML_to_text(path):
"""
Get main text from xml file
:param path: path to the file to transform
:param correct_aut: optional data frame of metadata correction (authors)
:return: a tuple with auts, and string (the text).
"""

Expand Down Expand Up @@ -45,18 +45,14 @@ def XML_to_text(path, correct_aut=None):

else:
aut = auts[0]
if correct_aut is not None and aut in list(correct_aut.loc[:, "Original"]):
print("correcting " + aut + " to " + correct_aut.loc[aut, "Actual"])
aut = correct_aut.loc[aut, "Actual"]

return aut, re.sub(r"\s+", " ", str(myxsl(my_doc)))


def TXT_to_text(path, correct_aut=None):
def TXT_to_text(path):
"""
Get main text from xml file
:param path: path to the file to transform
:param correct_aut: optional data frame of metadata correction (authors)
:return: a tuple with auts, and string (the text).
"""

Expand All @@ -70,15 +66,14 @@ def TXT_to_text(path, correct_aut=None):
return aut, re.sub(r"\s+", " ", str(' '.join(txt)))


def identify_lang(string, model):
def detect_lang(string):
"""
Get the language from a string
:param string: a string, duh
:param model, the fasttext model
:return: the language
"""

return model.predict(string) # , k = 3)
return langdetect.detect(string) # , k = 3)


def normalise(text, keep_punct=False, keep_sym=False):
Expand All @@ -98,14 +93,13 @@ def normalise(text, keep_punct=False, keep_sym=False):
return out


def load_texts(paths, identify_lang=None, format="txt", correct_aut=None, keep_punct=False, keep_sym=False):
def load_texts(paths, identify_lang=False, format="txt", keep_punct=False, keep_sym=False):
"""
Loads a collection of documents into a 'myTexts' object for further processing.
TODO: a proper class
:param paths: path to docs
:param identify_lang: what model to use for language guessing of the texts (default: None)
:param identify_lang: whether or not try to identify lang (default: False)
:param format: format of the source files (implemented values: txt [default], xml)
:param correct_aut: optional data frame of metadata correction (authors)
:param keep_punct: whether or not to keep punctuation and caps.
:param keep_sym: whether or not to keep punctuation, caps, letter variants and numbers (no unidecode).
:return: a myTexts object
Expand All @@ -118,14 +112,13 @@ def load_texts(paths, identify_lang=None, format="txt", correct_aut=None, keep_p
name = path.split('/')[-1]

if format=='xml':
aut, text = XML_to_text(path, correct_aut=correct_aut)
aut, text = XML_to_text(path)

else:
aut, text = TXT_to_text(path) # implement correct_aut
aut, text = TXT_to_text(path)

if identify_lang is not None:
lang, cert = identify_lang(text, identify_lang)
lang = lang[0].replace("__label__", "")
if identify_lang:
lang = detect_lang(text)
else:
lang = "NA"

Expand Down Expand Up @@ -215,7 +208,7 @@ def get_samples(path, size, step=None, units="verses", feature="tokens", format=


def docs_to_samples(paths, size, step=None, units="verses", feature="tokens", format="tei", keep_punct=False,
keep_sym=False, max_samples=None, identify_lang=None):
keep_sym=False, max_samples=None, identify_lang=False):
"""
Loads a collection of documents into a 'myTexts' object for further processing BUT with samples !
:param paths: path to docs
Expand All @@ -227,20 +220,20 @@ def docs_to_samples(paths, size, step=None, units="verses", feature="tokens", fo
:param format: type of document, one of full text, TEI or simple XML (ONLY TEI and TXT IMPLEMENTED)
:param keep_punct: whether or not to keep punctuation and caps.
:param max_samples: maximum number of samples per author.
:param identify_lang: what model to use for language guessing of the texts (default: None)
:param identify_lang: whether or not try to identify lang (default: False)
"""
myTexts = []
for path in paths:
aut = path.split('/')[-1].split('_')[0]
if identify_lang is not None:
if identify_lang:
if format == 'xml':
aut, text = XML_to_text(path, correct_aut=correct_aut)
aut, text = XML_to_text(path)

else:
aut, text = TXT_to_text(path) # implement correct_aut
aut, text = TXT_to_text(path)

lang = detect_lang(text)

lang, cert = identify_lang(text, identify_lang)
lang = lang[0].replace("__label__", "")
else:
lang = 'NA'

Expand Down
Empty file added tests/__init__.py
Empty file.
Loading