From f8c0d826d604b1052acfac31d67cb638a68b6e57 Mon Sep 17 00:00:00 2001 From: Lise Brinck Date: Tue, 7 Nov 2023 13:11:27 +0100 Subject: [PATCH 01/15] add language extensions for norwegian nynorsk and faroese --- spacy/lang/fo/__init__.py | 26 +++ spacy/lang/fo/tokenizer_exceptions.py | 91 +++++++++++ spacy/lang/nn/__init__.py | 22 +++ spacy/lang/nn/examples.py | 14 ++ spacy/lang/nn/punctuation.py | 74 +++++++++ spacy/lang/nn/tokenizer_exceptions.py | 227 ++++++++++++++++++++++++++ 6 files changed, 454 insertions(+) create mode 100644 spacy/lang/fo/__init__.py create mode 100644 spacy/lang/fo/tokenizer_exceptions.py create mode 100644 spacy/lang/nn/__init__.py create mode 100644 spacy/lang/nn/examples.py create mode 100644 spacy/lang/nn/punctuation.py create mode 100644 spacy/lang/nn/tokenizer_exceptions.py diff --git a/spacy/lang/fo/__init__.py b/spacy/lang/fo/__init__.py new file mode 100644 index 00000000000..9f4b92dba37 --- /dev/null +++ b/spacy/lang/fo/__init__.py @@ -0,0 +1,26 @@ +"""Module for creating a faroese language class.""" +import spacy +from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS +from spacy.lang.punctuation import ( + TOKENIZER_SUFFIXES, + TOKENIZER_PREFIXES, + TOKENIZER_INFIXES, +) + +from spacy.language import Language, BaseDefaults + + +class FaroeseDefaults(BaseDefaults): + tokenizer_exceptions = TOKENIZER_EXCEPTIONS + infixes = TOKENIZER_INFIXES + suffixes = TOKENIZER_SUFFIXES + prefixes = TOKENIZER_PREFIXES + + +@spacy.registry.languages("fo") +class Faroese(Language): + lang = "fo" + Defaults = FaroeseDefaults + + +__all__ = ["Faroese"] diff --git a/spacy/lang/fo/tokenizer_exceptions.py b/spacy/lang/fo/tokenizer_exceptions.py new file mode 100644 index 00000000000..d3d24a47a7a --- /dev/null +++ b/spacy/lang/fo/tokenizer_exceptions.py @@ -0,0 +1,91 @@ +"""Exceptions for the faroese tokenizer - mainly abbreviations copied from CD-ORD ressources""" +from spacy.lang.tokenizer_exceptions import BASE_EXCEPTIONS +from spacy.util import update_exc +from spacy.symbols import ORTH + +_exc = {} + +for orth in [ + "apr.", + "aug.", + "avgr.", + "árg.", + "ávís.", + "beinl.", + "blkv.", + "blaðkv.", + "blm.", + "blaðm.", + "bls.", + "blstj.", + "blaðstj.", + "des.", + "eint.", + "febr.", + "fyrrv.", + "góðk.", + "h.m.", + "innt.", + "jan.", + "kl.", + "m.a.", + "mðr.", + "mió.", + "nr.", + "nto.", + "nov.", + "nút.", + "o.a.", + "o.a.m.", + "o.a.tíl.", + "o.fl.", + "ff.", + "o.m.a.", + "o.o.", + "o.s.fr.", + "o.tíl.", + "o.ø.", + "okt.", + "omf.", + "pst.", + "ritstj.", + "sbr.", + "sms.", + "smst.", + "smb.", + "sb.", + "sbrt.", + "sp.", + "sept.", + "spf.", + "spsk.", + "t.e.", + "t.s.", + "t.s.s.", + "tlf.", + "tel.", + "tsk.", + "t.o.v.", + "t.d.", + "uml.", + "ums.", + "uppl.", + "upprfr.", + "uppr.", + "útg.", + "útl.", + "útr.", + "vanl.", + "v.", + "v.h.", + "v.ø.o.", + "viðm.", + "viðv.", + "vm.", + "v.m.", +]: + _exc[orth] = [{ORTH: orth}] + capitalized = orth.capitalize() + _exc[capitalized] = [{ORTH: capitalized}] + +TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc) diff --git a/spacy/lang/nn/__init__.py b/spacy/lang/nn/__init__.py new file mode 100644 index 00000000000..18a8b76cd67 --- /dev/null +++ b/spacy/lang/nn/__init__.py @@ -0,0 +1,22 @@ +import spacy +from spacy.language import BaseDefaults, Language +from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES +from spacy.lang.nb import SYNTAX_ITERATORS +from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS + + +class NorwegianNynorskDefaults(BaseDefaults): + tokenizer_exceptions = TOKENIZER_EXCEPTIONS + prefixes = TOKENIZER_PREFIXES + infixes = TOKENIZER_INFIXES + suffixes = TOKENIZER_SUFFIXES + syntax_iterators = SYNTAX_ITERATORS + + +@spacy.registry.languages("nn") +class NorwegianNynorsk(Language): + lang = "nn" + Defaults = NorwegianNynorskDefaults + + +__all__ = ["Norwegian"] diff --git a/spacy/lang/nn/examples.py b/spacy/lang/nn/examples.py new file mode 100644 index 00000000000..1a8303aef2d --- /dev/null +++ b/spacy/lang/nn/examples.py @@ -0,0 +1,14 @@ +""" +Example sentences to test spaCy and its language models. + +>>> from mv_spacy_lang.nn.examples import sentences +>>> docs = nlp.pipe(sentences) +""" + + +sentences = [ + "Konseptet går ut på at alle tre omgangar tel, alle hopparar må stille i kvalifiseringa og poengsummen skal telje.", + "Det er ein meir enn i same periode i fjor.", + "Det har lava ned enorme snømengder i store delar av Europa den siste tida.", + "Akhtar Chaudhry er ikkje innstilt på Oslo-lista til SV, men utfordrar Heikki Holmås om førsteplassen.", +] diff --git a/spacy/lang/nn/punctuation.py b/spacy/lang/nn/punctuation.py new file mode 100644 index 00000000000..8e5de07c9f5 --- /dev/null +++ b/spacy/lang/nn/punctuation.py @@ -0,0 +1,74 @@ +from spacy.lang.char_classes import ( + ALPHA, + ALPHA_LOWER, + ALPHA_UPPER, + CONCAT_QUOTES, + CURRENCY, + LIST_CURRENCY, + LIST_ELLIPSES, + LIST_ICONS, + LIST_PUNCT, + LIST_QUOTES, + PUNCT, + UNITS, +) +from spacy.lang.punctuation import TOKENIZER_SUFFIXES + +_quotes = CONCAT_QUOTES.replace("'", "") +_list_punct = [x for x in LIST_PUNCT if x != "#"] +_list_icons = [x for x in LIST_ICONS if x != "°"] +_list_icons = [x.replace("\\u00B0", "") for x in _list_icons] +_list_quotes = [x for x in LIST_QUOTES if x != "\\'"] + + +_prefixes = ( + ["§", "%", "=", "—", "–", r"\+(?![0-9])"] + + _list_punct + + LIST_ELLIPSES + + LIST_QUOTES + + LIST_CURRENCY + + LIST_ICONS +) + + +_infixes = ( + LIST_ELLIPSES + + _list_icons + + [ + r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER), + r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA), + r"(?<=[{a}])[:<>=/](?=[{a}])".format(a=ALPHA), + r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), + r"(?<=[{a}])([{q}\)\]\(\[])(?=[{a}])".format(a=ALPHA, q=_quotes), + r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA), + ] +) + +_suffixes = ( + LIST_PUNCT + + LIST_ELLIPSES + + _list_quotes + + _list_icons + + ["—", "–"] + + [ + r"(?<=[0-9])\+", + r"(?<=°[FfCcKk])\.", + r"(?<=[0-9])(?:{c})".format(c=CURRENCY), + r"(?<=[0-9])(?:{u})".format(u=UNITS), + r"(?<=[{al}{e}{p}(?:{q})])\.".format( + al=ALPHA_LOWER, e=r"%²\-\+", q=_quotes, p=PUNCT + ), + r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER), + ] + + [r"(?<=[^sSxXzZ])'"] +) +_suffixes += [ + suffix + for suffix in TOKENIZER_SUFFIXES + if suffix not in ["'s", "'S", "’s", "’S", r"\'"] +] + + +TOKENIZER_PREFIXES = _prefixes +TOKENIZER_INFIXES = _infixes +TOKENIZER_SUFFIXES = _suffixes diff --git a/spacy/lang/nn/tokenizer_exceptions.py b/spacy/lang/nn/tokenizer_exceptions.py new file mode 100644 index 00000000000..4c9e1eace8c --- /dev/null +++ b/spacy/lang/nn/tokenizer_exceptions.py @@ -0,0 +1,227 @@ +from spacy.symbols import NORM, ORTH +from spacy.util import update_exc +from spacy.lang.tokenizer_exceptions import BASE_EXCEPTIONS + +_exc = {} + + +for exc_data in [ + {ORTH: "jan.", NORM: "januar"}, + {ORTH: "feb.", NORM: "februar"}, + {ORTH: "mar.", NORM: "mars"}, + {ORTH: "apr.", NORM: "april"}, + {ORTH: "jun.", NORM: "juni"}, + {ORTH: "aug.", NORM: "august"}, + {ORTH: "sep.", NORM: "september"}, + {ORTH: "okt.", NORM: "oktober"}, + {ORTH: "nov.", NORM: "november"}, + {ORTH: "des.", NORM: "desember"}, +]: + _exc[exc_data[ORTH]] = [exc_data] + + +for orth in [ + "Ap.", + "Aq.", + "Ca.", + "Chr.", + "Co.", + "Dr.", + "F.eks.", + "Fr.p.", + "Frp.", + "Grl.", + "Kr.", + "Kr.F.", + "Kr.F.s", + "Mr.", + "Mrs.", + "Pb.", + "Pr.", + "Sp.", + "St.", + "a.m.", + "ad.", + "adm.dir.", + "adr.", + "b.c.", + "bl.a.", + "bla.", + "bm.", + "bnr.", + "bto.", + "c.c.", + "ca.", + "cand.mag.", + "co.", + "d.d.", + "d.m.", + "d.y.", + "dept.", + "dr.", + "dr.med.", + "dr.philos.", + "dr.psychol.", + "dss.", + "dvs.", + "e.Kr.", + "e.l.", + "eg.", + "eig.", + "ekskl.", + "el.", + "et.", + "etc.", + "etg.", + "ev.", + "evt.", + "f.", + "f.Kr.", + "f.eks.", + "f.o.m.", + "fhv.", + "fk.", + "foreg.", + "fork.", + "fv.", + "fvt.", + "g.", + "gl.", + "gno.", + "gnr.", + "grl.", + "gt.", + "h.r.adv.", + "hhv.", + "hoh.", + "hr.", + "ifb.", + "ifm.", + "iht.", + "inkl.", + "istf.", + "jf.", + "jr.", + "jun.", + "juris.", + "kfr.", + "kgl.", + "kgl.res.", + "kl.", + "komm.", + "kr.", + "kst.", + "lat.", + "lø.", + "m.a.", + "m.a.o.", + "m.fl.", + "m.m.", + "m.v.", + "ma.", + "mag.art.", + "md.", + "mfl.", + "mht.", + "mill.", + "min.", + "mnd.", + "moh.", + "mrd.", + "muh.", + "mv.", + "mva.", + "n.å.", + "ndf.", + "nr.", + "nto.", + "nyno.", + "o.a.", + "o.l.", + "obl.", + "off.", + "ofl.", + "on.", + "op.", + "org.", + "osv.", + "ovf.", + "p.", + "p.a.", + "p.g.a.", + "p.m.", + "p.t.", + "pga.", + "ph.d.", + "pkt.", + "pr.", + "pst.", + "pt.", + "red.anm.", + "ref.", + "res.", + "res.kap.", + "resp.", + "rv.", + "s.", + "s.d.", + "s.k.", + "s.u.", + "s.å.", + "sen.", + "sep.", + "siviling.", + "sms.", + "snr.", + "spm.", + "sr.", + "sst.", + "st.", + "st.meld.", + "st.prp.", + "stip.", + "stk.", + "stud.", + "sv.", + "såk.", + "sø.", + "t.d.", + "t.h.", + "t.o.m.", + "t.v.", + "temp.", + "ti.", + "tils.", + "tilsv.", + "tl;dr", + "tlf.", + "to.", + "ult.", + "utg.", + "v.", + "vedk.", + "vedr.", + "vg.", + "vgs.", + "vha.", + "vit.ass.", + "vn.", + "vol.", + "vs.", + "vsa.", + "§§", + "©NTB", + "årg.", + "årh.", +]: + _exc[orth] = [{ORTH: orth}] + +# Dates +for h in range(1, 31 + 1): + for period in ["."]: + _exc[f"{h}{period}"] = [{ORTH: f"{h}."}] + +_custom_base_exc = {"i.": [{ORTH: "i", NORM: "i"}, {ORTH: "."}]} +_exc.update(_custom_base_exc) + +TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc) From 15da3da8b476ad8cad5acf881cade3babcb6d750 Mon Sep 17 00:00:00 2001 From: Lise Brinck Date: Tue, 7 Nov 2023 13:14:04 +0100 Subject: [PATCH 02/15] update docstring for nn/examples.py --- spacy/lang/nn/examples.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/lang/nn/examples.py b/spacy/lang/nn/examples.py index 1a8303aef2d..3a78f4d3ac0 100644 --- a/spacy/lang/nn/examples.py +++ b/spacy/lang/nn/examples.py @@ -1,7 +1,7 @@ """ Example sentences to test spaCy and its language models. ->>> from mv_spacy_lang.nn.examples import sentences +>>> from spacy.lang.nn.examples import sentences >>> docs = nlp.pipe(sentences) """ From 1582656175e7b19b92679d4ba8576d003e033a0c Mon Sep 17 00:00:00 2001 From: Lise Brinck Date: Tue, 7 Nov 2023 13:17:11 +0100 Subject: [PATCH 03/15] use relative imports --- spacy/lang/fo/tokenizer_exceptions.py | 7 +++---- spacy/lang/nn/punctuation.py | 4 ++-- spacy/lang/nn/tokenizer_exceptions.py | 6 +++--- 3 files changed, 8 insertions(+), 9 deletions(-) diff --git a/spacy/lang/fo/tokenizer_exceptions.py b/spacy/lang/fo/tokenizer_exceptions.py index d3d24a47a7a..856b72200bd 100644 --- a/spacy/lang/fo/tokenizer_exceptions.py +++ b/spacy/lang/fo/tokenizer_exceptions.py @@ -1,7 +1,6 @@ -"""Exceptions for the faroese tokenizer - mainly abbreviations copied from CD-ORD ressources""" -from spacy.lang.tokenizer_exceptions import BASE_EXCEPTIONS -from spacy.util import update_exc -from spacy.symbols import ORTH +from ...symbols import ORTH +from ...util import update_exc +from ..tokenizer_exceptions import BASE_EXCEPTIONS _exc = {} diff --git a/spacy/lang/nn/punctuation.py b/spacy/lang/nn/punctuation.py index 8e5de07c9f5..7b50b58d37f 100644 --- a/spacy/lang/nn/punctuation.py +++ b/spacy/lang/nn/punctuation.py @@ -1,4 +1,4 @@ -from spacy.lang.char_classes import ( +from ..char_classes import ( ALPHA, ALPHA_LOWER, ALPHA_UPPER, @@ -12,7 +12,7 @@ PUNCT, UNITS, ) -from spacy.lang.punctuation import TOKENIZER_SUFFIXES +from ..punctuation import TOKENIZER_SUFFIXES _quotes = CONCAT_QUOTES.replace("'", "") _list_punct = [x for x in LIST_PUNCT if x != "#"] diff --git a/spacy/lang/nn/tokenizer_exceptions.py b/spacy/lang/nn/tokenizer_exceptions.py index 4c9e1eace8c..08552dd9004 100644 --- a/spacy/lang/nn/tokenizer_exceptions.py +++ b/spacy/lang/nn/tokenizer_exceptions.py @@ -1,6 +1,6 @@ -from spacy.symbols import NORM, ORTH -from spacy.util import update_exc -from spacy.lang.tokenizer_exceptions import BASE_EXCEPTIONS +from ...symbols import NORM, ORTH +from ...util import update_exc +from ..tokenizer_exceptions import BASE_EXCEPTIONS _exc = {} From 64bb825a0c758460dd95f0e44fbaa864ccf01c2f Mon Sep 17 00:00:00 2001 From: Lise Brinck Date: Tue, 7 Nov 2023 13:26:48 +0100 Subject: [PATCH 04/15] add fo and nn tokenizers to pytest fixtures --- spacy/tests/conftest.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 4ca741dfc13..7db986ab9e7 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -162,6 +162,11 @@ def fi_tokenizer(): return get_lang_class("fi")().tokenizer +@pytest.fixture(scope="session") +def fo_tokenizer(): + return get_lang_class("fo")().tokenizer + + @pytest.fixture(scope="session") def fr_tokenizer(): return get_lang_class("fr")().tokenizer @@ -317,6 +322,11 @@ def nl_tokenizer(): return get_lang_class("nl")().tokenizer +@pytest.fixture(scope="session") +def nn_tokenizer(): + return get_lang_class("nn")().tokenizer + + @pytest.fixture(scope="session") def pl_tokenizer(): return get_lang_class("pl")().tokenizer From 6e280657275f37bcb4cfeb4ab09809332ebf2f07 Mon Sep 17 00:00:00 2001 From: Lise Brinck Date: Tue, 7 Nov 2023 14:33:29 +0100 Subject: [PATCH 05/15] add unittests for fo and nn and fix bug in nn --- spacy/lang/nn/__init__.py | 2 +- spacy/tests/lang/fo/test_tokenizer.py | 19 +++++++++++++++++++ spacy/tests/lang/nn/test_tokenizer.py | 27 +++++++++++++++++++++++++++ 3 files changed, 47 insertions(+), 1 deletion(-) create mode 100644 spacy/tests/lang/fo/test_tokenizer.py create mode 100644 spacy/tests/lang/nn/test_tokenizer.py diff --git a/spacy/lang/nn/__init__.py b/spacy/lang/nn/__init__.py index 18a8b76cd67..eb1258b0ddf 100644 --- a/spacy/lang/nn/__init__.py +++ b/spacy/lang/nn/__init__.py @@ -19,4 +19,4 @@ class NorwegianNynorsk(Language): Defaults = NorwegianNynorskDefaults -__all__ = ["Norwegian"] +__all__ = ["NorwegianNynorsk"] diff --git a/spacy/tests/lang/fo/test_tokenizer.py b/spacy/tests/lang/fo/test_tokenizer.py new file mode 100644 index 00000000000..d1041c07299 --- /dev/null +++ b/spacy/tests/lang/fo/test_tokenizer.py @@ -0,0 +1,19 @@ +import pytest + +FO_TOKEN_EXCEPTION_TESTS = [ + ( + "Eftir løgtingslóg um samsýning og eftirløn landsstýrismanna v.m., skulu løgmaður og landsstýrismenn vanliga siga frá sær størv í almennari tænastu ella privatum virkjum, samtøkum ella stovnum. ", + ["Eftir", "løgtingslóg", "um", "samsýning", "og", "eftirløn", "landsstýrismanna", "v.m.", ",", "skulu", "løgmaður", "og", "landsstýrismenn", "vanliga", "siga", "frá", "sær", "størv", "í", "almennari", "tænastu", "ella", "privatum", "virkjum", ",", "samtøkum", "ella", "stovnum", "."], + ), + ( + "Sambandsflokkurin gongur aftur við 2,7 prosentum í mun til valið í 1994, tá flokkurin fekk undirtøku frá 23,4 prosent av veljarunum.", + ["Sambandsflokkurin", "gongur", "aftur", "við", "2,7", "prosentum", "í", "mun", "til", "valið", "í", "1994", ",", "tá", "flokkurin", "fekk", "undirtøku", "frá", "23,4", "prosent", "av", "veljarunum", "."], + ), +] + + +@pytest.mark.parametrize("text,expected_tokens", FO_TOKEN_EXCEPTION_TESTS) +def test_fo_tokenizer_handles_exception_cases(fo_tokenizer, text, expected_tokens): + tokens = fo_tokenizer(text) + token_list = [token.text for token in tokens if not token.is_space] + assert expected_tokens == token_list diff --git a/spacy/tests/lang/nn/test_tokenizer.py b/spacy/tests/lang/nn/test_tokenizer.py new file mode 100644 index 00000000000..e68bf0f8148 --- /dev/null +++ b/spacy/tests/lang/nn/test_tokenizer.py @@ -0,0 +1,27 @@ +import pytest + +NN_TOKEN_EXCEPTION_TESTS = [ + ( + "Målet til direktoratet er at alle skal bli tilbydd jobb i politiet så raskt som mogleg i 2014.", + ["Målet", "til", "direktoratet", "er", "at", "alle", "skal", "bli", "tilbydd", "jobb", "i", "politiet", "så", "raskt", "som", "mogleg", "i", "2014", "."], + ), + ( + "Han ønskjer ikkje at staten skal vere med på å finansiere slik undervisning, men dette er rektor på skulen ueinig i.", + ["Han", "ønskjer", "ikkje", "at", "staten", "skal", "vere", "med", "på", "å", "finansiere", "slik", "undervisning", ",", "men", "dette", "er", "rektor", "på", "skulen", "ueinig", "i", "."], + ), + ( + "Ifølgje China Daily vart det 8.848 meter høge fjellet flytta 3 centimeter sørvestover under jordskjelvet, som vart målt til 7,8.", + ["Ifølgje", "China", "Daily", "vart", "det", "8.848", "meter", "høge", "fjellet", "flytta", "3", "centimeter", "sørvestover", "under", "jordskjelvet", ",", "som", "vart", "målt", "til", "7,8", "."], + ), + ( + "Brukssesongen er frå nov. til mai, med ein topp i mars.", + ["Brukssesongen", "er", "frå", "nov.", "til", "mai", ",", "med", "ein", "topp", "i", "mars", "."], + ) +] + + +@pytest.mark.parametrize("text,expected_tokens", NN_TOKEN_EXCEPTION_TESTS) +def test_nn_tokenizer_handles_exception_cases(nn_tokenizer, text, expected_tokens): + tokens = nn_tokenizer(text) + token_list = [token.text for token in tokens if not token.is_space] + assert expected_tokens == token_list From 9e0a0c38d65376c2491cd32a7b371c88ec058fdb Mon Sep 17 00:00:00 2001 From: Lise Brinck Date: Wed, 8 Nov 2023 09:57:23 +0100 Subject: [PATCH 06/15] remove module docstring from fo/__init__.py --- spacy/lang/fo/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/spacy/lang/fo/__init__.py b/spacy/lang/fo/__init__.py index 9f4b92dba37..f815bb185e4 100644 --- a/spacy/lang/fo/__init__.py +++ b/spacy/lang/fo/__init__.py @@ -1,4 +1,3 @@ -"""Module for creating a faroese language class.""" import spacy from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from spacy.lang.punctuation import ( From 7dfa86d8dee4cd86a3c7c91dac0a83401dba71f7 Mon Sep 17 00:00:00 2001 From: Lise Brinck Date: Wed, 8 Nov 2023 10:04:28 +0100 Subject: [PATCH 07/15] add comments about example sentences' origin --- spacy/lang/nn/examples.py | 1 + spacy/tests/lang/fo/test_tokenizer.py | 1 + spacy/tests/lang/nn/test_tokenizer.py | 1 + 3 files changed, 3 insertions(+) diff --git a/spacy/lang/nn/examples.py b/spacy/lang/nn/examples.py index 3a78f4d3ac0..95ec0aaddd0 100644 --- a/spacy/lang/nn/examples.py +++ b/spacy/lang/nn/examples.py @@ -6,6 +6,7 @@ """ +# sentences taken from Omsetjingsminne frå Nynorsk pressekontor 2022 (https://www.nb.no/sprakbanken/en/resource-catalogue/oai-nb-no-sbr-80/) sentences = [ "Konseptet går ut på at alle tre omgangar tel, alle hopparar må stille i kvalifiseringa og poengsummen skal telje.", "Det er ein meir enn i same periode i fjor.", diff --git a/spacy/tests/lang/fo/test_tokenizer.py b/spacy/tests/lang/fo/test_tokenizer.py index d1041c07299..794bde6824c 100644 --- a/spacy/tests/lang/fo/test_tokenizer.py +++ b/spacy/tests/lang/fo/test_tokenizer.py @@ -1,5 +1,6 @@ import pytest +# examples taken from Basic LAnguage Resource Kit 1.0 for Faroese (https://www.openslr.org/125/) FO_TOKEN_EXCEPTION_TESTS = [ ( "Eftir løgtingslóg um samsýning og eftirløn landsstýrismanna v.m., skulu løgmaður og landsstýrismenn vanliga siga frá sær størv í almennari tænastu ella privatum virkjum, samtøkum ella stovnum. ", diff --git a/spacy/tests/lang/nn/test_tokenizer.py b/spacy/tests/lang/nn/test_tokenizer.py index e68bf0f8148..777be7f1567 100644 --- a/spacy/tests/lang/nn/test_tokenizer.py +++ b/spacy/tests/lang/nn/test_tokenizer.py @@ -1,5 +1,6 @@ import pytest +# examples taken from Omsetjingsminne frå Nynorsk pressekontor 2022 (https://www.nb.no/sprakbanken/en/resource-catalogue/oai-nb-no-sbr-80/) NN_TOKEN_EXCEPTION_TESTS = [ ( "Målet til direktoratet er at alle skal bli tilbydd jobb i politiet så raskt som mogleg i 2014.", From 34dbf8f9830565706b55738e997d542e801e807d Mon Sep 17 00:00:00 2001 From: Lise Brinck Date: Wed, 8 Nov 2023 10:07:32 +0100 Subject: [PATCH 08/15] add license information to faroese data credit --- spacy/tests/lang/fo/test_tokenizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/tests/lang/fo/test_tokenizer.py b/spacy/tests/lang/fo/test_tokenizer.py index 794bde6824c..37294f4bd52 100644 --- a/spacy/tests/lang/fo/test_tokenizer.py +++ b/spacy/tests/lang/fo/test_tokenizer.py @@ -1,6 +1,6 @@ import pytest -# examples taken from Basic LAnguage Resource Kit 1.0 for Faroese (https://www.openslr.org/125/) +# examples taken from Basic LAnguage Resource Kit 1.0 for Faroese (https://maltokni.fo/en/resources) licensed with CC BY 4.0 (https://creativecommons.org/licenses/by/4.0/) FO_TOKEN_EXCEPTION_TESTS = [ ( "Eftir løgtingslóg um samsýning og eftirløn landsstýrismanna v.m., skulu løgmaður og landsstýrismenn vanliga siga frá sær størv í almennari tænastu ella privatum virkjum, samtøkum ella stovnum. ", From 210b79d619449e8d1745b5705a452ba8775671d1 Mon Sep 17 00:00:00 2001 From: Lise Brinck Date: Wed, 8 Nov 2023 10:09:30 +0100 Subject: [PATCH 09/15] format unittests using black --- spacy/tests/lang/fo/test_tokenizer.py | 58 ++++++++++++++++- spacy/tests/lang/nn/test_tokenizer.py | 91 +++++++++++++++++++++++++-- 2 files changed, 142 insertions(+), 7 deletions(-) diff --git a/spacy/tests/lang/fo/test_tokenizer.py b/spacy/tests/lang/fo/test_tokenizer.py index 37294f4bd52..d31aa100fe3 100644 --- a/spacy/tests/lang/fo/test_tokenizer.py +++ b/spacy/tests/lang/fo/test_tokenizer.py @@ -4,11 +4,65 @@ FO_TOKEN_EXCEPTION_TESTS = [ ( "Eftir løgtingslóg um samsýning og eftirløn landsstýrismanna v.m., skulu løgmaður og landsstýrismenn vanliga siga frá sær størv í almennari tænastu ella privatum virkjum, samtøkum ella stovnum. ", - ["Eftir", "løgtingslóg", "um", "samsýning", "og", "eftirløn", "landsstýrismanna", "v.m.", ",", "skulu", "løgmaður", "og", "landsstýrismenn", "vanliga", "siga", "frá", "sær", "størv", "í", "almennari", "tænastu", "ella", "privatum", "virkjum", ",", "samtøkum", "ella", "stovnum", "."], + [ + "Eftir", + "løgtingslóg", + "um", + "samsýning", + "og", + "eftirløn", + "landsstýrismanna", + "v.m.", + ",", + "skulu", + "løgmaður", + "og", + "landsstýrismenn", + "vanliga", + "siga", + "frá", + "sær", + "størv", + "í", + "almennari", + "tænastu", + "ella", + "privatum", + "virkjum", + ",", + "samtøkum", + "ella", + "stovnum", + ".", + ], ), ( "Sambandsflokkurin gongur aftur við 2,7 prosentum í mun til valið í 1994, tá flokkurin fekk undirtøku frá 23,4 prosent av veljarunum.", - ["Sambandsflokkurin", "gongur", "aftur", "við", "2,7", "prosentum", "í", "mun", "til", "valið", "í", "1994", ",", "tá", "flokkurin", "fekk", "undirtøku", "frá", "23,4", "prosent", "av", "veljarunum", "."], + [ + "Sambandsflokkurin", + "gongur", + "aftur", + "við", + "2,7", + "prosentum", + "í", + "mun", + "til", + "valið", + "í", + "1994", + ",", + "tá", + "flokkurin", + "fekk", + "undirtøku", + "frá", + "23,4", + "prosent", + "av", + "veljarunum", + ".", + ], ), ] diff --git a/spacy/tests/lang/nn/test_tokenizer.py b/spacy/tests/lang/nn/test_tokenizer.py index 777be7f1567..9d607072eb0 100644 --- a/spacy/tests/lang/nn/test_tokenizer.py +++ b/spacy/tests/lang/nn/test_tokenizer.py @@ -4,20 +4,101 @@ NN_TOKEN_EXCEPTION_TESTS = [ ( "Målet til direktoratet er at alle skal bli tilbydd jobb i politiet så raskt som mogleg i 2014.", - ["Målet", "til", "direktoratet", "er", "at", "alle", "skal", "bli", "tilbydd", "jobb", "i", "politiet", "så", "raskt", "som", "mogleg", "i", "2014", "."], + [ + "Målet", + "til", + "direktoratet", + "er", + "at", + "alle", + "skal", + "bli", + "tilbydd", + "jobb", + "i", + "politiet", + "så", + "raskt", + "som", + "mogleg", + "i", + "2014", + ".", + ], ), ( "Han ønskjer ikkje at staten skal vere med på å finansiere slik undervisning, men dette er rektor på skulen ueinig i.", - ["Han", "ønskjer", "ikkje", "at", "staten", "skal", "vere", "med", "på", "å", "finansiere", "slik", "undervisning", ",", "men", "dette", "er", "rektor", "på", "skulen", "ueinig", "i", "."], + [ + "Han", + "ønskjer", + "ikkje", + "at", + "staten", + "skal", + "vere", + "med", + "på", + "å", + "finansiere", + "slik", + "undervisning", + ",", + "men", + "dette", + "er", + "rektor", + "på", + "skulen", + "ueinig", + "i", + ".", + ], ), ( "Ifølgje China Daily vart det 8.848 meter høge fjellet flytta 3 centimeter sørvestover under jordskjelvet, som vart målt til 7,8.", - ["Ifølgje", "China", "Daily", "vart", "det", "8.848", "meter", "høge", "fjellet", "flytta", "3", "centimeter", "sørvestover", "under", "jordskjelvet", ",", "som", "vart", "målt", "til", "7,8", "."], + [ + "Ifølgje", + "China", + "Daily", + "vart", + "det", + "8.848", + "meter", + "høge", + "fjellet", + "flytta", + "3", + "centimeter", + "sørvestover", + "under", + "jordskjelvet", + ",", + "som", + "vart", + "målt", + "til", + "7,8", + ".", + ], ), ( "Brukssesongen er frå nov. til mai, med ein topp i mars.", - ["Brukssesongen", "er", "frå", "nov.", "til", "mai", ",", "med", "ein", "topp", "i", "mars", "."], - ) + [ + "Brukssesongen", + "er", + "frå", + "nov.", + "til", + "mai", + ",", + "med", + "ein", + "topp", + "i", + "mars", + ".", + ], + ), ] From 5a27662d327eca9a030dfd6bfb4e0aebc5f10f5d Mon Sep 17 00:00:00 2001 From: Lise Brinck Date: Wed, 8 Nov 2023 10:34:04 +0100 Subject: [PATCH 10/15] add __init__ files to test/lang/nn and tests/lang/fo --- spacy/tests/lang/fo/__init__.py | 0 spacy/tests/lang/nn/__init__.py | 0 2 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 spacy/tests/lang/fo/__init__.py create mode 100644 spacy/tests/lang/nn/__init__.py diff --git a/spacy/tests/lang/fo/__init__.py b/spacy/tests/lang/fo/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/spacy/tests/lang/nn/__init__.py b/spacy/tests/lang/nn/__init__.py new file mode 100644 index 00000000000..e69de29bb2d From 7281c73ad295ea250c201f5ef29c70a7fc8b34c0 Mon Sep 17 00:00:00 2001 From: Lise Brinck Date: Wed, 8 Nov 2023 10:44:25 +0100 Subject: [PATCH 11/15] fix import order and use relative imports in fo/__nit__.py and nn/__init__.py --- spacy/lang/fo/__init__.py | 11 ++--------- spacy/lang/nn/__init__.py | 6 ++---- 2 files changed, 4 insertions(+), 13 deletions(-) diff --git a/spacy/lang/fo/__init__.py b/spacy/lang/fo/__init__.py index f815bb185e4..db18f1a5d97 100644 --- a/spacy/lang/fo/__init__.py +++ b/spacy/lang/fo/__init__.py @@ -1,12 +1,6 @@ -import spacy +from ...language import BaseDefaults, Language +from ..punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS -from spacy.lang.punctuation import ( - TOKENIZER_SUFFIXES, - TOKENIZER_PREFIXES, - TOKENIZER_INFIXES, -) - -from spacy.language import Language, BaseDefaults class FaroeseDefaults(BaseDefaults): @@ -16,7 +10,6 @@ class FaroeseDefaults(BaseDefaults): prefixes = TOKENIZER_PREFIXES -@spacy.registry.languages("fo") class Faroese(Language): lang = "fo" Defaults = FaroeseDefaults diff --git a/spacy/lang/nn/__init__.py b/spacy/lang/nn/__init__.py index eb1258b0ddf..ebbf0709089 100644 --- a/spacy/lang/nn/__init__.py +++ b/spacy/lang/nn/__init__.py @@ -1,7 +1,6 @@ -import spacy -from spacy.language import BaseDefaults, Language +from ...language import BaseDefaults, Language +from ..nb import SYNTAX_ITERATORS from .punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES -from spacy.lang.nb import SYNTAX_ITERATORS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS @@ -13,7 +12,6 @@ class NorwegianNynorskDefaults(BaseDefaults): syntax_iterators = SYNTAX_ITERATORS -@spacy.registry.languages("nn") class NorwegianNynorsk(Language): lang = "nn" Defaults = NorwegianNynorskDefaults From a526b01e900dfaec83453b5310f13e0ae1e5351b Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Wed, 15 Nov 2023 09:43:42 +0100 Subject: [PATCH 12/15] Make the tests a bit more compact --- spacy/tests/lang/fo/test_tokenizer.py | 56 ++---------------- spacy/tests/lang/nn/test_tokenizer.py | 83 ++------------------------- 2 files changed, 10 insertions(+), 129 deletions(-) diff --git a/spacy/tests/lang/fo/test_tokenizer.py b/spacy/tests/lang/fo/test_tokenizer.py index d31aa100fe3..e61a62be58c 100644 --- a/spacy/tests/lang/fo/test_tokenizer.py +++ b/spacy/tests/lang/fo/test_tokenizer.py @@ -1,70 +1,22 @@ import pytest # examples taken from Basic LAnguage Resource Kit 1.0 for Faroese (https://maltokni.fo/en/resources) licensed with CC BY 4.0 (https://creativecommons.org/licenses/by/4.0/) +# fmt: off FO_TOKEN_EXCEPTION_TESTS = [ ( "Eftir løgtingslóg um samsýning og eftirløn landsstýrismanna v.m., skulu løgmaður og landsstýrismenn vanliga siga frá sær størv í almennari tænastu ella privatum virkjum, samtøkum ella stovnum. ", [ - "Eftir", - "løgtingslóg", - "um", - "samsýning", - "og", - "eftirløn", - "landsstýrismanna", - "v.m.", - ",", - "skulu", - "løgmaður", - "og", - "landsstýrismenn", - "vanliga", - "siga", - "frá", - "sær", - "størv", - "í", - "almennari", - "tænastu", - "ella", - "privatum", - "virkjum", - ",", - "samtøkum", - "ella", - "stovnum", - ".", + "Eftir", "løgtingslóg", "um", "samsýning", "og", "eftirløn", "landsstýrismanna", "v.m.", ",", "skulu", "løgmaður", "og", "landsstýrismenn", "vanliga", "siga", "frá", "sær", "størv", "í", "almennari", "tænastu", "ella", "privatum", "virkjum", ",", "samtøkum", "ella", "stovnum", ".", ], ), ( "Sambandsflokkurin gongur aftur við 2,7 prosentum í mun til valið í 1994, tá flokkurin fekk undirtøku frá 23,4 prosent av veljarunum.", [ - "Sambandsflokkurin", - "gongur", - "aftur", - "við", - "2,7", - "prosentum", - "í", - "mun", - "til", - "valið", - "í", - "1994", - ",", - "tá", - "flokkurin", - "fekk", - "undirtøku", - "frá", - "23,4", - "prosent", - "av", - "veljarunum", - ".", + "Sambandsflokkurin", "gongur", "aftur", "við", "2,7", "prosentum", "í", "mun", "til", "valið", "í", "1994", ",", "tá", "flokkurin", "fekk", "undirtøku", "frá", "23,4", "prosent", "av", "veljarunum", ".", ], ), ] +# fmt: on @pytest.mark.parametrize("text,expected_tokens", FO_TOKEN_EXCEPTION_TESTS) diff --git a/spacy/tests/lang/nn/test_tokenizer.py b/spacy/tests/lang/nn/test_tokenizer.py index 9d607072eb0..74a6937bdce 100644 --- a/spacy/tests/lang/nn/test_tokenizer.py +++ b/spacy/tests/lang/nn/test_tokenizer.py @@ -1,105 +1,34 @@ import pytest # examples taken from Omsetjingsminne frå Nynorsk pressekontor 2022 (https://www.nb.no/sprakbanken/en/resource-catalogue/oai-nb-no-sbr-80/) +# fmt: off NN_TOKEN_EXCEPTION_TESTS = [ ( "Målet til direktoratet er at alle skal bli tilbydd jobb i politiet så raskt som mogleg i 2014.", [ - "Målet", - "til", - "direktoratet", - "er", - "at", - "alle", - "skal", - "bli", - "tilbydd", - "jobb", - "i", - "politiet", - "så", - "raskt", - "som", - "mogleg", - "i", - "2014", - ".", + "Målet", "til", "direktoratet", "er", "at", "alle", "skal", "bli", "tilbydd", "jobb", "i", "politiet", "så", "raskt", "som", "mogleg", "i", "2014", ".", ], ), ( "Han ønskjer ikkje at staten skal vere med på å finansiere slik undervisning, men dette er rektor på skulen ueinig i.", [ - "Han", - "ønskjer", - "ikkje", - "at", - "staten", - "skal", - "vere", - "med", - "på", - "å", - "finansiere", - "slik", - "undervisning", - ",", - "men", - "dette", - "er", - "rektor", - "på", - "skulen", - "ueinig", - "i", - ".", + "Han", "ønskjer", "ikkje", "at", "staten", "skal", "vere", "med", "på", "å", "finansiere", "slik", "undervisning", ",", "men", "dette", "er", "rektor", "på", "skulen", "ueinig", "i", ".", ], ), ( "Ifølgje China Daily vart det 8.848 meter høge fjellet flytta 3 centimeter sørvestover under jordskjelvet, som vart målt til 7,8.", [ - "Ifølgje", - "China", - "Daily", - "vart", - "det", - "8.848", - "meter", - "høge", - "fjellet", - "flytta", - "3", - "centimeter", - "sørvestover", - "under", - "jordskjelvet", - ",", - "som", - "vart", - "målt", - "til", - "7,8", - ".", + "Ifølgje", "China", "Daily", "vart", "det", "8.848", "meter", "høge", "fjellet", "flytta", "3", "centimeter", "sørvestover", "under", "jordskjelvet", ",", "som", "vart", "målt", "til", "7,8", ".", ], ), ( "Brukssesongen er frå nov. til mai, med ein topp i mars.", [ - "Brukssesongen", - "er", - "frå", - "nov.", - "til", - "mai", - ",", - "med", - "ein", - "topp", - "i", - "mars", - ".", + "Brukssesongen", "er", "frå", "nov.", "til", "mai", ",", "med", "ein", "topp", "i", "mars", ".", ], ), ] +# fmt: on @pytest.mark.parametrize("text,expected_tokens", NN_TOKEN_EXCEPTION_TESTS) From 2441803222cdc7779bb9e41fb625f5df8cdd0293 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Wed, 15 Nov 2023 09:44:11 +0100 Subject: [PATCH 13/15] Add fo and nn to website languages --- website/meta/languages.json | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/website/meta/languages.json b/website/meta/languages.json index 3305b840b58..d6a07809795 100644 --- a/website/meta/languages.json +++ b/website/meta/languages.json @@ -103,6 +103,10 @@ "has_examples": true, "models": ["fi_core_news_sm", "fi_core_news_md", "fi_core_news_lg"] }, + { + "code": "fo", + "name": "Faroese" + }, { "code": "fr", "name": "French", @@ -290,6 +294,12 @@ "example": "Dit is een zin.", "has_examples": true }, + { + "code": "nn", + "name": "Norwegian Nynorsk", + "example": "Det er ein meir enn i same periode i fjor.", + "has_examples": true + }, { "code": "pl", "name": "Polish", From ae67abd2703f338dd427f1764e997775544db51c Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Fri, 17 Nov 2023 09:51:24 +0100 Subject: [PATCH 14/15] Add note about jul. --- spacy/lang/nn/tokenizer_exceptions.py | 1 + 1 file changed, 1 insertion(+) diff --git a/spacy/lang/nn/tokenizer_exceptions.py b/spacy/lang/nn/tokenizer_exceptions.py index 08552dd9004..16fc817d269 100644 --- a/spacy/lang/nn/tokenizer_exceptions.py +++ b/spacy/lang/nn/tokenizer_exceptions.py @@ -11,6 +11,7 @@ {ORTH: "mar.", NORM: "mars"}, {ORTH: "apr.", NORM: "april"}, {ORTH: "jun.", NORM: "juni"}, + # note: "jul." is in the simple list below without a NORM exception {ORTH: "aug.", NORM: "august"}, {ORTH: "sep.", NORM: "september"}, {ORTH: "okt.", NORM: "oktober"}, From 654e1e3b16085a329d185789cc515fce1ca44b4f Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Fri, 17 Nov 2023 09:53:14 +0100 Subject: [PATCH 15/15] Add "jul." as exception --- spacy/lang/nn/tokenizer_exceptions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/lang/nn/tokenizer_exceptions.py b/spacy/lang/nn/tokenizer_exceptions.py index 16fc817d269..4bfcb26d833 100644 --- a/spacy/lang/nn/tokenizer_exceptions.py +++ b/spacy/lang/nn/tokenizer_exceptions.py @@ -103,7 +103,7 @@ "istf.", "jf.", "jr.", - "jun.", + "jul.", "juris.", "kfr.", "kgl.",