From 13d2e8d06a72aa65a27c9427b49893410bb33657 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20P=C3=A9rez?= Date: Thu, 5 Dec 2019 13:15:10 +0100 Subject: [PATCH 1/6] Fix/scansion (#62) * Fixed syllabification exceptions, support for disabling/enabling spacy_affixes * Fixed multiline break * Fixed splitted verb stresses and secondary stress on '-mente' adverbs * Fixed reviewed issues * Fixed reviewed issues 2nd wave * Added minimum length for '-mente' adverbs --- src/rantanplan/core.py | 156 ++++++++++++++----- src/rantanplan/pipeline.py | 14 +- tests/fixtures/phonological_groups.json | 180 +++++++++++----------- tests/fixtures/rhyme_analysis_sonnet.json | 30 +--- tests/test_pipeline.py | 18 +++ 5 files changed, 242 insertions(+), 156 deletions(-) diff --git a/src/rantanplan/core.py b/src/rantanplan/core.py index 1f3a659..e1c9cd2 100644 --- a/src/rantanplan/core.py +++ b/src/rantanplan/core.py @@ -71,18 +71,21 @@ WEAK_VOWELS = set("iuüíúIÍUÜÚ") LIAISON_FIRST_PART = set("aeiouáéíóúAEIOUÁÉÍÓÚyY") LIAISON_SECOND_PART = set("aeiouáéíóúAEIOUÁÉÍÓÚhyYH") + STRESSED_UNACCENTED_MONOSYLLABLES = {"yo", "vio", "dio", "fe", "sol", "ti", "un"} + UNSTRESSED_UNACCENTED_MONOSYLLABLES = {'de', 'el', 'la', 'las', 'le', 'les', 'lo', 'los', 'mas', 'me', 'mi', 'nos', 'os', 'que', 'se', 'si', 'su', 'tan', 'te', 'tu', "tus", "oh"} -UNSTRESSED_FORMS = {"que", "cual", "quien", "donde", "cuando", "cuanto", - "como"} -POSSESSIVE_PRON = {"mío", "mía", "míos", "mías", "tuyo", "tuya", "tuyos", - "tuyas", "suyo", "suya", "suyos", "suyas"} +UNSTRESSED_FORMS = {"ay", "don", "doña", "aun", "que", "cual", "quien", "donde", + "cuando", "cuanto", "como", "cuantas", "cuantos"} + +STRESSED_PRON = {"mío", "mía", "míos", "mías", "tuyo", "tuya", "tuyos", + "tuyas", "suyo", "suya", "suyos", "suyas", "todo"} POSSESSIVE_PRON_UNSTRESSED = {"nuestro", "nuestra", "nuestros", "nuestras", "vuestro", "vuestra", "vuestros", "vuestras"} @@ -282,14 +285,29 @@ def get_stresses(phonological_groups): :return: List of boolean values indicating whether a group is stressed (True) or not (False) """ - stresses = [group["is_stressed"] for group in phonological_groups] + # stresses = [group["is_stressed"] for group in phonological_groups] + stresses = [] + last_word_syllables = [] + for group in phonological_groups: + stresses.append(group["is_stressed"]) + for group in phonological_groups: + last_word_syllables.append(group.get("is_word_end", False)) + # Get position for the last syllable of the penultimate word + if last_word_syllables.count(True) > 1: + penultimate_word = -( + [i for i, n in enumerate(last_word_syllables[::-1]) if n][1] + 1) + else: + penultimate_word = None last_stress = -(stresses[::-1].index(True) + 1) # Oxytone (Aguda) if last_stress == -1: stresses.append(False) # Paroxytone (Esdrújula) or Proparoxytone (Sobreesdrújula) elif last_stress <= -3: - stresses.pop() + if penultimate_word is None: + stresses.pop() + elif last_stress > penultimate_word: + stresses.pop() return stresses @@ -469,7 +487,7 @@ def get_word_stress(word, pos, tag, alternative_syllabification=False): """ Gets a list of syllables from a word and creates a list with syllabified word and stressed syllable index - :param word: List of str representing syllables + :param word: Word string :param alternative_syllabification: Wether or not the alternative syllabification is used :param pos: PoS tag from spacy ("DET") @@ -481,7 +499,19 @@ def get_word_stress(word, pos, tag, alternative_syllabification=False): :rtype: dict """ syllable_list, _ = syllabify(word, alternative_syllabification) - word_lower = "".join(word).lower() + word_lower = word.lower() + # Handle secondary stress on adverbs ending in -mente + if pos == "ADV" and word_lower[-5:] == "mente" and len(word) > 5: + root = word[:-5] + mente = word[-5:] + stress_root = get_word_stress(root, "ADJ", "") + stress_mente = get_word_stress(mente, "NOUN", "") + return { + 'word': stress_root['word'] + stress_mente['word'], + "stress_position": stress_root['stress_position'] - len( + stress_mente['word']), + "secondary_stress_positions": [stress_mente['stress_position']], + } if len(syllable_list) == 1: first_monosyllable = syllable_list[0].lower() if ((first_monosyllable not in UNSTRESSED_UNACCENTED_MONOSYLLABLES) @@ -489,7 +519,7 @@ def get_word_stress(word, pos, tag, alternative_syllabification=False): or pos not in ("SCONJ", "CCONJ", "DET", "PRON", "ADP") or (pos == "PRON" and tag.get("Case") == "Nom") or (pos == "DET" and tag.get("Definite") in ( - "Dem", "Ind")) + "Dem", "Ind")) or pos in ("PROPN", "NUM", "NOUN", "VERB", "AUX", "ADV") or (pos == "ADJ" and tag.get("Poss", None) != "Yes") or (pos == "PRON" @@ -500,37 +530,50 @@ def get_word_stress(word, pos, tag, alternative_syllabification=False): or (pos in ("PRON", "DET") and tag.get("PronType", None) in ( "Exc", "Int", "Dem")) - or "".join(word).lower() in POSSESSIVE_PRON)): + or "".join(word).lower() in STRESSED_PRON) and ( + word_lower not in UNSTRESSED_FORMS)): stressed_position = -1 else: stressed_position = 0 # unstressed monosyllable - elif (pos in ("INTJ", "PROPN", "NUM", "NOUN", "VERB", "AUX", "ADV") - or pos == "ADJ" and word_lower not in POSSESSIVE_PRON_UNSTRESSED - or (pos == "PRON" and tag.get("PronType", None) in ("Prs", "Ind")) - or (pos == "DET" and tag.get("PronType", None) in ("Dem", "Ind")) - or (pos == "DET" and tag.get("Definite", None) == "Ind") - or (pos == "PRON" and tag.get("Poss", None) == "Yes") - or (pos in ("PRON", "DET") - and tag.get("PronType", None) in ("Exc", "Int", "Dem")) - or (word_lower in POSSESSIVE_PRON)): + else: tilde = get_orthographic_accent(syllable_list) - # If an orthographic accent exists, the syllable negative index is saved if tilde is not None: - stressed_position = -(len(syllable_list) - tilde) - # Elif the word is paroxytone (llana) we save the penultimate syllable. - elif is_paroxytone(syllable_list): - stressed_position = -2 - # If the word does not meet the above criteria that means that it's an - # oxytone word (aguda). + stressed_position = tilde - len(syllable_list) + elif (pos in ("INTJ", "PROPN", "NUM", "NOUN", "VERB", "AUX", "ADV") + or pos == "ADJ" + or (pos == "PRON" and tag.get("PronType", None) in ( + "Prs", "Ind")) + or (pos == "DET" and tag.get("PronType", None) in ( + "Dem", "Ind")) + or (pos == "DET" and tag.get("Definite", None) == "Ind") + or (pos == "PRON" and tag.get("Poss", None) == "Yes") + or (pos in ("PRON", "DET") + and tag.get("PronType", None) in ("Exc", "Int", "Dem")) + or (word_lower in STRESSED_PRON)) and ( + word_lower not in UNSTRESSED_FORMS) and ( + word_lower not in POSSESSIVE_PRON_UNSTRESSED): + tilde = get_orthographic_accent(syllable_list) + # If an orthographic accent exists, + # the syllable negative index is saved + if tilde is not None: + stressed_position = -(len(syllable_list) - tilde) + # Elif the word is paroxytone (llana) + # we save the penultimate syllable. + elif is_paroxytone(syllable_list): + stressed_position = -2 + # If the word does not meet the above criteria that means + # that it's an oxytone word (aguda). + else: + stressed_position = -1 else: - stressed_position = -1 - else: - stressed_position = 0 # unstressed + stressed_position = 0 # unstressed out_syllable_list = [] for index, syllable in enumerate(syllable_list): out_syllable_list.append( - {"syllable": syllable, - "is_stressed": len(syllable_list) - index == -stressed_position}) + { + "syllable": syllable, + "is_stressed": len(syllable_list) - index == -stressed_position + }) if index < 1: continue # Sinaeresis @@ -583,18 +626,53 @@ def get_words(word_list, alternative_syllabification=False): tags = spacy_tag_to_dict(tag) stressed_word = get_word_stress(word.text, pos, tags, alternative_syllabification) - first_syllable = get_last_syllable(syllabified_words) - second_syllable = stressed_word['word'][0] - # Synalepha - if first_syllable and second_syllable and have_prosodic_liaison( - first_syllable, second_syllable): - first_syllable.update({'has_synalepha': True}) + if word.pos_ in ("AUX", "VERB") and word._.affixes_length: + stressed_word.update( + {'affixes_length': word._.affixes_length}) + stressed_word.update({'pos': word.pos_, 'tag': word.tag_}) syllabified_words.append(stressed_word) else: syllabified_words.append({"symbol": word.text}) + syllabified_words = join_affixes(syllabified_words) + clean_word_list = [syll for syll in syllabified_words if "word" in syll] + # Synalepha + for index, word in enumerate(clean_word_list): + if len(clean_word_list) != index + 1: + first_syllable = clean_word_list[index]['word'][-1] + second_syllable = clean_word_list[index + 1]['word'][0] + if first_syllable and second_syllable and have_prosodic_liaison( + first_syllable, second_syllable): + first_syllable.update({'has_synalepha': True}) return syllabified_words +def join_affixes(line): + """ + Join affixes of split words and recalculates stress + :param line: List of syllabified words (dict) + :return: List of syllabified words (dict) with joined affixes + """ + syllabified_words = [] + indices_to_ignore = [] + for index, word in enumerate(line): + affixes_length = word.get('affixes_length', None) + if index in indices_to_ignore: + continue + elif affixes_length is None: + syllabified_words.append(word) + else: + indices_to_ignore = range(index, index + affixes_length + 1) + join_word = [] + for affix_index in indices_to_ignore: + affix = line[affix_index]['word'] + join_word += [syll["syllable"] for syll in affix] + word_stress = get_word_stress("".join(join_word), word["pos"], + word["tag"]) + word_stress["word"][-1]["is_word_end"] = True + syllabified_words.append(word_stress) + return syllabified_words if syllabified_words else line + + def get_scansion(text, rhyme_analysis=False, rhythm_format="pattern", rhythmical_lengths=None): """ @@ -692,9 +770,9 @@ def generate_phonological_groups(tokens): syllables = get_syllables_word_end(words) for liaison in ( ("synalepha",), + ("synalepha", "sinaeresis"), ("sinaeresis",), ("sinaeresis", "synalepha"), - ("synalepha", "sinaeresis"), ): for ignore_synalepha_h in (break_on_h, None): for liaison_positions_1 in generate_liaison_positions( @@ -710,7 +788,7 @@ def generate_phonological_groups(tokens): yield groups else: for liaison_positions_2 in generate_liaison_positions( - syllables, liaison[1] + syllables, liaison[1] ): yield get_phonological_groups( groups, diff --git a/src/rantanplan/pipeline.py b/src/rantanplan/pipeline.py index de5602b..2e659f1 100644 --- a/src/rantanplan/pipeline.py +++ b/src/rantanplan/pipeline.py @@ -29,10 +29,11 @@ def custom_tokenizer(nlp): _load_pipeline = {} -def load_pipeline(lang=None): +def load_pipeline(lang=None, split_affixes=True): """ Loads the new pipeline with the custom tokenizer :param lang: Spacy language model + :param split_affixes: Whether or not to use spacy_affixes to split words :return: New custom language model """ global _load_pipeline @@ -41,9 +42,12 @@ def load_pipeline(lang=None): if lang not in _load_pipeline: nlp = spacy.load(lang) nlp.tokenizer = custom_tokenizer(nlp) - nlp.remove_pipe("affixes") if nlp.has_pipe("affixes") else None - suffixes = {k: v for k, v in load_affixes().items() if k.startswith(AFFIXES_SUFFIX)} - affixes_matcher = AffixesMatcher(nlp, split_on=["VERB"], rules=suffixes) - nlp.add_pipe(affixes_matcher, name="affixes", first=True) + if split_affixes: + nlp.remove_pipe("affixes") if nlp.has_pipe("affixes") else None + suffixes = {k: v for k, v in load_affixes().items() if + k.startswith(AFFIXES_SUFFIX)} + affixes_matcher = AffixesMatcher(nlp, split_on=["VERB"], + rules=suffixes) + nlp.add_pipe(affixes_matcher, name="affixes", first=True) _load_pipeline[lang] = nlp return _load_pipeline[lang] diff --git a/tests/fixtures/phonological_groups.json b/tests/fixtures/phonological_groups.json index 32ad5e6..dda6e55 100644 --- a/tests/fixtures/phonological_groups.json +++ b/tests/fixtures/phonological_groups.json @@ -282,7 +282,7 @@ { "syllable": "rro", "is_stressed": false, - "has_synalepha": true, + "has_synalepha": false, "is_word_end": true }, { @@ -290,14 +290,11 @@ "is_stressed": true }, { - "syllable": "ce", - "is_stressed": false, - "has_synalepha": true, - "is_word_end": true - }, - { - "syllable": "a", - "is_stressed": true + "syllable": "cea", + "is_stressed": true, + "synalepha_index": [ + 1 + ] }, { "syllable": "guas", @@ -318,7 +315,7 @@ { "syllable": "rro", "is_stressed": false, - "has_synalepha": true, + "has_synalepha": false, "is_word_end": true }, { @@ -328,7 +325,7 @@ { "syllable": "ce", "is_stressed": false, - "has_synalepha": true, + "has_synalepha": false, "is_word_end": true }, { @@ -421,14 +418,11 @@ "is_stressed": true }, { - "syllable": "rro", - "is_stressed": false, - "has_synalepha": false, - "is_word_end": true - }, - { - "syllable": "ha", - "is_stressed": true + "syllable": "rroha", + "is_stressed": true, + "synalepha_index": [ + 2 + ] }, { "syllable": "cea", @@ -454,14 +448,11 @@ "is_stressed": true }, { - "syllable": "rro", - "is_stressed": false, - "has_synalepha": false, - "is_word_end": true - }, - { - "syllable": "ha", - "is_stressed": true + "syllable": "rroha", + "is_stressed": true, + "synalepha_index": [ + 2 + ] }, { "syllable": "ce", @@ -490,11 +481,14 @@ "is_stressed": true }, { - "syllable": "rroha", - "is_stressed": true, - "synalepha_index": [ - 2 - ] + "syllable": "rro", + "is_stressed": false, + "has_synalepha": false, + "is_word_end": true + }, + { + "syllable": "ha", + "is_stressed": true }, { "syllable": "cea", @@ -520,11 +514,14 @@ "is_stressed": true }, { - "syllable": "rroha", - "is_stressed": true, - "synalepha_index": [ - 2 - ] + "syllable": "rro", + "is_stressed": false, + "has_synalepha": false, + "is_word_end": true + }, + { + "syllable": "ha", + "is_stressed": true }, { "syllable": "ce", @@ -555,7 +552,7 @@ { "syllable": "rro", "is_stressed": false, - "has_synalepha": false, + "has_synalepha": true, "is_word_end": true }, { @@ -563,11 +560,14 @@ "is_stressed": true }, { - "syllable": "cea", - "is_stressed": true, - "synalepha_index": [ - 1 - ] + "syllable": "ce", + "is_stressed": false, + "has_synalepha": true, + "is_word_end": true + }, + { + "syllable": "a", + "is_stressed": true }, { "syllable": "guas", @@ -588,7 +588,7 @@ { "syllable": "rro", "is_stressed": false, - "has_synalepha": false, + "has_synalepha": true, "is_word_end": true }, { @@ -598,7 +598,7 @@ { "syllable": "ce", "is_stressed": false, - "has_synalepha": false, + "has_synalepha": true, "is_word_end": true }, { @@ -1164,7 +1164,7 @@ { "syllable": "rro", "is_stressed": false, - "has_synalepha": true, + "has_synalepha": false, "is_word_end": true }, { @@ -1172,14 +1172,11 @@ "is_stressed": true }, { - "syllable": "ce", - "is_stressed": false, - "has_synalepha": true, - "is_word_end": true - }, - { - "syllable": "a", - "is_stressed": true + "syllable": "cea", + "is_stressed": true, + "synalepha_index": [ + 1 + ] }, { "syllable": "guas", @@ -1200,7 +1197,7 @@ { "syllable": "rro", "is_stressed": false, - "has_synalepha": true, + "has_synalepha": false, "is_word_end": true }, { @@ -1210,7 +1207,7 @@ { "syllable": "ce", "is_stressed": false, - "has_synalepha": true, + "has_synalepha": false, "is_word_end": true }, { @@ -1303,14 +1300,11 @@ "is_stressed": true }, { - "syllable": "rro", - "is_stressed": false, - "has_synalepha": false, - "is_word_end": true - }, - { - "syllable": "ha", - "is_stressed": true + "syllable": "rroha", + "is_stressed": true, + "synalepha_index": [ + 2 + ] }, { "syllable": "cea", @@ -1336,14 +1330,11 @@ "is_stressed": true }, { - "syllable": "rro", - "is_stressed": false, - "has_synalepha": false, - "is_word_end": true - }, - { - "syllable": "ha", - "is_stressed": true + "syllable": "rroha", + "is_stressed": true, + "synalepha_index": [ + 2 + ] }, { "syllable": "ce", @@ -1372,11 +1363,14 @@ "is_stressed": true }, { - "syllable": "rroha", - "is_stressed": true, - "synalepha_index": [ - 2 - ] + "syllable": "rro", + "is_stressed": false, + "has_synalepha": false, + "is_word_end": true + }, + { + "syllable": "ha", + "is_stressed": true }, { "syllable": "cea", @@ -1402,11 +1396,14 @@ "is_stressed": true }, { - "syllable": "rroha", - "is_stressed": true, - "synalepha_index": [ - 2 - ] + "syllable": "rro", + "is_stressed": false, + "has_synalepha": false, + "is_word_end": true + }, + { + "syllable": "ha", + "is_stressed": true }, { "syllable": "ce", @@ -1437,7 +1434,7 @@ { "syllable": "rro", "is_stressed": false, - "has_synalepha": false, + "has_synalepha": true, "is_word_end": true }, { @@ -1445,11 +1442,14 @@ "is_stressed": true }, { - "syllable": "cea", - "is_stressed": true, - "synalepha_index": [ - 1 - ] + "syllable": "ce", + "is_stressed": false, + "has_synalepha": true, + "is_word_end": true + }, + { + "syllable": "a", + "is_stressed": true }, { "syllable": "guas", @@ -1470,7 +1470,7 @@ { "syllable": "rro", "is_stressed": false, - "has_synalepha": false, + "has_synalepha": true, "is_word_end": true }, { @@ -1480,7 +1480,7 @@ { "syllable": "ce", "is_stressed": false, - "has_synalepha": false, + "has_synalepha": true, "is_word_end": true }, { diff --git a/tests/fixtures/rhyme_analysis_sonnet.json b/tests/fixtures/rhyme_analysis_sonnet.json index f17b740..bfbb64a 100644 --- a/tests/fixtures/rhyme_analysis_sonnet.json +++ b/tests/fixtures/rhyme_analysis_sonnet.json @@ -1828,21 +1828,15 @@ }, { "syllable": "dir", - "is_stressed": true, - "is_word_end": true - } - ], - "stress_position": -1 - }, - { - "word": [ + "is_stressed": true + }, { "syllable": "me", "is_stressed": false, "is_word_end": true } ], - "stress_position": 0 + "stress_position": -2 } ], "phonological_groups": [ @@ -1891,8 +1885,7 @@ }, { "syllable": "dir", - "is_stressed": true, - "is_word_end": true + "is_stressed": true }, { "syllable": "me", @@ -2143,21 +2136,15 @@ }, { "syllable": "tir", - "is_stressed": true, - "is_word_end": true - } - ], - "stress_position": -1 - }, - { - "word": [ + "is_stressed": true + }, { "syllable": "me", "is_stressed": false, "is_word_end": true } ], - "stress_position": 0 + "stress_position": -2 }, { "symbol": "." @@ -2209,8 +2196,7 @@ }, { "syllable": "tir", - "is_stressed": true, - "is_word_end": true + "is_stressed": true }, { "syllable": "me", diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py index bafab6a..c35092d 100644 --- a/tests/test_pipeline.py +++ b/tests/test_pipeline.py @@ -29,3 +29,21 @@ def mockreturn(lang=None): {"text": token.text, "pos_": token.pos_, "tag_": token.tag_, "n_rights": token.n_rights}) # noqa assert token_dict == test_dict_list + + +def test_load_pipeline_affixes(monkeypatch): + def mockreturn(lang=None): + nlp = spacy.blank('es') # noqa + nlp.vocab.lookups.get_table = lambda *_: {} + return nlp + + monkeypatch.setattr(spacy, 'load', mockreturn) + # lang doesn't matter as long as it hasn't been used in the test session + nlp = load_pipeline("blank", split_affixes=False) + doc = nlp("prue-\nba") + token_dict = [] + for token in doc: + token_dict.append( + {"text": token.text, "pos_": token.pos_, "tag_": token.tag_, + "n_rights": token.n_rights}) # noqa + assert token_dict == test_dict_list From a94af95b98d9af7a8707b9c82b53b93503c81252 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20P=C3=A9rez?= Date: Thu, 5 Dec 2019 16:38:42 +0100 Subject: [PATCH 2/6] Added 'AUX' to the split_on list for spacy affixes (#64) --- src/rantanplan/pipeline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/rantanplan/pipeline.py b/src/rantanplan/pipeline.py index 2e659f1..09c6a01 100644 --- a/src/rantanplan/pipeline.py +++ b/src/rantanplan/pipeline.py @@ -46,7 +46,7 @@ def load_pipeline(lang=None, split_affixes=True): nlp.remove_pipe("affixes") if nlp.has_pipe("affixes") else None suffixes = {k: v for k, v in load_affixes().items() if k.startswith(AFFIXES_SUFFIX)} - affixes_matcher = AffixesMatcher(nlp, split_on=["VERB"], + affixes_matcher = AffixesMatcher(nlp, split_on=["VERB", "AUX"], rules=suffixes) nlp.add_pipe(affixes_matcher, name="affixes", first=True) _load_pipeline[lang] = nlp From 9251220f9c14dfaec1f31df522d7235fc94148f8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20P=C3=A9rez?= Date: Wed, 18 Dec 2019 13:49:26 +0100 Subject: [PATCH 3/6] =?UTF-8?q?Bump=20version:=200.4.0=20=E2=86=92=200.4.1?= =?UTF-8?q?=20(#66)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- setup.cfg | 2 +- setup.py | 2 +- src/rantanplan/__init__.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/setup.cfg b/setup.cfg index 205706e..fc34ee2 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.4.0 +current_version = 0.4.1 commit = True tag = True diff --git a/setup.py b/setup.py index 3063eb9..e865b96 100644 --- a/setup.py +++ b/setup.py @@ -41,7 +41,7 @@ def read(*names, **kwargs): setup( name='rantanplan', - version='0.4.0', + version='0.4.1', license='Apache Software License 2.0', description='Scansion tool for Spanish texts', long_description='%s\n%s' % ( diff --git a/src/rantanplan/__init__.py b/src/rantanplan/__init__.py index ca16eef..0694a51 100644 --- a/src/rantanplan/__init__.py +++ b/src/rantanplan/__init__.py @@ -1,2 +1,2 @@ -__version__ = '0.4.0' +__version__ = '0.4.1' from .core import get_scansion # noqa From 6547aa5b90bcf86eb0a81af3ea9fd890e37a046c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20P=C3=A9rez?= Date: Thu, 12 Mar 2020 12:11:13 +0100 Subject: [PATCH 4/6] Add documentation (#68) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Added documentation * Changed imports * Bump version: 0.4.1 → 0.4.2 * Added usage * Fixed requirements * Unified syllabification variables and functions * Added output example --- CHANGELOG.rst | 51 +++ README.rst | 101 +++++- docs/conf.py | 2 +- docs/reference/rantanplan.rst | 2 +- docs/usage.rst | 93 +++++- requirements.txt | 4 +- setup.cfg | 2 +- setup.py | 2 +- src/rantanplan/__init__.py | 2 +- src/rantanplan/core.py | 297 +++++++----------- ..._syllabification.py => syllabification.py} | 120 +++++++ 11 files changed, 473 insertions(+), 203 deletions(-) rename src/rantanplan/{alternative_syllabification.py => syllabification.py} (94%) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 37a7f2c..49ce68f 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -2,6 +2,57 @@ Changelog ========= +0.4.2 (2020-03-11) +------------------ + +* Added documentation + +0.4.1 (2019-12-19) +------------------ + +* Added 'AUX' to the split_on list for spacy affixes +* Fixed syllabification exceptions, support for disabling/enabling spacy_affixes +* Fixed multiline break +* Fixed splitted verb stresses and secondary stress on '-mente' adverbs +* Fixed some issues +* Added minimum length for '-mente' adverbs + +0.4.0 (2019-11-21) +------------------ + +* Added SpaCy Doc input support +* Add umlaut hyatus +* Added new hyatus and fixed init +* Refactoring code +* Feat/new syllabification +* Naming conventions +* Adding rhyme analaysis to scansion output +* Adding 'singleton' behaviour to load_pipeline +* Metre analysis w/ sinaeresis and synalephas +* Added new workflow for syllabification, with tests +* Post syllabification rules regexes +* Added unit tests for all functions + +0.3.0 (2019-06-18) +------------------ + +* Added SpaCy Doc input support +* Add umlaut hyatus +* Fixed syllabyfication errors, affixes and the pipeline +* Fixed hyphenator for diphthongs with u umlaut +* Added hyphenation for explicit hyatus with umlaut vowels +* Added new hyatus and fixed __init__ + +0.2.0 (2019-06-14) +------------------ + +* Better hyphenator, and affixes and pipeline fixes + +0.1.2 (2019-06-10) +------------------ + +* Republishing on Pypi + 0.1.0 (2019-07-03) ------------------ diff --git a/README.rst b/README.rst index dd4c9d5..bf40230 100644 --- a/README.rst +++ b/README.rst @@ -39,9 +39,9 @@ Overview :alt: PyPI Package latest release :target: https://pypi.org/project/rantanplan -.. |commits-since| image:: https://img.shields.io/github/commits-since/linhd-postdata/rantanplan/v0.1.0.svg +.. |commits-since| image:: https://img.shields.io/github/commits-since/linhd-postdata/rantanplan/0.4.2.svg :alt: Commits since latest release - :target: https://github.com/linhd-postdata/rantanplan/compare/v0.1.0...master + :target: https://github.com/linhd-postdata/rantanplan/compare/0.4.2...master .. |wheel| image:: https://img.shields.io/pypi/wheel/rantanplan.svg :alt: PyPI Wheel @@ -69,6 +69,103 @@ Installation pip install rantanplan +Usage +===== + +Install required resources +-------------------------- + +#. Install spaCy model language for Spanish:: + + python -m spacy download es_core_news_md + +#. Install Freeling rules for affixes:: + + python -m spacy_affixes download es + + +Import rantanplan +----------------- + +To use rantanplan in a project:: + + import rantanplan + +Usage example +------------- +.. code-block:: python + + from rantanplan.core import get_scansion + + poem = """Me gustas cuando callas porque estás como ausente, + y me oyes desde lejos, y mi voz no te toca. + Parece que los ojos se te hubieran volado + y parece que un beso te cerrara la boca. + + Como todas las cosas están llenas de mi alma + emerges de las cosas, llena del alma mía. + Mariposa de sueño, te pareces a mi alma, + y te pareces a la palabra melancolía.""" + + get_scansion(poem) + +Output example +-------------- + +.. code-block:: python + + [{'tokens': [{'word': [{'syllable': 'Me', + 'is_stressed': False, + 'is_word_end': True}], + 'stress_position': 0}, + {'word': [{'syllable': 'gus', 'is_stressed': True}, + {'syllable': 'tas', 'is_stressed': False, 'is_word_end': True}], + 'stress_position': -2}, + {'word': [{'syllable': 'cuan', 'is_stressed': False}, + {'syllable': 'do', 'is_stressed': False, 'is_word_end': True}], + 'stress_position': 0}, + {'word': [{'syllable': 'ca', 'is_stressed': True}, + {'syllable': 'llas', 'is_stressed': False, 'is_word_end': True}], + 'stress_position': -2}, + {'word': [{'syllable': 'por', 'is_stressed': False}, + {'syllable': 'que', + 'is_stressed': False, + 'has_synalepha': True, + 'is_word_end': True}], + 'stress_position': 0}, + {'word': [{'syllable': 'es', 'is_stressed': False}, + {'syllable': 'tás', 'is_stressed': True, 'is_word_end': True}], + 'stress_position': -1}, + {'word': [{'syllable': 'co', 'is_stressed': False}, + {'syllable': 'mo', + 'is_stressed': False, + 'has_synalepha': True, + 'is_word_end': True}], + 'stress_position': 0}, + {'word': [{'syllable': 'au', 'is_stressed': False}, + {'syllable': 'sen', 'is_stressed': True}, + {'syllable': 'te', 'is_stressed': False, 'is_word_end': True}], + 'stress_position': -2}, + {'symbol': ','}], + 'phonological_groups': [{'syllable': 'Me', + 'is_stressed': False, + 'is_word_end': True}, + {'syllable': 'gus', 'is_stressed': True}, + {'syllable': 'tas', 'is_stressed': False, 'is_word_end': True}, + {'syllable': 'cuan', 'is_stressed': False}, + {'syllable': 'do', 'is_stressed': False, 'is_word_end': True}, + {'syllable': 'ca', 'is_stressed': True}, + {'syllable': 'llas', 'is_stressed': False, 'is_word_end': True}, + {'syllable': 'por', 'is_stressed': False}, + {'syllable': 'quees', 'is_stressed': False, 'synalepha_index': [2]}, + {'syllable': 'tás', 'is_stressed': True, 'is_word_end': True}, + {'syllable': 'co', 'is_stressed': False}, + {'syllable': 'moau', 'is_stressed': False, 'synalepha_index': [1]}, + {'syllable': 'sen', 'is_stressed': True}, + {'syllable': 'te', 'is_stressed': False, 'is_word_end': True}], + 'rhythm': {'stress': '-+---+---+--+-', 'type': 'pattern', 'length': 14}}, + ... + Documentation ============= diff --git a/docs/conf.py b/docs/conf.py index 43d5926..d26561d 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -26,7 +26,7 @@ year = '2019' author = 'LINHD POSTDATA Project' copyright = '{0}, {1}'.format(year, author) -version = release = '0.1.0' +version = release = '0.4.2' pygments_style = 'trac' templates_path = ['.'] diff --git a/docs/reference/rantanplan.rst b/docs/reference/rantanplan.rst index e63ebb4..f583606 100644 --- a/docs/reference/rantanplan.rst +++ b/docs/reference/rantanplan.rst @@ -5,5 +5,5 @@ rantanplan from rantanplan import * -.. automodule:: rantanplan +.. automodule:: rantanplan.core :members: diff --git a/docs/usage.rst b/docs/usage.rst index 9673afb..f7a6aae 100644 --- a/docs/usage.rst +++ b/docs/usage.rst @@ -1,7 +1,96 @@ -===== Usage ===== +Install required resources +-------------------------- + +#. Install spaCy model language for Spanish:: + + python -m spacy download es_core_news_md + +#. Install Freeling rules for affixes:: + + python -m spacy_affixes download es + + +Import rantanplan +----------------- + To use rantanplan in a project:: - import rantanplan + import rantanplan + +Usage example +------------- +.. code-block:: python + + from rantanplan.core import get_scansion + + poem = """Me gustas cuando callas porque estás como ausente, + y me oyes desde lejos, y mi voz no te toca. + Parece que los ojos se te hubieran volado + y parece que un beso te cerrara la boca. + + Como todas las cosas están llenas de mi alma + emerges de las cosas, llena del alma mía. + Mariposa de sueño, te pareces a mi alma, + y te pareces a la palabra melancolía.""" + + get_scansion(poem) + +Output example +-------------- + +.. code-block:: python + + [{'tokens': [{'word': [{'syllable': 'Me', + 'is_stressed': False, + 'is_word_end': True}], + 'stress_position': 0}, + {'word': [{'syllable': 'gus', 'is_stressed': True}, + {'syllable': 'tas', 'is_stressed': False, 'is_word_end': True}], + 'stress_position': -2}, + {'word': [{'syllable': 'cuan', 'is_stressed': False}, + {'syllable': 'do', 'is_stressed': False, 'is_word_end': True}], + 'stress_position': 0}, + {'word': [{'syllable': 'ca', 'is_stressed': True}, + {'syllable': 'llas', 'is_stressed': False, 'is_word_end': True}], + 'stress_position': -2}, + {'word': [{'syllable': 'por', 'is_stressed': False}, + {'syllable': 'que', + 'is_stressed': False, + 'has_synalepha': True, + 'is_word_end': True}], + 'stress_position': 0}, + {'word': [{'syllable': 'es', 'is_stressed': False}, + {'syllable': 'tás', 'is_stressed': True, 'is_word_end': True}], + 'stress_position': -1}, + {'word': [{'syllable': 'co', 'is_stressed': False}, + {'syllable': 'mo', + 'is_stressed': False, + 'has_synalepha': True, + 'is_word_end': True}], + 'stress_position': 0}, + {'word': [{'syllable': 'au', 'is_stressed': False}, + {'syllable': 'sen', 'is_stressed': True}, + {'syllable': 'te', 'is_stressed': False, 'is_word_end': True}], + 'stress_position': -2}, + {'symbol': ','}], + 'phonological_groups': [{'syllable': 'Me', + 'is_stressed': False, + 'is_word_end': True}, + {'syllable': 'gus', 'is_stressed': True}, + {'syllable': 'tas', 'is_stressed': False, 'is_word_end': True}, + {'syllable': 'cuan', 'is_stressed': False}, + {'syllable': 'do', 'is_stressed': False, 'is_word_end': True}, + {'syllable': 'ca', 'is_stressed': True}, + {'syllable': 'llas', 'is_stressed': False, 'is_word_end': True}, + {'syllable': 'por', 'is_stressed': False}, + {'syllable': 'quees', 'is_stressed': False, 'synalepha_index': [2]}, + {'syllable': 'tás', 'is_stressed': True, 'is_word_end': True}, + {'syllable': 'co', 'is_stressed': False}, + {'syllable': 'moau', 'is_stressed': False, 'synalepha_index': [1]}, + {'syllable': 'sen', 'is_stressed': True}, + {'syllable': 'te', 'is_stressed': False, 'is_word_end': True}], + 'rhythm': {'stress': '-+---+---+--+-', 'type': 'pattern', 'length': 14}}, + ... diff --git a/requirements.txt b/requirements.txt index ce477bc..64e65ff 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,3 @@ Click>=7.0 -spacy>=2.1 -spacy_affixes \ No newline at end of file +spacy>=2.2 +spacy_affixes diff --git a/setup.cfg b/setup.cfg index fc34ee2..95ca53b 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.4.1 +current_version = 0.4.2 commit = True tag = True diff --git a/setup.py b/setup.py index e865b96..ac04a19 100644 --- a/setup.py +++ b/setup.py @@ -41,7 +41,7 @@ def read(*names, **kwargs): setup( name='rantanplan', - version='0.4.1', + version='0.4.2', license='Apache Software License 2.0', description='Scansion tool for Spanish texts', long_description='%s\n%s' % ( diff --git a/src/rantanplan/__init__.py b/src/rantanplan/__init__.py index 0694a51..3907fa6 100644 --- a/src/rantanplan/__init__.py +++ b/src/rantanplan/__init__.py @@ -1,2 +1,2 @@ -__version__ = '0.4.1' +__version__ = '0.4.2' from .core import get_scansion # noqa diff --git a/src/rantanplan/core.py b/src/rantanplan/core.py index e1c9cd2..becc165 100644 --- a/src/rantanplan/core.py +++ b/src/rantanplan/core.py @@ -13,143 +13,44 @@ from spacy.tokens import Doc -from .alternative_syllabification import ALTERNATIVE_SYLLABIFICATION -from .alternative_syllabification import SYLLABIFICATOR_FOREIGN_WORDS_DICT from .pipeline import load_pipeline from .rhymes import STRUCTURES_LENGTH from .rhymes import analyze_rhyme - -""" -Syllabification -""" -accents_re = re.compile("[áéíóú]", re.I | re.U) -paroxytone_re = re.compile("([aeiou]|n|[aeiou]s)$", - # checks if a str ends in unaccented vowel/N/S - re.I | re.U) - -""" -Regular expressions for spanish syllabification. -For the 'tl' cluster we have decided to join the two letters -because is the most common syllabification and the same that -Perkins (http://sadowsky.cl/perkins.html), DIRAE (https://dirae.es/), -and Educalingo (https://educalingo.com/es/dic-es) use. -""" -letter_clusters_re = re.compile(r""" - # 1: weak vowels diphthong with h - ([iuü]h[iuü])| - # 2: open vowels - ([aáeéíoóú]h[iuü])| - # 3: closed vowels - ([iuü]h[aáeéíoóú])| - # 4: liquid and mute consonants (adds hyphen) - ([a-záéíóúñ](?:(?:[bcdfghjklmnñpqstvy][hlr])| - (?:[bcdfghjklmnñpqrstvy][hr])| - (?:[bcdfghjklmnñpqrstvyz][h]))[aáeéiíoóuúü])| - # 5: any char followed by liquid and mute consonant, - # exceptions for 'r+l' and 't+l' - ((?:(?:[bcdfghjklmnñpqstvy][hlr])| - (?:[bcdfghjklmnñpqrstvy][hr])| - (?:[bcdfghjklmnñpqrstvyz][h]))[aáeéiíoóuúü])| - # 6: non-liquid consonant (adds hyphen) - ([a-záéíóúñ][bcdfghjklmnñpqrstvxyz][aáeéiíoóuúüï])| - # 7: vowel group (adds hyphen) - ([aáeéíoóú][aáeéíoóú])| - # 8: umlaut 'u' diphthongs - (ü[iíaeo])| - # 9: Explicit hiatus with umlaut vowels, first part - ([aeiou][äëïöü])| - #10: Explicit hiatus with umlaut vowels, second part - ([üäëïö][a-z])| - #11: any char - ([a-záéíóúñ])""", re.I | re.U | re.VERBOSE) # VERBOSE to catch the group - -""" -Rhythmical Analysis -""" -SPACE = "SPACE" -STRONG_VOWELS = set("aeoáéóÁÉÓAEO") -WEAK_VOWELS = set("iuüíúIÍUÜÚ") -LIAISON_FIRST_PART = set("aeiouáéíóúAEIOUÁÉÍÓÚyY") -LIAISON_SECOND_PART = set("aeiouáéíóúAEIOUÁÉÍÓÚhyYH") - -STRESSED_UNACCENTED_MONOSYLLABLES = {"yo", "vio", "dio", "fe", "sol", "ti", - "un"} - -UNSTRESSED_UNACCENTED_MONOSYLLABLES = {'de', 'el', 'la', 'las', 'le', 'les', - 'lo', 'los', - 'mas', 'me', 'mi', 'nos', 'os', 'que', - 'se', 'si', - 'su', 'tan', 'te', 'tu', "tus", "oh"} - -UNSTRESSED_FORMS = {"ay", "don", "doña", "aun", "que", "cual", "quien", "donde", - "cuando", "cuanto", "como", "cuantas", "cuantos"} - -STRESSED_PRON = {"mío", "mía", "míos", "mías", "tuyo", "tuya", "tuyos", - "tuyas", "suyo", "suya", "suyos", "suyas", "todo"} - -POSSESSIVE_PRON_UNSTRESSED = {"nuestro", "nuestra", "nuestros", "nuestras", - "vuestro", "vuestra", "vuestros", "vuestras"} - -""" -Regular expressions and rules for syllabification exceptions -""" - -# Words starting with prefixes SIN-/DES- followed by consonant "destituir" -PREFIX_DES_WITH_CONSONANT_RE = ( - re.compile("^(des)([bcdfgjklmhnñpqrstvxyz].*)", re.I | re.U)) - -# Words starting with prefixes SIN-/DES- followed by consonant "sinhueso" -PREFIX_SIN_WITH_CONSONANT_RE = ( - re.compile("^(sin)([bcdfgjklmhnñpqrstvxyz].*)", re.I | re.U)) - -# Group consonant+[hlr] with exceptions for ll -CONSONANT_GROUP = (re.compile("(.*[hmnqsw])([hlr][aeiouáéíóú].*)", re.I | re.U)) -CONSONANT_GROUP_EXCEPTION_LL = ( - re.compile("(.*[hlmnqsw])([hr][aeiouáéíóú].*)", re.I | re.U)) -CONSONANT_GROUP_EXCEPTION_DL = ( - re.compile("(.*[d])([l][aeiouáéíóú].*)", re.I | re.U)) - -# Group vowel+ w + vowel -W_VOWEL_GROUP = (re.compile("(.*[aeiouáéíóú])(w[aeiouáéíóú].*)", re.I | re.U)) - -# Post-syllabification exceptions for consonant clusters and diphthongs -# Explicitit hiatus on first vowel -HIATUS_FIRST_VOWEL_RE = (re.compile( - "(?:(.*-)|^)([äëïö]|[^g]ü)([aeiouúáéíó].*)", - re.I | re.U | re.VERBOSE)) - -# Consonant cluster. Example: 'cneorácea' -CONSONANT_CLUSTER_RE = (re.compile( - "(?:(.*-)|^)([mpgc])-([bcdfghjklmñnpqrstvwxyz][aeioáéíó].*)", - re.I | re.U | re.VERBOSE)) - -# Lowering diphthong. Example: 'ahijador' -LOWERING_DIPHTHONGS_WITH_H = ( - re.compile( - """((?:.*-|^)(?:qu|[bcdfghjklmñnpqrstvwxyz]+)?) - ([aeo])-(h[iu](?![aeoiuíúáéó]).*)""", - re.I | re.U | re.VERBOSE)) - -# Lowering diphthong. Example: 'buhitiho' -RAISING_DIPHTHONGS_WITH_H = ( - re.compile( - """((?:.*-|^)(?:qu|[bcdfghjklmñnpqrstvwxyz]+)?) - ([iu])-(h[aeiouáéó](?![aeoáéiuíú]).*)""", - re.I | re.U | re.VERBOSE)) - -""" -Rhythmical Analysis functions -""" +from .syllabification import ALTERNATIVE_SYLLABIFICATION +from .syllabification import CONSONANT_CLUSTER_RE +from .syllabification import CONSONANT_GROUP +from .syllabification import CONSONANT_GROUP_EXCEPTION_DL +from .syllabification import CONSONANT_GROUP_EXCEPTION_LL +from .syllabification import HIATUS_FIRST_VOWEL_RE +from .syllabification import LIAISON_FIRST_PART +from .syllabification import LIAISON_SECOND_PART +from .syllabification import LOWERING_DIPHTHONGS_WITH_H +from .syllabification import POSSESSIVE_PRON_UNSTRESSED +from .syllabification import PREFIX_DES_WITH_CONSONANT_RE +from .syllabification import PREFIX_SIN_WITH_CONSONANT_RE +from .syllabification import RAISING_DIPHTHONGS_WITH_H +from .syllabification import SPACE +from .syllabification import STRESSED_PRON +from .syllabification import STRESSED_UNACCENTED_MONOSYLLABLES +from .syllabification import STRONG_VOWELS +from .syllabification import SYLLABIFICATOR_FOREIGN_WORDS_DICT +from .syllabification import UNSTRESSED_FORMS +from .syllabification import UNSTRESSED_UNACCENTED_MONOSYLLABLES +from .syllabification import W_VOWEL_GROUP +from .syllabification import WEAK_VOWELS +from .syllabification import accents_re +from .syllabification import letter_clusters_re +from .syllabification import paroxytone_re def have_prosodic_liaison(first_syllable, second_syllable): - """ - Checkfor prosodic liaison between two syllables - :param first_syllable: dic with key syllable (str) and is_stressed (bool) - representing the first syllable - :param second_syllable: dic with key syllable (str) and is_stressed (bool) - representing the second syllable - :return: True if there is prosodic liaison and False otherwise + """Checks for prosodic liaison between two syllables + + :param first_syllable: Dictionary with key syllable (str) and is_stressed (bool) representing + the first syllable + :param second_syllable: Dictionary with key syllable (str) and is_stressed (bool) + representing the second syllable + :return: `True` if there is prosodic liaison and `False` otherwise :rtype: bool """ if second_syllable['syllable'][0].lower() == 'y' and ( @@ -162,10 +63,11 @@ def have_prosodic_liaison(first_syllable, second_syllable): def get_syllables_word_end(words): - """ - Get a list of syllables from a list of words extracting word boundaries + """Get a list of syllables from a list of words extracting word boundaries + :param words: List of dictonaries of syllables for each word in a line :return: List of dictionaries of syllables with an extra is_word_end key + :rtype: list """ syllables = [] for word in words: @@ -180,17 +82,18 @@ def get_syllables_word_end(words): def get_phonological_groups(word_syllables, liaison_type="synalepha", breakage_func=None, liaison_positions=None): - """ - Get a list of dictionaries for each phonological group on a line + """Get a list of dictionaries for each phonological group on a line and joins the syllables to create phonological groups (pronounced together) according to a type of liaison, either synaloepha or sinaeresis + :param word_syllables: List of dictionaries for each word of the line :param liaison_type: Which liaison is going to be performed synalepha or - sinaeresis + sinaeresis :param breakage_func: Function to decide when not to break a liaison that is - specified in liaison_positions + specified in liaison_positions :param liaison_positions: Positions of the liaisons :return: A list of conjoined syllables + :rtype: list """ syllables = word_syllables[:] liaison_property = f"has_{liaison_type}" @@ -240,13 +143,14 @@ def get_phonological_groups(word_syllables, liaison_type="synalepha", def clean_phonological_groups(groups, liaison_positions, liaison_property): - """ - Clean phonological groups so their liaison property is consistently set + """Clean phonological groups so their liaison property is consistently set according to the the liaison positions + :param groups: Phonological groups to be cleaned :param liaison_positions: Positions of the liaisons :param liaison_property: The liaison type (synaeresis or synalepha) - :return: + :return: Cleaned phonological groups + :rtype: dict """ clean_groups = [] for idx, group in enumerate(groups): @@ -260,12 +164,13 @@ def clean_phonological_groups(groups, liaison_positions, liaison_property): def get_rhythmical_pattern(phonological_groups, rhythm_format="pattern"): - """ - Gets a rhythm pattern for a poem in either "pattern": "-++-+-+-" + """Gets a rhythm pattern for a poem in either "pattern": "-++-+-+-" "binary": "01101010" or "indexed": [1,2,4,6] format + :param phonological_groups: a dictionary with the syllables of the line :param rhythm_format: The output format for the rhythm :return: Dictionary with with rhythm and phonologic groups + :rtype: dict """ stresses = get_stresses(phonological_groups) stress = format_stress(stresses, rhythm_format) @@ -277,15 +182,16 @@ def get_rhythmical_pattern(phonological_groups, rhythm_format="pattern"): def get_stresses(phonological_groups): - """ - Gets a list of stress marks (True for stressed, False for unstressed) from a - list of phonological groups applying rules depending on the ending stress. + """Gets a list of stress marks (`True` for stressed, `False` for unstressed) + from a list of phonological groups applying rules depending on the ending + stress. + :param phonological_groups: a dictionary with the phonological groups - (syllables) of the line + (syllables) of the line :return: List of boolean values indicating whether a group is - stressed (True) or not (False) + stressed (`True`) or not (`False`) + :rtype: list """ - # stresses = [group["is_stressed"] for group in phonological_groups] stresses = [] last_word_syllables = [] for group in phonological_groups: @@ -312,16 +218,17 @@ def get_stresses(phonological_groups): def format_stress(stresses, rhythm_format="pattern", indexed_separator="-"): - """ - Converts a list of boolean elements into a string that matches the chosen - rhythm format: - "indexed": 2,5,8 - "pattern": -++--+-+- - "binary": 01101001 + """Converts a list of boolean elements into a string that matches the chosen + rhythm format: + "indexed": 2,5,8 + "pattern": -++--+-+- + "binary": 01101001 + :param stresses: List of boolean elements representing stressed syllables :param rhythm_format: Format to be used: indexed, pattern, or binary :param indexed_separator: String to use as a separator for indexed pattern :return: String with the stress pattern + :rtype: str """ separator = "" if rhythm_format == 'indexed': @@ -342,11 +249,12 @@ def format_stress(stresses, rhythm_format="pattern", indexed_separator="-"): def apply_exception_rules(word): - """ - Applies presyllabification rules to a word, + """Applies presyllabification rules to a word, based on Antonio Ríos Mestre's work + :param word: A string to be checked for exceptions :return: A string with the presyllabified word + :rtype: str """ # Vowel + w + vowel group if W_VOWEL_GROUP.match(word): @@ -380,11 +288,12 @@ def apply_exception_rules(word): def apply_exception_rules_post(word): - """ - Applies presyllabification rules to a word, + """Applies presyllabification rules to a word, based on Antonio Ríos Mestre's work + :param word: A string to be checked for exceptions :return: A string with the presyllabified word with hyphens + :rtype: str """ # We make one pass for every match found so we can perform # several substitutions @@ -403,12 +312,12 @@ def apply_exception_rules_post(word): def syllabify(word, alternative_syllabification=False): - """ - Syllabifies a word. + """Syllabifies a word. + :param word: The word to be syllabified. :param alternative_syllabification: Wether or not the alternative - syllabification is used - :return: list of syllables and exceptions where appropriate. + syllabification is used + :return: List of syllables and exceptions where appropriate. :rtype: list """ output = "" @@ -438,10 +347,10 @@ def syllabify(word, alternative_syllabification=False): def get_orthographic_accent(syllable_list): - """ - Given a list of str representing syllables, + """Given a list of str representing syllables, return position in the list of a syllable bearing orthographic stress (with the acute accent mark in Spanish) + :param syllable_list: list of syllables as str or unicode each :return: Position or None if no orthographic stress :rtype: int @@ -456,11 +365,11 @@ def get_orthographic_accent(syllable_list): def is_paroxytone(syllables): - """ - Given a list of str representing syllables from a single word, + """Given a list of str representing syllables from a single word, check if it is paroxytonic (llana) or not + :param syllables: List of syllables as str - :return: True if paroxytone, False if not + :return: `True` if paroxytone, `False` if not :rtype: bool """ if not get_orthographic_accent("".join(syllables)): @@ -469,12 +378,13 @@ def is_paroxytone(syllables): def spacy_tag_to_dict(tag): - """ - Creater a dict from spacy pos tags + """Creates a dict from spacy pos tags + :param tag: Extended spacy pos tag - ("Definite=Ind|Gender=Masc|Number=Sing|PronType=Art") + ("Definite=Ind|Gender=Masc|Number=Sing|PronType=Art") :return: A dictionary in the form of - "{'Definite': 'Ind', 'Gender': 'Masc', 'Number': 'Sing', 'PronType': 'Art'}" + "{'Definite': 'Ind', 'Gender': 'Masc', 'Number': 'Sing', + 'PronType': 'Art'}" :rtype: dict """ if tag and '=' in tag: @@ -484,18 +394,17 @@ def spacy_tag_to_dict(tag): def get_word_stress(word, pos, tag, alternative_syllabification=False): - """ - Gets a list of syllables from a word and creates a list with syllabified + """Gets a list of syllables from a word and creates a list with syllabified word and stressed syllable index + :param word: Word string :param alternative_syllabification: Wether or not the alternative - syllabification is used + syllabification is used :param pos: PoS tag from spacy ("DET") :param tag: Extended PoS tag info from spacy - ("Definite=Ind|Gender=Masc|Number=Sing|PronType=Art") + ("Definite=Ind|Gender=Masc|Number=Sing|PronType=Art") :return: Dict with [original syllab word, stressed syllabified word, - negative index position of stressed syllable or 0 - if not stressed] + negative index position of stressed syllable or 0 if not stressed] :rtype: dict """ syllable_list, _ = syllabify(word, alternative_syllabification) @@ -593,10 +502,11 @@ def get_word_stress(word, pos, tag, alternative_syllabification=False): def get_last_syllable(token_list): - """ - Gets last syllable from a word in a dictionary + """Gets last syllable from a word in a dictionary + :param token_list: list of dictionaries with line tokens :return: Last syllable + :rtype: str """ if len(token_list) > 0: for token in token_list[::-1]: @@ -605,14 +515,14 @@ def get_last_syllable(token_list): def get_words(word_list, alternative_syllabification=False): - """ - Gets a list of syllables from a word and creates a list with syllabified + """Gets a list of syllables from a word and creates a list with syllabified word and stressed syllabe index + :param word_list: List of spacy objects representing a word or sentence :param alternative_syllabification: Wether or not the alternative - syllabification is used + syllabification is used :return: List with [original syllab. word, stressed syllab. word, negative - index position of stressed syllable] + index position of stressed syllable] :rtype: list """ syllabified_words = [] @@ -647,10 +557,11 @@ def get_words(word_list, alternative_syllabification=False): def join_affixes(line): - """ - Join affixes of split words and recalculates stress + """Join affixes of split words and recalculates stress + :param line: List of syllabified words (dict) :return: List of syllabified words (dict) with joined affixes + :rtype: list """ syllabified_words = [] indices_to_ignore = [] @@ -675,13 +586,13 @@ def join_affixes(line): def get_scansion(text, rhyme_analysis=False, rhythm_format="pattern", rhythmical_lengths=None): - """ - Generates a list of dictionaries for each line + """Generates a list of dictionaries for each line + :param text: Full text to be analyzed :param rhyme_analysis: Specify if rhyme analysis is to be performed :param rhythm_format: output format for rhythm analysis :param rhythmical_lengths: List with explicit rhythmical lengths per line - that the analysed lines has to meet + that the analysed lines has to meet :return: list of dictionaries per line :rtype: list """ @@ -760,10 +671,11 @@ def break_on_h(liaison_type, syllable_left, syllable_right): def generate_phonological_groups(tokens): - """ - Generates phonological groups from a list of tokens + """Generates phonological groups from a list of tokens + :param tokens: list of spaCy tokens :return: Generator with a list of phonological groups + :rtype: generator """ for alternative_syllabification in (True, False): words = get_words(tokens, alternative_syllabification) @@ -799,11 +711,12 @@ def generate_phonological_groups(tokens): def generate_liaison_positions(syllables, liaison): - """ - Generates all possible combinations for the liaisons on a list of syllables + """Generates all possible combinations for the liaisons on a list of syllables + :param syllables: List of syllables with :param liaison: Type of liaison combination to be generated :return: Generator with a list of possible combinations + :rtype: generator """ positions = [int(syllable.get(f"has_{liaison}", 0)) for syllable in syllables] diff --git a/src/rantanplan/alternative_syllabification.py b/src/rantanplan/syllabification.py similarity index 94% rename from src/rantanplan/alternative_syllabification.py rename to src/rantanplan/syllabification.py index f1d0bde..38a3281 100644 --- a/src/rantanplan/alternative_syllabification.py +++ b/src/rantanplan/syllabification.py @@ -1,3 +1,123 @@ +import re + +""" +Syllabification +""" +accents_re = re.compile("[áéíóú]", re.I | re.U) +paroxytone_re = re.compile("([aeiou]|n|[aeiou]s)$", + # checks if a str ends in unaccented vowel/N/S + re.I | re.U) + +""" +Regular expressions for spanish syllabification. +For the 'tl' cluster we have decided to join the two letters +because is the most common syllabification and the same that +Perkins (http://sadowsky.cl/perkins.html), DIRAE (https://dirae.es/), +and Educalingo (https://educalingo.com/es/dic-es) use. +""" +letter_clusters_re = re.compile(r""" + # 1: weak vowels diphthong with h + ([iuü]h[iuü])| + # 2: open vowels + ([aáeéíoóú]h[iuü])| + # 3: closed vowels + ([iuü]h[aáeéíoóú])| + # 4: liquid and mute consonants (adds hyphen) + ([a-záéíóúñ](?:(?:[bcdfghjklmnñpqstvy][hlr])| + (?:[bcdfghjklmnñpqrstvy][hr])| + (?:[bcdfghjklmnñpqrstvyz][h]))[aáeéiíoóuúü])| + # 5: any char followed by liquid and mute consonant, + # exceptions for 'r+l' and 't+l' + ((?:(?:[bcdfghjklmnñpqstvy][hlr])| + (?:[bcdfghjklmnñpqrstvy][hr])| + (?:[bcdfghjklmnñpqrstvyz][h]))[aáeéiíoóuúü])| + # 6: non-liquid consonant (adds hyphen) + ([a-záéíóúñ][bcdfghjklmnñpqrstvxyz][aáeéiíoóuúüï])| + # 7: vowel group (adds hyphen) + ([aáeéíoóú][aáeéíoóú])| + # 8: umlaut 'u' diphthongs + (ü[iíaeo])| + # 9: Explicit hiatus with umlaut vowels, first part + ([aeiou][äëïöü])| + #10: Explicit hiatus with umlaut vowels, second part + ([üäëïö][a-z])| + #11: any char + ([a-záéíóúñ])""", re.I | re.U | re.VERBOSE) # VERBOSE to catch the group + +""" +Rhythmical Analysis +""" +SPACE = "SPACE" +STRONG_VOWELS = set("aeoáéóÁÉÓAEO") +WEAK_VOWELS = set("iuüíúIÍUÜÚ") +LIAISON_FIRST_PART = set("aeiouáéíóúAEIOUÁÉÍÓÚyY") +LIAISON_SECOND_PART = set("aeiouáéíóúAEIOUÁÉÍÓÚhyYH") + +STRESSED_UNACCENTED_MONOSYLLABLES = {"yo", "vio", "dio", "fe", "sol", "ti", + "un"} + +UNSTRESSED_UNACCENTED_MONOSYLLABLES = {'de', 'el', 'la', 'las', 'le', 'les', + 'lo', 'los', + 'mas', 'me', 'mi', 'nos', 'os', 'que', + 'se', 'si', + 'su', 'tan', 'te', 'tu', "tus", "oh"} + +UNSTRESSED_FORMS = {"ay", "don", "doña", "aun", "que", "cual", "quien", "donde", + "cuando", "cuanto", "como", "cuantas", "cuantos"} + +STRESSED_PRON = {"mío", "mía", "míos", "mías", "tuyo", "tuya", "tuyos", + "tuyas", "suyo", "suya", "suyos", "suyas", "todo"} + +POSSESSIVE_PRON_UNSTRESSED = {"nuestro", "nuestra", "nuestros", "nuestras", + "vuestro", "vuestra", "vuestros", "vuestras"} + +""" +Regular expressions and rules for syllabification exceptions +""" + +# Words starting with prefixes SIN-/DES- followed by consonant "destituir" +PREFIX_DES_WITH_CONSONANT_RE = ( + re.compile("^(des)([bcdfgjklmhnñpqrstvxyz].*)", re.I | re.U)) + +# Words starting with prefixes SIN-/DES- followed by consonant "sinhueso" +PREFIX_SIN_WITH_CONSONANT_RE = ( + re.compile("^(sin)([bcdfgjklmhnñpqrstvxyz].*)", re.I | re.U)) + +# Group consonant+[hlr] with exceptions for ll +CONSONANT_GROUP = (re.compile("(.*[hmnqsw])([hlr][aeiouáéíóú].*)", re.I | re.U)) +CONSONANT_GROUP_EXCEPTION_LL = ( + re.compile("(.*[hlmnqsw])([hr][aeiouáéíóú].*)", re.I | re.U)) +CONSONANT_GROUP_EXCEPTION_DL = ( + re.compile("(.*[d])([l][aeiouáéíóú].*)", re.I | re.U)) + +# Group vowel+ w + vowel +W_VOWEL_GROUP = (re.compile("(.*[aeiouáéíóú])(w[aeiouáéíóú].*)", re.I | re.U)) + +# Post-syllabification exceptions for consonant clusters and diphthongs +# Explicitit hiatus on first vowel +HIATUS_FIRST_VOWEL_RE = (re.compile( + "(?:(.*-)|^)([äëïö]|[^g]ü)([aeiouúáéíó].*)", + re.I | re.U | re.VERBOSE)) + +# Consonant cluster. Example: 'cneorácea' +CONSONANT_CLUSTER_RE = (re.compile( + "(?:(.*-)|^)([mpgc])-([bcdfghjklmñnpqrstvwxyz][aeioáéíó].*)", + re.I | re.U | re.VERBOSE)) + +# Lowering diphthong. Example: 'ahijador' +LOWERING_DIPHTHONGS_WITH_H = ( + re.compile( + """((?:.*-|^)(?:qu|[bcdfghjklmñnpqrstvwxyz]+)?) + ([aeo])-(h[iu](?![aeoiuíúáéó]).*)""", + re.I | re.U | re.VERBOSE)) + +# Lowering diphthong. Example: 'buhitiho' +RAISING_DIPHTHONGS_WITH_H = ( + re.compile( + """((?:.*-|^)(?:qu|[bcdfghjklmñnpqrstvwxyz]+)?) + ([iu])-(h[aeiouáéó](?![aeoáéiuíú]).*)""", + re.I | re.U | re.VERBOSE)) + """ Exceptions for foreign words in Spanish that do not follow standard Spanish syllabification rules From fa6e630b342b47647417e3d41182f11489a782e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20P=C3=A9rez?= Date: Tue, 24 Mar 2020 10:04:52 +0100 Subject: [PATCH 5/6] Added support for filtering consecutive liaisons and syllabification exceptions (#70) * Added support for filtering consecutive liaisons and syllabification exceptions * Added missing documentation --- src/rantanplan/core.py | 20 +++++++++++++++++++- src/rantanplan/syllabification.py | 13 +++++++++++-- tests/test_core.py | 13 +++++++++++++ 3 files changed, 43 insertions(+), 3 deletions(-) diff --git a/src/rantanplan/core.py b/src/rantanplan/core.py index becc165..f6d99dc 100644 --- a/src/rantanplan/core.py +++ b/src/rantanplan/core.py @@ -725,8 +725,26 @@ def generate_liaison_positions(syllables, liaison): liaison_indices = [ index for index, position in enumerate(positions) if position ] + # Prioritize single liaisons + non_single_liaisons = [] for combination in combinations: liaison_positions = [0] * len(positions) for index, liaison_index in enumerate(liaison_indices): liaison_positions[liaison_index] = combination[index] - yield liaison_positions + if has_single_liaisons(liaison_positions): + yield liaison_positions + else: + non_single_liaisons.append(liaison_positions) + for liaison_position in non_single_liaisons: + yield liaison_position + + +def has_single_liaisons(liaisons): + """Checks whether liaisons (a list of 1's and 0's) has consecutive liaisons + (1's) or not + + :param liaisons: List of possible liaisons to apply per phonological group + :return: True if no consecutive liaisons, False otherwise + :rtype: bool + """ + return not any(i == j == 1 for i, j in zip(liaisons, liaisons[1:])) diff --git a/src/rantanplan/syllabification.py b/src/rantanplan/syllabification.py index 38a3281..d43d523 100644 --- a/src/rantanplan/syllabification.py +++ b/src/rantanplan/syllabification.py @@ -60,7 +60,7 @@ 'lo', 'los', 'mas', 'me', 'mi', 'nos', 'os', 'que', 'se', 'si', - 'su', 'tan', 'te', 'tu', "tus", "oh"} + 'su', 'tan', 'te', 'tu', "tus", "oh", "pues"} UNSTRESSED_FORMS = {"ay", "don", "doña", "aun", "que", "cual", "quien", "donde", "cuando", "cuanto", "como", "cuantas", "cuantos"} @@ -1327,6 +1327,10 @@ 'sexuados': (['se', 'xua', 'dos'], [(['se', 'xu', 'a', 'dos'], (1, 2))]), 'sexual': (['se', 'xual'], [(['se', 'xu', 'al'], (1, 2))]), 'suave': (['sua', 've'], [(['su', 'a', 've'], (0, 1))]), + 'suntuoso': (['sun', 'tuo', 'so'], [(['sun', 'tu', 'o', 'so'], (2, 3))]), + 'suntuosa': (['sun', 'tuo', 'sa'], [(['sun', 'tu', 'o', 'sa'], (2, 3))]), + 'suntuosos': (['sun', 'tuo', 'sos'], [(['sun', 'tu', 'o', 'sos'], (2, 3))]), + 'suntuosas': (['sun', 'tuo', 'sas'], [(['sun', 'tu', 'o', 'sas'], (2, 3))]), 'televisual': (['te', 'le', 'vi', 'sual'], [(['te', 'le', 'vi', 'su', 'al'], (3, 4))]), 'textual': (['tex', 'tual'], [(['tex', 'tu', 'al'], (1, 2))]), @@ -1451,8 +1455,13 @@ 'viajares': (['via', 'ja', 'res'], [(['vi', 'a', 'ja', 'res'], (0, 1))]), 'viaje': (['via', 'je'], [(['vi', 'a', 'je'], (0, 1))]), 'viajes': (['via', 'jes'], [(['vi', 'a', 'jes'], (0, 1))]), - 'virtual': (['vir', 'tual'], [(['vir', 'tu', 'al'], (2, 2))]), + 'virtual': (['vir', 'tual'], [(['vir', 'tu', 'al'], (2, 3))]), + 'virtuoso': (['vir', 'tuo', 'so'], [(['vir', 'tu', 'o', 'so'], (2, 3))]), + 'virtuosa': (['vir', 'tuo', 'sa'], [(['vir', 'tu', 'o', 'sa'], (2, 3))]), + 'virtuosos': (['vir', 'tuo', 'sos'], [(['vir', 'tu', 'o', 'sos'], (2, 3))]), + 'virtuosas': (['vir', 'tuo', 'sas'], [(['vir', 'tu', 'o', 'sas'], (2, 3))]), 'visual': (['vi', 'sual'], [(['vi', 'su', 'al'], (1, 2))]), 'visuales': (['vi', 'sua', 'les'], [(['vi', 'su', 'a', 'les'], (1, 2))]), 'viudo': (['viu', 'do'], [(['vi', 'u', 'do'], (0, 1))]), + 'ilión': (['i', 'lión'], [(['i', 'li', 'ón'], (0, 1))]), 'viudos': (['viu', 'dos'], [(['vi', 'u', 'dos'], (0, 1))])} diff --git a/tests/test_core.py b/tests/test_core.py index 3c75309..e646c6e 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -21,6 +21,7 @@ from rantanplan.core import get_syllables_word_end from rantanplan.core import get_word_stress from rantanplan.core import get_words +from rantanplan.core import has_single_liaisons from rantanplan.core import have_prosodic_liaison from rantanplan.core import is_paroxytone from rantanplan.core import spacy_tag_to_dict @@ -1121,3 +1122,15 @@ def test_apply_exception_rules_consonan_w_vowel(): word = "kiwi" output = "ki-wi" assert apply_exception_rules(word) == output + + +def test_has_single_liaisons_false(): + liaisons = [0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0] + output = has_single_liaisons(liaisons) + assert not output + + +def test_has_single_liaisons_true(): + liaisons = [0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0] + output = has_single_liaisons(liaisons) + assert output From 5c706c3e31e03c5042a7511301635314e48acc8a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20P=C3=A9rez?= Date: Tue, 24 Mar 2020 10:33:52 +0100 Subject: [PATCH 6/6] =?UTF-8?q?Bump=20version:=200.4.2=20=E2=86=92=200.4.3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- setup.cfg | 2 +- setup.py | 2 +- src/rantanplan/__init__.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/setup.cfg b/setup.cfg index 95ca53b..0281be1 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.4.2 +current_version = 0.4.3 commit = True tag = True diff --git a/setup.py b/setup.py index ac04a19..8ea5d71 100644 --- a/setup.py +++ b/setup.py @@ -41,7 +41,7 @@ def read(*names, **kwargs): setup( name='rantanplan', - version='0.4.2', + version='0.4.3', license='Apache Software License 2.0', description='Scansion tool for Spanish texts', long_description='%s\n%s' % ( diff --git a/src/rantanplan/__init__.py b/src/rantanplan/__init__.py index 3907fa6..24a1450 100644 --- a/src/rantanplan/__init__.py +++ b/src/rantanplan/__init__.py @@ -1,2 +1,2 @@ -__version__ = '0.4.2' +__version__ = '0.4.3' from .core import get_scansion # noqa