From 3c12348e5cf6f0f6e0de381669a0bb568ea2f840 Mon Sep 17 00:00:00 2001 From: ahoimarie Date: Sat, 4 Sep 2021 20:35:39 +0200 Subject: [PATCH 01/12] re-added submission, after having cleaned up the tree --- filters/alliteration/README.md | 16 +++++ filters/alliteration/__init__.py | 1 + filters/alliteration/filter.py | 48 +++++++++++++++ filters/alliteration/requirements.txt | 1 + filters/alliteration/test.json | 85 +++++++++++++++++++++++++++ 5 files changed, 151 insertions(+) create mode 100644 filters/alliteration/README.md create mode 100644 filters/alliteration/__init__.py create mode 100644 filters/alliteration/filter.py create mode 100644 filters/alliteration/requirements.txt create mode 100644 filters/alliteration/test.json diff --git a/filters/alliteration/README.md b/filters/alliteration/README.md new file mode 100644 index 000000000..39e1b88e2 --- /dev/null +++ b/filters/alliteration/README.md @@ -0,0 +1,16 @@ +## keywords filter + +**Author: Marie Tolkiehn**\ +Center for Data and Computing in Natural Sciences, Universität Hamburg\ +marie.tolkiehn@desy.de + +## What type of a filter is this? + +This filter returns True if a sentence is an alliteration and False otherwise. +There is an option to remove stopwords from the sentences. + +## Related Work + +## What are the limitations of this filter? +There may be phonetic alliterations that are not captured by a graphematic approach. For example, `Phonetic` and `Fine` are phonetic alliterations but not graphematic ones. +This could be ameliorated e.g. by using Carnegie Mellon's pronouncing dictionary to compare each word. \ No newline at end of file diff --git a/filters/alliteration/__init__.py b/filters/alliteration/__init__.py new file mode 100644 index 000000000..1e78c9bed --- /dev/null +++ b/filters/alliteration/__init__.py @@ -0,0 +1 @@ +from .filter import * diff --git a/filters/alliteration/filter.py b/filters/alliteration/filter.py new file mode 100644 index 000000000..abee69ac8 --- /dev/null +++ b/filters/alliteration/filter.py @@ -0,0 +1,48 @@ +from interfaces.SentenceOperation import SentenceOperation +from tasks.TaskTypes import TaskType +from initialize import spacy_nlp +import spacy +import string + +class Alliteration(SentenceOperation): + tasks = [TaskType.TEXT_CLASSIFICATION, TaskType.TEXT_TO_TEXT_GENERATION] + languages = ["en"] + + def __init__(self, stopwords: bool=False): + super().__init__() + self.stopwords = stopwords + self.nlp = spacy_nlp if spacy_nlp else spacy.load("en_core_web_sm") + + def filter(self, sentence: str = None) -> bool: + def get_phonemes(word: str): + # We are adding some digraphs to avoid 'sand' and 'shady' to alliterate. + # Then we check for these digraphs first + digraphs = ["ch", "ph", "sh", "th"] + if word[:2] in digraphs: + return word[:2] + else: + return word[:1] + + # Convert to lower, remove punctuation, tokenize into words + sentenceS = sentence.lower().translate(str.maketrans("", "", string.punctuation)).split() + + # if self.stopwords: # This somehow does not work, it always returns interfaces.SentenceOperation.SentenceOperation, even with a getter method + if self.stopwords: + all_stopwords = self.nlp.Defaults.stop_words + # Remove all stopwords from our sentence + sentenceS = [word for word in sentenceS if word not in all_stopwords] + + # tokenized = self.nlp(sentence, disable=["parser", "tagger", "ner"]) + # tokenizedB = [token.text for token in tokenized if token.text.isalpha()] + # tokened_text = [w.lower() for w in tokenizedB] # make it lowercase + + first_phon = get_phonemes(sentenceS[0]) + start_phon = [get_phonemes(word)==first_phon for word in sentenceS] + + return all(start_phon) + +# Alliteration(SentenceOperation).filter("It is I in it.") +# Alliteration(SentenceOperation).filter("It is not my fault.") +# print(Alliteration(SentenceOperation).filter("She showed Shawn shady shandy.")) +# print(Alliteration(SentenceOperation).filter("She showed Shawn some shady shandy.")) +# print(Alliteration(SentenceOperation).filter("Peter Piper picked a peck of pickled peppers.")) \ No newline at end of file diff --git a/filters/alliteration/requirements.txt b/filters/alliteration/requirements.txt new file mode 100644 index 000000000..88c873737 --- /dev/null +++ b/filters/alliteration/requirements.txt @@ -0,0 +1 @@ +spacytextblob==3.0.1 \ No newline at end of file diff --git a/filters/alliteration/test.json b/filters/alliteration/test.json new file mode 100644 index 000000000..bdfad698f --- /dev/null +++ b/filters/alliteration/test.json @@ -0,0 +1,85 @@ +{ + "type": "alliteration", + "test_cases": [ + { + "class": "Alliteration", + "args": { + "stopwords": true + }, + "inputs": { + "sentence": "Andrew always asks Anne about anchovies." + }, + "outputs": true + }, + { + "class": "Alliteration", + "args": { + "stopwords": true + }, + "inputs": { + "sentence": "She showed Shawn shady shandy." + }, + "outputs": true + }, + { + "class": "Alliteration", + "args": { + "stopwords": true + }, + "inputs": { + "sentence": "She showed Shawn some shady shandy." + }, + "outputs": true + }, + { + "class": "Alliteration", + "args": { + "stopwords": true + }, + "inputs": { + "sentence": "Peter Piper picked a peck of pickled peppers." + }, + "outputs": true + }, + { + "class": "Alliteration", + "args": { + "stopwords": false + }, + "inputs": { + "sentence": "Andrew always asks Anne about anchovies." + }, + "outputs": true + }, + { + "class": "Alliteration", + "args": { + "stopwords": false + }, + "inputs": { + "sentence": "She showed Shawn shady shandy." + }, + "outputs": true + }, + { + "class": "Alliteration", + "args": { + "stopwords": false + }, + "inputs": { + "sentence": "She showed Shawn some shady shandy." + }, + "outputs": false + }, + { + "class": "Alliteration", + "args": { + "stopwords": false + }, + "inputs": { + "sentence": "Peter Piper picked a peck of pickled peppers." + }, + "outputs": false + } + ] +} \ No newline at end of file From e09da5256c9afcae9357a127a816e16dcd5d71a3 Mon Sep 17 00:00:00 2001 From: ahoimarie Date: Tue, 7 Sep 2021 21:38:04 +0200 Subject: [PATCH 02/12] added keyword --- Makefile | 3 +-- filters/alliteration/filter.py | 27 +++++++++++++++++++-------- 2 files changed, 20 insertions(+), 10 deletions(-) diff --git a/Makefile b/Makefile index 66418d375..fa03e701c 100644 --- a/Makefile +++ b/Makefile @@ -15,5 +15,4 @@ clean: style # Test .PHONY: test -test: - pytest -v --cov-config=pyproject.toml + diff --git a/filters/alliteration/filter.py b/filters/alliteration/filter.py index abee69ac8..dd3ddf031 100644 --- a/filters/alliteration/filter.py +++ b/filters/alliteration/filter.py @@ -1,14 +1,18 @@ +import string + +import spacy + +from initialize import spacy_nlp from interfaces.SentenceOperation import SentenceOperation from tasks.TaskTypes import TaskType -from initialize import spacy_nlp -import spacy -import string + class Alliteration(SentenceOperation): tasks = [TaskType.TEXT_CLASSIFICATION, TaskType.TEXT_TO_TEXT_GENERATION] languages = ["en"] + keywords = ["morphological"] - def __init__(self, stopwords: bool=False): + def __init__(self, stopwords: bool = False): super().__init__() self.stopwords = stopwords self.nlp = spacy_nlp if spacy_nlp else spacy.load("en_core_web_sm") @@ -24,25 +28,32 @@ def get_phonemes(word: str): return word[:1] # Convert to lower, remove punctuation, tokenize into words - sentenceS = sentence.lower().translate(str.maketrans("", "", string.punctuation)).split() + sentenceS = ( + sentence.lower() + .translate(str.maketrans("", "", string.punctuation)) + .split() + ) # if self.stopwords: # This somehow does not work, it always returns interfaces.SentenceOperation.SentenceOperation, even with a getter method if self.stopwords: all_stopwords = self.nlp.Defaults.stop_words # Remove all stopwords from our sentence - sentenceS = [word for word in sentenceS if word not in all_stopwords] + sentenceS = [ + word for word in sentenceS if word not in all_stopwords + ] # tokenized = self.nlp(sentence, disable=["parser", "tagger", "ner"]) # tokenizedB = [token.text for token in tokenized if token.text.isalpha()] # tokened_text = [w.lower() for w in tokenizedB] # make it lowercase first_phon = get_phonemes(sentenceS[0]) - start_phon = [get_phonemes(word)==first_phon for word in sentenceS] + start_phon = [get_phonemes(word) == first_phon for word in sentenceS] return all(start_phon) + # Alliteration(SentenceOperation).filter("It is I in it.") # Alliteration(SentenceOperation).filter("It is not my fault.") # print(Alliteration(SentenceOperation).filter("She showed Shawn shady shandy.")) # print(Alliteration(SentenceOperation).filter("She showed Shawn some shady shandy.")) -# print(Alliteration(SentenceOperation).filter("Peter Piper picked a peck of pickled peppers.")) \ No newline at end of file +# print(Alliteration(SentenceOperation).filter("Peter Piper picked a peck of pickled peppers.")) From 527ffb05c5034790f90da2f81ddb8b3c2e1dda00 Mon Sep 17 00:00:00 2001 From: ahoimarie Date: Fri, 10 Sep 2021 17:13:30 +0200 Subject: [PATCH 03/12] added evaluation results and tweaked code to make evaluations work. --- Makefile | 3 ++- filters/alliteration/README.md | 44 ++++++++++++++++++++++++++++++---- filters/alliteration/filter.py | 28 +++++++++++++++------- 3 files changed, 60 insertions(+), 15 deletions(-) diff --git a/Makefile b/Makefile index fa03e701c..8b7314430 100644 --- a/Makefile +++ b/Makefile @@ -15,4 +15,5 @@ clean: style # Test .PHONY: test - +# test: +# pytest -v --cov-config=pyproject.toml diff --git a/filters/alliteration/README.md b/filters/alliteration/README.md index 39e1b88e2..ab1459de5 100644 --- a/filters/alliteration/README.md +++ b/filters/alliteration/README.md @@ -1,16 +1,50 @@ -## keywords filter +## Alliteration filter **Author: Marie Tolkiehn**\ Center for Data and Computing in Natural Sciences, Universität Hamburg\ marie.tolkiehn@desy.de + ## What type of a filter is this? -This filter returns True if a sentence is an alliteration and False otherwise. -There is an option to remove stopwords from the sentences. +This filter returns True if a sentence is an alliteration and False otherwise. +There is an option to remove stopwords from the sentences, and the default is True (remove stopwords). However, should the sentence solely consist of stop words, will they not be removed + +If the input contains more than one sentence, only the first sentence is used and filtered. + + +## Robustness Evaluation +### Removing Stopwords (True) +Here is the performance of the model on the filtered set: +* **IMDB**\ + `python evaluate.py -f Alliteration -task "TEXT_CLASSIFICATION" -m "textattack/roberta-base-imdb" -d "imdb" -p 20`\ + The accuracy on this subset which has 24 examples = 100.0 +* **SST-2**\ + `python evaluate.py -f Alliteration -task "TEXT_CLASSIFICATION" -m "textattack/roberta-base-SST-2" -d "sst2" -p 20`\ + The accuracy on this subset which has 4 examples = 100.0 +* **QQP** \ + `python evaluate.py -f Alliteration -task "TEXT_CLASSIFICATION" -m "textattack/bert-base-uncased-QQP" -d "qqp" -p 20`\ + The accuracy on this subset which has 28 examples = 96.0 +* **MNLI**\ + `python evaluate.py -f Alliteration -task "TEXT_CLASSIFICATION" -m "roberta-large-mnli" -d "multi_nli" -p 20`\ + The accuracy on this subset which has 77 examples = 91.0 + +### Not removing stopwords (False) +* **IMDB**\ + `python evaluate.py -f Alliteration -task "TEXT_CLASSIFICATION" -m "textattack/roberta-base-imdb" -d "imdb" -p 20`\ + The accuracy on this subset which has 8 examples = 100.0 +* **SST-2**\ + `python evaluate.py -f Alliteration -task "TEXT_CLASSIFICATION" -m "textattack/roberta-base-SST-2" -d "sst2" -p 20`\ + The accuracy on this subset which has 1 examples = 100.0 +* **QQP** \ + `python evaluate.py -f Alliteration -task "TEXT_CLASSIFICATION" -m "textattack/bert-base-uncased-QQP" -d "qqp" -p 20`\ + The accuracy on this subset which has 1 examples = 100.0 +* **MNLI**\ + `python evaluate.py -f Alliteration -task "TEXT_CLASSIFICATION" -m "roberta-large-mnli" -d "multi_nli" -p 20`\ + The accuracy on this subset which has 22 examples = 91.0 ## Related Work ## What are the limitations of this filter? -There may be phonetic alliterations that are not captured by a graphematic approach. For example, `Phonetic` and `Fine` are phonetic alliterations but not graphematic ones. -This could be ameliorated e.g. by using Carnegie Mellon's pronouncing dictionary to compare each word. \ No newline at end of file +There may be phonetic alliterations that are not captured by a graphematic approach. For example, `Phonetic` and `Fine` are phonetic alliterations but not graphematic ones. +This could be ameliorated e.g. by using more sophisticated methods such as a pronouncing dictionary by Carnegie Mellon's to compare each word. \ No newline at end of file diff --git a/filters/alliteration/filter.py b/filters/alliteration/filter.py index dd3ddf031..9131d400a 100644 --- a/filters/alliteration/filter.py +++ b/filters/alliteration/filter.py @@ -12,12 +12,12 @@ class Alliteration(SentenceOperation): languages = ["en"] keywords = ["morphological"] - def __init__(self, stopwords: bool = False): + def __init__(self, stopwords: bool = True): super().__init__() self.stopwords = stopwords self.nlp = spacy_nlp if spacy_nlp else spacy.load("en_core_web_sm") - def filter(self, sentence: str = None) -> bool: + def filter(self, sentence: str = None, min_sentence_length=3) -> bool: def get_phonemes(word: str): # We are adding some digraphs to avoid 'sand' and 'shady' to alliterate. # Then we check for these digraphs first @@ -27,20 +27,30 @@ def get_phonemes(word: str): else: return word[:1] + # If the input contains multiple sentences, only take the first sentence that has the min_sentence_length + sent = self.nlp(sentence.lstrip()) + segmented_sentence = list(sent.sents) + for k in segmented_sentence: + # Skip any too short 'sentences' that contain no alphanumeric characters + if len(k.text) > min_sentence_length and k.text.lower().islower(): + first_sentence = k.text + break + # Convert to lower, remove punctuation, tokenize into words sentenceS = ( - sentence.lower() + first_sentence.lower() .translate(str.maketrans("", "", string.punctuation)) .split() ) # if self.stopwords: # This somehow does not work, it always returns interfaces.SentenceOperation.SentenceOperation, even with a getter method if self.stopwords: - all_stopwords = self.nlp.Defaults.stop_words - # Remove all stopwords from our sentence - sentenceS = [ - word for word in sentenceS if word not in all_stopwords - ] + if not set(sentenceS).issubset(self.nlp.Defaults.stop_words): + all_stopwords = self.nlp.Defaults.stop_words + # Remove all stopwords from our sentence + sentenceS = [ + word for word in sentenceS if word not in all_stopwords + ] # tokenized = self.nlp(sentence, disable=["parser", "tagger", "ner"]) # tokenizedB = [token.text for token in tokenized if token.text.isalpha()] @@ -54,6 +64,6 @@ def get_phonemes(word: str): # Alliteration(SentenceOperation).filter("It is I in it.") # Alliteration(SentenceOperation).filter("It is not my fault.") -# print(Alliteration(SentenceOperation).filter("She showed Shawn shady shandy.")) +# print(Alliteration(SentenceOperation).filter("She showed Shawn shady shandy. This is the second sentence.")) # print(Alliteration(SentenceOperation).filter("She showed Shawn some shady shandy.")) # print(Alliteration(SentenceOperation).filter("Peter Piper picked a peck of pickled peppers.")) From 64e174d5f6678d5c4190ddec3720977470855d65 Mon Sep 17 00:00:00 2001 From: ahoimarie Date: Fri, 10 Sep 2021 18:51:57 +0200 Subject: [PATCH 04/12] added Data statement --- filters/alliteration/README.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/filters/alliteration/README.md b/filters/alliteration/README.md index ab1459de5..484fc6446 100644 --- a/filters/alliteration/README.md +++ b/filters/alliteration/README.md @@ -43,7 +43,11 @@ Here is the performance of the model on the filtered set: `python evaluate.py -f Alliteration -task "TEXT_CLASSIFICATION" -m "roberta-large-mnli" -d "multi_nli" -p 20`\ The accuracy on this subset which has 22 examples = 91.0 -## Related Work + +## Data and code source +Data was fully created by the author. +Only the test case involving "Peter and his famous pickled peppers" first appeared in print in 1813 in John Harris's Peter Piper's Practical Principles of Plain and Perfect Pronunciation. + ## What are the limitations of this filter? There may be phonetic alliterations that are not captured by a graphematic approach. For example, `Phonetic` and `Fine` are phonetic alliterations but not graphematic ones. From 994084009326a3287cbfdca22fbb7f07200f7c95 Mon Sep 17 00:00:00 2001 From: ahoimarie Date: Wed, 22 Sep 2021 16:35:23 +0200 Subject: [PATCH 05/12] added minimum_alliteration_length, included all input text sentences, and amended README --- filters/alliteration/README.md | 53 ++++++++++++++----- filters/alliteration/filter.py | 94 +++++++++++++++++++++++----------- filters/alliteration/test.json | 37 ++++++++++++- 3 files changed, 140 insertions(+), 44 deletions(-) diff --git a/filters/alliteration/README.md b/filters/alliteration/README.md index 484fc6446..58e3bd2fa 100644 --- a/filters/alliteration/README.md +++ b/filters/alliteration/README.md @@ -7,41 +7,68 @@ marie.tolkiehn@desy.de ## What type of a filter is this? -This filter returns True if a sentence is an alliteration and False otherwise. -There is an option to remove stopwords from the sentences, and the default is True (remove stopwords). However, should the sentence solely consist of stop words, will they not be removed +This filter returns True if any of the input sentences is an alliteration and False otherwise. +By default, stop words are removed and do not count to the alliteration. +However, should the sentence solely consist of stop words, will they not be removed. -If the input contains more than one sentence, only the first sentence is used and filtered. +A sentence is deemed an alliteration if it contains words starting with the same character or digraph ("ch", "ph", "sh", "th"). +The minimum alliteration length then governs how many words starting with the same first phoneme are required to be deemed a valid alliteration. +The default minimum alliteration length is 3. +These alliterative words do not need to appear contiguously in the sentence. +This means that e.g. "Peter Aquarium prepared a pepperoni pizza." is a valid alliteration +as it contains more than (default) 3 alliterative non-stopword words (despite "Aquarium"). + +## Why is it a challenge? +Alliterations attract audiences. +Alliterations are a stylistic device and trope of literature or poetry. +However, alliterations are around us all the time. From newspaper headlines +("Beer Baron Beats Banner" or "Banner Bars Booze (Booze Barred By Banner)" (c) The Simpsons) +over ads ("Taco Tuesdays"), and company/brand names ("Coca Cola", "Bed, Bath & Beyond", "PayPal"), +protagonists ("Peter Pevensie", "Peter Pan", "Bilbo Baggins", "Donald Duck") +and even academic publications, writers often use alliterations to catch the reader's (or listener's) attention, +as through sound repetition, they are catchy and easy to remember. +Alliterations generally sound pleasing and different phonemes create different rhythms and vibes. +For example, alliterations starting with S are often connected to snake-like features, +whereas alliterations with plosives such as P create a particular rhythm. + +This filter could check just how prevalent alliterations are in various types of texts and if there are particular areas they are particularly prevalent. +A good language model may then be able to generate synonymous alliterations from non-alliterative texts. ## Robustness Evaluation -### Removing Stopwords (True) +### Removing Stopwords (True), minimum alliteration length = 3 Here is the performance of the model on the filtered set: * **IMDB**\ `python evaluate.py -f Alliteration -task "TEXT_CLASSIFICATION" -m "textattack/roberta-base-imdb" -d "imdb" -p 20`\ - The accuracy on this subset which has 24 examples = 100.0 + The accuracy on this subset which has 597 examples = 94.0 + * **SST-2**\ `python evaluate.py -f Alliteration -task "TEXT_CLASSIFICATION" -m "textattack/roberta-base-SST-2" -d "sst2" -p 20`\ - The accuracy on this subset which has 4 examples = 100.0 + The accuracy on this subset which has 21 examples = 90.0 + * **QQP** \ `python evaluate.py -f Alliteration -task "TEXT_CLASSIFICATION" -m "textattack/bert-base-uncased-QQP" -d "qqp" -p 20`\ - The accuracy on this subset which has 28 examples = 96.0 + The accuracy on this subset which has 27 examples = 96.0 + * **MNLI**\ `python evaluate.py -f Alliteration -task "TEXT_CLASSIFICATION" -m "roberta-large-mnli" -d "multi_nli" -p 20`\ - The accuracy on this subset which has 77 examples = 91.0 + The accuracy on this subset which has 92 examples = 97.0 + -### Not removing stopwords (False) +### Not removing stopwords (False), minimum alliteration length = 3 * **IMDB**\ `python evaluate.py -f Alliteration -task "TEXT_CLASSIFICATION" -m "textattack/roberta-base-imdb" -d "imdb" -p 20`\ - The accuracy on this subset which has 8 examples = 100.0 + The accuracy on this subset which has 943 examples = 95.0 * **SST-2**\ `python evaluate.py -f Alliteration -task "TEXT_CLASSIFICATION" -m "textattack/roberta-base-SST-2" -d "sst2" -p 20`\ - The accuracy on this subset which has 1 examples = 100.0 + The accuracy on this subset which has 54 examples = 96.0 * **QQP** \ `python evaluate.py -f Alliteration -task "TEXT_CLASSIFICATION" -m "textattack/bert-base-uncased-QQP" -d "qqp" -p 20`\ - The accuracy on this subset which has 1 examples = 100.0 + The accuracy on this subset which has 101 examples = 94.0 * **MNLI**\ `python evaluate.py -f Alliteration -task "TEXT_CLASSIFICATION" -m "roberta-large-mnli" -d "multi_nli" -p 20`\ - The accuracy on this subset which has 22 examples = 91.0 + The accuracy on this subset which has 294 examples = 93.0\ + ## Data and code source diff --git a/filters/alliteration/filter.py b/filters/alliteration/filter.py index 9131d400a..0fba3d970 100644 --- a/filters/alliteration/filter.py +++ b/filters/alliteration/filter.py @@ -12,12 +12,22 @@ class Alliteration(SentenceOperation): languages = ["en"] keywords = ["morphological"] - def __init__(self, stopwords: bool = True): + def __init__(self, stopwords: bool = True, min_alliteration_length=3): super().__init__() self.stopwords = stopwords + self.min_alliteration_length = min_alliteration_length self.nlp = spacy_nlp if spacy_nlp else spacy.load("en_core_web_sm") def filter(self, sentence: str = None, min_sentence_length=3) -> bool: + """ + This filter returns True if any of the input sentences is an alliteration. + A sentence is deemed an alliteration if it contains a minimum alliteration length of (Default) 3. + These alliterative words do not need to appear contiguously. + This means that e.g. "Peter Aquarium prepared a pepperoni pizza." is an alliteration + as it contains more than 3 alliterative non-stopword words (despite "Aquarium"). + By default, stop words are removed and do not count to the alliteration. + """ + def get_phonemes(word: str): # We are adding some digraphs to avoid 'sand' and 'shady' to alliterate. # Then we check for these digraphs first @@ -27,43 +37,69 @@ def get_phonemes(word: str): else: return word[:1] - # If the input contains multiple sentences, only take the first sentence that has the min_sentence_length - sent = self.nlp(sentence.lstrip()) - segmented_sentence = list(sent.sents) - for k in segmented_sentence: - # Skip any too short 'sentences' that contain no alphanumeric characters - if len(k.text) > min_sentence_length and k.text.lower().islower(): - first_sentence = k.text - break - - # Convert to lower, remove punctuation, tokenize into words - sentenceS = ( - first_sentence.lower() - .translate(str.maketrans("", "", string.punctuation)) - .split() - ) - - # if self.stopwords: # This somehow does not work, it always returns interfaces.SentenceOperation.SentenceOperation, even with a getter method - if self.stopwords: - if not set(sentenceS).issubset(self.nlp.Defaults.stop_words): - all_stopwords = self.nlp.Defaults.stop_words - # Remove all stopwords from our sentence - sentenceS = [ - word for word in sentenceS if word not in all_stopwords - ] + def segment_sentences(sentence, min_sentence_length): + """ + If the input contains multiple sentences, only take the sentences that have the min_sentence_length and that do contain alphanumeric characters. + """ + sent = self.nlp(sentence.lstrip()) + segmented_sentence = list(sent.sents) + all_stopwords = self.nlp.Defaults.stop_words + filt_sentences = [] + for k in segmented_sentence: + # Skip any too short 'sentences' that contain no alphanumeric characters + if ( + len(k.text) > min_sentence_length + and k.text.lower().islower() + ): + valid_sentences = k.text + else: + continue + + # Convert to lower, remove punctuation, tokenize into words + sentenceS = ( + valid_sentences.lower() + .translate(str.maketrans("", "", string.punctuation)) + .split() + ) + + if self.stopwords: + if not set(sentenceS).issubset( + self.nlp.Defaults.stop_words + ): + # Remove all stopwords from our sentence + sentenceS = [ + word + for word in sentenceS + if word not in all_stopwords + ] + filt_sentences.append(sentenceS) + + return filt_sentences # tokenized = self.nlp(sentence, disable=["parser", "tagger", "ner"]) # tokenizedB = [token.text for token in tokenized if token.text.isalpha()] # tokened_text = [w.lower() for w in tokenizedB] # make it lowercase - first_phon = get_phonemes(sentenceS[0]) - start_phon = [get_phonemes(word) == first_phon for word in sentenceS] + # Process input sentences + sentenceS = segment_sentences(sentence, min_sentence_length) + + # Iterate through sentences + sentence_count = [] + for sen in sentenceS: + + first_phon = get_phonemes(sen[0]) + start_phon = [get_phonemes(word) == first_phon for word in sen] + sentence_count.append( + sum(start_phon) >= self.min_alliteration_length + ) - return all(start_phon) + return any( + sentence_count + ) # return True if any of the input sentences are alliterative # Alliteration(SentenceOperation).filter("It is I in it.") # Alliteration(SentenceOperation).filter("It is not my fault.") -# print(Alliteration(SentenceOperation).filter("She showed Shawn shady shandy. This is the second sentence.")) +# print(Alliteration(SentenceOperation).filter("4 *((( ::). She showed Aquarium Shawn shady shandy. This is the second sentence Sandy sorted. It is imminent in Iowa.")) # print(Alliteration(SentenceOperation).filter("She showed Shawn some shady shandy.")) # print(Alliteration(SentenceOperation).filter("Peter Piper picked a peck of pickled peppers.")) diff --git a/filters/alliteration/test.json b/filters/alliteration/test.json index bdfad698f..891137c02 100644 --- a/filters/alliteration/test.json +++ b/filters/alliteration/test.json @@ -69,7 +69,7 @@ "inputs": { "sentence": "She showed Shawn some shady shandy." }, - "outputs": false + "outputs": true }, { "class": "Alliteration", @@ -79,7 +79,40 @@ "inputs": { "sentence": "Peter Piper picked a peck of pickled peppers." }, + "outputs": true + }, + { + "class": "Alliteration", + "args": { + "stopwords": true + }, + "inputs": { + "sentence": "4 *((( ::). She showed Aquarium Shawn shady shandy. This is the second sentence Sandy sorted. It is imminent in Iowa." + }, + "outputs": true + }, + { + "class": "Alliteration", + "args": { + "stopwords": false, + "min_alliteration_length": 5 + }, + "inputs": { + "sentence": "4 *((( ::). She offered Shawn super shandy. This is the second sentence Sandy sorted. It is imminent in Iowa." + }, + "outputs": true + }, + { + "class": "Alliteration", + "args": { + "stopwords": true, + "min_alliteration_length": 5 + }, + "inputs": { + "sentence": "4 *((( ::). She offered Shawn super shandy. This is the second sentence Sandy sorted. It is imminent in Iowa." + }, "outputs": false } ] -} \ No newline at end of file +} + From dbcd9fbb3187785885988611970bacc9bf0f58c1 Mon Sep 17 00:00:00 2001 From: ahoimarie Date: Thu, 23 Sep 2021 20:53:19 +0200 Subject: [PATCH 06/12] updated README and robustness scores --- filters/alliteration/README.md | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/filters/alliteration/README.md b/filters/alliteration/README.md index 58e3bd2fa..995d70cdf 100644 --- a/filters/alliteration/README.md +++ b/filters/alliteration/README.md @@ -9,7 +9,7 @@ marie.tolkiehn@desy.de This filter returns True if any of the input sentences is an alliteration and False otherwise. By default, stop words are removed and do not count to the alliteration. -However, should the sentence solely consist of stop words, will they not be removed. +However, should the sentence solely consist of stop words, they will not be removed. A sentence is deemed an alliteration if it contains words starting with the same character or digraph ("ch", "ph", "sh", "th"). The minimum alliteration length then governs how many words starting with the same first phoneme are required to be deemed a valid alliteration. @@ -19,7 +19,7 @@ These alliterative words do not need to appear contiguously in the sentence. This means that e.g. "Peter Aquarium prepared a pepperoni pizza." is a valid alliteration as it contains more than (default) 3 alliterative non-stopword words (despite "Aquarium"). -## Why is it a challenge? +## Why is this filter important? Alliterations attract audiences. Alliterations are a stylistic device and trope of literature or poetry. However, alliterations are around us all the time. From newspaper headlines @@ -40,34 +40,34 @@ A good language model may then be able to generate synonymous alliterations from Here is the performance of the model on the filtered set: * **IMDB**\ `python evaluate.py -f Alliteration -task "TEXT_CLASSIFICATION" -m "textattack/roberta-base-imdb" -d "imdb" -p 20`\ - The accuracy on this subset which has 597 examples = 94.0 + The accuracy on this subset which has 612 examples = 95.0 * **SST-2**\ `python evaluate.py -f Alliteration -task "TEXT_CLASSIFICATION" -m "textattack/roberta-base-SST-2" -d "sst2" -p 20`\ - The accuracy on this subset which has 21 examples = 90.0 + The accuracy on this subset which has 17 examples = 88.0 * **QQP** \ `python evaluate.py -f Alliteration -task "TEXT_CLASSIFICATION" -m "textattack/bert-base-uncased-QQP" -d "qqp" -p 20`\ - The accuracy on this subset which has 27 examples = 96.0 + The accuracy on this subset which has 31 examples = 97.0 * **MNLI**\ `python evaluate.py -f Alliteration -task "TEXT_CLASSIFICATION" -m "roberta-large-mnli" -d "multi_nli" -p 20`\ - The accuracy on this subset which has 92 examples = 97.0 + The accuracy on this subset which has 128 examples = 91.0 ### Not removing stopwords (False), minimum alliteration length = 3 * **IMDB**\ `python evaluate.py -f Alliteration -task "TEXT_CLASSIFICATION" -m "textattack/roberta-base-imdb" -d "imdb" -p 20`\ - The accuracy on this subset which has 943 examples = 95.0 + The accuracy on this subset which has 886 examples = 95.0 * **SST-2**\ `python evaluate.py -f Alliteration -task "TEXT_CLASSIFICATION" -m "textattack/roberta-base-SST-2" -d "sst2" -p 20`\ - The accuracy on this subset which has 54 examples = 96.0 + The accuracy on this subset which has 34 examples = 97.0 * **QQP** \ `python evaluate.py -f Alliteration -task "TEXT_CLASSIFICATION" -m "textattack/bert-base-uncased-QQP" -d "qqp" -p 20`\ - The accuracy on this subset which has 101 examples = 94.0 + The accuracy on this subset which has 111 examples = 94.0 * **MNLI**\ `python evaluate.py -f Alliteration -task "TEXT_CLASSIFICATION" -m "roberta-large-mnli" -d "multi_nli" -p 20`\ - The accuracy on this subset which has 294 examples = 93.0\ + The accuracy on this subset which has 233 examples = 92.0\ From 7172a668fef835efa8ac60a9076d3ec2ba1d8eab Mon Sep 17 00:00:00 2001 From: ahoimarie Date: Thu, 23 Sep 2021 20:54:14 +0200 Subject: [PATCH 07/12] changed criterion to check for alliterations --- filters/alliteration/filter.py | 81 +++++++++++++++++++++++++--------- 1 file changed, 60 insertions(+), 21 deletions(-) diff --git a/filters/alliteration/filter.py b/filters/alliteration/filter.py index 0fba3d970..5f5bcc733 100644 --- a/filters/alliteration/filter.py +++ b/filters/alliteration/filter.py @@ -1,5 +1,9 @@ +#!/usr/bin/env python3 +# *_* coding: utf-8 *_* + import string +import numpy as np import spacy from initialize import spacy_nlp @@ -12,10 +16,16 @@ class Alliteration(SentenceOperation): languages = ["en"] keywords = ["morphological"] - def __init__(self, stopwords: bool = True, min_alliteration_length=3): + def __init__( + self, + stopwords: bool = True, + min_alliteration_length: int = 3, + allowed_offwords: int = 2, + ): super().__init__() self.stopwords = stopwords self.min_alliteration_length = min_alliteration_length + self.allowed_offwords = allowed_offwords self.nlp = spacy_nlp if spacy_nlp else spacy.load("en_core_web_sm") def filter(self, sentence: str = None, min_sentence_length=3) -> bool: @@ -29,17 +39,20 @@ def filter(self, sentence: str = None, min_sentence_length=3) -> bool: """ def get_phonemes(word: str): - # We are adding some digraphs to avoid 'sand' and 'shady' to alliterate. - # Then we check for these digraphs first + """ + We are adding some digraphs to avoid 'sand' and 'shady' to alliterate. + Then we check for these digraphs first + """ digraphs = ["ch", "ph", "sh", "th"] if word[:2] in digraphs: return word[:2] else: return word[:1] - def segment_sentences(sentence, min_sentence_length): + def segment_sentences(self, sentence, min_sentence_length): """ - If the input contains multiple sentences, only take the sentences that have the min_sentence_length and that do contain alphanumeric characters. + If the input contains multiple sentences, only take the sentences that have the min_sentence_length + and that do contain alphanumeric characters. """ sent = self.nlp(sentence.lstrip()) segmented_sentence = list(sent.sents) @@ -76,30 +89,56 @@ def segment_sentences(sentence, min_sentence_length): return filt_sentences - # tokenized = self.nlp(sentence, disable=["parser", "tagger", "ner"]) - # tokenizedB = [token.text for token in tokenized if token.text.isalpha()] - # tokened_text = [w.lower() for w in tokenizedB] # make it lowercase + def rolling_window(data, windowlen): + """ + Create a 1-dimensional rolling window of size windowlen. + If the windowlen is smaller than the length of the data, use the length of the data instead. + """ + if len(data) < windowlen: + windowlen = len(data) + shape = data.shape[:-1] + ( + data.shape[-1] - windowlen + 1, + windowlen, + ) + strides = data.strides + (data.strides[-1],) + return np.lib.stride_tricks.as_strided( + data, shape=shape, strides=strides + ) + + def find_contiguous_elements( + elements, min_alliteration_length, allowed_offwords + ): + """ + Create rolling windows of size min_alliteration_length + allowed_offwords + and check if any window contains a block of the same elements of the size min_alliteration_length. + Return True if any window with the min_alliteration_length is found, False otherwise. + """ + rolling_sent = rolling_window( + elements, min_alliteration_length + allowed_offwords + ) + + for windows in rolling_sent: + if ( + windows == max(set(windows), key=sorted(windows).count) + ).sum() >= min_alliteration_length: + return True + + return False # Process input sentences - sentenceS = segment_sentences(sentence, min_sentence_length) + sentenceS = segment_sentences(self, sentence, min_sentence_length) # Iterate through sentences sentence_count = [] for sen in sentenceS: - - first_phon = get_phonemes(sen[0]) - start_phon = [get_phonemes(word) == first_phon for word in sen] - sentence_count.append( - sum(start_phon) >= self.min_alliteration_length + cat_sentence = np.array([get_phonemes(word) for word in sen]) + phonemes_bool = find_contiguous_elements( + cat_sentence, + self.min_alliteration_length, + self.allowed_offwords, ) + sentence_count.append(phonemes_bool) return any( sentence_count ) # return True if any of the input sentences are alliterative - - -# Alliteration(SentenceOperation).filter("It is I in it.") -# Alliteration(SentenceOperation).filter("It is not my fault.") -# print(Alliteration(SentenceOperation).filter("4 *((( ::). She showed Aquarium Shawn shady shandy. This is the second sentence Sandy sorted. It is imminent in Iowa.")) -# print(Alliteration(SentenceOperation).filter("She showed Shawn some shady shandy.")) -# print(Alliteration(SentenceOperation).filter("Peter Piper picked a peck of pickled peppers.")) From 1ac30cff9de9b3b170512ba84d7344d8197f661c Mon Sep 17 00:00:00 2001 From: ahoimarie Date: Fri, 24 Sep 2021 15:37:20 +0200 Subject: [PATCH 08/12] corrected docstring for rolling_window --- filters/alliteration/filter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/filters/alliteration/filter.py b/filters/alliteration/filter.py index 5f5bcc733..14264d17f 100644 --- a/filters/alliteration/filter.py +++ b/filters/alliteration/filter.py @@ -92,7 +92,7 @@ def segment_sentences(self, sentence, min_sentence_length): def rolling_window(data, windowlen): """ Create a 1-dimensional rolling window of size windowlen. - If the windowlen is smaller than the length of the data, use the length of the data instead. + If the windowlen is larger than the length of the data, use the length of the data instead. """ if len(data) < windowlen: windowlen = len(data) From 866a31095606ce34e51cd588a3c7830fb36ce9bf Mon Sep 17 00:00:00 2001 From: ahoimarie Date: Wed, 29 Sep 2021 16:23:45 +0200 Subject: [PATCH 09/12] Update Makefile --- Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 8b7314430..5e35507d4 100644 --- a/Makefile +++ b/Makefile @@ -15,5 +15,5 @@ clean: style # Test .PHONY: test -# test: -# pytest -v --cov-config=pyproject.toml +test: + pytest -v --cov-config=pyproject.toml From 58e744cf21c551eed9dd07f4796d0bef720b8c21 Mon Sep 17 00:00:00 2001 From: ahoimarie Date: Sun, 3 Oct 2021 08:58:42 +0200 Subject: [PATCH 10/12] remove Makefile from tracking --- Makefile | 19 ------------------- 1 file changed, 19 deletions(-) delete mode 100644 Makefile diff --git a/Makefile b/Makefile deleted file mode 100644 index 5e35507d4..000000000 --- a/Makefile +++ /dev/null @@ -1,19 +0,0 @@ -# Styling -.PHONY: style -style: - black . - flake8 - isort . - -# Cleaning -.PHONY: clean -clean: style - find . | grep -E "(__pycache__|\.pyc|\.pyo)" | xargs rm -rf - find . | grep -E ".pytest_cache" | xargs rm -rf - find . | grep -E ".ipynb_checkpoints" | xargs rm -rf - rm -f .coverage - -# Test -.PHONY: test -test: - pytest -v --cov-config=pyproject.toml From 5abb695c4984da93d82ad783fc3eaf88b11ddfa6 Mon Sep 17 00:00:00 2001 From: ahoimarie Date: Sat, 30 Oct 2021 11:30:26 +0200 Subject: [PATCH 11/12] added Makefile --- Makefile | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 Makefile diff --git a/Makefile b/Makefile new file mode 100644 index 000000000..5e35507d4 --- /dev/null +++ b/Makefile @@ -0,0 +1,19 @@ +# Styling +.PHONY: style +style: + black . + flake8 + isort . + +# Cleaning +.PHONY: clean +clean: style + find . | grep -E "(__pycache__|\.pyc|\.pyo)" | xargs rm -rf + find . | grep -E ".pytest_cache" | xargs rm -rf + find . | grep -E ".ipynb_checkpoints" | xargs rm -rf + rm -f .coverage + +# Test +.PHONY: test +test: + pytest -v --cov-config=pyproject.toml From 95e2d675584950028c6b299f2b6585e43b1aa8aa Mon Sep 17 00:00:00 2001 From: ahoimarie Date: Tue, 16 Nov 2021 13:37:33 +0100 Subject: [PATCH 12/12] Removed a modified makefile from pull request --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 5e35507d4..66418d375 100644 --- a/Makefile +++ b/Makefile @@ -16,4 +16,4 @@ clean: style # Test .PHONY: test test: - pytest -v --cov-config=pyproject.toml + pytest -v --cov-config=pyproject.toml