diff --git a/README.md b/README.md index 89920a0e2..6c51831d2 100644 --- a/README.md +++ b/README.md @@ -17,9 +17,12 @@ of _[KASTEL - Institute of Information Security and Dependability](https://kaste the [KIT](https://www.kit.edu). ## User Interfaces -To be able to execute the core algorithms from this repository, you can write own user interfaces that (should) use the [ArDoCoRunner](https://github.com/ArDoCo/Core/blob/main/pipeline/pipeline-core/src/main/java/edu/kit/kastel/mcse/ardoco/core/execution/runner/ArDoCoRunner.java). -We provide an example Command Line Interface (CLI) at [ArDoCo/CLI](https://github.com/ArDoCo/CLI) as well as a simple Graphical User Interface (GUI) at [ArDoCo/GUI](https://github.com/ArDoCo/GUI). +To be able to execute the core algorithms from this repository, you can write own user interfaces that (should) use +the [ArDoCoRunner](https://github.com/ArDoCo/Core/blob/main/pipeline/pipeline-core/src/main/java/edu/kit/kastel/mcse/ardoco/core/execution/runner/ArDoCoRunner.java). + +We provide an example Command Line Interface (CLI) at [ArDoCo/CLI](https://github.com/ArDoCo/CLI) as well as a simple Graphical User Interface (GUI) +at [ArDoCo/GUI](https://github.com/ArDoCo/GUI). Future user interfaces like an enhanced GUI or a web interface are planned. @@ -39,6 +42,7 @@ To test the Core, you could use case studies and benchmarks provided in .. ## Maven ```xml + io.github.ardoco.core @@ -49,7 +53,9 @@ To test the Core, you could use case studies and benchmarks provided in .. ``` For snapshot releases, make sure to add the following repository + ```xml + @@ -64,9 +70,31 @@ For snapshot releases, make sure to add the following repository ``` +## Microservice for text preprocessing + +Text preprocessing works locally, but there is also the option to host a microservice for this. +The benefit is that the models do not need to be loaded each time, saving some runtime (and local memory). + +The microservice can be found at [ArDoCo/StanfordCoreNLP-Provider-Service](https://github.com/ArDoCo/StanfordCoreNLP-Provider-Service/). + +The microservice is secured with credentials and the usage of the microservice needs to be activated and the URL of the microservice configured. +These settings can be provided to the execution via environment variables. +To do so, set the following variables: + +```env +NLP_PROVIDER_SOURCE=microservice +MICROSERVICE_URL=[microservice_url] +SCNLP_SERVICE_USER=[your_username] +SCNLP_SERVICE_PASSWORD=[your_password] +``` + +The first variable `NLP_PROVIDER_SOURCE=microservice` activates the microservice usage. +The next three variables configure the connection, and you need to provide the configuration for your deployed microservice. + ## Attribution -The initial version of this project is based on the master thesis [Linking Software Architecture Documentation and Models](https://doi.org/10.5445/IR/1000126194). +The initial version of this project is based on the master +thesis [Linking Software Architecture Documentation and Models](https://doi.org/10.5445/IR/1000126194). ## Acknowledgements diff --git a/framework/common/src/main/java/edu/kit/kastel/mcse/ardoco/core/api/text/POSTag.java b/framework/common/src/main/java/edu/kit/kastel/mcse/ardoco/core/api/text/POSTag.java index 935df071f..a2e21f9cf 100644 --- a/framework/common/src/main/java/edu/kit/kastel/mcse/ardoco/core/api/text/POSTag.java +++ b/framework/common/src/main/java/edu/kit/kastel/mcse/ardoco/core/api/text/POSTag.java @@ -1,12 +1,16 @@ /* Licensed under MIT 2021-2023. */ package edu.kit.kastel.mcse.ardoco.core.api.text; +import java.io.IOException; + import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonValue; + /** * This class represents all valid part-of-speech (pos) tags - * */ public enum POSTag { //@formatter:off @@ -77,4 +81,19 @@ public boolean isVerb() { public boolean isNoun() { return getTag().startsWith("NN"); } + + @JsonValue + public String toValue() { + return getTag(); + } + + @JsonCreator + public static POSTag forValue(String value) throws IOException { + try { + return get(value); + } catch (IllegalArgumentException e) { + throw new IOException("Cannot deserialize PosTag"); + } + } + } diff --git a/framework/common/src/main/java/edu/kit/kastel/mcse/ardoco/core/api/text/Text.java b/framework/common/src/main/java/edu/kit/kastel/mcse/ardoco/core/api/text/Text.java index 138d0f3ea..5800c256a 100644 --- a/framework/common/src/main/java/edu/kit/kastel/mcse/ardoco/core/api/text/Text.java +++ b/framework/common/src/main/java/edu/kit/kastel/mcse/ardoco/core/api/text/Text.java @@ -24,6 +24,14 @@ default int getLength() { */ ImmutableList words(); + /** + * Returns the word at the given index + * + * @param index the index + * @return the word at the given index + */ + Word getWord(int index); + /** * Returns the sentences of the text, ordered by appearance. * diff --git a/framework/text-provider-json/src/main/java/edu/kit/kastel/mcse/ardoco/core/textproviderjson/converter/ObjectToDtoConverter.java b/framework/text-provider-json/src/main/java/edu/kit/kastel/mcse/ardoco/core/textproviderjson/converter/ObjectToDtoConverter.java index 0d19e99ff..6bf8b7ed2 100644 --- a/framework/text-provider-json/src/main/java/edu/kit/kastel/mcse/ardoco/core/textproviderjson/converter/ObjectToDtoConverter.java +++ b/framework/text-provider-json/src/main/java/edu/kit/kastel/mcse/ardoco/core/textproviderjson/converter/ObjectToDtoConverter.java @@ -9,10 +9,14 @@ import org.eclipse.collections.api.list.ImmutableList; -import edu.kit.kastel.mcse.ardoco.core.api.text.*; +import edu.kit.kastel.mcse.ardoco.core.api.text.DependencyTag; +import edu.kit.kastel.mcse.ardoco.core.api.text.POSTag; +import edu.kit.kastel.mcse.ardoco.core.api.text.Phrase; +import edu.kit.kastel.mcse.ardoco.core.api.text.Sentence; +import edu.kit.kastel.mcse.ardoco.core.api.text.Text; +import edu.kit.kastel.mcse.ardoco.core.api.text.Word; import edu.kit.kastel.mcse.ardoco.core.textproviderjson.dto.IncomingDependencyDto; import edu.kit.kastel.mcse.ardoco.core.textproviderjson.dto.OutgoingDependencyDto; -import edu.kit.kastel.mcse.ardoco.core.textproviderjson.dto.PosTag; import edu.kit.kastel.mcse.ardoco.core.textproviderjson.dto.SentenceDto; import edu.kit.kastel.mcse.ardoco.core.textproviderjson.dto.TextDto; import edu.kit.kastel.mcse.ardoco.core.textproviderjson.dto.WordDto; @@ -27,7 +31,7 @@ public class ObjectToDtoConverter { /** * converts an ArDoCo text into a text DTO - * + * * @param text the ArDoCo text * @return the text DTO */ @@ -74,7 +78,7 @@ private WordDto convertToWordDTO(Word word) throws NotConvertableException { wordDTO.setText(word.getText()); wordDTO.setLemma(word.getLemma()); try { - wordDTO.setPosTag(PosTag.forValue(word.getPosTag().toString())); + wordDTO.setPosTag(POSTag.forValue(word.getPosTag().toString())); } catch (IOException e) { throw new NotConvertableException(String.format("IOException when converting word with id %d to WordDto: PosTag not found.", wordDTO.getId())); } diff --git a/framework/text-provider-json/src/main/java/edu/kit/kastel/mcse/ardoco/core/textproviderjson/dto/PosTag.java b/framework/text-provider-json/src/main/java/edu/kit/kastel/mcse/ardoco/core/textproviderjson/dto/PosTag.java deleted file mode 100644 index 8b9415729..000000000 --- a/framework/text-provider-json/src/main/java/edu/kit/kastel/mcse/ardoco/core/textproviderjson/dto/PosTag.java +++ /dev/null @@ -1,58 +0,0 @@ -/* Licensed under MIT 2023. */ -package edu.kit.kastel.mcse.ardoco.core.textproviderjson.dto; - -import java.io.IOException; - -import com.fasterxml.jackson.annotation.*; - -public enum PosTag { - //@formatter:off - ADJECTIVE("JJ"), ADJECTIVE_COMPARATIVE(ADJECTIVE + "R"), ADJECTIVE_SUPERLATIVE(ADJECTIVE + "S"), ADVERB("RB"), ADVERB_COMPARATIVE(ADVERB + "R"), ADVERB_SUPERLATIVE(ADVERB + "S"), ADVERB_WH( - "W" + ADVERB), CONJUNCTION_COORDINATING("CC"), CONJUNCTION_SUBORDINATING("IN"), CARDINAL_NUMBER("CD"), DETERMINER("DT"), DETERMINER_WH("W" + DETERMINER), EXISTENTIAL_THERE( - "EX"), FOREIGN_WORD("FW"), LIST_ITEM_MARKER("LS"), NOUN("NN"), NOUN_PLURAL(NOUN + "S"), NOUN_PROPER_SINGULAR(NOUN + "P"), NOUN_PROPER_PLURAL(NOUN + "PS"), PREDETERMINER( - "PDT"), POSSESSIVE_ENDING("POS"), PRONOUN_PERSONAL("PRP"), PRONOUN_POSSESSIVE("PRP$"), PRONOUN_POSSESSIVE_WH("WP$"), PRONOUN_WH("WP"), PARTICLE("RP"), SYMBOL("SYM"), TO( - "TO"), INTERJECTION("UH"), VERB("VB"), VERB_PAST_TENSE(VERB + "D"), VERB_PARTICIPLE_PRESENT(VERB + "G"), VERB_PARTICIPLE_PAST(VERB + "N"), VERB_SINGULAR_PRESENT_NONTHIRD_PERSON( - VERB + "P"), VERB_SINGULAR_PRESENT_THIRD_PERSON(VERB + "Z"), VERB_MODAL("MD"), CLOSER("."), COMMA(","), COLON(":"), LEFT_PAREN("-LRB-"), RIGHT_PAREN("-RRB-"), NONE("-NONE-"), OPEN_QUOTE( - "``"), CLOSE_QUOTE("''"), DOLLAR("$"), HASHTAG("#"), HYPH("HYPH"), NFP("NFP"), ADD("ADD"), AFX("AFX"), GW("GW"), XX("XX"); - //@formatter:on - - private final String tag; - - PosTag(String tag) { - this.tag = tag; - } - - /** - * Returns the encoding for this part-of-speech. - * - * @return A string representing a Penn Treebank encoding for an English part-of-speech. - */ - @Override - public String toString() { - return getTag(); - } - - /** - * Gets the tag information. - * - * @return the tag - */ - public String getTag() { - return tag; - } - - @JsonValue - public String toValue() { - return getTag(); - } - - @JsonCreator - public static PosTag forValue(String value) throws IOException { - for (PosTag v : values()) { - if (value.equals(v.getTag())) { - return v; - } - } - throw new IOException("Cannot deserialize PosTag"); - } -} diff --git a/framework/text-provider-json/src/main/java/edu/kit/kastel/mcse/ardoco/core/textproviderjson/dto/WordDto.java b/framework/text-provider-json/src/main/java/edu/kit/kastel/mcse/ardoco/core/textproviderjson/dto/WordDto.java index cb3f0c069..0886c8ac3 100644 --- a/framework/text-provider-json/src/main/java/edu/kit/kastel/mcse/ardoco/core/textproviderjson/dto/WordDto.java +++ b/framework/text-provider-json/src/main/java/edu/kit/kastel/mcse/ardoco/core/textproviderjson/dto/WordDto.java @@ -5,7 +5,9 @@ import java.util.List; import java.util.Objects; -import com.fasterxml.jackson.annotation.*; +import com.fasterxml.jackson.annotation.JsonProperty; + +import edu.kit.kastel.mcse.ardoco.core.api.text.POSTag; /** * Definition of a word @@ -17,7 +19,7 @@ public class WordDto { private List outgoingDependencies = new ArrayList<>(); private long sentenceNo; private String text; - private PosTag posTag; + private POSTag posTag; /** * The id of the word. Should be ascending from 1 for the first word in the text. @@ -72,12 +74,12 @@ public void setOutgoingDependencies(List value) { } @JsonProperty("posTag") - public PosTag getPosTag() { + public POSTag getPosTag() { return posTag; } @JsonProperty("posTag") - public void setPosTag(PosTag value) { + public void setPosTag(POSTag value) { this.posTag = value; } diff --git a/framework/text-provider-json/src/main/java/edu/kit/kastel/mcse/ardoco/core/textproviderjson/textobject/PhraseImpl.java b/framework/text-provider-json/src/main/java/edu/kit/kastel/mcse/ardoco/core/textproviderjson/textobject/PhraseImpl.java index c5476d34b..e6b7f2a12 100644 --- a/framework/text-provider-json/src/main/java/edu/kit/kastel/mcse/ardoco/core/textproviderjson/textobject/PhraseImpl.java +++ b/framework/text-provider-json/src/main/java/edu/kit/kastel/mcse/ardoco/core/textproviderjson/textobject/PhraseImpl.java @@ -22,24 +22,28 @@ public class PhraseImpl implements Phrase { private static final String PUNCTUATION_WITH_SPACE = "\\s+([.,;:?!])"; private static final String BRACKETS_WITH_SPACE = "\\s+([()\\[\\]{}<>])"; + private final PhraseType type; + private final ImmutableList childPhrases; private final ImmutableList nonPhraseWords; private ImmutableList phraseWords; - + private ImmutableList containedWords; + private ImmutableList subPhrases; + private ImmutableSortedMap phraseVector; + private int sentenceNo = -1; private String text; - private final PhraseType type; - - private final List childPhrases; - public PhraseImpl(ImmutableList nonPhraseWords, PhraseType type, List childPhrases) { this.nonPhraseWords = nonPhraseWords == null ? Lists.immutable.empty() : nonPhraseWords; this.type = type; - this.childPhrases = childPhrases; + this.childPhrases = Lists.immutable.ofAll(childPhrases); } @Override - public int getSentenceNo() { - return getContainedWords().get(0).getSentenceNo(); + public synchronized int getSentenceNo() { + if (sentenceNo < 0) { + sentenceNo = getContainedWords().get(0).getSentenceNo(); + } + return sentenceNo; } @Override @@ -60,70 +64,78 @@ public PhraseType getPhraseType() { } @Override - public ImmutableList getContainedWords() { - if (phraseWords == null) { - List collectedWords = new ArrayList<>(); - for (Phrase subphrase : childPhrases) { - collectedWords.addAll(subphrase.getContainedWords().castToList()); + public synchronized ImmutableList getContainedWords() { + if (containedWords == null) { + if (phraseWords == null) { + List collectedWords = new ArrayList<>(); + for (Phrase subphrase : childPhrases) { + collectedWords.addAll(subphrase.getContainedWords().castToList()); + } + this.phraseWords = Lists.immutable.ofAll(collectedWords); } - this.phraseWords = Lists.immutable.ofAll(collectedWords); + + MutableList words = Lists.mutable.ofAll(nonPhraseWords); + words.addAllIterable(phraseWords); + words.sortThis(Comparator.comparingInt(Word::getPosition)); + containedWords = words.toImmutable(); } - MutableList words = Lists.mutable.ofAll(nonPhraseWords); - words.addAllIterable(phraseWords); - words.sortThis(Comparator.comparingInt(Word::getPosition)); - return words.toImmutable(); + return containedWords; } @Override - public ImmutableList getSubPhrases() { - List subPhrases = new ArrayList<>(childPhrases); - for (Phrase childPhrase : childPhrases) { - subPhrases.addAll(childPhrase.getSubPhrases().toList()); + public synchronized ImmutableList getSubPhrases() { + if (subPhrases == null) { + MutableList tempSubPhrases = Lists.mutable.ofAll(childPhrases); + for (Phrase childPhrase : childPhrases) { + tempSubPhrases.addAll(childPhrase.getSubPhrases().toList()); + } + subPhrases = tempSubPhrases.toImmutable(); } - return Lists.immutable.ofAll(subPhrases); + return subPhrases; } @Override public boolean isSuperPhraseOf(Phrase other) { - List subphrases = this.childPhrases; + MutableList subphrases = Lists.mutable.ofAll(this.getSubPhrases()); while (!subphrases.isEmpty()) { if (subphrases.contains(other)) { return true; } - List newSubphrases = new ArrayList<>(); - for (Phrase subphrase : subphrases) { - newSubphrases.addAll(subphrase.getSubPhrases().castToList()); - } - subphrases = newSubphrases; + subphrases = getSubPhrasesOfPhrases(subphrases); } return false; } + private static MutableList getSubPhrasesOfPhrases(MutableList subphrases) { + MutableList subPhrasesOfPhrases = Lists.mutable.empty(); + for (Phrase subphrase : subphrases) { + subPhrasesOfPhrases.addAll(subphrase.getSubPhrases().castToList()); + } + return subPhrasesOfPhrases; + } + @Override public boolean isSubPhraseOf(Phrase other) { - List subphrases = other.getSubPhrases().castToList(); + MutableList subphrases = Lists.mutable.ofAll(other.getSubPhrases()); while (!subphrases.isEmpty()) { if (subphrases.contains(this)) { return true; } - List newSubphrases = new ArrayList<>(); - for (Phrase subphrase : subphrases) { - newSubphrases.addAll(subphrase.getSubPhrases().castToList()); - } - subphrases = newSubphrases; + subphrases = getSubPhrasesOfPhrases(subphrases); } return false; } @Override - public ImmutableSortedMap getPhraseVector() { - MutableSortedMap phraseVector = SortedMaps.mutable.empty(); - - var grouped = getContainedWords().groupBy(Word::getText).toMap(); - grouped.forEach((key, value) -> phraseVector.put(value.getAny(), value.size())); - - return phraseVector.toImmutable(); + public synchronized ImmutableSortedMap getPhraseVector() { + if (this.phraseVector == null) { + MutableSortedMap tempPhraseVector = SortedMaps.mutable.empty(); + var grouped = getContainedWords().groupBy(Word::getText).toMap(); + grouped.forEach((key, value) -> tempPhraseVector.put(value.getAny(), value.size())); + this.phraseVector = tempPhraseVector.toImmutable(); + } + return this.phraseVector; } @Override diff --git a/framework/text-provider-json/src/main/java/edu/kit/kastel/mcse/ardoco/core/textproviderjson/textobject/TextImpl.java b/framework/text-provider-json/src/main/java/edu/kit/kastel/mcse/ardoco/core/textproviderjson/textobject/TextImpl.java index 88647367e..e849717f3 100644 --- a/framework/text-provider-json/src/main/java/edu/kit/kastel/mcse/ardoco/core/textproviderjson/textobject/TextImpl.java +++ b/framework/text-provider-json/src/main/java/edu/kit/kastel/mcse/ardoco/core/textproviderjson/textobject/TextImpl.java @@ -2,6 +2,8 @@ package edu.kit.kastel.mcse.ardoco.core.textproviderjson.textobject; import java.util.Objects; +import java.util.SortedMap; +import java.util.TreeMap; import org.eclipse.collections.api.factory.Lists; import org.eclipse.collections.api.list.ImmutableList; @@ -15,6 +17,9 @@ public class TextImpl implements Text { private ImmutableList sentences; private ImmutableList words; + private final SortedMap wordsIndex = new TreeMap<>(); + + private int length = -1; public TextImpl() { sentences = Lists.immutable.empty(); @@ -26,10 +31,13 @@ public void setSentences(ImmutableList sentences) { } @Override - public int getLength() { - int length = 0; - for (Sentence sentence : sentences) { - length += sentence.getText().length(); + public synchronized int getLength() { + if (this.length < 0) { + int calculatedLength = 0; + for (Sentence sentence : sentences) { + calculatedLength += sentence.getWords().size(); + } + this.length = calculatedLength; } return length; } @@ -38,10 +46,23 @@ public int getLength() { public ImmutableList words() { if (words.isEmpty()) { words = collectWords(); + int index = 0; + for (Word word : words) { + wordsIndex.put(index, word); + index++; + } } return words; } + @Override + public synchronized Word getWord(int index) { + if (wordsIndex.isEmpty()) { + words(); + } + return wordsIndex.get(index); + } + @Override public ImmutableList getSentences() { return sentences; @@ -50,7 +71,7 @@ public ImmutableList getSentences() { private ImmutableList collectWords() { MutableList collectedWords = Lists.mutable.empty(); for (Sentence sentence : sentences) { - collectedWords.addAll(sentence.getWords().castToCollection()); + collectedWords.addAll(sentence.getWords().toList()); } return collectedWords.toImmutable(); } diff --git a/framework/text-provider-json/src/main/java/edu/kit/kastel/mcse/ardoco/core/textproviderjson/textobject/WordImpl.java b/framework/text-provider-json/src/main/java/edu/kit/kastel/mcse/ardoco/core/textproviderjson/textobject/WordImpl.java index 7703c8967..3c048c0e5 100644 --- a/framework/text-provider-json/src/main/java/edu/kit/kastel/mcse/ardoco/core/textproviderjson/textobject/WordImpl.java +++ b/framework/text-provider-json/src/main/java/edu/kit/kastel/mcse/ardoco/core/textproviderjson/textobject/WordImpl.java @@ -7,7 +7,12 @@ import org.eclipse.collections.api.factory.Lists; import org.eclipse.collections.api.list.ImmutableList; -import edu.kit.kastel.mcse.ardoco.core.api.text.*; +import edu.kit.kastel.mcse.ardoco.core.api.text.DependencyTag; +import edu.kit.kastel.mcse.ardoco.core.api.text.POSTag; +import edu.kit.kastel.mcse.ardoco.core.api.text.Phrase; +import edu.kit.kastel.mcse.ardoco.core.api.text.Sentence; +import edu.kit.kastel.mcse.ardoco.core.api.text.Text; +import edu.kit.kastel.mcse.ardoco.core.api.text.Word; public class WordImpl implements Word { @@ -63,7 +68,7 @@ public POSTag getPosTag() { public Word getPreWord() { int preWordIndex = indexInText - 1; if (preWord == null && preWordIndex > 0) { - preWord = parent.words().get(preWordIndex); + preWord = parent.getWord(preWordIndex); } return preWord; } @@ -72,7 +77,7 @@ public Word getPreWord() { public Word getNextWord() { int nextWordIndex = indexInText + 1; if (nextWord == null && nextWordIndex < parent.getLength()) { - nextWord = parent.words().get(nextWordIndex); + nextWord = parent.getWord(nextWordIndex); } return nextWord; } @@ -90,14 +95,14 @@ public String getLemma() { @Override public ImmutableList getOutgoingDependencyWordsWithType(DependencyTag dependencyTag) { List dependenciesOfType = this.outgoingDependencies.stream().filter(x -> x.getDependencyTag() == dependencyTag).toList(); - List words = dependenciesOfType.stream().map(x -> this.parent.words().get((int) x.getWordId())).toList(); + List words = dependenciesOfType.stream().map(x -> this.parent.getWord((int) x.getWordId())).toList(); return Lists.immutable.ofAll(words); } @Override public ImmutableList getIncomingDependencyWordsWithType(DependencyTag dependencyTag) { List dependenciesOfType = this.ingoingDependencies.stream().filter(x -> x.getDependencyTag() == dependencyTag).toList(); - List words = dependenciesOfType.stream().map(x -> this.parent.words().get((int) x.getWordId())).toList(); + List words = dependenciesOfType.stream().map(x -> this.parent.getWord((int) x.getWordId())).toList(); return Lists.immutable.ofAll(words); } @@ -125,13 +130,13 @@ public boolean equals(Object o) { return true; if (!(o instanceof WordImpl word)) return false; - return indexInText == word.indexInText && sentenceNo == word.sentenceNo && Objects.equals(preWord, word.preWord) && Objects.equals(nextWord, - word.nextWord) && Objects.equals(text, word.text) && posTag == word.posTag && Objects.equals(lemma, word.lemma) && Objects.equals( - ingoingDependencies, word.ingoingDependencies) && Objects.equals(outgoingDependencies, word.outgoingDependencies); + return indexInText == word.indexInText && sentenceNo == word.sentenceNo && Objects.equals(text, word.text) && posTag == word.posTag && Objects.equals( + lemma, word.lemma) && Objects.equals(ingoingDependencies, word.ingoingDependencies) && Objects.equals(outgoingDependencies, + word.outgoingDependencies); } @Override public int hashCode() { - return Objects.hash(indexInText, preWord, nextWord, sentenceNo, text, posTag, lemma, ingoingDependencies, outgoingDependencies); + return Objects.hash(indexInText, sentenceNo, text, posTag, lemma, ingoingDependencies, outgoingDependencies); } } diff --git a/framework/text-provider-json/src/main/resources/schemas/text.json b/framework/text-provider-json/src/main/resources/schemas/text.json index 2ce17633f..ee5974868 100644 --- a/framework/text-provider-json/src/main/resources/schemas/text.json +++ b/framework/text-provider-json/src/main/resources/schemas/text.json @@ -1,237 +1,243 @@ { - "$schema": "http://json-schema.org/draft-04/schema#", - "$id": "https://raw.githubusercontent.com/ArDoCo/Core/main/framework/text-provider-json/src/main/resources/schemas/text.json", - "title": "Text", - "description": "A definition of a text", - "type": "object", - "required": [ - "sentences" - ], - "properties": { - "sentences": { - "description": "the words that are contained in this sentence", - "type": "array", - "uniqueItems": true, - "items": { - "description": "Sentence in a text", - "type": "object", - "required": [ - "sentenceNo", - "text", - "constituencyTree", - "words" - ], - "properties": { - "sentenceNo": { - "description": "index of the sentence", - "type": "integer", - "minimum": 0 - }, - "text": { - "description": "the text of the sentence", - "type": "string" - }, - "constituencyTree": { - "description": "the constituency tree of the sentence in bracket notation", - "type": "string" - }, - "words": { + "$schema": "https://json-schema.org/draft/2020-12/schema#", + "$id": "https://raw.githubusercontent.com/ArDoCo/Core/main/framework/text-provider-json/src/main/resources/schemas/text.json", + "title": "Text", + "description": "A definition of a text", + "type": "object", + "required": [ + "sentences" + ], + "properties": { + "sentences": { "description": "the words that are contained in this sentence", "type": "array", "uniqueItems": true, "items": { - "description": "Definition of a word", - "type": "object", - "required": [ - "sentenceNo", - "id", - "text", - "lemma", - "posTag", - "outgoingDependencies", - "incomingDependencies" - ], - "properties": { - "sentenceNo": { - "description": "index of the sentence the word is contained in", - "type": "integer", - "minimum": 0 - }, - "id": { - "description": "The id of the word. Should be ascending from 1 for the first word in the text.", - "type": "integer", - "minimum": 1 - }, - "text": { - "description": "the text of the word", - "type": "string" - }, - "lemma": { - "description": "the lemma of the word", - "type": "string" - }, - "posTag": { - "$ref": "#/$defs/posTags" - }, - "outgoingDependencies": { - "description": "the outgoing dependencies", - "type": "array", - "uniqueItems": false, - "items": { - "type": "object", - "required": [ - "targetWordId", - "dependencyType" - ], - "properties": { - "targetWordId": { - "description": "The id of the word the dependency points to.", - "type": "integer" - }, - "dependencyType": { - "$refs": "#/$defs/dependencyTypes" - } + "description": "Sentence in a text", + "type": "object", + "required": [ + "sentenceNo", + "text", + "constituencyTree", + "words" + ], + "properties": { + "sentenceNo": { + "description": "index of the sentence", + "type": "integer", + "minimum": 0 + }, + "text": { + "description": "the text of the sentence", + "type": "string" + }, + "constituencyTree": { + "description": "the constituency tree of the sentence in bracket notation", + "type": "string" + }, + "words": { + "description": "the words that are contained in this sentence", + "type": "array", + "uniqueItems": true, + "items": { + "description": "Definition of a word", + "type": "object", + "required": [ + "sentenceNo", + "id", + "text", + "lemma", + "posTag", + "outgoingDependencies", + "incomingDependencies" + ], + "properties": { + "sentenceNo": { + "description": "index of the sentence the word is contained in", + "type": "integer", + "minimum": 0 + }, + "id": { + "description": "The id of the word. Should be ascending from 1 for the first word in the text.", + "type": "integer", + "minimum": 1 + }, + "text": { + "description": "the text of the word", + "type": "string" + }, + "lemma": { + "description": "the lemma of the word", + "type": "string" + }, + "posTag": { + "$ref": "#/$defs/posTags" + }, + "outgoingDependencies": { + "description": "the outgoing dependencies", + "type": "array", + "uniqueItems": false, + "items": { + "type": "object", + "required": [ + "targetWordId", + "dependencyType" + ], + "properties": { + "targetWordId": { + "description": "The id of the word the dependency points to.", + "type": "integer" + }, + "dependencyType": { + "$refs": "#/$defs/dependencyTypes" + } + } + } + }, + "incomingDependencies": { + "description": "the incoming dependencies", + "type": "array", + "uniqueItems": false, + "items": { + "type": "object", + "required": [ + "sourceWordId", + "dependencyType" + ], + "properties": { + "sourceWordId": { + "description": "The id of the word the dependency originates from.", + "type": "integer" + }, + "dependencyType": { + "$refs": "#/$defs/dependencyTypes" + } + } + } + } + } + } } - } - }, - "incomingDependencies": { - "description": "the incoming dependencies", - "type": "array", - "uniqueItems": false, - "items": { - "type": "object", - "required": [ - "sourceWordId", - "dependencyType" - ], - "properties": { - "sourceWordId": { - "description": "The id of the word the dependency originates from.", - "type": "integer" - }, - "dependencyType": { - "$refs": "#/$defs/dependencyTypes" - } - } - } } - } } - } } - } - } - }, - "$defs": { - "posTags": { - "description": "the lemma of the word", - "type": "string", - "enum": [ - "JJ", - "JJR", - "JJS", - "RB", - "RBR", - "RBS", - "WRB", - "CC", - "IN", - "CD", - "DT", - "WDT", - "EX", - "FW", - "HYPH", - "LS", - "NN", - "NNS", - "NNP", - "NNPS", - "PDT", - "POS", - "PRP", - "PRP$", - "WP$", - "WP", - "RP", - "SYM", - "TO", - "UH", - "VB", - "VBD", - "VBG", - "VBN", - "VBP", - "VBZ", - "MD", - ".", - ",", - ":", - "-LRB-", - "-RRB-", - "-NONE-", - "``", - "''", - "$", - "#" - ] }, - "dependencyTypes": { - "description": "The valid dependency tags", - "type": "string", - "enum": [ - "APPOS", - "NSUBJ", - "POSS", - "OBJ", - "IOBJ", - "NMOD", - "NSUBJPASS", - "POBJ", - "AGENT", - "NUM", - "PREDET", - "RCMOD", - "CSUBJ", - "CCOMP", - "XCOMP", - "OBL", - "VOCATIVE", - "EXPL", - "DISLOCATED", - "ADVCL", - "ADVMOD", - "DISCOURSE", - "AUXILIARY", - "COP", - "MARK", - "ACL", - "AMOD", - "DET", - "CLF", - "CASE", - "CONJ", - "CC", - "FIXED", - "FLAT", - "COMPOUND", - "LIST", - "PARATAXIS", - "ORPHAN", - "GOES_WITH", - "REPARANDUM", - "PUNCT", - "CSUBJ_PASS", - "ACL_RELCL", - "COMPOUND_PRT", - "NMOD_POSS", - "REF", - "NSUBJ_XSUBJ", - "NSUBJ_PASS_XSUBJ", - "NSUBJ_RELSUBJ", - "NSUBJ_PASS_RELSUBJ", - "OBJ_RELOBJ" - ] + "$defs": { + "posTags": { + "description": "the lemma of the word", + "type": "string", + "enum": [ + "JJ", + "JJR", + "JJS", + "RB", + "RBR", + "RBS", + "WRB", + "CC", + "IN", + "CD", + "DT", + "WDT", + "EX", + "FW", + "HYPH", + "LS", + "NN", + "NNS", + "NNP", + "NNPS", + "PDT", + "POS", + "PRP", + "PRP$", + "WP$", + "WP", + "RP", + "SYM", + "TO", + "UH", + "VB", + "VBD", + "VBG", + "VBN", + "VBP", + "VBZ", + "MD", + ".", + ",", + ":", + "-LRB-", + "-RRB-", + "-NONE-", + "``", + "''", + "$", + "#", + "HYPH", + "NFP", + "ADD", + "AFX", + "GW", + "XX" + ] + }, + "dependencyTypes": { + "description": "The valid dependency tags", + "type": "string", + "enum": [ + "APPOS", + "NSUBJ", + "POSS", + "OBJ", + "IOBJ", + "NMOD", + "NSUBJPASS", + "POBJ", + "AGENT", + "NUM", + "PREDET", + "RCMOD", + "CSUBJ", + "CCOMP", + "XCOMP", + "OBL", + "VOCATIVE", + "EXPL", + "DISLOCATED", + "ADVCL", + "ADVMOD", + "DISCOURSE", + "AUXILIARY", + "COP", + "MARK", + "ACL", + "AMOD", + "DET", + "CLF", + "CASE", + "CONJ", + "CC", + "FIXED", + "FLAT", + "COMPOUND", + "LIST", + "PARATAXIS", + "ORPHAN", + "GOES_WITH", + "REPARANDUM", + "PUNCT", + "CSUBJ_PASS", + "ACL_RELCL", + "COMPOUND_PRT", + "NMOD_POSS", + "REF", + "NSUBJ_XSUBJ", + "NSUBJ_PASS_XSUBJ", + "NSUBJ_RELSUBJ", + "NSUBJ_PASS_RELSUBJ", + "OBJ_RELOBJ" + ] + } } - } -} \ No newline at end of file +} diff --git a/framework/text-provider-json/src/test/java/edu/kit/kastel/mcse/ardoco/core/textproviderjson/PhraseImplTest.java b/framework/text-provider-json/src/test/java/edu/kit/kastel/mcse/ardoco/core/textproviderjson/PhraseImplTest.java new file mode 100644 index 000000000..cb77b6bfc --- /dev/null +++ b/framework/text-provider-json/src/test/java/edu/kit/kastel/mcse/ardoco/core/textproviderjson/PhraseImplTest.java @@ -0,0 +1,92 @@ +/* Licensed under MIT 2023. */ +package edu.kit.kastel.mcse.ardoco.core.textproviderjson; + +import java.io.IOException; + +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import edu.kit.kastel.mcse.ardoco.core.api.text.Phrase; +import edu.kit.kastel.mcse.ardoco.core.api.text.Text; +import edu.kit.kastel.mcse.ardoco.core.textproviderjson.converter.DtoToObjectConverter; +import edu.kit.kastel.mcse.ardoco.core.textproviderjson.error.NotConvertableException; +import edu.kit.kastel.mcse.ardoco.core.textproviderjson.textobject.PhraseImpl; + +class PhraseImplTest { + + private static final DtoToObjectConverter CONVERTER = new DtoToObjectConverter(); + private static Phrase baselinePhrase; + private PhraseImpl phraseImplInstance; + + @BeforeAll + static void initAll() { + Text baselineText = TestUtil.generateTextWithMultipleSentences(); + baselinePhrase = baselineText.getSentences().get(1).getPhrases().get(0); + } + + @BeforeEach + void init() { + try { + Text textImplInstance = CONVERTER.convertText(TestUtil.generateDTOWithMultipleSentences()); + phraseImplInstance = (PhraseImpl) textImplInstance.getSentences().get(1).getPhrases().get(0); + } catch (NotConvertableException | IOException e) { + throw new RuntimeException(e); + } + } + + @Test + void testGetSentenceNo() { + Assertions.assertEquals(baselinePhrase.getSentenceNo(), phraseImplInstance.getSentenceNo()); + } + + @Test + void testGetText() { + Assertions.assertEquals(baselinePhrase.getText(), phraseImplInstance.getText()); + } + + @Test + void testGetPhraseType() { + Assertions.assertEquals(baselinePhrase.getPhraseType(), phraseImplInstance.getPhraseType()); + } + + @Test + void testGetContainedWords() { + Assertions.assertEquals(baselinePhrase.getContainedWords().size(), phraseImplInstance.getContainedWords().size()); + } + + @Test + void testGetSubPhrases() { + Assertions.assertEquals(baselinePhrase.getSubPhrases().size(), phraseImplInstance.getSubPhrases().size()); + } + + @Test + void testIsSuperPhraseOf() { + Phrase subphrase = phraseImplInstance.getSubPhrases().get(0); + Assertions.assertAll(// + () -> Assertions.assertTrue(phraseImplInstance.isSuperPhraseOf(subphrase)), () -> Assertions.assertFalse(phraseImplInstance.isSuperPhraseOf( + phraseImplInstance)), () -> Assertions.assertFalse(subphrase.isSuperPhraseOf(phraseImplInstance))// + ); + } + + @Test + void testIsSubPhraseOf() { + Phrase subphrase = phraseImplInstance.getSubPhrases().get(0); + Assertions.assertAll(// + () -> Assertions.assertFalse(phraseImplInstance.isSubPhraseOf(subphrase)), () -> Assertions.assertFalse(phraseImplInstance.isSubPhraseOf( + phraseImplInstance)), () -> Assertions.assertTrue(subphrase.isSubPhraseOf(phraseImplInstance))// + ); + } + + @Test + void testGetPhraseVector() { + Assertions.assertEquals(baselinePhrase.getPhraseVector().size(), phraseImplInstance.getPhraseVector().size()); + } + + @Test + void simpleHashCodeTest() { + Assertions.assertEquals(phraseImplInstance.hashCode(), phraseImplInstance.hashCode()); + } + +} diff --git a/framework/text-provider-json/src/test/java/edu/kit/kastel/mcse/ardoco/core/textproviderjson/TestUtil.java b/framework/text-provider-json/src/test/java/edu/kit/kastel/mcse/ardoco/core/textproviderjson/TestUtil.java index e362e16ca..4e2015f0c 100644 --- a/framework/text-provider-json/src/test/java/edu/kit/kastel/mcse/ardoco/core/textproviderjson/TestUtil.java +++ b/framework/text-provider-json/src/test/java/edu/kit/kastel/mcse/ardoco/core/textproviderjson/TestUtil.java @@ -7,10 +7,14 @@ import org.eclipse.collections.api.factory.Lists; -import edu.kit.kastel.mcse.ardoco.core.api.text.*; +import edu.kit.kastel.mcse.ardoco.core.api.text.DependencyTag; +import edu.kit.kastel.mcse.ardoco.core.api.text.POSTag; +import edu.kit.kastel.mcse.ardoco.core.api.text.Phrase; +import edu.kit.kastel.mcse.ardoco.core.api.text.PhraseType; +import edu.kit.kastel.mcse.ardoco.core.api.text.Sentence; +import edu.kit.kastel.mcse.ardoco.core.api.text.Text; import edu.kit.kastel.mcse.ardoco.core.textproviderjson.dto.IncomingDependencyDto; import edu.kit.kastel.mcse.ardoco.core.textproviderjson.dto.OutgoingDependencyDto; -import edu.kit.kastel.mcse.ardoco.core.textproviderjson.dto.PosTag; import edu.kit.kastel.mcse.ardoco.core.textproviderjson.dto.SentenceDto; import edu.kit.kastel.mcse.ardoco.core.textproviderjson.dto.TextDto; import edu.kit.kastel.mcse.ardoco.core.textproviderjson.dto.WordDto; @@ -30,7 +34,7 @@ private TestUtil() { /** * generates a default textDTO without dependencies between the words - * + * * @return a default textDTO */ public static TextDto generateDefaultDTO() throws IOException { @@ -39,28 +43,28 @@ public static TextDto generateDefaultDTO() throws IOException { word1.setSentenceNo(1); word1.setLemma("this"); word1.setText("This"); - word1.setPosTag(PosTag.forValue("DT")); + word1.setPosTag(POSTag.forValue("DT")); WordDto word2 = new WordDto(); word2.setId(2); word2.setSentenceNo(1); word2.setLemma("be"); word2.setText("is"); - word2.setPosTag(PosTag.forValue("VBZ")); + word2.setPosTag(POSTag.forValue("VBZ")); WordDto word3 = new WordDto(); word3.setId(3); word3.setSentenceNo(1); word3.setLemma("I"); word3.setText("me"); - word3.setPosTag(PosTag.forValue("PRP")); + word3.setPosTag(POSTag.forValue("PRP")); WordDto word4 = new WordDto(); word4.setId(4); word4.setSentenceNo(1); word4.setLemma("."); word4.setText("."); - word4.setPosTag(PosTag.forValue(".")); + word4.setPosTag(POSTag.forValue(".")); List words = new ArrayList<>(List.of(word1, word2, word3, word4)); @@ -126,28 +130,28 @@ public static TextDto generateDTOWithMultipleSentences() throws IOException { word1.setSentenceNo(1); word1.setLemma("this"); word1.setText("This"); - word1.setPosTag(PosTag.forValue("DT")); + word1.setPosTag(POSTag.forValue("DT")); WordDto word2 = new WordDto(); word2.setId(2); word2.setSentenceNo(1); word2.setLemma("be"); word2.setText("is"); - word2.setPosTag(PosTag.forValue("VBZ")); + word2.setPosTag(POSTag.forValue("VBZ")); WordDto word3 = new WordDto(); word3.setId(3); word3.setSentenceNo(1); word3.setLemma("I"); word3.setText("me"); - word3.setPosTag(PosTag.forValue("PRP")); + word3.setPosTag(POSTag.forValue("PRP")); WordDto word4 = new WordDto(); word4.setId(4); word4.setSentenceNo(1); word4.setLemma("."); word4.setText("."); - word4.setPosTag(PosTag.forValue(".")); + word4.setPosTag(POSTag.forValue(".")); List words = new ArrayList<>(List.of(word1, word2, word3, word4)); @@ -162,28 +166,28 @@ public static TextDto generateDTOWithMultipleSentences() throws IOException { word5.setSentenceNo(2); word5.setLemma("this"); word5.setText("This"); - word5.setPosTag(PosTag.forValue("DT")); + word5.setPosTag(POSTag.forValue("DT")); WordDto word6 = new WordDto(); word6.setId(6); word6.setSentenceNo(2); word6.setLemma("be"); word6.setText("is"); - word6.setPosTag(PosTag.forValue("VBZ")); + word6.setPosTag(POSTag.forValue("VBZ")); WordDto word7 = new WordDto(); word7.setId(7); word7.setSentenceNo(2); word7.setLemma("you"); word7.setText("you"); - word7.setPosTag(PosTag.forValue("PRP")); + word7.setPosTag(POSTag.forValue("PRP")); WordDto word8 = new WordDto(); word8.setId(8); word8.setSentenceNo(2); word8.setLemma("."); word8.setText("."); - word8.setPosTag(PosTag.forValue(".")); + word8.setPosTag(POSTag.forValue(".")); List words2 = new ArrayList<>(List.of(word5, word6, word7, word8)); @@ -268,7 +272,7 @@ public static TextDto generateTextDtoWithDependencies() throws IOException { word1.setSentenceNo(1); word1.setLemma("hello"); word1.setText("Hello"); - word1.setPosTag(PosTag.forValue("UH")); + word1.setPosTag(POSTag.forValue("UH")); OutgoingDependencyDto outgoingDependency = new OutgoingDependencyDto(); outgoingDependency.setTargetWordId(2); outgoingDependency.setDependencyTag(DependencyTag.PUNCT); @@ -279,7 +283,7 @@ public static TextDto generateTextDtoWithDependencies() throws IOException { word2.setSentenceNo(1); word2.setLemma("."); word2.setText("."); - word2.setPosTag(PosTag.forValue(".")); + word2.setPosTag(POSTag.forValue(".")); IncomingDependencyDto incomingDependency = new IncomingDependencyDto(); incomingDependency.setSourceWordId(1); incomingDependency.setDependencyTag(DependencyTag.PUNCT); diff --git a/framework/text-provider-json/src/test/java/edu/kit/kastel/mcse/ardoco/core/textproviderjson/TextImplTest.java b/framework/text-provider-json/src/test/java/edu/kit/kastel/mcse/ardoco/core/textproviderjson/TextImplTest.java new file mode 100644 index 000000000..d3d3ee2eb --- /dev/null +++ b/framework/text-provider-json/src/test/java/edu/kit/kastel/mcse/ardoco/core/textproviderjson/TextImplTest.java @@ -0,0 +1,59 @@ +/* Licensed under MIT 2023. */ +package edu.kit.kastel.mcse.ardoco.core.textproviderjson; + +import java.io.IOException; + +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import edu.kit.kastel.mcse.ardoco.core.api.text.Text; +import edu.kit.kastel.mcse.ardoco.core.textproviderjson.converter.DtoToObjectConverter; +import edu.kit.kastel.mcse.ardoco.core.textproviderjson.error.NotConvertableException; +import edu.kit.kastel.mcse.ardoco.core.textproviderjson.textobject.TextImpl; + +class TextImplTest { + private static final DtoToObjectConverter CONVERTER = new DtoToObjectConverter(); + private static Text baselineText; + private TextImpl textImplInstance; + + @BeforeAll + static void initAll() { + baselineText = TestUtil.generateTextWithMultipleSentences(); + } + + @BeforeEach + void init() { + try { + textImplInstance = (TextImpl) CONVERTER.convertText(TestUtil.generateDTOWithMultipleSentences()); + } catch (NotConvertableException | IOException e) { + throw new RuntimeException(e); + } + } + + @Test + void getLengthTest() { + Assertions.assertEquals(baselineText.getLength(), textImplInstance.getLength()); + } + + @Test + void wordsTest() { + Assertions.assertEquals(baselineText.words().size(), textImplInstance.words().size()); + } + + @Test + void getWordTest() { + Assertions.assertEquals(baselineText.getWord(0), textImplInstance.getWord(0)); + } + + @Test + void getSentencesTest() { + Assertions.assertEquals(baselineText.getSentences().size(), textImplInstance.getSentences().size()); + } + + @Test + void simpleHashCodeTest() { + Assertions.assertEquals(textImplInstance.hashCode(), textImplInstance.hashCode()); + } +} diff --git a/framework/text-provider-json/src/test/java/edu/kit/kastel/mcse/ardoco/core/textproviderjson/converter/JsonConverterTest.java b/framework/text-provider-json/src/test/java/edu/kit/kastel/mcse/ardoco/core/textproviderjson/converter/JsonConverterTest.java index 591438e1e..5e38ab8b8 100644 --- a/framework/text-provider-json/src/test/java/edu/kit/kastel/mcse/ardoco/core/textproviderjson/converter/JsonConverterTest.java +++ b/framework/text-provider-json/src/test/java/edu/kit/kastel/mcse/ardoco/core/textproviderjson/converter/JsonConverterTest.java @@ -11,9 +11,9 @@ import org.junit.jupiter.api.Test; import edu.kit.kastel.mcse.ardoco.core.api.text.DependencyTag; +import edu.kit.kastel.mcse.ardoco.core.api.text.POSTag; import edu.kit.kastel.mcse.ardoco.core.textproviderjson.dto.IncomingDependencyDto; import edu.kit.kastel.mcse.ardoco.core.textproviderjson.dto.OutgoingDependencyDto; -import edu.kit.kastel.mcse.ardoco.core.textproviderjson.dto.PosTag; import edu.kit.kastel.mcse.ardoco.core.textproviderjson.dto.SentenceDto; import edu.kit.kastel.mcse.ardoco.core.textproviderjson.dto.TextDto; import edu.kit.kastel.mcse.ardoco.core.textproviderjson.dto.WordDto; @@ -65,7 +65,7 @@ private TextDto getValidTextDtoExample() throws IOException { expectedWord.setSentenceNo(1); expectedWord.setLemma("hello"); expectedWord.setText("Hello"); - expectedWord.setPosTag(PosTag.forValue("UH")); + expectedWord.setPosTag(POSTag.forValue("UH")); OutgoingDependencyDto expectedOutDep = new OutgoingDependencyDto(); expectedOutDep.setTargetWordId(1); diff --git a/report/pom.xml b/report/pom.xml index ad2095ee0..957c05cc7 100644 --- a/report/pom.xml +++ b/report/pom.xml @@ -116,6 +116,11 @@ + + io.github.ardoco.core + text-provider-json + ${revision} + org.junit.jupiter junit-jupiter-engine diff --git a/stages/text-preprocessing/src/main/java/edu/kit/kastel/mcse/ardoco/core/text/providers/informants/corenlp/TextImpl.java b/stages/text-preprocessing/src/main/java/edu/kit/kastel/mcse/ardoco/core/text/providers/informants/corenlp/TextImpl.java index 8f06559aa..6528703b3 100644 --- a/stages/text-preprocessing/src/main/java/edu/kit/kastel/mcse/ardoco/core/text/providers/informants/corenlp/TextImpl.java +++ b/stages/text-preprocessing/src/main/java/edu/kit/kastel/mcse/ardoco/core/text/providers/informants/corenlp/TextImpl.java @@ -1,6 +1,9 @@ /* Licensed under MIT 2022-2023. */ package edu.kit.kastel.mcse.ardoco.core.text.providers.informants.corenlp; +import java.util.SortedMap; +import java.util.TreeMap; + import org.eclipse.collections.api.factory.Lists; import org.eclipse.collections.api.list.ImmutableList; import org.eclipse.collections.api.list.MutableList; @@ -15,6 +18,7 @@ public class TextImpl implements Text { final CoreDocument coreDocument; private ImmutableList sentences = Lists.immutable.empty(); private ImmutableList words = Lists.immutable.empty(); + private final SortedMap wordsIndex = new TreeMap<>(); public TextImpl(CoreDocument coreDocument) { this.coreDocument = coreDocument; @@ -28,6 +32,14 @@ public ImmutableList words() { return words; } + @Override + public synchronized Word getWord(int index) { + if (wordsIndex.isEmpty()) { + words(); + } + return wordsIndex.get(index); + } + @Override public ImmutableList getSentences() { if (sentences.isEmpty()) { @@ -56,6 +68,11 @@ private void iterateDocumentForWordsAndSentences() { sentences = sentenceList.toImmutable(); words = wordList.toImmutable(); + int index = 0; + for (Word word : words) { + wordsIndex.put(index, word); + index++; + } } } diff --git a/stages/text-preprocessing/src/main/java/edu/kit/kastel/mcse/ardoco/core/text/providers/informants/corenlp/WordImpl.java b/stages/text-preprocessing/src/main/java/edu/kit/kastel/mcse/ardoco/core/text/providers/informants/corenlp/WordImpl.java index 02f20726e..c53b2d6c6 100644 --- a/stages/text-preprocessing/src/main/java/edu/kit/kastel/mcse/ardoco/core/text/providers/informants/corenlp/WordImpl.java +++ b/stages/text-preprocessing/src/main/java/edu/kit/kastel/mcse/ardoco/core/text/providers/informants/corenlp/WordImpl.java @@ -84,7 +84,7 @@ public POSTag getPosTag() { public Word getPreWord() { int preWordIndex = index - 1; if (preWord == null && preWordIndex > 0) { - preWord = parent.words().get(preWordIndex); + preWord = parent.getWord(preWordIndex); } return preWord; } @@ -93,7 +93,7 @@ public Word getPreWord() { public Word getNextWord() { int nextWordIndex = index + 1; if (nextWord == null && nextWordIndex < parent.getLength()) { - nextWord = parent.words().get(nextWordIndex); + nextWord = parent.getWord(nextWordIndex); } return nextWord; } @@ -103,10 +103,6 @@ public int getPosition() { return index; } - protected int getPositionInSentence() { - return this.token.index(); - } - protected int getBeginCharPosition() { return this.token.beginPosition(); } @@ -148,7 +144,7 @@ public ImmutableList getIncomingDependencyWordsWithType(DependencyTag depe private Word getCorrespondingWordForFirstTokenBasedOnSecondToken(CoreLabel firstToken, CoreLabel secondToken) { var firstTokenIndex = (firstToken.index() - secondToken.index()) + index; - return parent.words().get(firstTokenIndex); + return parent.getWord(firstTokenIndex); } private List getDependenciesOfType(DependencyTag dependencyTag) { diff --git a/stages/text-preprocessing/src/main/java/edu/kit/kastel/mcse/ardoco/core/text/providers/informants/corenlp/config/ConfigManager.java b/stages/text-preprocessing/src/main/java/edu/kit/kastel/mcse/ardoco/core/text/providers/informants/corenlp/config/ConfigManager.java index 3135eb6c6..8bcf043bb 100644 --- a/stages/text-preprocessing/src/main/java/edu/kit/kastel/mcse/ardoco/core/text/providers/informants/corenlp/config/ConfigManager.java +++ b/stages/text-preprocessing/src/main/java/edu/kit/kastel/mcse/ardoco/core/text/providers/informants/corenlp/config/ConfigManager.java @@ -32,7 +32,7 @@ private ConfigManager() { logger.warn("Could not load config file. ", e); properties.setProperty(PROPERTY_MICROSERVICE_URL, "http://localhost:8080"); properties.setProperty(PROPERTY_NLP_PROVIDER_SOURCE, "local"); - properties.setProperty(PROPERTY_CORENLP_SERVICE, "/stanfordnlp?text="); + properties.setProperty(PROPERTY_CORENLP_SERVICE, "/stanfordnlp"); properties.setProperty(PROPERTY_HEALTH_SERVICE, "/stanfordnlp/health"); } if (System.getenv("MICROSERVICE_URL") != null) { diff --git a/stages/text-preprocessing/src/main/java/edu/kit/kastel/mcse/ardoco/core/text/providers/informants/corenlp/textprocessor/HttpCommunicator.java b/stages/text-preprocessing/src/main/java/edu/kit/kastel/mcse/ardoco/core/text/providers/informants/corenlp/textprocessor/HttpCommunicator.java index 4aa468dac..eb97cee3b 100644 --- a/stages/text-preprocessing/src/main/java/edu/kit/kastel/mcse/ardoco/core/text/providers/informants/corenlp/textprocessor/HttpCommunicator.java +++ b/stages/text-preprocessing/src/main/java/edu/kit/kastel/mcse/ardoco/core/text/providers/informants/corenlp/textprocessor/HttpCommunicator.java @@ -2,22 +2,29 @@ package edu.kit.kastel.mcse.ardoco.core.text.providers.informants.corenlp.textprocessor; import java.io.IOException; +import java.nio.charset.StandardCharsets; import org.apache.hc.client5.http.auth.AuthScope; import org.apache.hc.client5.http.auth.UsernamePasswordCredentials; import org.apache.hc.client5.http.classic.methods.HttpGet; +import org.apache.hc.client5.http.classic.methods.HttpPost; import org.apache.hc.client5.http.impl.auth.BasicCredentialsProvider; import org.apache.hc.client5.http.impl.classic.BasicHttpClientResponseHandler; import org.apache.hc.client5.http.impl.classic.CloseableHttpClient; import org.apache.hc.client5.http.impl.classic.HttpClients; +import org.apache.hc.core5.http.ContentType; +import org.apache.hc.core5.http.io.entity.StringEntity; public class HttpCommunicator { + public static final String ENV_USERNAME = "SCNLP_SERVICE_USER"; + public static final String ENV_PASSWORD = "SCNLP_SERVICE_PASSWORD"; + public String sendAuthenticatedGetRequest(String requestUrl) throws IOException { - String username = System.getenv("USERNAME"); - String password = System.getenv("PASSWORD"); + String username = System.getenv(ENV_USERNAME); + String password = System.getenv(ENV_PASSWORD); if (username == null || password == null) { - throw new IOException("Environment variables USERNAME and PASSWORD must be set."); + throw new IOException("Environment variables " + ENV_USERNAME + " and " + ENV_PASSWORD + " must be set."); } HttpGet request = new HttpGet(requestUrl); @@ -27,4 +34,22 @@ public String sendAuthenticatedGetRequest(String requestUrl) throws IOException return httpClient.execute(request, new BasicHttpClientResponseHandler()); } } + + public String sendAuthenticatedPostRequest(String requestUrl, String body) throws IOException { + String username = System.getenv(ENV_USERNAME); + String password = System.getenv(ENV_PASSWORD); + if (username == null || password == null) { + throw new IOException("Environment variables " + ENV_USERNAME + " and " + ENV_PASSWORD + " must be set."); + } + + HttpPost request = new HttpPost(requestUrl); + StringEntity requestEntity = new StringEntity(body, ContentType.APPLICATION_JSON, StandardCharsets.UTF_8.toString(), false); + request.setEntity(requestEntity); + + BasicCredentialsProvider provider = new BasicCredentialsProvider(); + provider.setCredentials(new AuthScope(null, -1), new UsernamePasswordCredentials(username, password.toCharArray())); + try (CloseableHttpClient httpClient = HttpClients.custom().setDefaultCredentialsProvider(provider).build()) { + return httpClient.execute(request, new BasicHttpClientResponseHandler()); + } + } } diff --git a/stages/text-preprocessing/src/main/java/edu/kit/kastel/mcse/ardoco/core/text/providers/informants/corenlp/textprocessor/TextProcessor.java b/stages/text-preprocessing/src/main/java/edu/kit/kastel/mcse/ardoco/core/text/providers/informants/corenlp/textprocessor/TextProcessor.java index 163269750..bba62d495 100644 --- a/stages/text-preprocessing/src/main/java/edu/kit/kastel/mcse/ardoco/core/text/providers/informants/corenlp/textprocessor/TextProcessor.java +++ b/stages/text-preprocessing/src/main/java/edu/kit/kastel/mcse/ardoco/core/text/providers/informants/corenlp/textprocessor/TextProcessor.java @@ -37,9 +37,8 @@ public Text processText(String inputText) { int numberOfTry = 0; while (numberOfTry < MAX_FAILED_SERVICE_REQUESTS) { try { - Text processedText = processService(inputText); - logger.info("Processed text with CoreNLP microservice."); - return processedText; + logger.info("Processing text with CoreNLP microservice."); + return processService(inputText); } catch (IOException e) { numberOfTry++; logger.warn("Could not process text with CoreNLP microservice. Trying again. ", e); @@ -50,7 +49,7 @@ public Text processText(String inputText) { } logger.warn("Could not process text with CoreNLP microservice. Processing locally instead."); } - logger.info("Processed text locally."); + logger.info("Processing text locally."); return processLocally(inputText); } diff --git a/stages/text-preprocessing/src/main/java/edu/kit/kastel/mcse/ardoco/core/text/providers/informants/corenlp/textprocessor/TextProcessorService.java b/stages/text-preprocessing/src/main/java/edu/kit/kastel/mcse/ardoco/core/text/providers/informants/corenlp/textprocessor/TextProcessorService.java index 59d281489..f1c6412ef 100644 --- a/stages/text-preprocessing/src/main/java/edu/kit/kastel/mcse/ardoco/core/text/providers/informants/corenlp/textprocessor/TextProcessorService.java +++ b/stages/text-preprocessing/src/main/java/edu/kit/kastel/mcse/ardoco/core/text/providers/informants/corenlp/textprocessor/TextProcessorService.java @@ -5,6 +5,8 @@ import java.net.URLEncoder; import java.nio.charset.StandardCharsets; +import org.jetbrains.annotations.NotNull; + import edu.kit.kastel.mcse.ardoco.core.api.text.Text; import edu.kit.kastel.mcse.ardoco.core.text.providers.informants.corenlp.config.ConfigManager; import edu.kit.kastel.mcse.ardoco.core.textproviderjson.converter.DtoToObjectConverter; @@ -26,20 +28,30 @@ public class TextProcessorService { */ public Text processText(String inputText) throws IOException, InvalidJsonException, NotConvertableException { TextDto textDto; - String jsonText = sendCorenlpRequest(inputText); + String jsonText = sendCoreNlpRequest(inputText); textDto = JsonConverter.fromJsonString(jsonText); return new DtoToObjectConverter().convertText(textDto); } - private String sendCorenlpRequest(String inputText) throws IOException { - inputText = URLEncoder.encode(inputText, StandardCharsets.UTF_8); + private String sendCoreNlpRequest(String inputText) throws IOException { + String encodedText = encodeText(inputText); ConfigManager configManager = ConfigManager.INSTANCE; - String requestUrl = configManager.getMicroserviceUrl() + configManager.getCorenlpService() + inputText; - return sendAuthenticatedGetRequest(requestUrl); + String requestUrl = configManager.getMicroserviceUrl() + configManager.getCorenlpService(); + return sendAuthenticatedPostRequest(requestUrl, encodedText); + } + + private static String encodeText(String inputText) { + return URLEncoder.encode(inputText, StandardCharsets.UTF_8); } - private String sendAuthenticatedGetRequest(String requestUrl) throws IOException { + private String sendAuthenticatedPostRequest(String requestUrl, String encodedText) throws IOException { HttpCommunicator httpCommunicator = new HttpCommunicator(); - return httpCommunicator.sendAuthenticatedGetRequest(requestUrl); + String body = getRequestBodyString(encodedText); + return httpCommunicator.sendAuthenticatedPostRequest(requestUrl, body); + } + + @NotNull + private static String getRequestBodyString(String encodedText) { + return "{\"text\": \"" + encodedText + "\"}"; } } diff --git a/stages/text-preprocessing/src/main/resources/config.properties b/stages/text-preprocessing/src/main/resources/config.properties index e25bff7ea..bf99a5e1e 100644 --- a/stages/text-preprocessing/src/main/resources/config.properties +++ b/stages/text-preprocessing/src/main/resources/config.properties @@ -1,5 +1,5 @@ nlpProviderSource=microservice -microserviceUrl= http://localhost:8080 -corenlpService=/stanfordnlp?text= +microserviceUrl=http://localhost:8080 +corenlpService=/stanfordnlp healthService=/stanfordnlp/health