Skip to content

Commit

Permalink
Merge pull request #1437 from dkpro/feature/1422-Trim-field-values-by…
Browse files Browse the repository at this point in the history
…-default

#1422 - Trim field values by default
  • Loading branch information
reckart authored Nov 30, 2019
2 parents de18859 + 3a2ced0 commit 2d84c18
Show file tree
Hide file tree
Showing 10 changed files with 279 additions and 184 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -42,12 +42,12 @@
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import org.dkpro.core.api.io.IobDecoder;
import org.dkpro.core.api.io.JCasResourceCollectionReader_ImplBase;
import org.dkpro.core.api.lexmorph.pos.POSUtils;
import org.dkpro.core.api.parameter.ComponentParameters;
import org.dkpro.core.api.parameter.MimeTypes;
import org.dkpro.core.api.resources.CompressionUtils;
import org.dkpro.core.api.resources.MappingProvider;
import org.dkpro.core.io.conll.internal.ConllReader_ImplBase;

import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
Expand All @@ -70,7 +70,7 @@
"de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token",
"de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk"})
public class Conll2000Reader
extends JCasResourceCollectionReader_ImplBase
extends ConllReader_ImplBase
{
private static final int FORM = 0;
private static final int POSTAG = 1;
Expand Down Expand Up @@ -141,7 +141,7 @@ public class Conll2000Reader
ComponentParameters.PARAM_CHUNK_MAPPING_LOCATION;
@ConfigurationParameter(name = PARAM_CHUNK_MAPPING_LOCATION, mandatory = false)
protected String chunkMappingLocation;

private MappingProvider posMappingProvider;
private MappingProvider chunkMappingProvider;

Expand Down Expand Up @@ -213,22 +213,24 @@ private void convert(JCas aJCas, BufferedReader aReader)
int i = 0;
for (String[] word : words) {
// Read token
Token token = doc.add(word[FORM], Token.class);
Token token = doc.add(trim(word[FORM]), Token.class);
sentenceEnd = token.getEnd();
doc.add(" ");

if (posEnabled) {
Type posTag = posMappingProvider.getTagType(word[POSTAG]);
String posTagValue = cleanTag(word[POSTAG]);

Type posTag = posMappingProvider.getTagType(posTagValue);
POS pos = (POS) aJCas.getCas().createAnnotation(posTag, token.getBegin(),
token.getEnd());
pos.setPosValue(word[POSTAG] != null ? word[POSTAG].intern() : null);
pos.setPosValue(posTagValue);
POSUtils.assignCoarseValue(pos);
pos.addToIndexes();
token.setPos(pos);
}

tokens.add(token);
chunkTags[i] = word[IOB];
chunkTags[i] = trim(word[IOB]);
i++;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,11 +41,11 @@
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import org.dkpro.core.api.io.IobDecoder;
import org.dkpro.core.api.io.JCasResourceCollectionReader_ImplBase;
import org.dkpro.core.api.parameter.ComponentParameters;
import org.dkpro.core.api.parameter.MimeTypes;
import org.dkpro.core.api.resources.CompressionUtils;
import org.dkpro.core.api.resources.MappingProvider;
import org.dkpro.core.io.conll.internal.ConllReader_ImplBase;

import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
Expand Down Expand Up @@ -121,7 +121,7 @@
"de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token",
"de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity"})
public class Conll2002Reader
extends JCasResourceCollectionReader_ImplBase
extends ConllReader_ImplBase
{
/**
* Column Separators
Expand Down Expand Up @@ -317,14 +317,14 @@ private void convert(JCas aJCas, BufferedReader aReader)
String[] word = wordIterator.next();

// Read token
Token token = doc.add(word[FORM], Token.class);
Token token = doc.add(trim(word[FORM]), Token.class);
sentenceEnd = token.getEnd();
if (wordIterator.hasNext()) {
doc.add(" ");
}

tokens.add(token);
namedEntityTags[i] = word[IOB];
namedEntityTags[i] = cleanTag(word[IOB]);
i++;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,12 +42,12 @@
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import org.dkpro.core.api.io.IobDecoder;
import org.dkpro.core.api.io.JCasResourceCollectionReader_ImplBase;
import org.dkpro.core.api.lexmorph.pos.POSUtils;
import org.dkpro.core.api.parameter.ComponentParameters;
import org.dkpro.core.api.parameter.MimeTypes;
import org.dkpro.core.api.resources.CompressionUtils;
import org.dkpro.core.api.resources.MappingProvider;
import org.dkpro.core.io.conll.internal.ConllReader_ImplBase;

import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS;
import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity;
Expand All @@ -72,7 +72,7 @@
"de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk",
"de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity" })
public class Conll2003Reader
extends JCasResourceCollectionReader_ImplBase
extends ConllReader_ImplBase
{
private static final int FORM = 0;
private static final int POSTAG = 1;
Expand Down Expand Up @@ -251,23 +251,25 @@ private void convert(JCas aJCas, BufferedReader aReader)
int i = 0;
for (String[] word : words) {
// Read token
Token token = doc.add(word[FORM], Token.class);
Token token = doc.add(trim(word[FORM]), Token.class);
sentenceEnd = token.getEnd();
doc.add(" ");

if (posEnabled) {
Type posTag = posMappingProvider.getTagType(word[POSTAG]);
String posTagValue = cleanTag(word[POSTAG]);

Type posTag = posMappingProvider.getTagType(posTagValue);
POS pos = (POS) aJCas.getCas().createAnnotation(posTag, token.getBegin(),
token.getEnd());
pos.setPosValue(word[POSTAG] != null ? word[POSTAG].intern() : null);
pos.setPosValue(posTagValue);
POSUtils.assignCoarseValue(pos);
pos.addToIndexes();
token.setPos(pos);
}

tokens.add(token);
chunkTags[i] = word[CHUNK];
namedEntityTags[i] = word[NAMED_ENTITY];
chunkTags[i] = cleanTag(word[CHUNK]);
namedEntityTags[i] = cleanTag(word[NAMED_ENTITY]);
i++;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
*/
package org.dkpro.core.io.conll;

import static de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.DependencyFlavor.BASIC;
import static org.apache.commons.io.IOUtils.closeQuietly;
import static org.dkpro.core.api.resources.MappingProviderFactory.createPosMappingProvider;

Expand All @@ -41,19 +42,18 @@
import org.apache.uima.fit.factory.JCasBuilder;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import org.dkpro.core.api.io.JCasResourceCollectionReader_ImplBase;
import org.dkpro.core.api.parameter.ComponentParameters;
import org.dkpro.core.api.parameter.MimeTypes;
import org.dkpro.core.api.resources.CompressionUtils;
import org.dkpro.core.api.resources.MappingProvider;
import org.dkpro.core.io.conll.internal.ConllReader_ImplBase;

import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.morph.MorphologicalFeatures;
import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency;
import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.DependencyFlavor;
import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.ROOT;
import eu.openminted.share.annotations.api.DocumentationResource;

Expand All @@ -75,7 +75,7 @@
"de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma",
"de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency" })
public class Conll2006Reader
extends JCasResourceCollectionReader_ImplBase
extends ConllReader_ImplBase
{
/**
* Character encoding of the input data.
Expand Down Expand Up @@ -228,33 +228,34 @@ public void convert(JCas aJCas, BufferedReader aReader)
while (wordIterator.hasNext()) {
String[] word = wordIterator.next();
// Read token
Token token = doc.add(word[FORM], Token.class);
tokens.put(Integer.valueOf(word[ID]), token);
Token token = doc.add(trim(word[FORM]), Token.class);
tokens.put(Integer.valueOf(trim(word[ID])), token);
if (wordIterator.hasNext()) {
doc.add(" ");
}

// Read lemma
if (!UNUSED.equals(word[LEMMA]) && readLemma) {
Lemma lemma = new Lemma(aJCas, token.getBegin(), token.getEnd());
lemma.setValue(word[LEMMA]);
lemma.setValue(trim(word[LEMMA]));
lemma.addToIndexes();
token.setLemma(lemma);
}

// Read part-of-speech tag
POS pos = null;
String tag = useCPosAsPos ? word[CPOSTAG] : word[POSTAG];
String cPosTag = cleanTag(word[CPOSTAG]);
String tag = useCPosAsPos ? cPosTag : cleanTag(word[POSTAG]);
if (!UNUSED.equals(tag) && readPos) {
Type posTag = posMappingProvider.getTagType(tag);
pos = (POS) aJCas.getCas().createAnnotation(posTag, token.getBegin(),
token.getEnd());
pos.setPosValue(tag != null ? tag.intern() : null);
pos.setPosValue(tag);
}

// Read coarse part-of-speech tag
if (!UNUSED.equals(word[CPOSTAG]) && readCPos && pos != null) {
pos.setCoarseValue(word[CPOSTAG] != null ? word[CPOSTAG].intern() : null);
if (!UNUSED.equals(cPosTag) && readCPos && pos != null) {
pos.setCoarseValue(cPosTag);
}

if (pos != null) {
Expand All @@ -263,10 +264,11 @@ public void convert(JCas aJCas, BufferedReader aReader)
}

// Read morphological features
if (!UNUSED.equals(word[FEATS]) && readMorph) {
String featsValue = cleanTag(word[FEATS]);
if (!UNUSED.equals(featsValue) && readMorph) {
MorphologicalFeatures morphtag = new MorphologicalFeatures(aJCas,
token.getBegin(), token.getEnd());
morphtag.setValue(word[FEATS]);
morphtag.setValue(featsValue);
morphtag.addToIndexes();
token.setMorph(morphtag);
}
Expand All @@ -277,29 +279,30 @@ public void convert(JCas aJCas, BufferedReader aReader)
// Read dependencies
if (readDependency) {
for (String[] word : words) {
if (!UNUSED.equals(word[DEPREL])) {
int depId = Integer.valueOf(word[ID]);
int govId = Integer.valueOf(word[HEAD]);
String depRel = cleanTag(word[DEPREL]);
if (!UNUSED.equals(depRel)) {
int depId = Integer.valueOf(trim(word[ID]));
int govId = Integer.valueOf(trim(word[HEAD]));

// Model the root as a loop onto itself
if (govId == 0) {
Dependency rel = new ROOT(aJCas);
rel.setGovernor(tokens.get(depId));
rel.setDependent(tokens.get(depId));
rel.setDependencyType(word[DEPREL]);
rel.setDependencyType(depRel);
rel.setBegin(rel.getDependent().getBegin());
rel.setEnd(rel.getDependent().getEnd());
rel.setFlavor(DependencyFlavor.BASIC);
rel.setFlavor(BASIC);
rel.addToIndexes();
}
else {
Dependency rel = new Dependency(aJCas);
rel.setGovernor(tokens.get(govId));
rel.setDependent(tokens.get(depId));
rel.setDependencyType(word[DEPREL]);
rel.setDependencyType(depRel);
rel.setBegin(rel.getDependent().getBegin());
rel.setEnd(rel.getDependent().getEnd());
rel.setFlavor(DependencyFlavor.BASIC);
rel.setFlavor(BASIC);
rel.addToIndexes();
}
}
Expand Down
Loading

0 comments on commit 2d84c18

Please sign in to comment.