Skip to content
This repository has been archived by the owner on Mar 13, 2023. It is now read-only.

Commit

Permalink
Version bump, last minute changes.
Browse files Browse the repository at this point in the history
  • Loading branch information
benknoll-umn committed May 24, 2017
1 parent 6b7bae4 commit 9987d94
Show file tree
Hide file tree
Showing 10 changed files with 361 additions and 143 deletions.
4 changes: 2 additions & 2 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -23,15 +23,15 @@
<groupId>edu.umn.biomedicus</groupId>
<artifactId>biomedicus-gpl</artifactId>
<packaging>jar</packaging>
<version>1.6.0-SNAPSHOT</version>
<version>1.6.0</version>

<name>biomedicus-gpl</name>
<description>BioMedICUS Annotation System - GPL Extensions</description>

<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
<biomedicus.version>1.6.0-SNAPSHOT</biomedicus.version>
<biomedicus.version>1.6.0</biomedicus.version>
</properties>

<dependencies>
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,219 @@
/*
* Copyright (C) 2016 Regents of the University of Minnesota
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/

package edu.umn.biomedicus.gpl.penntree;

import com.google.inject.Inject;
import edu.stanford.nlp.ling.TaggedWord;
import edu.stanford.nlp.trees.PennTreeReaderFactory;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.TreeReader;
import edu.umn.biomedicus.common.types.syntax.PartOfSpeech;
import edu.umn.biomedicus.common.types.syntax.PartsOfSpeech;
import edu.umn.biomedicus.common.types.text.ImmutableParseToken;
import edu.umn.biomedicus.common.types.text.Sentence;
import edu.umn.biomedicus.framework.store.Document;
import edu.umn.biomedicus.framework.store.Label;
import edu.umn.biomedicus.framework.store.Span;
import edu.umn.biomedicus.framework.store.TextView;
import edu.umn.biomedicus.uima.adapter.UimaAdapters;
import edu.umn.biomedicus.uima.files.InputFileAdapter;
import edu.umn.biomedicus.uima.labels.LabelAdapters;
import java.io.IOException;
import java.io.Reader;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;
import java.util.Random;
import org.apache.uima.cas.CAS;
import org.apache.uima.collection.CollectionException;

/**
* Adapts Penn treebank format files to CAS files.
*
* @author Ben Knoll
* @since 1.3.0
*/
public final class PennTreebankInputFileAdapter implements InputFileAdapter {

/**
* The penn tree reader factory.
*/
private final PennTreeReaderFactory pennTreeReaderFactory = new PennTreeReaderFactory();

private final LabelAdapters labelAdapters;

/**
* The view name to load into.
*/
private String viewName;

@Inject
public PennTreebankInputFileAdapter(LabelAdapters labelAdapters) {
this.labelAdapters = labelAdapters;
}

@Override
public void adaptFile(CAS cas, Path path) throws CollectionException {
StringBuilder text = new StringBuilder();
ArrayList<SentenceBuilder> sentences = new ArrayList<>();
try (Reader reader = Files.newBufferedReader(path)) {
TreeReader treeReader = pennTreeReaderFactory.newTreeReader(reader);
Tree tree;
while ((tree = treeReader.readTree()) != null) {
int sentenceStart = text.length();

ArrayList<TaggedWord> taggedWords = tree.taggedYield();
SentenceBuilder sentenceBuilder = new SentenceBuilder();
sentenceBuilder.tokenBuilders = new ArrayList<>(taggedWords.size());
for (TaggedWord taggedWord : taggedWords) {
String tag = taggedWord.tag();
String word = taggedWord.word();
text.append(" ");

/**
* This -NONE- tag occurs in some documents when there is a assumed phrase.
*/
if (!"-NONE-".equals(tag)) {
int tokenStart = text.length();
if ("-LRB-".equals(word)) {
text.append('(');
} else if ("-RRB-".equals(word)) {
text.append(')');
} else if ("-LCB-".equals(word)) {
text.append('{');
} else if ("-RCB-".equals(word)) {
text.append('}');
} else if ("-LSB-".equals(word)) {
text.append('[');
} else if ("-RSB-".equals(word)) {
text.append(']');
} else if ("``".equals(word)) {
text.append("\"");
} else if ("''".equals(word)) {
text.append("\"");
} else {
text.append(word);
}
int tokenEnd = text.length();

PartOfSpeech partOfSpeech;
if ("-LRB-".equals(tag)) {
partOfSpeech = PartOfSpeech.LEFT_PAREN;
} else if ("-RRB-".equals(tag)) {
partOfSpeech = PartOfSpeech.RIGHT_PAREN;
} else {
if (tag.contains("|")) {
String[] tags = tag.split("\\|");
Random random = new Random();
int randomIndex = random.nextInt(tags.length);
partOfSpeech = PartsOfSpeech.forTag(tags[randomIndex]);
} else {
partOfSpeech = PartsOfSpeech.forTag(tag);
}

}

TokenBuilder tokenBuilder = new TokenBuilder();
tokenBuilder.tokenSpan = new Span(tokenStart, tokenEnd);
tokenBuilder.partOfSpeech = partOfSpeech;
if (partOfSpeech == null) {
throw new AssertionError("part of speech should not be null");
}
sentenceBuilder.tokenBuilders.add(tokenBuilder);
}
}

int sentenceEnd = text.length();

sentenceBuilder.sentenceSpan = new Span(sentenceStart, sentenceEnd);
sentences.add(sentenceBuilder);
}
} catch (IOException e) {
throw new CollectionException(e);
}

Document document = UimaAdapters.createDocument(cas, labelAdapters,
path.getFileName().toString());
TextView textView = document.newTextView()
.withText(text.toString())
.withName(viewName)
.build();

for (SentenceBuilder sentence : sentences) {
Span sentenceSpan = sentence.sentenceSpan;

textView.label(Label.create(sentenceSpan, Sentence.create()));

for (TokenBuilder tokenBuilder : sentence.tokenBuilders) {
Span tokenSpan = tokenBuilder.tokenSpan;

textView.label(Label.create(tokenSpan,
ImmutableParseToken.builder()
.text(tokenSpan.getCovered(text).toString())
.hasSpaceAfter(true)
.build()
));

PartOfSpeech partOfSpeech = tokenBuilder.partOfSpeech;
if (partOfSpeech != null) {
String pos = partOfSpeech.toString();
textView.label(Label.create(tokenSpan, PartsOfSpeech.forTag(pos)));
}
}
}

}

@Override
public void setTargetView(String viewName) {
this.viewName = viewName;
}

/**
* Used to build sentences.
*/
private static class SentenceBuilder {

/**
* The begin and end of the sentence.
*/
private Span sentenceSpan;

/**
* The tokens of the sentence.
*/
private List<TokenBuilder> tokenBuilders;
}

/**
* Used to build tokens
*/
private static class TokenBuilder {

/**
* The begin and end of the token
*/
private Span tokenSpan;

/**
* The part of speech.
*/
private PartOfSpeech partOfSpeech;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -17,41 +17,44 @@

package edu.umn.biomedicus.gpl.stanford.parser;

import edu.umn.biomedicus.framework.DocumentProcessor;
import edu.umn.biomedicus.framework.store.TextView;
import edu.umn.biomedicus.framework.store.Label;
import edu.umn.biomedicus.framework.store.LabelIndex;
import edu.umn.biomedicus.framework.store.Labeler;
import edu.umn.biomedicus.common.types.semantics.SocialHistoryCandidate;
import edu.umn.biomedicus.common.types.syntax.PartOfSpeech;
import edu.umn.biomedicus.common.types.text.ConstituencyParse;
import edu.umn.biomedicus.common.types.text.ParseToken;
import edu.umn.biomedicus.exc.BiomedicusException;

import edu.umn.biomedicus.framework.DocumentProcessor;
import edu.umn.biomedicus.framework.store.Document;
import edu.umn.biomedicus.framework.store.Label;
import edu.umn.biomedicus.framework.store.LabelIndex;
import edu.umn.biomedicus.framework.store.Labeler;
import edu.umn.biomedicus.framework.store.TextView;
import javax.inject.Inject;

public class SHStanfordConstituencyParser implements DocumentProcessor {
private final LabelIndex<PartOfSpeech> partOfSpeechLabelIndex;
private final Labeler<ConstituencyParse> constituencyParseLabeler;
private final LabelIndex<ParseToken> parseTokenLabelIndex;
private final StanfordConstituencyParserModel stanfordConstituencyParserModel;
private final LabelIndex<SocialHistoryCandidate> socialHistoryCandidateLabelIndex;

@Inject
SHStanfordConstituencyParser(TextView document,
StanfordConstituencyParserModel stanfordConstituencyParserModel) {
socialHistoryCandidateLabelIndex = document.getLabelIndex(SocialHistoryCandidate.class);
parseTokenLabelIndex = document.getLabelIndex(ParseToken.class);
partOfSpeechLabelIndex = document.getLabelIndex(PartOfSpeech.class);
constituencyParseLabeler = document.getLabeler(ConstituencyParse.class);
this.stanfordConstituencyParserModel = stanfordConstituencyParserModel;
}
private final LabelIndex<PartOfSpeech> partOfSpeechLabelIndex;
private final Labeler<ConstituencyParse> constituencyParseLabeler;
private final LabelIndex<ParseToken> parseTokenLabelIndex;
private final StanfordConstituencyParserModel stanfordConstituencyParserModel;
private final LabelIndex<SocialHistoryCandidate> labelIndex;

@Inject
public SHStanfordConstituencyParser(
TextView textView,
StanfordConstituencyParserModel stanfordConstituencyParserModel
) {
labelIndex = textView.getLabelIndex(SocialHistoryCandidate.class);
parseTokenLabelIndex = textView.getLabelIndex(ParseToken.class);
partOfSpeechLabelIndex = textView.getLabelIndex(PartOfSpeech.class);
constituencyParseLabeler = textView.getLabeler(ConstituencyParse.class);
this.stanfordConstituencyParserModel = stanfordConstituencyParserModel;
}

@Override
public void process() throws BiomedicusException {
for (Label<SocialHistoryCandidate> socialHistoryCandidateLabel : socialHistoryCandidateLabelIndex) {
stanfordConstituencyParserModel.parseSentence(socialHistoryCandidateLabel, parseTokenLabelIndex,
partOfSpeechLabelIndex, constituencyParseLabeler);
}
@Override
public void process() throws BiomedicusException {
for (Label<SocialHistoryCandidate> label : labelIndex) {
stanfordConstituencyParserModel.parseSentence(label, parseTokenLabelIndex,
partOfSpeechLabelIndex, constituencyParseLabeler);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -17,41 +17,41 @@

package edu.umn.biomedicus.gpl.stanford.parser;

import edu.umn.biomedicus.framework.DocumentProcessor;
import edu.umn.biomedicus.framework.store.TextView;
import edu.umn.biomedicus.framework.store.Label;
import edu.umn.biomedicus.framework.store.LabelIndex;
import edu.umn.biomedicus.framework.store.Labeler;
import edu.umn.biomedicus.common.types.syntax.PartOfSpeech;
import edu.umn.biomedicus.common.types.text.ConstituencyParse;
import edu.umn.biomedicus.common.types.text.ParseToken;
import edu.umn.biomedicus.common.types.text.Sentence;
import edu.umn.biomedicus.exc.BiomedicusException;

import edu.umn.biomedicus.framework.DocumentProcessor;
import edu.umn.biomedicus.framework.store.Label;
import edu.umn.biomedicus.framework.store.LabelIndex;
import edu.umn.biomedicus.framework.store.Labeler;
import edu.umn.biomedicus.framework.store.TextView;
import javax.inject.Inject;

public class StanfordConstituencyParser implements DocumentProcessor {
private final LabelIndex<Sentence> sentenceLabelIndex;
private final LabelIndex<PartOfSpeech> partOfSpeechLabelIndex;
private final Labeler<ConstituencyParse> constituencyParseLabeler;
private final LabelIndex<ParseToken> parseTokenLabelIndex;
private final StanfordConstituencyParserModel stanfordConstituencyParserModel;

@Inject
public StanfordConstituencyParser(TextView document,
StanfordConstituencyParserModel stanfordConstituencyParserModel) {
sentenceLabelIndex = document.getLabelIndex(Sentence.class);
parseTokenLabelIndex = document.getLabelIndex(ParseToken.class);
partOfSpeechLabelIndex = document.getLabelIndex(PartOfSpeech.class);
constituencyParseLabeler = document.getLabeler(ConstituencyParse.class);
this.stanfordConstituencyParserModel = stanfordConstituencyParserModel;
}
private final LabelIndex<Sentence> sentenceLabelIndex;
private final LabelIndex<PartOfSpeech> partOfSpeechLabelIndex;
private final Labeler<ConstituencyParse> constituencyParseLabeler;
private final LabelIndex<ParseToken> parseTokenLabelIndex;
private final StanfordConstituencyParserModel stanfordConstituencyParserModel;

@Inject
public StanfordConstituencyParser(TextView textView,
StanfordConstituencyParserModel stanfordConstituencyParserModel) {
sentenceLabelIndex = textView.getLabelIndex(Sentence.class);
parseTokenLabelIndex = textView.getLabelIndex(ParseToken.class);
partOfSpeechLabelIndex = textView.getLabelIndex(PartOfSpeech.class);
constituencyParseLabeler = textView.getLabeler(ConstituencyParse.class);
this.stanfordConstituencyParserModel = stanfordConstituencyParserModel;
}

@Override
public void process() throws BiomedicusException {
for (Label<Sentence> sentenceLabel : sentenceLabelIndex) {
stanfordConstituencyParserModel.parseSentence(sentenceLabel, parseTokenLabelIndex, partOfSpeechLabelIndex,
constituencyParseLabeler);
}
@Override
public void process() throws BiomedicusException {
for (Label<Sentence> sentenceLabel : sentenceLabelIndex) {
stanfordConstituencyParserModel.parseSentence(sentenceLabel, parseTokenLabelIndex,
partOfSpeechLabelIndex, constituencyParseLabeler);
}
}
}
Loading

0 comments on commit 9987d94

Please sign in to comment.