Skip to content

Commit 3a9be83

Browse files
committed
Integrated Illinois cleaner tool with the IllinoisOutputReader.
* Replaced old (and which is not working) fixAddedSpaces function with the Illinois cleaner tool.
1 parent bf7feb2 commit 3a9be83

File tree

3 files changed

+118
-37
lines changed

3 files changed

+118
-37
lines changed

src/IllinoisOutputCleaner/src/net/iyiuykular/apps/Main.java

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -53,8 +53,9 @@ public static void main(String[] args) throws IOException {
5353
String rawFilePath = "data/originalfile.txt";
5454
String referenceFilePath = "data/afterannotation.txt";
5555

56-
rawFilePath = "/tmp/annotations/Adolf_hitler/Adolf_hitler";
57-
referenceFilePath = "/tmp/annotated.txt";
56+
rawFilePath = "/tmp/annotations/Schrodinger/Schrodinger";
57+
referenceFilePath = "/tmp/illinoistooloutfile.txt";
58+
referenceFilePath="/tmp/out-1355782783734";
5859
// TODO Auto-generated method stub
5960
String rawFile = readFile(rawFilePath);
6061
String referenceFile = readFile(referenceFilePath);

src/SocialNetworkExtractorNihai/src/gs/yasa/outputunifier/illinois/IllinoisOutputReader.java

Lines changed: 112 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,10 @@
1515
import java.nio.MappedByteBuffer;
1616
import java.nio.channels.FileChannel;
1717
import java.nio.charset.Charset;
18+
import java.text.CharacterIterator;
19+
import java.text.StringCharacterIterator;
1820
import java.util.ArrayList;
21+
import java.util.Properties;
1922
import java.util.Scanner;
2023
import java.util.regex.Matcher;
2124
import java.util.regex.Pattern;
@@ -52,6 +55,7 @@ private static void writeFile(String path, String content) throws IOException {
5255
}
5356
}
5457

58+
5559
/* (non-Javadoc)
5660
* @see gs.yasa.outputunifier.OutputReader#read(java.io.File)
5761
*/
@@ -99,14 +103,14 @@ public ArrayList<Annotation> read(String annotatedString)
99103
}
100104

101105
//remove the extra and unnecessary spaces (only for illinois) in the document
102-
//annotatedString = fixAddedSpaces(annotatedString);
106+
annotatedString = fixAddedSpaces(annotatedString);
103107

104-
try {
105-
annotatedString = readFile("/tmp/cleared.txt");
106-
} catch (IOException e1) {
107-
// TODO Auto-generated catch block
108-
e1.printStackTrace();
109-
}
108+
// try {
109+
// annotatedString = readFile("/tmp/cleared.txt");
110+
// } catch (IOException e1) {
111+
// // TODO Auto-generated catch block
112+
// e1.printStackTrace();
113+
// }
110114

111115
if(debug)
112116
{
@@ -174,32 +178,108 @@ public ArrayList<Annotation> read(String annotatedString)
174178
*/
175179
private String fixAddedSpaces(String annotatedText)
176180
{
177-
String annotatedString = annotatedText;
178-
annotatedString=annotatedString.replaceAll("\\( ", "\\(");
181+
String propertiesFilePath = "/home/samet/.bin/Dropbox/workspace/config/sne.properties_linux";
182+
Properties prop = new Properties();
183+
try {
184+
prop.load(new FileInputStream(propertiesFilePath));
185+
} catch (FileNotFoundException e1) {
186+
e1.printStackTrace();
187+
} catch (IOException e1) {
188+
e1.printStackTrace();
189+
}
190+
191+
String outputDirectory = prop.getProperty("outputDirectory");
179192

180-
annotatedString=annotatedString.replaceAll(" \\)", "\\)");
181-
annotatedString=annotatedString.replaceAll(" ,", ",");
182-
annotatedString=annotatedString.replaceAll(" ;", ";");
183-
annotatedString=annotatedString.replaceAll(" \\. ", ". ");
184-
annotatedString=annotatedString.replaceAll("\\] '", "\\]'");
185-
annotatedString=annotatedString.replaceAll(" 's", "'s");
186-
// Samet added
187-
annotatedString=annotatedString.replaceAll("&#160 ; ;", "&#160; ;");
188-
annotatedString=annotatedString.replaceAll(";;", "; ;");
189-
annotatedString=annotatedString.replaceAll(": \\]", " \\]:");
190-
annotatedString=annotatedString.replaceAll(" \\.\\) ", "\\.\\) ");
191-
annotatedString=annotatedString.replaceAll(" \\.\\)\\.", "\\.\\)\\.");
192-
annotatedString=annotatedString.replaceAll("\\.\" \\:", "\\.\"\\:");
193-
annotatedString=annotatedString.replaceAll("\" \\.:", "\"\\.:");
194-
annotatedString=annotatedString.replaceAll("\\. \\.", "\\. \\.\n");
195-
annotatedString=annotatedString.replaceAll(" ! ", "! ");
196-
annotatedString=annotatedString.replaceAll("\" — ", "\"— ");
197-
annotatedString=annotatedString.replaceAll(". .\n ", ". .\n");
193+
String rawFilePath = outputDirectory + "input";
194+
195+
String rawText = "";
196+
try {
197+
rawText = readFile(rawFilePath);
198+
} catch (IOException e) {
199+
System.out.println("Raw text file cannot be found at: " + rawFilePath);
200+
e.printStackTrace();
201+
}
202+
CharacterIterator rawFileCharIterator = new StringCharacterIterator(rawText);
203+
CharacterIterator refFileCharIterator = new StringCharacterIterator(annotatedText);
204+
StringBuffer stringBuffer = new StringBuffer();
205+
char rawChar = rawFileCharIterator.first();
206+
char refChar = refFileCharIterator.first();
207+
208+
for(; rawChar != rawFileCharIterator.DONE && refChar != refFileCharIterator.DONE;) {
198209

199-
// This line was here.
200-
annotatedString=annotatedString.replaceAll("\" (.+?) \"", "\"$1\"");
201-
// End of Samet added
202-
annotatedString=annotatedString.replaceAll(" \\?\"\\. ", "\\?\"\\. ");
203-
return annotatedString;
210+
if (rawChar == refChar) {
211+
stringBuffer.append(String.valueOf(rawChar));
212+
rawChar = rawFileCharIterator.next();
213+
refChar = refFileCharIterator.next();
214+
log(stringBuffer.toString());
215+
} else {
216+
if (refChar == '[') {
217+
StringBuilder s = new StringBuilder();
218+
for (int i=0; i<3; i++) {
219+
s.append(refFileCharIterator.next());
220+
}
221+
String threeChars = s.toString();
222+
if (threeChars.equals("PER") || threeChars.equals("MIS") || threeChars.equals("ORG") || threeChars.equals("LOC")) {
223+
StringBuilder detectedAnnotation = new StringBuilder();
224+
detectedAnnotation.append("[");
225+
detectedAnnotation.append(threeChars);
226+
227+
// Annotation types are represented with three chars, except MISC type.
228+
if (threeChars.toString().equals("MIS")) {
229+
detectedAnnotation.append("C");
230+
refChar = refFileCharIterator.next();
231+
}
232+
233+
// Add another space, coming after annotation type:
234+
detectedAnnotation.append(" ");
235+
refChar = refFileCharIterator.next();
236+
237+
while(refChar != ']') {
238+
refChar = refFileCharIterator.next();
239+
detectedAnnotation.append(refChar);
240+
log(detectedAnnotation.toString());
241+
}
242+
stringBuffer.append(detectedAnnotation.toString());
243+
244+
// Annotation is represented as this: [PER Person Name ]
245+
// So that we substract 8 chars to find real length of annotation.
246+
int trimmedAnnotationLength = detectedAnnotation.toString().length() - 8;
247+
if (threeChars.equals("MIS")) {
248+
trimmedAnnotationLength--;
249+
}
250+
for(int i=0; i<trimmedAnnotationLength; i++) {
251+
rawChar = rawFileCharIterator.next();
252+
}
253+
254+
log(stringBuffer.toString());
255+
refChar = refFileCharIterator.next();
256+
} else {
257+
stringBuffer.append("[");
258+
stringBuffer.append(s.toString());
259+
for (int i=0; i<3; i++) {
260+
rawFileCharIterator.next();
261+
}
262+
}
263+
} else if (rawChar == ';'){
264+
if (refChar == ' ' && refFileCharIterator.next() == ';') {
265+
refChar = ';';
266+
}
267+
} else if (rawChar == '\n') {
268+
stringBuffer.append(rawChar);
269+
rawChar = rawFileCharIterator.next();
270+
} else {
271+
refChar = refFileCharIterator.next();
272+
}
273+
}
274+
275+
}
276+
String content = stringBuffer.toString();
277+
return content;
278+
}
279+
private static void log(String text) {
280+
Boolean debug = false;
281+
if (debug) {
282+
System.out.println(text);
283+
}
204284
}
205285
}

src/SocialNetworkExtractorNihai/src/gs/yasa/sne/SocialNetworkExtractor.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -124,11 +124,11 @@ public static void main(String[] args) {
124124

125125
//initialize the tools
126126
ArrayList<NERTool> tools = new ArrayList<NERTool>();
127-
tools.add(new StanfordNERTool());
127+
//tools.add(new StanfordNERTool());
128128
tools.add(new IllinoisNERTool());
129-
tools.add(new OpenCalaisNERTool());
129+
//tools.add(new OpenCalaisNERTool());
130130
//tools.add(new LinkedEntityRecognizerNERTool());
131-
tools.add(new DateParserNERTool());
131+
//tools.add(new DateParserNERTool());
132132
//perform the annotations
133133
for (NERTool nerTool : tools) {
134134
try

0 commit comments

Comments
 (0)