|
15 | 15 | import java.nio.MappedByteBuffer;
|
16 | 16 | import java.nio.channels.FileChannel;
|
17 | 17 | import java.nio.charset.Charset;
|
| 18 | +import java.text.CharacterIterator; |
| 19 | +import java.text.StringCharacterIterator; |
18 | 20 | import java.util.ArrayList;
|
| 21 | +import java.util.Properties; |
19 | 22 | import java.util.Scanner;
|
20 | 23 | import java.util.regex.Matcher;
|
21 | 24 | import java.util.regex.Pattern;
|
@@ -52,6 +55,7 @@ private static void writeFile(String path, String content) throws IOException {
|
52 | 55 | }
|
53 | 56 | }
|
54 | 57 |
|
| 58 | + |
55 | 59 | /* (non-Javadoc)
|
56 | 60 | * @see gs.yasa.outputunifier.OutputReader#read(java.io.File)
|
57 | 61 | */
|
@@ -99,14 +103,14 @@ public ArrayList<Annotation> read(String annotatedString)
|
99 | 103 | }
|
100 | 104 |
|
101 | 105 | //remove the extra and unnecessary spaces (only for illinois) in the document
|
102 |
| - //annotatedString = fixAddedSpaces(annotatedString); |
| 106 | + annotatedString = fixAddedSpaces(annotatedString); |
103 | 107 |
|
104 |
| - try { |
105 |
| - annotatedString = readFile("/tmp/cleared.txt"); |
106 |
| - } catch (IOException e1) { |
107 |
| - // TODO Auto-generated catch block |
108 |
| - e1.printStackTrace(); |
109 |
| - } |
| 108 | +// try { |
| 109 | +// annotatedString = readFile("/tmp/cleared.txt"); |
| 110 | +// } catch (IOException e1) { |
| 111 | +// // TODO Auto-generated catch block |
| 112 | +// e1.printStackTrace(); |
| 113 | +// } |
110 | 114 |
|
111 | 115 | if(debug)
|
112 | 116 | {
|
@@ -174,32 +178,108 @@ public ArrayList<Annotation> read(String annotatedString)
|
174 | 178 | */
|
175 | 179 | private String fixAddedSpaces(String annotatedText)
|
176 | 180 | {
|
177 |
| - String annotatedString = annotatedText; |
178 |
| - annotatedString=annotatedString.replaceAll("\\( ", "\\("); |
| 181 | + String propertiesFilePath = "/home/samet/.bin/Dropbox/workspace/config/sne.properties_linux"; |
| 182 | + Properties prop = new Properties(); |
| 183 | + try { |
| 184 | + prop.load(new FileInputStream(propertiesFilePath)); |
| 185 | + } catch (FileNotFoundException e1) { |
| 186 | + e1.printStackTrace(); |
| 187 | + } catch (IOException e1) { |
| 188 | + e1.printStackTrace(); |
| 189 | + } |
| 190 | + |
| 191 | + String outputDirectory = prop.getProperty("outputDirectory"); |
179 | 192 |
|
180 |
| - annotatedString=annotatedString.replaceAll(" \\)", "\\)"); |
181 |
| - annotatedString=annotatedString.replaceAll(" ,", ","); |
182 |
| - annotatedString=annotatedString.replaceAll(" ;", ";"); |
183 |
| - annotatedString=annotatedString.replaceAll(" \\. ", ". "); |
184 |
| - annotatedString=annotatedString.replaceAll("\\] '", "\\]'"); |
185 |
| - annotatedString=annotatedString.replaceAll(" 's", "'s"); |
186 |
| - // Samet added |
187 |
| - annotatedString=annotatedString.replaceAll("  ; ;", "  ;"); |
188 |
| - annotatedString=annotatedString.replaceAll(";;", "; ;"); |
189 |
| - annotatedString=annotatedString.replaceAll(": \\]", " \\]:"); |
190 |
| - annotatedString=annotatedString.replaceAll(" \\.\\) ", "\\.\\) "); |
191 |
| - annotatedString=annotatedString.replaceAll(" \\.\\)\\.", "\\.\\)\\."); |
192 |
| - annotatedString=annotatedString.replaceAll("\\.\" \\:", "\\.\"\\:"); |
193 |
| - annotatedString=annotatedString.replaceAll("\" \\.:", "\"\\.:"); |
194 |
| - annotatedString=annotatedString.replaceAll("\\. \\.", "\\. \\.\n"); |
195 |
| - annotatedString=annotatedString.replaceAll(" ! ", "! "); |
196 |
| - annotatedString=annotatedString.replaceAll("\" — ", "\"— "); |
197 |
| - annotatedString=annotatedString.replaceAll(". .\n ", ". .\n"); |
| 193 | + String rawFilePath = outputDirectory + "input"; |
| 194 | + |
| 195 | + String rawText = ""; |
| 196 | + try { |
| 197 | + rawText = readFile(rawFilePath); |
| 198 | + } catch (IOException e) { |
| 199 | + System.out.println("Raw text file cannot be found at: " + rawFilePath); |
| 200 | + e.printStackTrace(); |
| 201 | + } |
| 202 | + CharacterIterator rawFileCharIterator = new StringCharacterIterator(rawText); |
| 203 | + CharacterIterator refFileCharIterator = new StringCharacterIterator(annotatedText); |
| 204 | + StringBuffer stringBuffer = new StringBuffer(); |
| 205 | + char rawChar = rawFileCharIterator.first(); |
| 206 | + char refChar = refFileCharIterator.first(); |
| 207 | + |
| 208 | + for(; rawChar != rawFileCharIterator.DONE && refChar != refFileCharIterator.DONE;) { |
198 | 209 |
|
199 |
| - // This line was here. |
200 |
| - annotatedString=annotatedString.replaceAll("\" (.+?) \"", "\"$1\""); |
201 |
| - // End of Samet added |
202 |
| - annotatedString=annotatedString.replaceAll(" \\?\"\\. ", "\\?\"\\. "); |
203 |
| - return annotatedString; |
| 210 | + if (rawChar == refChar) { |
| 211 | + stringBuffer.append(String.valueOf(rawChar)); |
| 212 | + rawChar = rawFileCharIterator.next(); |
| 213 | + refChar = refFileCharIterator.next(); |
| 214 | + log(stringBuffer.toString()); |
| 215 | + } else { |
| 216 | + if (refChar == '[') { |
| 217 | + StringBuilder s = new StringBuilder(); |
| 218 | + for (int i=0; i<3; i++) { |
| 219 | + s.append(refFileCharIterator.next()); |
| 220 | + } |
| 221 | + String threeChars = s.toString(); |
| 222 | + if (threeChars.equals("PER") || threeChars.equals("MIS") || threeChars.equals("ORG") || threeChars.equals("LOC")) { |
| 223 | + StringBuilder detectedAnnotation = new StringBuilder(); |
| 224 | + detectedAnnotation.append("["); |
| 225 | + detectedAnnotation.append(threeChars); |
| 226 | + |
| 227 | + // Annotation types are represented with three chars, except MISC type. |
| 228 | + if (threeChars.toString().equals("MIS")) { |
| 229 | + detectedAnnotation.append("C"); |
| 230 | + refChar = refFileCharIterator.next(); |
| 231 | + } |
| 232 | + |
| 233 | + // Add another space, coming after annotation type: |
| 234 | + detectedAnnotation.append(" "); |
| 235 | + refChar = refFileCharIterator.next(); |
| 236 | + |
| 237 | + while(refChar != ']') { |
| 238 | + refChar = refFileCharIterator.next(); |
| 239 | + detectedAnnotation.append(refChar); |
| 240 | + log(detectedAnnotation.toString()); |
| 241 | + } |
| 242 | + stringBuffer.append(detectedAnnotation.toString()); |
| 243 | + |
| 244 | + // Annotation is represented as this: [PER Person Name ] |
| 245 | + // So that we substract 8 chars to find real length of annotation. |
| 246 | + int trimmedAnnotationLength = detectedAnnotation.toString().length() - 8; |
| 247 | + if (threeChars.equals("MIS")) { |
| 248 | + trimmedAnnotationLength--; |
| 249 | + } |
| 250 | + for(int i=0; i<trimmedAnnotationLength; i++) { |
| 251 | + rawChar = rawFileCharIterator.next(); |
| 252 | + } |
| 253 | + |
| 254 | + log(stringBuffer.toString()); |
| 255 | + refChar = refFileCharIterator.next(); |
| 256 | + } else { |
| 257 | + stringBuffer.append("["); |
| 258 | + stringBuffer.append(s.toString()); |
| 259 | + for (int i=0; i<3; i++) { |
| 260 | + rawFileCharIterator.next(); |
| 261 | + } |
| 262 | + } |
| 263 | + } else if (rawChar == ';'){ |
| 264 | + if (refChar == ' ' && refFileCharIterator.next() == ';') { |
| 265 | + refChar = ';'; |
| 266 | + } |
| 267 | + } else if (rawChar == '\n') { |
| 268 | + stringBuffer.append(rawChar); |
| 269 | + rawChar = rawFileCharIterator.next(); |
| 270 | + } else { |
| 271 | + refChar = refFileCharIterator.next(); |
| 272 | + } |
| 273 | + } |
| 274 | + |
| 275 | + } |
| 276 | + String content = stringBuffer.toString(); |
| 277 | + return content; |
| 278 | + } |
| 279 | + private static void log(String text) { |
| 280 | + Boolean debug = false; |
| 281 | + if (debug) { |
| 282 | + System.out.println(text); |
| 283 | + } |
204 | 284 | }
|
205 | 285 | }
|
0 commit comments