diff --git a/src/main/java/edu/jhu/nlp/wikipedia/SAXPageCallbackHandler.java b/src/main/java/edu/jhu/nlp/wikipedia/SAXPageCallbackHandler.java index 402fc35..a087c57 100644 --- a/src/main/java/edu/jhu/nlp/wikipedia/SAXPageCallbackHandler.java +++ b/src/main/java/edu/jhu/nlp/wikipedia/SAXPageCallbackHandler.java @@ -20,6 +20,7 @@ public class SAXPageCallbackHandler extends DefaultHandler { private StringBuilder currentWikitext; private StringBuilder currentTitle; private StringBuilder currentID; + private StringBuilder currentTimestamp; private String language = null; @@ -42,6 +43,10 @@ public void startElement(String uri, String name, String qName, Attributes attr) insideRevision = true; } + if (qName.equals("timestamp")) { + currentTimestamp = new StringBuilder(""); + } + } @Override @@ -49,16 +54,21 @@ public void endElement(String uri, String name, String qName){ if (qName.equals("revision")){ insideRevision = false; } + if (qName.equals("page")){ currentPage.setTitle(currentTitle.toString()); currentPage.setID(currentID.toString()); currentPage.setWikiText(currentWikitext.toString(), language); pageHandler.process(currentPage); } - if (qName.equals("mediawiki")) - { + + if (qName.equals("mediawiki")) { // TODO hasMoreElements() should now return false } + + if (qName.equals("timestamp")) { + currentPage.setTimestamp(currentTimestamp.toString()); + } } @Override @@ -73,5 +83,8 @@ else if ((currentTag.equals("id")) && !insideRevision){ else if (currentTag.equals("text")){ currentWikitext = currentWikitext.append(ch, start, length); } + else if (currentTag.equals("timestamp")) { + currentTimestamp.append(ch, start, length); + } } } \ No newline at end of file diff --git a/src/main/java/edu/jhu/nlp/wikipedia/WikiPage.java b/src/main/java/edu/jhu/nlp/wikipedia/WikiPage.java index c2eadcf..cf85c90 100755 --- a/src/main/java/edu/jhu/nlp/wikipedia/WikiPage.java +++ b/src/main/java/edu/jhu/nlp/wikipedia/WikiPage.java @@ -15,6 +15,7 @@ public class WikiPage { private String title = null; private WikiTextParser wikiTextParser = null; private String id = null; + private String timestamp = null; private Language language = null; private Pattern disambCatPattern = null; //Pattern.compile("\\("+language.getDisambiguationLabel()+"\\)", Pattern.CASE_INSENSITIVE); private Pattern categoryPattern = null; //Pattern.compile( language.getLocalizedCategoryLabel()+ "\\W\\w+", Pattern.CASE_INSENSITIVE); @@ -156,10 +157,19 @@ public HashSet getLinks() { } public void setID(String id) { - this.id = id; + // Trim to get rid of extraneous whitespace and newline + this.id = id.trim(); } public String getID() { return id; } + + public void setTimestamp(String timestamp) { + this.timestamp = timestamp; + } + + public String getTimestamp() { + return timestamp; + } } \ No newline at end of file diff --git a/src/test/java/edu/jhu/nlp/wikipedia/WikiXMLParserTest.java b/src/test/java/edu/jhu/nlp/wikipedia/WikiXMLParserTest.java index cc90d19..32d2b86 100644 --- a/src/test/java/edu/jhu/nlp/wikipedia/WikiXMLParserTest.java +++ b/src/test/java/edu/jhu/nlp/wikipedia/WikiXMLParserTest.java @@ -16,6 +16,7 @@ public void process(WikiPage page) { assertEquals("Isaac Newton", page.getTitle()); assertEquals("14627", page.getID()); + assertEquals("2010-01-04T10:29:28Z", page.getTimestamp()); } });