From 4c51672dce77e0957263ad6b6ed3a367baab4149 Mon Sep 17 00:00:00 2001 From: Prudhvi Boyapalli Date: Sun, 19 Feb 2017 19:15:42 -0600 Subject: [PATCH 1/2] Fixed failing WikiXMLParserTest by removing whitespace and newline from page ID --- src/main/java/edu/jhu/nlp/wikipedia/WikiPage.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/main/java/edu/jhu/nlp/wikipedia/WikiPage.java b/src/main/java/edu/jhu/nlp/wikipedia/WikiPage.java index c2eadcf..d96a938 100755 --- a/src/main/java/edu/jhu/nlp/wikipedia/WikiPage.java +++ b/src/main/java/edu/jhu/nlp/wikipedia/WikiPage.java @@ -156,7 +156,8 @@ public HashSet getLinks() { } public void setID(String id) { - this.id = id; + // Trim to get rid of extraneous whitespace and newline + this.id = id.trim(); } public String getID() { From 87b2962b268eadf72812ff1f1bbfda52057ab078 Mon Sep 17 00:00:00 2001 From: Prudhvi Boyapalli Date: Sun, 19 Feb 2017 19:23:10 -0600 Subject: [PATCH 2/2] Added extraction of timestamp from XML and made it available as part of WikiPage --- .../nlp/wikipedia/SAXPageCallbackHandler.java | 17 +++++++++++++++-- .../java/edu/jhu/nlp/wikipedia/WikiPage.java | 9 +++++++++ .../jhu/nlp/wikipedia/WikiXMLParserTest.java | 1 + 3 files changed, 25 insertions(+), 2 deletions(-) diff --git a/src/main/java/edu/jhu/nlp/wikipedia/SAXPageCallbackHandler.java b/src/main/java/edu/jhu/nlp/wikipedia/SAXPageCallbackHandler.java index 402fc35..a087c57 100644 --- a/src/main/java/edu/jhu/nlp/wikipedia/SAXPageCallbackHandler.java +++ b/src/main/java/edu/jhu/nlp/wikipedia/SAXPageCallbackHandler.java @@ -20,6 +20,7 @@ public class SAXPageCallbackHandler extends DefaultHandler { private StringBuilder currentWikitext; private StringBuilder currentTitle; private StringBuilder currentID; + private StringBuilder currentTimestamp; private String language = null; @@ -42,6 +43,10 @@ public void startElement(String uri, String name, String qName, Attributes attr) insideRevision = true; } + if (qName.equals("timestamp")) { + currentTimestamp = new StringBuilder(""); + } + } @Override @@ -49,16 +54,21 @@ public void endElement(String uri, String name, String qName){ if (qName.equals("revision")){ insideRevision = false; } + if (qName.equals("page")){ currentPage.setTitle(currentTitle.toString()); currentPage.setID(currentID.toString()); currentPage.setWikiText(currentWikitext.toString(), language); pageHandler.process(currentPage); } - if (qName.equals("mediawiki")) - { + + if (qName.equals("mediawiki")) { // TODO hasMoreElements() should now return false } + + if (qName.equals("timestamp")) { + currentPage.setTimestamp(currentTimestamp.toString()); + } } @Override @@ -73,5 +83,8 @@ else if ((currentTag.equals("id")) && !insideRevision){ else if (currentTag.equals("text")){ currentWikitext = currentWikitext.append(ch, start, length); } + else if (currentTag.equals("timestamp")) { + currentTimestamp.append(ch, start, length); + } } } \ No newline at end of file diff --git a/src/main/java/edu/jhu/nlp/wikipedia/WikiPage.java b/src/main/java/edu/jhu/nlp/wikipedia/WikiPage.java index d96a938..cf85c90 100755 --- a/src/main/java/edu/jhu/nlp/wikipedia/WikiPage.java +++ b/src/main/java/edu/jhu/nlp/wikipedia/WikiPage.java @@ -15,6 +15,7 @@ public class WikiPage { private String title = null; private WikiTextParser wikiTextParser = null; private String id = null; + private String timestamp = null; private Language language = null; private Pattern disambCatPattern = null; //Pattern.compile("\\("+language.getDisambiguationLabel()+"\\)", Pattern.CASE_INSENSITIVE); private Pattern categoryPattern = null; //Pattern.compile( language.getLocalizedCategoryLabel()+ "\\W\\w+", Pattern.CASE_INSENSITIVE); @@ -163,4 +164,12 @@ public void setID(String id) { public String getID() { return id; } + + public void setTimestamp(String timestamp) { + this.timestamp = timestamp; + } + + public String getTimestamp() { + return timestamp; + } } \ No newline at end of file diff --git a/src/test/java/edu/jhu/nlp/wikipedia/WikiXMLParserTest.java b/src/test/java/edu/jhu/nlp/wikipedia/WikiXMLParserTest.java index cc90d19..32d2b86 100644 --- a/src/test/java/edu/jhu/nlp/wikipedia/WikiXMLParserTest.java +++ b/src/test/java/edu/jhu/nlp/wikipedia/WikiXMLParserTest.java @@ -16,6 +16,7 @@ public void process(WikiPage page) { assertEquals("Isaac Newton", page.getTitle()); assertEquals("14627", page.getID()); + assertEquals("2010-01-04T10:29:28Z", page.getTimestamp()); } });