Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 15 additions & 2 deletions src/main/java/edu/jhu/nlp/wikipedia/SAXPageCallbackHandler.java
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ public class SAXPageCallbackHandler extends DefaultHandler {
private StringBuilder currentWikitext;
private StringBuilder currentTitle;
private StringBuilder currentID;
private StringBuilder currentTimestamp;
private String language = null;


Expand All @@ -42,23 +43,32 @@ public void startElement(String uri, String name, String qName, Attributes attr)
insideRevision = true;
}

if (qName.equals("timestamp")) {
currentTimestamp = new StringBuilder("");
}

}

@Override
public void endElement(String uri, String name, String qName){
if (qName.equals("revision")){
insideRevision = false;
}

if (qName.equals("page")){
currentPage.setTitle(currentTitle.toString());
currentPage.setID(currentID.toString());
currentPage.setWikiText(currentWikitext.toString(), language);
pageHandler.process(currentPage);
}
if (qName.equals("mediawiki"))
{

if (qName.equals("mediawiki")) {
// TODO hasMoreElements() should now return false
}

if (qName.equals("timestamp")) {
currentPage.setTimestamp(currentTimestamp.toString());
}
}

@Override
Expand All @@ -73,5 +83,8 @@ else if ((currentTag.equals("id")) && !insideRevision){
else if (currentTag.equals("text")){
currentWikitext = currentWikitext.append(ch, start, length);
}
else if (currentTag.equals("timestamp")) {
currentTimestamp.append(ch, start, length);
}
}
}
12 changes: 11 additions & 1 deletion src/main/java/edu/jhu/nlp/wikipedia/WikiPage.java
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ public class WikiPage {
private String title = null;
private WikiTextParser wikiTextParser = null;
private String id = null;
private String timestamp = null;
private Language language = null;
private Pattern disambCatPattern = null; //Pattern.compile("\\("+language.getDisambiguationLabel()+"\\)", Pattern.CASE_INSENSITIVE);
private Pattern categoryPattern = null; //Pattern.compile( language.getLocalizedCategoryLabel()+ "\\W\\w+", Pattern.CASE_INSENSITIVE);
Expand Down Expand Up @@ -156,10 +157,19 @@ public HashSet<String> getLinks() {
}

public void setID(String id) {
this.id = id;
// Trim to get rid of extraneous whitespace and newline
this.id = id.trim();
}

public String getID() {
return id;
}

public void setTimestamp(String timestamp) {
this.timestamp = timestamp;
}

public String getTimestamp() {
return timestamp;
}
}
1 change: 1 addition & 0 deletions src/test/java/edu/jhu/nlp/wikipedia/WikiXMLParserTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ public void process(WikiPage page) {

assertEquals("Isaac Newton", page.getTitle());
assertEquals("14627", page.getID());
assertEquals("2010-01-04T10:29:28Z", page.getTimestamp());

}
});
Expand Down