package edu.jhu.nlp.wikipedia; import org.xml.sax.Attributes; import org.xml.sax.helpers.DefaultHandler; /** * * A Wrapper class for the PageCallbackHandler * * @author Jason Smith * */ public class SAXPageCallbackHandler extends DefaultHandler { private PageCallbackHandler pageHandler; private WikiPage currentPage; private String currentTag; private StringBuilder currentWikitext; private StringBuilder currentTitle; private StringBuilder currentID; public SAXPageCallbackHandler(PageCallbackHandler ph){ pageHandler = ph; } public void startElement(String uri, String name, String qName, Attributes attr){ currentTag = qName; if (qName.equals("page")){ currentPage = new WikiPage(); currentWikitext = new StringBuilder(""); currentTitle = new StringBuilder(""); currentID = new StringBuilder(""); } } public void endElement(String uri, String name, String qName){ if (qName.equals("page")){ currentPage.setTitle(currentTitle.toString()); currentPage.setID(currentID.toString()); currentPage.setWikiText(currentWikitext.toString()); pageHandler.process(currentPage); } if (qName.equals("mediawiki")) { // TODO hasMoreElements() should now return false } } public void characters(char ch[], int start, int length){ if (currentTag.equals("title")){ currentTitle = currentTitle.append(ch, start, length); } // TODO: To avoid looking at the revision ID, only the first ID is taken. // I'm not sure how big the block size is in each call to characters(), // so this may be unsafe. else if ((currentTag.equals("id")) && (currentID.length() == 0)){ currentID = new StringBuilder(); currentID.append(ch, start, length); } else if (currentTag.equals("text")){ currentWikitext = currentWikitext.append(ch, start, length); } } }