package org.elasticsearch.river.wikipedia.support; import org.xml.sax.Attributes; import org.xml.sax.helpers.DefaultHandler; /** * A Wrapper class for the PageCallbackHandler * * @author Jason Smith */ public class SAXPageCallbackHandler extends DefaultHandler { private PageCallbackHandler pageHandler; private WikiPage currentPage; private String currentTag; private String currentWikitext; private String currentTitle; private String currentID; public SAXPageCallbackHandler(PageCallbackHandler ph) { pageHandler = ph; } public void startElement(String uri, String name, String qName, Attributes attr) { currentTag = qName; if (qName.equals("page")) { currentPage = new WikiPage(); currentWikitext = ""; currentTitle = ""; currentID = ""; } } public void endElement(String uri, String name, String qName) { if (qName.equals("page")) { currentPage.setTitle(currentTitle); currentPage.setID(currentID); currentPage.setWikiText(currentWikitext); pageHandler.process(currentPage); } if (qName.equals("mediawiki")) { // TODO hasMoreElements() should now return false } } public void characters(char ch[], int start, int length) { if (currentTag.equals("title")) { currentTitle = currentTitle.concat(new String(ch, start, length)); } // TODO: To avoid looking at the revision ID, only the first ID is taken. // I'm not sure how big the block size is in each call to characters(), // so this may be unsafe. else if ((currentTag.equals("id")) && (currentID.length() == 0)) { currentID = new String(ch, start, length); } else if (currentTag.equals("text")) { currentWikitext = currentWikitext.concat(new String(ch, start, length)); } } }