package edu.jhu.nlp.wikipedia; import java.io.InputStream; import org.xml.sax.SAXException; import org.xml.sax.XMLReader; import org.xml.sax.helpers.XMLReaderFactory; /** * * A SAX Parser for Wikipedia XML dumps. * * @author Jason Smith * */ public class WikiXMLSAXParser extends WikiXMLParser { private XMLReader xmlReader; private PageCallbackHandler pageHandler = null; public WikiXMLSAXParser(InputStream is){ super(is); this.initReaderHandler(); } public WikiXMLSAXParser(String fileName){ super(fileName); this.initReaderHandler(); } private void initReaderHandler(){ try { xmlReader = XMLReaderFactory.createXMLReader(); pageHandler = new IteratorHandler(this); } catch (SAXException e) { // TODO Auto-generated catch block e.printStackTrace(); } } /** * Set a callback handler. The callback is executed every time a * page instance is detected in the stream. Custom handlers are * implementations of {@link PageCallbackHandler} * @param handler * @throws Exception */ public void setPageCallback(PageCallbackHandler handler) throws Exception { pageHandler = handler; } /** * The main parse method. * @throws Exception */ public void parse() throws Exception { xmlReader.setContentHandler(new SAXPageCallbackHandler(pageHandler)); xmlReader.parse(getInputSource()); } /** * This parser is event driven, so it * can't provide a page iterator. */ @Override public WikiPageIterator getIterator() throws Exception { if(!(pageHandler instanceof IteratorHandler)) { throw new Exception("Custom page callback found. Will not iterate."); } throw new UnsupportedOperationException(); } /** * A convenience method for the Wikipedia SAX interface * @param dumpFile - path to the Wikipedia dump * @param handler - callback handler used for parsing * @throws Exception */ public static void parseWikipediaDump(String dumpFile, PageCallbackHandler handler) throws Exception { WikiXMLParser wxsp = WikiXMLParserFactory.getSAXParser(dumpFile); wxsp.setPageCallback(handler); wxsp.parse(); } }