/* * Licensed to ElasticSearch and Shay Banon under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. ElasticSearch licenses this * file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.elasticsearch.river.wikipedia.support; import org.xml.sax.SAXException; import org.xml.sax.XMLReader; import org.xml.sax.helpers.XMLReaderFactory; import java.net.URL; /** * A SAX Parser for Wikipedia XML dumps. * * @author Jason Smith */ public class WikiXMLSAXParser extends WikiXMLParser { private XMLReader xmlReader; private PageCallbackHandler pageHandler = null; public WikiXMLSAXParser(URL fileName) { super(fileName); try { xmlReader = XMLReaderFactory.createXMLReader(); pageHandler = new IteratorHandler(this); } catch (SAXException e) { // TODO Auto-generated catch block e.printStackTrace(); } } /** * Set a callback handler. The callback is executed every time a * page instance is detected in the stream. Custom handlers are * implementations of {@link PageCallbackHandler} * * @param handler * @throws Exception */ public void setPageCallback(PageCallbackHandler handler) throws Exception { pageHandler = handler; } /** * The main parse method. * * @throws Exception */ public void parse() throws Exception { xmlReader.setContentHandler(new SAXPageCallbackHandler(pageHandler)); xmlReader.parse(getInputSource()); } /** * This parser is event driven, so it * can't provide a page iterator. */ @Override public WikiPageIterator getIterator() throws Exception { if (!(pageHandler instanceof IteratorHandler)) { throw new Exception("Custom page callback found. Will not iterate."); } throw new UnsupportedOperationException(); } /** * A convenience method for the Wikipedia SAX interface * * @param dumpFile - path to the Wikipedia dump * @param handler - callback handler used for parsing * @throws Exception */ public static void parseWikipediaDump(URL dumpFile, PageCallbackHandler handler) throws Exception { WikiXMLParser wxsp = WikiXMLParserFactory.getSAXParser(dumpFile); wxsp.setPageCallback(handler); wxsp.parse(); } }