/*
* Copyright 2007 T-Rank AS
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package no.trank.openpipe.wikipedia.producer;
import java.io.ByteArrayOutputStream;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.util.Iterator;
import java.util.NoSuchElementException;
import javax.xml.stream.XMLEventReader;
import javax.xml.stream.XMLEventWriter;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLOutputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.events.XMLEvent;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Splits a wikipedia dump into <page>...</page> chunks.
*
* @version $Revision$
*/
public class WikiDocumentSplitter implements Iterator<String> {
private static final Logger log = LoggerFactory.getLogger(WikiDocumentSplitter.class);
private static final String PAGE_ELEMENT = "page";
private final XMLEventReader xmlEventReader;
private final XMLOutputFactory xmlOutputFactory = XMLOutputFactory.newInstance();
private String next;
public WikiDocumentSplitter(InputStream input) throws XMLStreamException {
if (input == null) {
throw new IllegalArgumentException("Input can not be <null>");
}
xmlEventReader = XMLInputFactory.newInstance().createXMLEventReader(input);
}
@Override
public boolean hasNext() {
if (next != null) {
return true;
} else {
try {
while (xmlEventReader.hasNext()) {
XMLEvent xmlEvent = xmlEventReader.nextEvent();
if(xmlEvent.isStartElement()) {
String localPart = xmlEvent.asStartElement().getName().getLocalPart();
if (PAGE_ELEMENT.equals(localPart)) {
next = parsePage(xmlEvent);
return next != null;
}
}
}
return false;
} catch (XMLStreamException e) {
throw new RuntimeException("Could not parse xml", e);
} catch (UnsupportedEncodingException e) {
throw new RuntimeException("UTF-8 not supported", e);
}
}
}
private String parsePage(XMLEvent pageStartEvent) throws XMLStreamException, UnsupportedEncodingException {
XMLEvent xmlEvent = null;
ByteArrayOutputStream out = new ByteArrayOutputStream();
XMLEventWriter xmlEventWriter = xmlOutputFactory.createXMLEventWriter(out, "UTF-8");
try {
xmlEventWriter.add(pageStartEvent);
while (xmlEventReader.hasNext()) {
xmlEvent = xmlEventReader.nextEvent();
xmlEventWriter.add(xmlEvent);
if (xmlEvent.isEndElement()) {
String localPart = xmlEvent.asEndElement().getName().getLocalPart();
if (PAGE_ELEMENT.equals(localPart)) {
xmlEventWriter.close();
return out.toString("UTF-8");
}
}
}
return null;
} catch (XMLStreamException e) {
if (xmlEvent != null) {
log.info("Failed on event {}", xmlEvent);
}
throw e;
}
}
@Override
public String next() {
if (hasNext()) {
try {
return next;
} finally {
next = null;
}
} else {
throw new NoSuchElementException();
}
}
@Override
public void remove() {
throw new UnsupportedOperationException("Operation not supported");
}
public void close() {
try {
xmlEventReader.close();
} catch (XMLStreamException e) {
// Do nothing
}
}
}