/******************************************************************************
* Copyright (c) 2010 Basis Technology Corp.
*
* Basis Technology Corp. licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package com.basistech.readability;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringReader;
import org.xml.sax.Attributes;
import org.xml.sax.ErrorHandler;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.SAXParseException;
import org.xml.sax.helpers.DefaultHandler;
import org.cyberneko.html.parsers.SAXParser;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Due to bugs in the Jsoup parser, we want a class that uses Neko to do the parse.
* The same trick could be played with JSoup.
*/
public class NekoJsoupParser {
private static final Logger LOG = LoggerFactory.getLogger(NekoJsoupParser.class);
public NekoJsoupParser() {
//
}
private final class LocalErrorHandler implements ErrorHandler {
@Override
public void error(SAXParseException e) throws SAXException {
LOG.error("Parse error", e);
throw e;
}
@Override
public void fatalError(SAXParseException e) throws SAXException {
LOG.error("Parse error", e);
throw e;
}
@Override
public void warning(SAXParseException e) throws SAXException {
LOG.warn("Parse warning", e);
}
}
private class Handler extends DefaultHandler {
private Document document;
private Element currentElement;
private int depth;
Handler(Document document) {
this.document = document;
}
@Override
public void characters(char[] data, int start, int length) throws SAXException {
assert currentElement != null;
currentElement.appendText(new String(data, start, length));
}
@Override
public void endDocument() throws SAXException {
assert depth == 0;
}
@Override
public void endElement(String uri, String localName, String qname) throws SAXException {
LOG.debug("end element " + qname);
currentElement = currentElement.parent();
depth--;
}
@Override
public void ignorableWhitespace(char[] data, int start, int length) throws SAXException {
characters(data, start, length);
}
@Override
public void startDocument() throws SAXException {
currentElement = document;
}
@Override
public void startElement(String uri, String localName, String qname, Attributes attrs) throws SAXException {
LOG.debug("start element " + qname + " " + depth);
Element newElement;
newElement = currentElement.appendElement(localName);
for (int ax = 0; ax < attrs.getLength(); ax++) {
String name = attrs.getQName(ax);
String value = attrs.getValue(ax);
newElement.attr(name, value);
}
currentElement = newElement;
depth++;
}
}
public Document parse(InputStream data, String baseUri) throws SAXException, IOException {
InputSource source = new InputSource();
source.setByteStream(data);
SAXParser nekoParser = new SAXParser();
Document document = new Document(baseUri);
nekoParser.setContentHandler(new Handler(document));
nekoParser.setErrorHandler(new LocalErrorHandler());
nekoParser.parse(source);
return document;
}
public Document parse(String data, String baseUri) throws SAXException, IOException {
InputSource source = new InputSource();
source.setCharacterStream(new StringReader(data));
SAXParser nekoParser = new SAXParser();
Document document = new Document(baseUri);
nekoParser.setContentHandler(new Handler(document));
nekoParser.setErrorHandler(new LocalErrorHandler());
nekoParser.parse(source);
return document;
}
public Document parse(String data) throws SAXException, IOException {
return Jsoup.parse(data);
}
}