package org.basex.build.file; import static org.basex.util.Token.*; import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.StringWriter; import java.io.Writer; import java.lang.reflect.Constructor; import org.basex.build.xml.XMLParser; import org.basex.core.Prop; import org.basex.io.IO; import org.basex.io.IOContent; import org.basex.io.in.ArrayInput; import org.basex.util.Reflect; import org.basex.util.Util; import org.xml.sax.ContentHandler; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import org.xml.sax.XMLReader; /** * This class uses TagSoup to convert HTML input to well-formed XML. * If TagSoup is not found in the classpath, the original document is passed on. * * TagSoup was written by John Cowan and is based on the Apache 2.0 License: * {@code http://home.ccil.org/~cowan/XML/tagsoup/}. * * @author BaseX Team 2005-12, BSD License * @author Christian Gruen */ public final class HTMLParser extends XMLParser { /** HTML reader. */ private static final Class<?> READER = Reflect.find( "org.ccil.cowan.tagsoup.Parser"); /** HTML reader. */ private static final Constructor<?> WRITER = Reflect.find(Reflect.find( "org.ccil.cowan.tagsoup.XMLWriter"), Writer.class); /** * Checks if a CatalogResolver is available. * @return result of check */ public static boolean available() { return READER != null; } /** * Constructor. * @param source document source * @param target target path * @param prop database properties * @throws IOException I/O exception */ public HTMLParser(final IO source, final String target, final Prop prop) throws IOException { super(toXML(source), target, prop); } /** * Converts an HTML document to XML. * @param io io reference * @return parser * @throws IOException I/O exception */ private static IO toXML(final IO io) throws IOException { // reader could not be initialized; fall back to XML if(READER == null) return io; try { // tries to extract the encoding from the input byte[] content = io.read(); final ArrayInput ai = new ArrayInput(content); String enc = ai.encoding(); content = ai.readBytes(); // looks for a charset definition final byte[] encoding = token("charset="); int cs = indexOf(content, encoding); if(cs > 0) { // extracts the encoding string cs += encoding.length; int ce = cs; while(++ce < content.length && content[ce] > 0x28); enc = string(substring(content, cs, ce)); } // define input final InputSource is = new InputSource(new ByteArrayInputStream(content)); is.setEncoding(supported(enc) ? normEncoding(enc, null) : UTF8); // define output final StringWriter sw = new StringWriter(); final XMLReader reader = (XMLReader) Reflect.get(READER); reader.setContentHandler((ContentHandler) Reflect.get(WRITER, sw)); reader.parse(is); return new IOContent(token(sw.toString()), io.name()); } catch(final SAXException ex) { Util.debug(ex); return io; } } }