/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package mj.ocraptor.extraction.tika.parser.html; import java.io.IOException; import java.io.InputStream; import java.nio.charset.Charset; import java.util.Arrays; import java.util.Collections; import java.util.HashSet; import java.util.Set; import mj.ocraptor.extraction.tika.parser.txt.AutoDetectReader; import org.apache.tika.config.ServiceLoader; import org.apache.tika.exception.TikaException; import org.apache.tika.io.CloseShieldInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.AbstractParser; import org.apache.tika.parser.ParseContext; import org.ccil.cowan.tagsoup.HTMLSchema; import org.ccil.cowan.tagsoup.Schema; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; /** * HTML parser. Uses TagSoup to turn the input document to HTML SAX events, and * post-processes the events to produce XHTML and metadata expected by Tika * clients. */ public class HtmlParser extends AbstractParser { /** Serial version UID */ private static final long serialVersionUID = 7895315240498733128L; private static final Set<MediaType> SUPPORTED_TYPES = Collections .unmodifiableSet(new HashSet<MediaType>(Arrays.asList( MediaType.text("html"), MediaType.application("xhtml+xml"), MediaType.application("vnd.wap.xhtml+xml"), MediaType.application("x-asp")))); private static final ServiceLoader LOADER = new ServiceLoader( HtmlParser.class.getClassLoader()); /** * HTML schema singleton used to amortise the heavy instantiation time. */ private static final Schema HTML_SCHEMA = new HTMLSchema(); public Set<MediaType> getSupportedTypes(ParseContext context) { return SUPPORTED_TYPES; } public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { // Automatically detect the character encoding AutoDetectReader reader = new AutoDetectReader(new CloseShieldInputStream( stream), metadata, context.get(ServiceLoader.class, LOADER)); try { Charset charset = reader.getCharset(); // charset = Charset.forName("utf-8"); String previous = metadata.get(Metadata.CONTENT_TYPE); if (previous == null || previous.startsWith("text/html")) { MediaType type = new MediaType(MediaType.TEXT_HTML, charset); metadata.set(Metadata.CONTENT_TYPE, type.toString()); } // deprecated, see TIKA-431 metadata.set(Metadata.CONTENT_ENCODING, charset.name()); // Get the HTML mapper from the parse context HtmlMapper mapper = context.get(HtmlMapper.class, new HtmlParserMapper()); // Parse the HTML document org.ccil.cowan.tagsoup.Parser parser = new org.ccil.cowan.tagsoup.Parser(); // Use schema from context or default Schema schema = context.get(Schema.class, HTML_SCHEMA); // TIKA-528: Reuse share schema to avoid heavy instantiation parser.setProperty(org.ccil.cowan.tagsoup.Parser.schemaProperty, schema); // TIKA-599: Shared schema is thread-safe only if bogons are ignored parser .setFeature(org.ccil.cowan.tagsoup.Parser.ignoreBogonsFeature, true); parser.setContentHandler(new XHTMLDowngradeHandler(new HtmlHandler( mapper, handler, metadata))); parser.parse(reader.asInputSource()); } finally { reader.close(); } } /** * Maps "safe" HTML element names to semantic XHTML equivalents. If the given * element is unknown or deemed unsafe for inclusion in the parse output, then * this method returns <code>null</code> and the element will be ignored but * the content inside it is still processed. See the * {@link #isDiscardElement(String)} method for a way to discard the entire * contents of an element. * <p> * Subclasses can override this method to customize the default mapping. * * @deprecated Use the {@link HtmlMapper} mechanism to customize the HTML * mapping. This method will be removed in Tika 1.0. * @since Apache Tika 0.5 * @param name * HTML element name (upper case) * @return XHTML element name (lower case), or <code>null</code> if the * element is unsafe */ protected String mapSafeElement(String name) { return DefaultHtmlMapper.INSTANCE.mapSafeElement(name); } /** * Checks whether all content within the given HTML element should be * discarded instead of including it in the parse output. Subclasses can * override this method to customize the set of discarded elements. * * @deprecated Use the {@link HtmlMapper} mechanism to customize the HTML * mapping. This method will be removed in Tika 1.0. * @since Apache Tika 0.5 * @param name * HTML element name (upper case) * @return <code>true</code> if content inside the named element should be * ignored, <code>false</code> otherwise */ protected boolean isDiscardElement(String name) { return DefaultHtmlMapper.INSTANCE.isDiscardElement(name); } /** * @deprecated Use the {@link HtmlMapper} mechanism to customize the HTML * mapping. This method will be removed in Tika 1.0. **/ public String mapSafeAttribute(String elementName, String attributeName) { return DefaultHtmlMapper.INSTANCE.mapSafeAttribute(elementName, attributeName); } /** * Adapter class that maintains backwards compatibility with the protected * HtmlParser methods. Making HtmlParser implement HtmlMapper directly would * require those methods to be public, which would break backwards * compatibility with subclasses. * * @deprecated Use the {@link HtmlMapper} mechanism to customize the HTML * mapping. This class will be removed in Tika 1.0. */ private class HtmlParserMapper implements HtmlMapper { public String mapSafeElement(String name) { return HtmlParser.this.mapSafeElement(name); } public boolean isDiscardElement(String name) { return HtmlParser.this.isDiscardElement(name); } public String mapSafeAttribute(String elementName, String attributeName) { return HtmlParser.this.mapSafeAttribute(elementName, attributeName); } } }