/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package mj.ocraptor.extraction.tika.parser.html;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.Set;
import mj.ocraptor.extraction.tika.parser.txt.AutoDetectReader;
import org.apache.tika.config.ServiceLoader;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.CloseShieldInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.ParseContext;
import org.ccil.cowan.tagsoup.HTMLSchema;
import org.ccil.cowan.tagsoup.Schema;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
/**
* HTML parser. Uses TagSoup to turn the input document to HTML SAX events, and
* post-processes the events to produce XHTML and metadata expected by Tika
* clients.
*/
public class HtmlParser extends AbstractParser {
/** Serial version UID */
private static final long serialVersionUID = 7895315240498733128L;
private static final Set<MediaType> SUPPORTED_TYPES = Collections
.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
MediaType.text("html"), MediaType.application("xhtml+xml"),
MediaType.application("vnd.wap.xhtml+xml"),
MediaType.application("x-asp"))));
private static final ServiceLoader LOADER = new ServiceLoader(
HtmlParser.class.getClassLoader());
/**
* HTML schema singleton used to amortise the heavy instantiation time.
*/
private static final Schema HTML_SCHEMA = new HTMLSchema();
public Set<MediaType> getSupportedTypes(ParseContext context) {
return SUPPORTED_TYPES;
}
public void parse(InputStream stream, ContentHandler handler,
Metadata metadata, ParseContext context) throws IOException,
SAXException, TikaException {
// Automatically detect the character encoding
AutoDetectReader reader = new AutoDetectReader(new CloseShieldInputStream(
stream), metadata, context.get(ServiceLoader.class, LOADER));
try {
Charset charset = reader.getCharset();
// charset = Charset.forName("utf-8");
String previous = metadata.get(Metadata.CONTENT_TYPE);
if (previous == null || previous.startsWith("text/html")) {
MediaType type = new MediaType(MediaType.TEXT_HTML, charset);
metadata.set(Metadata.CONTENT_TYPE, type.toString());
}
// deprecated, see TIKA-431
metadata.set(Metadata.CONTENT_ENCODING, charset.name());
// Get the HTML mapper from the parse context
HtmlMapper mapper = context.get(HtmlMapper.class, new HtmlParserMapper());
// Parse the HTML document
org.ccil.cowan.tagsoup.Parser parser = new org.ccil.cowan.tagsoup.Parser();
// Use schema from context or default
Schema schema = context.get(Schema.class, HTML_SCHEMA);
// TIKA-528: Reuse share schema to avoid heavy instantiation
parser.setProperty(org.ccil.cowan.tagsoup.Parser.schemaProperty, schema);
// TIKA-599: Shared schema is thread-safe only if bogons are ignored
parser
.setFeature(org.ccil.cowan.tagsoup.Parser.ignoreBogonsFeature, true);
parser.setContentHandler(new XHTMLDowngradeHandler(new HtmlHandler(
mapper, handler, metadata)));
parser.parse(reader.asInputSource());
} finally {
reader.close();
}
}
/**
* Maps "safe" HTML element names to semantic XHTML equivalents. If the given
* element is unknown or deemed unsafe for inclusion in the parse output, then
* this method returns <code>null</code> and the element will be ignored but
* the content inside it is still processed. See the
* {@link #isDiscardElement(String)} method for a way to discard the entire
* contents of an element.
* <p>
* Subclasses can override this method to customize the default mapping.
*
* @deprecated Use the {@link HtmlMapper} mechanism to customize the HTML
* mapping. This method will be removed in Tika 1.0.
* @since Apache Tika 0.5
* @param name
* HTML element name (upper case)
* @return XHTML element name (lower case), or <code>null</code> if the
* element is unsafe
*/
protected String mapSafeElement(String name) {
return DefaultHtmlMapper.INSTANCE.mapSafeElement(name);
}
/**
* Checks whether all content within the given HTML element should be
* discarded instead of including it in the parse output. Subclasses can
* override this method to customize the set of discarded elements.
*
* @deprecated Use the {@link HtmlMapper} mechanism to customize the HTML
* mapping. This method will be removed in Tika 1.0.
* @since Apache Tika 0.5
* @param name
* HTML element name (upper case)
* @return <code>true</code> if content inside the named element should be
* ignored, <code>false</code> otherwise
*/
protected boolean isDiscardElement(String name) {
return DefaultHtmlMapper.INSTANCE.isDiscardElement(name);
}
/**
* @deprecated Use the {@link HtmlMapper} mechanism to customize the HTML
* mapping. This method will be removed in Tika 1.0.
**/
public String mapSafeAttribute(String elementName, String attributeName) {
return DefaultHtmlMapper.INSTANCE.mapSafeAttribute(elementName,
attributeName);
}
/**
* Adapter class that maintains backwards compatibility with the protected
* HtmlParser methods. Making HtmlParser implement HtmlMapper directly would
* require those methods to be public, which would break backwards
* compatibility with subclasses.
*
* @deprecated Use the {@link HtmlMapper} mechanism to customize the HTML
* mapping. This class will be removed in Tika 1.0.
*/
private class HtmlParserMapper implements HtmlMapper {
public String mapSafeElement(String name) {
return HtmlParser.this.mapSafeElement(name);
}
public boolean isDiscardElement(String name) {
return HtmlParser.this.isDiscardElement(name);
}
public String mapSafeAttribute(String elementName, String attributeName) {
return HtmlParser.this.mapSafeAttribute(elementName, attributeName);
}
}
}