/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.tika.parser.html; import java.util.HashMap; import java.util.HashSet; import java.util.Map; import java.util.Set; /** * The default HTML mapping rules in Tika. * * @since Apache Tika 0.6 */ @SuppressWarnings("serial") public class DefaultHtmlMapper implements HtmlMapper { /** * @since Apache Tika 0.8 */ public static final HtmlMapper INSTANCE = new DefaultHtmlMapper(); // Based on http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd private static final Map<String, String> SAFE_ELEMENTS = new HashMap<String, String>() {{ put("H1", "h1"); put("H2", "h2"); put("H3", "h3"); put("H4", "h4"); put("H5", "h5"); put("H6", "h6"); put("P", "p"); put("PRE", "pre"); put("BLOCKQUOTE", "blockquote"); put("Q", "q"); put("UL", "ul"); put("OL", "ol"); put("MENU", "ul"); put("LI", "li"); put("DL", "dl"); put("DT", "dt"); put("DD", "dd"); put("TABLE", "table"); put("THEAD", "thead"); put("TBODY", "tbody"); put("TR", "tr"); put("TH", "th"); put("TD", "td"); put("ADDRESS", "address"); // TIKA-460 - add anchors put("A", "a"); // TIKA-463 - add additional elements that contain URLs (and their sub-elements) put("MAP", "map"); put("AREA", "area"); put("IMG", "img"); put("FRAMESET", "frameset"); put("FRAME", "frame"); put("IFRAME", "iframe"); put("OBJECT", "object"); put("PARAM", "param"); put("INS", "ins"); put("DEL", "del"); }}; private static final Set<String> DISCARDABLE_ELEMENTS = new HashSet<String>() {{ add("STYLE"); add("SCRIPT"); }}; // For information on tags & attributes, see: // http://www.w3.org/TR/2002/REC-xhtml1-20020801/dtds.html#a_dtd_XHTML-1.0-Strict // http://www.w3schools.com/TAGS/ private static final Map<String, Set<String>> SAFE_ATTRIBUTES = new HashMap<String, Set<String>>() {{ put("a", attrSet("charset", "type", "name", "href", "hreflang", "rel", "rev", "shape", "coords")); put("img", attrSet("src", "alt", "longdesc", "height", "width", "usemap", "ismap")); put("frame", attrSet("longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling")); put("iframe", attrSet("longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width")); put("link", attrSet("charset", "href", "hreflang", "type", "rel", "rev", "media")); put("map", attrSet("id", "class", "style", "title", "name")); put("area", attrSet("shape", "coords", "href", "nohref", "alt")); put("object", attrSet("declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", "align", "border", "hspace", "vspace")); put("param", attrSet("id", "name", "value", "valuetype", "type")); put("blockquote", attrSet("cite")); put("ins", attrSet("cite", "datetime")); put("del", attrSet("cite", "datetime")); put("q", attrSet("cite")); // TODO - fill out this set. Include core, i18n, etc sets where appropriate. }}; private static Set<String> attrSet(String... attrs) { Set<String> result = new HashSet<String>(); for (String attr : attrs) { result.add(attr); } return result; } public String mapSafeElement(String name) { return SAFE_ELEMENTS.get(name); } /** * Normalizes an attribute name. Assumes that the element name * is valid and normalized */ public String mapSafeAttribute(String elementName, String attributeName) { Set<String> safeAttrs = SAFE_ATTRIBUTES.get(elementName); if ((safeAttrs != null) && safeAttrs.contains(attributeName)) { return attributeName; } else { return null; } } public boolean isDiscardElement(String name) { return DISCARDABLE_ELEMENTS.contains(name); } }