/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.tika.parser.html; import java.net.MalformedURLException; import java.net.URL; import java.util.Arrays; import java.util.HashSet; import java.util.Locale; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.mime.MediaType; import org.apache.tika.sax.TextContentHandler; import org.apache.tika.sax.XHTMLContentHandler; import org.xml.sax.Attributes; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; import org.xml.sax.helpers.AttributesImpl; class HtmlHandler extends TextContentHandler { // List of attributes that need to be resolved. private static final Set<String> URI_ATTRIBUTES = new HashSet<String>(Arrays.asList("src", "href", "longdesc", "cite")); private static final Pattern ICBM = Pattern.compile("\\s*(-?\\d+\\.\\d+)[,\\s]+(-?\\d+\\.\\d+)\\s*"); private final HtmlMapper mapper; private final XHTMLContentHandler xhtml; private final Metadata metadata; private final StringBuilder title = new StringBuilder(); private int bodyLevel = 0; private int discardLevel = 0; private int titleLevel = 0; private boolean isTitleSetToMetadata = false; private HtmlHandler( HtmlMapper mapper, XHTMLContentHandler xhtml, Metadata metadata) { super(xhtml); this.mapper = mapper; this.xhtml = xhtml; this.metadata = metadata; // Try to determine the default base URL, if one has not been given if (metadata.get(Metadata.CONTENT_LOCATION) == null) { String name = metadata.get(Metadata.RESOURCE_NAME_KEY); if (name != null) { name = name.trim(); try { new URL(name); // test URL format metadata.set(Metadata.CONTENT_LOCATION, name); } catch (MalformedURLException e) { // The resource name is not a valid URL, ignore it } } } } public HtmlHandler( HtmlMapper mapper, ContentHandler handler, Metadata metadata) { this(mapper, new XHTMLContentHandler(handler, metadata), metadata); } @Override public void startElement( String uri, String local, String name, Attributes atts) throws SAXException { if ("TITLE".equals(name) || titleLevel > 0) { titleLevel++; } if ("BODY".equals(name) || ("FRAMESET".equals(name)) || bodyLevel > 0) { bodyLevel++; } if (mapper.isDiscardElement(name) || discardLevel > 0) { discardLevel++; } if (bodyLevel == 0 && discardLevel == 0) { if ("META".equals(name) && atts.getValue("content") != null) { // TIKA-478: For cases where we have either a name or // "http-equiv", assume that XHTMLContentHandler will emit // these in the <head>, thus passing them through safely. if (atts.getValue("http-equiv") != null) { addHtmlMetadata( atts.getValue("http-equiv"), atts.getValue("content")); } else if (atts.getValue("name") != null) { // Record the meta tag in the metadata addHtmlMetadata( atts.getValue("name"), atts.getValue("content")); } else if (atts.getValue("property") != null) { // TIKA-983: Handle <meta property="og:xxx" content="yyy" /> tags metadata.add( atts.getValue("property"), atts.getValue("content")); } } else if ("BASE".equals(name) && atts.getValue("href") != null) { startElementWithSafeAttributes("base", atts); xhtml.endElement("base"); metadata.set( Metadata.CONTENT_LOCATION, resolve(atts.getValue("href"))); } else if ("LINK".equals(name)) { startElementWithSafeAttributes("link", atts); xhtml.endElement("link"); } else if ("SCRIPT".equals(name) && atts.getValue("src") != null) { startElementWithSafeAttributes("script", atts); xhtml.endElement("script"); } } if (bodyLevel > 0 && discardLevel == 0) { String safe = mapper.mapSafeElement(name); if (safe != null) { startElementWithSafeAttributes(safe, atts); } } title.setLength(0); } /** * Adds a metadata setting from the HTML <head/> to the Tika metadata * object. The name and value are normalized where possible. */ private void addHtmlMetadata(String name, String value) { if (name == null || value == null) { // ignore } else if (name.equalsIgnoreCase("ICBM")) { Matcher m = ICBM.matcher(value); if (m.matches()) { metadata.set("ICBM", m.group(1) + ", " + m.group(2)); metadata.set(Metadata.LATITUDE, m.group(1)); metadata.set(Metadata.LONGITUDE, m.group(2)); } else { metadata.set("ICBM", value); } } else if (name.equalsIgnoreCase(Metadata.CONTENT_TYPE)) { //don't overwrite Metadata.CONTENT_TYPE! MediaType type = MediaType.parse(value); if (type != null) { metadata.set(TikaCoreProperties.CONTENT_TYPE_HINT, type.toString()); } else { metadata.set(TikaCoreProperties.CONTENT_TYPE_HINT, value); } } else { metadata.add(name, value); } } private void startElementWithSafeAttributes(String name, Attributes atts) throws SAXException { if (atts.getLength() == 0) { xhtml.startElement(name); return; } boolean isObject = name.equals("object"); String codebase = null; if (isObject) { codebase = atts.getValue("", "codebase"); if (codebase != null) { codebase = resolve(codebase); } else { codebase = metadata.get(Metadata.CONTENT_LOCATION); } } AttributesImpl newAttributes = new AttributesImpl(atts); for (int att = 0; att < newAttributes.getLength(); att++) { String attrName = newAttributes.getLocalName(att); String normAttrName = mapper.mapSafeAttribute(name, attrName); if (normAttrName == null) { newAttributes.removeAttribute(att); att--; } else { // We have a remapped attribute name, so set it as it might have changed. newAttributes.setLocalName(att, normAttrName); // And resolve relative links. Eventually this should be pushed // into the HtmlMapper code. if (URI_ATTRIBUTES.contains(normAttrName)) { newAttributes.setValue(att, resolve(newAttributes.getValue(att))); } else if (isObject && "codebase".equals(normAttrName)) { newAttributes.setValue(att, codebase); } else if (isObject && ("data".equals(normAttrName) || "classid".equals(normAttrName))) { newAttributes.setValue( att, resolve(codebase, newAttributes.getValue(att))); } } } if ("img".equals(name) && newAttributes.getValue("", "alt") == null) { newAttributes.addAttribute("", "alt", "alt", "CDATA", ""); } xhtml.startElement(name, newAttributes); } @Override public void endElement( String uri, String local, String name) throws SAXException { if (bodyLevel > 0 && discardLevel == 0) { String safe = mapper.mapSafeElement(name); if (safe != null) { xhtml.endElement(safe); } else if (XHTMLContentHandler.ENDLINE.contains( name.toLowerCase(Locale.ENGLISH))) { // TIKA-343: Replace closing block tags (and <br/>) with a // newline unless the HtmlMapper above has already mapped // them to something else xhtml.newline(); } } if (titleLevel > 0) { titleLevel--; if (titleLevel == 0 && !isTitleSetToMetadata) { metadata.set(TikaCoreProperties.TITLE, title.toString().trim()); isTitleSetToMetadata = true; } } if (bodyLevel > 0) { bodyLevel--; } if (discardLevel > 0) { discardLevel--; } } @Override public void characters(char[] ch, int start, int length) throws SAXException { if (titleLevel > 0 && bodyLevel == 0) { title.append(ch, start, length); } if (bodyLevel > 0 && discardLevel == 0) { super.characters(ch, start, length); } } @Override public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException { if (bodyLevel > 0 && discardLevel == 0) { super.ignorableWhitespace(ch, start, length); } } private String resolve(String url) { return resolve(metadata.get(Metadata.CONTENT_LOCATION), url); } private String resolve(String base, String url) { url = url.trim(); // Return the URL as-is if no base URL is available or if the URL // matches a common non-hierarchical or pseudo URI prefix String lower = url.toLowerCase(Locale.ENGLISH); if (base == null || lower.startsWith("urn:") || lower.startsWith("mailto:") || lower.startsWith("tel:") || lower.startsWith("data:") || lower.startsWith("javascript:") || lower.startsWith("about:")) { return url; } try { URL baseURL = new URL(base.trim()); // We need to handle one special case, where the relativeUrl is // just a query string (like "?pid=1"), and the baseUrl doesn't // end with a '/'. In that case, the URL class removes the last // portion of the path, which we don't want. String path = baseURL.getPath(); if (url.startsWith("?") && path.length() > 0 && !path.endsWith("/")) { return new URL( baseURL.getProtocol(), baseURL.getHost(), baseURL.getPort(), baseURL.getPath() + url).toExternalForm(); } else { return new URL(baseURL, url).toExternalForm(); } } catch (MalformedURLException e) { // Unknown or broken format; just return the URL as received. return url; } } }