/** * Licensed to DigitalPebble Ltd under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * DigitalPebble licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.digitalpebble.stormcrawler.util; import javax.xml.xpath.XPath; import javax.xml.xpath.XPathConstants; import javax.xml.xpath.XPathExpression; import javax.xml.xpath.XPathExpressionException; import javax.xml.xpath.XPathFactory; import org.w3c.dom.DocumentFragment; import org.w3c.dom.NamedNodeMap; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import com.digitalpebble.stormcrawler.Metadata; /** * Normalises the robots instructions provided by the HTML meta tags or the HTTP * X-Robots-Tag headers. **/ public class RobotsTags { public final static String ROBOTS_NO_INDEX = "robots.noIndex"; public final static String ROBOTS_NO_FOLLOW = "robots.noFollow"; /** * Whether to interpret the noFollow directive strictly (remove links) or * not (remove anchor and do not track original URL). True by default. **/ public final static String ROBOTS_NO_FOLLOW_STRICT = "robots.noFollow.strict"; public final static String ROBOTS_NO_CACHE = "robots.noCache"; private boolean noIndex = false; private boolean noFollow = false; private boolean noCache = false; private static XPathExpression expression; static { XPath xpath = XPathFactory.newInstance().newXPath(); try { expression = xpath.compile("//META"); } catch (XPathExpressionException e) { throw new RuntimeException(e); } } /** Get the values from the fetch metadata **/ public RobotsTags(Metadata metadata) { // HTTP headers // X-Robots-Tag: noindex String[] values = metadata.getValues("X-Robots-Tag"); if (values == null) return; if (values.length == 1) { // just in case they put all the values on a single line values = values[0].split(" *, *"); } parseValues(values); } public RobotsTags() { } // set the values based on the meta tags // HTML tags // <meta name="robots" content="noarchive, nofollow"/> // called by the parser bolts public void extractMetaTags(DocumentFragment doc) throws XPathExpressionException { NodeList nodes = (NodeList) expression.evaluate(doc, XPathConstants.NODESET); if (nodes == null) return; int numNodes = nodes.getLength(); for (int i = 0; i < numNodes; i++) { Node n = (Node) nodes.item(i); // iterate on the attributes // and check that it has name=robots and content // whatever the case is boolean isRobots = false; String content = null; NamedNodeMap attrs = n.getAttributes(); for (int att = 0; att < attrs.getLength(); att++) { Node keyval = attrs.item(att); if ("name".equalsIgnoreCase(keyval.getNodeName()) && "robots".equalsIgnoreCase(keyval.getNodeValue())) { isRobots = true; continue; } if ("content".equalsIgnoreCase(keyval.getNodeName())) { content = keyval.getNodeValue(); continue; } } if (isRobots && content != null) { // got a value - split it String[] vals = content.split(" *, *"); parseValues(vals); } } } private void parseValues(String[] values) { for (String v : values) { v = v.trim(); if ("noindex".equalsIgnoreCase(v)) { noIndex = true; } else if ("nofollow".equalsIgnoreCase(v)) { noFollow = true; } else if ("noarchive".equalsIgnoreCase(v)) { noCache = true; } else if ("none".equalsIgnoreCase(v)) { noIndex = true; noFollow = true; noCache = true; } } } /** Adds a normalised representation of the directives in the metadata **/ public void normaliseToMetadata(Metadata metadata) { metadata.setValue(ROBOTS_NO_INDEX, Boolean.toString(noIndex)); metadata.setValue(ROBOTS_NO_CACHE, Boolean.toString(noCache)); metadata.setValue(ROBOTS_NO_FOLLOW, Boolean.toString(noFollow)); } public boolean isNoIndex() { return noIndex; } public boolean isNoFollow() { return noFollow; } public boolean isNoCache() { return noCache; } }