/** * Licensed to DigitalPebble Ltd under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * DigitalPebble licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.digitalpebble.stormcrawler.bolt; import static com.digitalpebble.stormcrawler.Constants.StatusStreamName; import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; import java.net.MalformedURLException; import java.net.URL; import java.util.Arrays; import java.util.HashMap; import java.util.LinkedList; import java.util.List; import java.util.Map; import org.apache.commons.lang.StringUtils; import org.apache.http.entity.ContentType; import org.apache.storm.metric.api.MultiCountMetric; import org.apache.storm.task.OutputCollector; import org.apache.storm.task.TopologyContext; import org.apache.storm.topology.OutputFieldsDeclarer; import org.apache.storm.tuple.Fields; import org.apache.storm.tuple.Tuple; import org.apache.storm.tuple.Values; import org.apache.tika.config.TikaConfig; import org.apache.tika.detect.Detector; import org.apache.tika.io.TikaInputStream; import org.apache.tika.mime.MediaType; import org.jsoup.Jsoup; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import org.slf4j.LoggerFactory; import org.w3c.dom.DocumentFragment; import com.digitalpebble.stormcrawler.Constants; import com.digitalpebble.stormcrawler.Metadata; import com.digitalpebble.stormcrawler.parse.JSoupDOMBuilder; import com.digitalpebble.stormcrawler.parse.Outlink; import com.digitalpebble.stormcrawler.parse.ParseData; import com.digitalpebble.stormcrawler.parse.ParseFilter; import com.digitalpebble.stormcrawler.parse.ParseFilters; import com.digitalpebble.stormcrawler.parse.ParseResult; import com.digitalpebble.stormcrawler.persistence.Status; import com.digitalpebble.stormcrawler.protocol.HttpHeaders; import com.digitalpebble.stormcrawler.util.ConfUtils; import com.digitalpebble.stormcrawler.util.RefreshTag; import com.digitalpebble.stormcrawler.util.RobotsTags; import com.ibm.icu.text.CharsetDetector; import com.ibm.icu.text.CharsetMatch; /** * Parser for HTML documents only which uses ICU4J to detect the charset * encoding. Kindly donated to storm-crawler by shopstyle.com. */ @SuppressWarnings("serial") public class JSoupParserBolt extends StatusEmitterBolt { /** Metadata key name for tracking the anchors */ public static final String ANCHORS_KEY_NAME = "anchors"; private static final org.slf4j.Logger LOG = LoggerFactory .getLogger(JSoupParserBolt.class); private MultiCountMetric eventCounter; private ParseFilter parseFilters = null; private Detector detector = TikaConfig.getDefaultConfig().getDetector(); private boolean detectMimeType = true; private boolean trackAnchors = true; private boolean emitOutlinks = true; private boolean robots_noFollow_strict = true; /** * If a Tuple is not HTML whether to send it to the status stream as an * error or pass it on the default stream **/ private boolean treat_non_html_as_error = true; private CharsetDetector charsetDetector; /** * Length of content to use for detecting the charset. Set to -1 to use the * full content (will make the parser slow), 0 to deactivate the detection * altogether, or any other value (at least a few hundred bytes). **/ private int maxLengthCharsetDetection = -1; @SuppressWarnings({ "rawtypes", "unchecked" }) @Override public void prepare(Map conf, TopologyContext context, OutputCollector collector) { super.prepare(conf, context, collector); eventCounter = context.registerMetric(this.getClass().getSimpleName(), new MultiCountMetric(), 10); parseFilters = ParseFilters.fromConf(conf); emitOutlinks = ConfUtils.getBoolean(conf, "parser.emitOutlinks", true); trackAnchors = ConfUtils.getBoolean(conf, "track.anchors", true); robots_noFollow_strict = ConfUtils.getBoolean(conf, RobotsTags.ROBOTS_NO_FOLLOW_STRICT, true); treat_non_html_as_error = ConfUtils.getBoolean(conf, "jsoup.treat.non.html.as.error", true); detectMimeType = ConfUtils.getBoolean(conf, "detect.mimetype", true); charsetDetector = new CharsetDetector(); maxLengthCharsetDetection = ConfUtils.getInt(conf, "detect.charset.maxlength", -1); } @Override public void execute(Tuple tuple) { byte[] content = tuple.getBinaryByField("content"); String url = tuple.getStringByField("url"); Metadata metadata = (Metadata) tuple.getValueByField("metadata"); LOG.info("Parsing : starting {}", url); // check that its content type is HTML // look at value found in HTTP headers boolean CT_OK = false; String mimeType = metadata.getFirstValue(HttpHeaders.CONTENT_TYPE); if (detectMimeType) { try { mimeType = guessMimeType(url, mimeType, content); } catch (Exception e) { String errorMessage = "Exception while guessing mimetype on " + url + ": " + e; handleException(url, e, metadata, tuple, "mimetype guessing", errorMessage); return; } // store identified type in md metadata.setValue("parse.Content-Type", mimeType); } if (StringUtils.isNotBlank(mimeType)) { if (mimeType.toLowerCase().contains("html")) { CT_OK = true; } } // go ahead even if no mimetype is available else { CT_OK = true; } if (!CT_OK) { if (this.treat_non_html_as_error) { String errorMessage = "Exception content-type " + mimeType + " for " + url; RuntimeException e = new RuntimeException(errorMessage); handleException(url, e, metadata, tuple, "content-type checking", errorMessage); } else { LOG.info("Incorrect mimetype - passing on : {}", url); collector.emit(tuple, new Values(url, content, metadata, "")); collector.ack(tuple); } return; } long start = System.currentTimeMillis(); String charset = getContentCharset(content, metadata); // get the robots tags from the fetch metadata RobotsTags robotsTags = new RobotsTags(metadata); Map<String, List<String>> slinks; String text = ""; DocumentFragment fragment; try (ByteArrayInputStream bais = new ByteArrayInputStream(content)) { org.jsoup.nodes.Document jsoupDoc = Jsoup.parse(bais, charset, url); fragment = JSoupDOMBuilder.jsoup2HTML(jsoupDoc); // extracts the robots directives from the meta tags robotsTags.extractMetaTags(fragment); // store a normalised representation in metadata // so that the indexer is aware of it robotsTags.normaliseToMetadata(metadata); // do not extract the links if no follow has been set // and we are in strict mode if (robotsTags.isNoFollow() && robots_noFollow_strict) { slinks = new HashMap<>(0); } else { Elements links = jsoupDoc.select("a[href]"); slinks = new HashMap<>(links.size()); for (Element link : links) { // abs:href tells jsoup to return fully qualified domains // for // relative urls. // e.g.: /foo will resolve to http://shopstyle.com/foo String targetURL = link.attr("abs:href"); // nofollow boolean noFollow = "nofollow".equalsIgnoreCase(link .attr("rel")); // remove altogether if (noFollow && robots_noFollow_strict) { continue; } // link not specifically marked as no follow // but whole page is if (!noFollow && robotsTags.isNoFollow()) { noFollow = true; } String anchor = link.text(); if (StringUtils.isNotBlank(targetURL)) { // any existing anchors for the same target? List<String> anchors = slinks.get(targetURL); if (anchors == null) { anchors = new LinkedList<>(); slinks.put(targetURL, anchors); } // track the anchors only if no follow is false if (!noFollow && StringUtils.isNotBlank(anchor)) { anchors.add(anchor); } } } } Element body = jsoupDoc.body(); if (body != null) { text = body.text(); } } catch (Throwable e) { String errorMessage = "Exception while parsing " + url + ": " + e; handleException(url, e, metadata, tuple, "content parsing", errorMessage); return; } // store identified charset in md metadata.setValue("parse.Content-Encoding", charset); long duration = System.currentTimeMillis() - start; LOG.info("Parsed {} in {} msec", url, duration); // redirection? try { String redirection = RefreshTag.extractRefreshURL(fragment); if (StringUtils.isNotBlank(redirection)) { // stores the URL it redirects to // used for debugging mainly - do not resolve the target // URL LOG.info("Found redir in {} to {}", url, redirection); metadata.setValue("_redirTo", redirection); if (allowRedirs() && StringUtils.isNotBlank(redirection)) { emitOutlink(tuple, new URL(url), redirection, metadata); } // Mark URL as redirected collector .emit(com.digitalpebble.stormcrawler.Constants.StatusStreamName, tuple, new Values(url, metadata, Status.REDIRECTION)); collector.ack(tuple); eventCounter.scope("tuple_success").incr(); return; } } catch (MalformedURLException e) { LOG.error("MalformedURLException on {}", url); } List<Outlink> outlinks = toOutlinks(url, metadata, slinks); ParseResult parse = new ParseResult(); parse.setOutlinks(outlinks); // parse data of the parent URL ParseData parseData = parse.get(url); parseData.setMetadata(metadata); parseData.setText(text); parseData.setContent(content); // apply the parse filters if any try { parseFilters.filter(url, content, fragment, parse); } catch (RuntimeException e) { String errorMessage = "Exception while running parse filters on " + url + ": " + e; handleException(url, e, metadata, tuple, "content filtering", errorMessage); return; } if (emitOutlinks) { for (Outlink outlink : parse.getOutlinks()) { collector.emit( StatusStreamName, tuple, new Values(outlink.getTargetURL(), outlink .getMetadata(), Status.DISCOVERED)); } } // emit each document/subdocument in the ParseResult object // there should be at least one ParseData item for the "parent" URL for (Map.Entry<String, ParseData> doc : parse) { ParseData parseDoc = doc.getValue(); collector.emit( tuple, new Values(doc.getKey(), parseDoc.getContent(), parseDoc .getMetadata(), parseDoc.getText())); } collector.ack(tuple); eventCounter.scope("tuple_success").incr(); } private void handleException(String url, Throwable e, Metadata metadata, Tuple tuple, String errorSource, String errorMessage) { LOG.error(errorMessage); // send to status stream in case another component wants to update // its status metadata.setValue(Constants.STATUS_ERROR_SOURCE, errorSource); metadata.setValue(Constants.STATUS_ERROR_MESSAGE, errorMessage); collector.emit(StatusStreamName, tuple, new Values(url, metadata, Status.ERROR)); collector.ack(tuple); // Increment metric that is context specific String s = "error_" + errorSource.replaceAll(" ", "_") + "_"; eventCounter.scope(s + e.getClass().getSimpleName()).incrBy(1); // Increment general metric eventCounter.scope("parse exception").incrBy(1); } @Override public void declareOutputFields(OutputFieldsDeclarer declarer) { super.declareOutputFields(declarer); // output of this module is the list of fields to index // with at least the URL, text content declarer.declare(new Fields("url", "content", "metadata", "text")); } private String getContentCharset(byte[] content, Metadata metadata) { String charset = null; // check if the server specified a charset String specifiedContentType = metadata .getFirstValue(HttpHeaders.CONTENT_TYPE); try { if (specifiedContentType != null) { ContentType parsedContentType = ContentType .parse(specifiedContentType); charset = parsedContentType.getCharset().name(); if (maxLengthCharsetDetection == 0) { return charset; } } } catch (Exception e) { charset = null; } // filter HTML tags charsetDetector.enableInputFilter(true); // give it a hint charsetDetector.setDeclaredEncoding(charset); // trim the content of the text for the detection byte[] subContent = content; if (maxLengthCharsetDetection != -1 && content.length > maxLengthCharsetDetection) { subContent = Arrays.copyOfRange(content, 0, maxLengthCharsetDetection); } charsetDetector.setText(subContent); try { CharsetMatch charsetMatch = charsetDetector.detect(); if (charsetMatch != null) { charset = charsetMatch.getName(); } } catch (Exception e) { // ignore and leave the charset as-is } return charset; } public String guessMimeType(String URL, String httpCT, byte[] content) { org.apache.tika.metadata.Metadata metadata = new org.apache.tika.metadata.Metadata(); if (StringUtils.isNotBlank(httpCT)) { // pass content type from server as a clue metadata.set(org.apache.tika.metadata.Metadata.CONTENT_TYPE, httpCT); } // use filename as a clue try { URL _url = new URL(URL); metadata.set(org.apache.tika.metadata.Metadata.RESOURCE_NAME_KEY, _url.getFile()); } catch (MalformedURLException e1) { throw new IllegalStateException("Malformed URL", e1); } try { try (InputStream stream = TikaInputStream.get(content)) { MediaType mt = detector.detect(stream, metadata); return mt.toString(); } } catch (IOException e) { throw new IllegalStateException("Unexpected IOException", e); } } private List<Outlink> toOutlinks(String url, Metadata metadata, Map<String, List<String>> slinks) { Map<String, Outlink> outlinks = new HashMap<>(); URL sourceUrl; try { sourceUrl = new URL(url); } catch (MalformedURLException e) { // we would have known by now as previous components check whether // the URL is valid LOG.error("MalformedURLException on {}", url); eventCounter.scope("error_invalid_source_url").incrBy(1); return new LinkedList<Outlink>(); } for (Map.Entry<String, List<String>> linkEntry : slinks.entrySet()) { String targetURL = linkEntry.getKey(); Outlink ol = filterOutlink(sourceUrl, targetURL, metadata); if (ol == null) { eventCounter.scope("outlink_filtered").incr(); continue; } // the same link could already be there post-normalisation Outlink old = outlinks.get(ol.getTargetURL()); if (old != null) { ol = old; } List<String> anchors = linkEntry.getValue(); if (trackAnchors && anchors.size() > 0) { ol.getMetadata().addValues(ANCHORS_KEY_NAME, anchors); // sets the first anchor ol.setAnchor(anchors.get(0)); } if (old == null) { outlinks.put(ol.getTargetURL(), ol); eventCounter.scope("outlink_kept").incr(); } } return new LinkedList<Outlink>(outlinks.values()); } }