/** * Licensed to DigitalPebble Ltd under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * DigitalPebble licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.digitalpebble.storm.crawler.bolt; import static com.digitalpebble.storm.crawler.Constants.StatusStreamName; import java.io.ByteArrayInputStream; import java.io.IOException; import java.net.MalformedURLException; import java.net.URL; import java.util.HashMap; import java.util.LinkedList; import java.util.List; import java.util.Map; import org.apache.commons.lang.StringUtils; import org.apache.http.entity.ContentType; import org.jsoup.Jsoup; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.w3c.dom.DocumentFragment; import backtype.storm.metric.api.MultiCountMetric; import backtype.storm.task.OutputCollector; import backtype.storm.task.TopologyContext; import backtype.storm.topology.OutputFieldsDeclarer; import backtype.storm.topology.base.BaseRichBolt; import backtype.storm.tuple.Fields; import backtype.storm.tuple.Tuple; import backtype.storm.tuple.Values; import com.digitalpebble.storm.crawler.Constants; import com.digitalpebble.storm.crawler.Metadata; import com.digitalpebble.storm.crawler.filtering.URLFilters; import com.digitalpebble.storm.crawler.parse.JSoupDOMBuilder; import com.digitalpebble.storm.crawler.parse.Outlink; import com.digitalpebble.storm.crawler.parse.ParseFilter; import com.digitalpebble.storm.crawler.parse.ParseFilters; import com.digitalpebble.storm.crawler.persistence.Status; import com.digitalpebble.storm.crawler.protocol.HttpHeaders; import com.digitalpebble.storm.crawler.util.ConfUtils; import com.digitalpebble.storm.crawler.util.MetadataTransfer; import com.ibm.icu.text.CharsetDetector; import com.ibm.icu.text.CharsetMatch; /** * Parser for HTML documents only which uses ICU4J to detect the charset * encoding. Kindly donated to storm-crawler by shopstyle.com. */ @SuppressWarnings("serial") public class JSoupParserBolt extends BaseRichBolt { /** Metadata key name for tracking the anchors */ public static final String ANCHORS_KEY_NAME = "anchors"; private static final Logger LOG = LoggerFactory .getLogger(JSoupParserBolt.class); private OutputCollector collector; private MultiCountMetric eventCounter; private ParseFilter parseFilters = null; private URLFilters urlFilters = null; private MetadataTransfer metadataTransfer; private boolean trackAnchors = true; private boolean emitOutlinks = true; @SuppressWarnings({ "rawtypes", "unchecked" }) @Override public void prepare(Map conf, TopologyContext context, OutputCollector collector) { this.collector = collector; eventCounter = context.registerMetric(this.getClass().getSimpleName(), new MultiCountMetric(), 10); parseFilters = ParseFilters.emptyParseFilter; String parseconfigfile = ConfUtils.getString(conf, "parsefilters.config.file", "parsefilters.json"); if (parseconfigfile != null) { try { parseFilters = new ParseFilters(conf, parseconfigfile); } catch (IOException e) { LOG.error("Exception caught while loading the ParseFilters"); throw new RuntimeException( "Exception caught while loading the ParseFilters", e); } } urlFilters = URLFilters.emptyURLFilters; emitOutlinks = ConfUtils.getBoolean(conf, "parser.emitOutlinks", true); if (emitOutlinks) { String urlconfigfile = ConfUtils.getString(conf, "urlfilters.config.file", "urlfilters.json"); if (urlconfigfile != null) { try { urlFilters = new URLFilters(conf, urlconfigfile); } catch (IOException e) { LOG.error("Exception caught while loading the URLFilters"); throw new RuntimeException( "Exception caught while loading the URLFilters", e); } } } trackAnchors = ConfUtils.getBoolean(conf, "track.anchors", true); metadataTransfer = MetadataTransfer.getInstance(conf); } @Override public void execute(Tuple tuple) { byte[] content = tuple.getBinaryByField("content"); String url = tuple.getStringByField("url"); Metadata metadata = (Metadata) tuple.getValueByField("metadata"); LOG.info("Parsing : starting {}", url); long start = System.currentTimeMillis(); String charset = getContentCharset(content, metadata); Map<String, List<String>> slinks; String text; DocumentFragment fragment; try (ByteArrayInputStream bais = new ByteArrayInputStream(content)) { org.jsoup.nodes.Document jsoupDoc = Jsoup.parse(bais, charset, url); fragment = JSoupDOMBuilder.jsoup2HTML(jsoupDoc); Elements links = jsoupDoc.select("a[href]"); slinks = new HashMap<String, List<String>>(links.size()); for (Element link : links) { // abs:href tells jsoup to return fully qualified domains for // relative urls. // e.g.: /foo will resolve to http://shopstyle.com/foo String targetURL = link.attr("abs:href"); String anchor = link.text(); if (StringUtils.isNotBlank(targetURL)) { List<String> anchors = slinks.get(targetURL); if (anchors == null) { anchors = new LinkedList<String>(); slinks.put(targetURL, anchors); } if (StringUtils.isNotBlank(anchor)) { anchors.add(anchor); } } } text = jsoupDoc.body().text(); } catch (Throwable e) { String errorMessage = "Exception while parsing " + url + ": " + e; LOG.error(errorMessage); // send to status stream in case another component wants to update // its status metadata.setValue(Constants.STATUS_ERROR_SOURCE, "content parsing"); metadata.setValue(Constants.STATUS_ERROR_MESSAGE, errorMessage); collector.emit(StatusStreamName, tuple, new Values(url, metadata, Status.ERROR)); collector.ack(tuple); // Increment metric that is context specific eventCounter.scope( "error_content_parsing_" + e.getClass().getSimpleName()) .incrBy(1); // Increment general metric eventCounter.scope("parse exception").incrBy(1); return; } // store identified charset in md metadata.setValue("parse.Content-Encoding", charset); long duration = System.currentTimeMillis() - start; LOG.info("Parsed {} in {} msec", url, duration); List<Outlink> outlinks = toOutlinks(url, metadata, slinks); // apply the parse filters if any try { parseFilters.filter(url, content, fragment, metadata, outlinks); } catch (RuntimeException e) { String errorMessage = "Exception while running parse filters on " + url + ": " + e; LOG.error(errorMessage); metadata.setValue(Constants.STATUS_ERROR_SOURCE, "content filtering"); metadata.setValue(Constants.STATUS_ERROR_MESSAGE, errorMessage); collector.emit(StatusStreamName, tuple, new Values(url, metadata, Status.ERROR)); collector.ack(tuple); // Increment metric that is context specific eventCounter.scope( "error_content_filtering_" + e.getClass().getSimpleName()) .incrBy(1); // Increment general metric eventCounter.scope("parse exception").incrBy(1); return; } if (emitOutlinks) { for (Outlink outlink : outlinks) { collector.emit( StatusStreamName, tuple, new Values(outlink.getTargetURL(), outlink .getMetadata(), Status.DISCOVERED)); } } collector.emit(tuple, new Values(url, content, metadata, text.trim())); collector.ack(tuple); eventCounter.scope("tuple_success").incr(); } @Override public void declareOutputFields(OutputFieldsDeclarer declarer) { // output of this module is the list of fields to index // with at least the URL, text content declarer.declare(new Fields("url", "content", "metadata", "text")); declarer.declareStream(StatusStreamName, new Fields("url", "metadata", "status")); } private String getContentCharset(byte[] content, Metadata metadata) { String charset = null; // check if the server specified a charset String specifiedContentType = metadata .getFirstValue(HttpHeaders.CONTENT_TYPE); try { if (specifiedContentType != null) { ContentType parsedContentType = ContentType .parse(specifiedContentType); charset = parsedContentType.getCharset().name(); } } catch (Exception e) { charset = null; } // filter HTML tags CharsetDetector detector = new CharsetDetector(); detector.enableInputFilter(true); // give it a hint detector.setDeclaredEncoding(charset); detector.setText(content); try { CharsetMatch charsetMatch = detector.detect(); if (charsetMatch != null) { charset = charsetMatch.getName(); } } catch (Exception e) { // ignore and leave the charset as-is } return charset; } private List<Outlink> toOutlinks(String url, Metadata metadata, Map<String, List<String>> slinks) { List<Outlink> outlinks = new LinkedList<Outlink>(); URL sourceUrl; try { sourceUrl = new URL(url); } catch (MalformedURLException e) { // we would have known by now as previous components check whether // the URL is valid LOG.error("MalformedURLException on {}", url); eventCounter.scope("error_invalid_source_url").incrBy(1); return outlinks; } Map<String, List<String>> linksKept = new HashMap<String, List<String>>(); for (Map.Entry<String, List<String>> linkEntry : slinks.entrySet()) { String targetURL = linkEntry.getKey(); // filter the urls if (urlFilters != null) { targetURL = urlFilters.filter(sourceUrl, metadata, targetURL); if (targetURL == null) { eventCounter.scope("outlink_filtered").incr(); continue; } } // the link has survived the various filters if (targetURL != null) { List<String> anchors = linkEntry.getValue(); linksKept.put(targetURL, anchors); eventCounter.scope("outlink_kept").incr(); } } for (String outlink : linksKept.keySet()) { // configure which metadata gets inherited from parent Metadata linkMetadata = metadataTransfer.getMetaForOutlink(outlink, url, metadata); Outlink ol = new Outlink(outlink); // add the anchors to the metadata? if (trackAnchors) { List<String> anchors = linksKept.get(outlink); if (anchors.size() > 0) { linkMetadata.addValues(ANCHORS_KEY_NAME, anchors); // sets the first anchor ol.setAnchor(anchors.get(0)); } } ol.setMetadata(linkMetadata); outlinks.add(ol); } return outlinks; } }