JSoupParserBolt.java example

Explorer

alfresco-apache-storm-demo-master
- core
  - src
    - main
      - java
        com
        digitalpebble
        storm
        crawler
        ConfigurableTopology.java
        Constants.java
        CrawlTopology.java
        Metadata.java
        bolt
        FetcherBolt.java
        IndexerBolt.java
        JSoupParserBolt.java
        PrinterBolt.java
        SimpleFetcherBolt.java
        SiteMapParserBolt.java
        StatusStreamBolt.java
        URLPartitionerBolt.java
        filtering
        URLFilter.java
        URLFilters.java
        basic
        BasicURLNormalizer.java
        depth
        MaxDepthFilter.java
        host
        HostURLFilter.java
        regex
        RegexRule.java
        RegexURLFilter.java
        RegexURLFilterBase.java
        RegexURLNormalizer.java
        indexing
        AbstractIndexerBolt.java
        StdOutIndexer.java
        parse
        JSoupDOMBuilder.java
        Outlink.java
        ParseFilter.java
        ParseFilters.java
        filter
        DebugParseFilter.java
        XPathFilter.java
        persistence
        AbstractStatusUpdaterBolt.java
        DefaultScheduler.java
        Scheduler.java
        Status.java
        protocol
        AbstractHttpProtocol.java
        HttpHeaders.java
        HttpRobotRulesParser.java
        Protocol.java
        ProtocolFactory.java
        ProtocolResponse.java
        RobotRules.java
        RobotRulesParser.java
        httpclient
        HttpProtocol.java
        spout
        FileSpout.java
        RandomURLSpout.java
        util
        ConfUtils.java
        MetadataTransfer.java
        StringTabScheme.java
        URLPartitioner.java
        URLUtil.java
        zaizi
        alfresco
        AlfrescoConstants.java
        bolt
        ProcessNodes.java
        spouts
        AlfrescoSpout.java
    - test
      - java
        com
        digitalpebble
        storm
        crawler
        TestMetadataSerialization.java
        TestOutputCollector.java
        TestUtil.java
        bolt
        SiteMapParserBoltTest.java
        filtering
        BasicURLNormalizerTest.java
        HostURLFilterTest.java
        parse
        filter
        ParsingTester.java
        XPathFilterTest.java
        util
        MetadataTransferTest.java
- external
  - src
    - main
      - java
        com
        digitalpebble
        storm
        crawler
        elasticsearch
        ESCrawlTopology.java
        ElasticSearchConnection.java
        bolt
        IndexerBolt.java
        metrics
        MetricsConsumer.java
        persistence
        ElasticSearchSpout.java
        StatusUpdaterBolt.java
        librato
        metrics
        LibratoMetricsConsumer.java
        metrics
        DebugMetricsConsumer.java
        HistogramMetric.java
        MeterMetric.java
        TimerMetric.java
        tika
        DOMBuilder.java
        ParserBolt.java
        XMLCharacterRecognizer.java

/**
 * Licensed to DigitalPebble Ltd under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * DigitalPebble licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.digitalpebble.storm.crawler.bolt;

import static com.digitalpebble.storm.crawler.Constants.StatusStreamName;

import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;

import org.apache.commons.lang.StringUtils;
import org.apache.http.entity.ContentType;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.DocumentFragment;

import backtype.storm.metric.api.MultiCountMetric;
import backtype.storm.task.OutputCollector;
import backtype.storm.task.TopologyContext;
import backtype.storm.topology.OutputFieldsDeclarer;
import backtype.storm.topology.base.BaseRichBolt;
import backtype.storm.tuple.Fields;
import backtype.storm.tuple.Tuple;
import backtype.storm.tuple.Values;

import com.digitalpebble.storm.crawler.Constants;
import com.digitalpebble.storm.crawler.Metadata;
import com.digitalpebble.storm.crawler.filtering.URLFilters;
import com.digitalpebble.storm.crawler.parse.JSoupDOMBuilder;
import com.digitalpebble.storm.crawler.parse.Outlink;
import com.digitalpebble.storm.crawler.parse.ParseFilter;
import com.digitalpebble.storm.crawler.parse.ParseFilters;
import com.digitalpebble.storm.crawler.persistence.Status;
import com.digitalpebble.storm.crawler.protocol.HttpHeaders;
import com.digitalpebble.storm.crawler.util.ConfUtils;
import com.digitalpebble.storm.crawler.util.MetadataTransfer;
import com.ibm.icu.text.CharsetDetector;
import com.ibm.icu.text.CharsetMatch;

/**
 * Parser for HTML documents only which uses ICU4J to detect the charset
 * encoding. Kindly donated to storm-crawler by shopstyle.com.
 */
@SuppressWarnings("serial")
public class JSoupParserBolt extends BaseRichBolt {

    /** Metadata key name for tracking the anchors */
    public static final String ANCHORS_KEY_NAME = "anchors";

    private static final Logger LOG = LoggerFactory
            .getLogger(JSoupParserBolt.class);

    private OutputCollector collector;

    private MultiCountMetric eventCounter;

    private ParseFilter parseFilters = null;

    private URLFilters urlFilters = null;

    private MetadataTransfer metadataTransfer;

    private boolean trackAnchors = true;

    private boolean emitOutlinks = true;

    @SuppressWarnings({ "rawtypes", "unchecked" })
    @Override
    public void prepare(Map conf, TopologyContext context,
            OutputCollector collector) {
        this.collector = collector;

        eventCounter = context.registerMetric(this.getClass().getSimpleName(),
                new MultiCountMetric(), 10);

        parseFilters = ParseFilters.emptyParseFilter;

        String parseconfigfile = ConfUtils.getString(conf,
                "parsefilters.config.file", "parsefilters.json");
        if (parseconfigfile != null) {
            try {
                parseFilters = new ParseFilters(conf, parseconfigfile);
            } catch (IOException e) {
                LOG.error("Exception caught while loading the ParseFilters");
                throw new RuntimeException(
                        "Exception caught while loading the ParseFilters", e);
            }
        }

        urlFilters = URLFilters.emptyURLFilters;
        emitOutlinks = ConfUtils.getBoolean(conf, "parser.emitOutlinks", true);

        if (emitOutlinks) {
            String urlconfigfile = ConfUtils.getString(conf,
                    "urlfilters.config.file", "urlfilters.json");

            if (urlconfigfile != null) {
                try {
                    urlFilters = new URLFilters(conf, urlconfigfile);
                } catch (IOException e) {
                    LOG.error("Exception caught while loading the URLFilters");
                    throw new RuntimeException(
                            "Exception caught while loading the URLFilters", e);
                }
            }
        }

        trackAnchors = ConfUtils.getBoolean(conf, "track.anchors", true);

        metadataTransfer = MetadataTransfer.getInstance(conf);
    }

    @Override
    public void execute(Tuple tuple) {

        byte[] content = tuple.getBinaryByField("content");
        String url = tuple.getStringByField("url");
        Metadata metadata = (Metadata) tuple.getValueByField("metadata");

        LOG.info("Parsing : starting {}", url);

        long start = System.currentTimeMillis();

        String charset = getContentCharset(content, metadata);

        Map<String, List<String>> slinks;
        String text;
        DocumentFragment fragment;
        try (ByteArrayInputStream bais = new ByteArrayInputStream(content)) {
            org.jsoup.nodes.Document jsoupDoc = Jsoup.parse(bais, charset, url);
            fragment = JSoupDOMBuilder.jsoup2HTML(jsoupDoc);

            Elements links = jsoupDoc.select("a[href]");
            slinks = new HashMap<String, List<String>>(links.size());
            for (Element link : links) {
                // abs:href tells jsoup to return fully qualified domains for
                // relative urls.
                // e.g.: /foo will resolve to http://shopstyle.com/foo
                String targetURL = link.attr("abs:href");
                String anchor = link.text();
                if (StringUtils.isNotBlank(targetURL)) {
                    List<String> anchors = slinks.get(targetURL);
                    if (anchors == null) {
                        anchors = new LinkedList<String>();
                        slinks.put(targetURL, anchors);
                    }
                    if (StringUtils.isNotBlank(anchor)) {
                        anchors.add(anchor);
                    }
                }
            }

            text = jsoupDoc.body().text();

        } catch (Throwable e) {
            String errorMessage = "Exception while parsing " + url + ": " + e;
            LOG.error(errorMessage);
            // send to status stream in case another component wants to update
            // its status
            metadata.setValue(Constants.STATUS_ERROR_SOURCE, "content parsing");
            metadata.setValue(Constants.STATUS_ERROR_MESSAGE, errorMessage);
            collector.emit(StatusStreamName, tuple, new Values(url, metadata,
                    Status.ERROR));
            collector.ack(tuple);
            // Increment metric that is context specific
            eventCounter.scope(
                    "error_content_parsing_" + e.getClass().getSimpleName())
                    .incrBy(1);
            // Increment general metric
            eventCounter.scope("parse exception").incrBy(1);
            return;
        }

        // store identified charset in md
        metadata.setValue("parse.Content-Encoding", charset);

        long duration = System.currentTimeMillis() - start;

        LOG.info("Parsed {} in {} msec", url, duration);

        List<Outlink> outlinks = toOutlinks(url, metadata, slinks);

        // apply the parse filters if any
        try {
            parseFilters.filter(url, content, fragment, metadata, outlinks);
        } catch (RuntimeException e) {
            String errorMessage = "Exception while running parse filters on "
                    + url + ": " + e;
            LOG.error(errorMessage);
            metadata.setValue(Constants.STATUS_ERROR_SOURCE,
                    "content filtering");
            metadata.setValue(Constants.STATUS_ERROR_MESSAGE, errorMessage);
            collector.emit(StatusStreamName, tuple, new Values(url, metadata,
                    Status.ERROR));
            collector.ack(tuple);
            // Increment metric that is context specific
            eventCounter.scope(
                    "error_content_filtering_" + e.getClass().getSimpleName())
                    .incrBy(1);
            // Increment general metric
            eventCounter.scope("parse exception").incrBy(1);
            return;
        }

        if (emitOutlinks) {
            for (Outlink outlink : outlinks) {
                collector.emit(
                        StatusStreamName,
                        tuple,
                        new Values(outlink.getTargetURL(), outlink
                                .getMetadata(), Status.DISCOVERED));
            }
        }

        collector.emit(tuple, new Values(url, content, metadata, text.trim()));
        collector.ack(tuple);
        eventCounter.scope("tuple_success").incr();
    }

    @Override
    public void declareOutputFields(OutputFieldsDeclarer declarer) {
        // output of this module is the list of fields to index
        // with at least the URL, text content
        declarer.declare(new Fields("url", "content", "metadata", "text"));
        declarer.declareStream(StatusStreamName, new Fields("url", "metadata",
                "status"));
    }

    private String getContentCharset(byte[] content, Metadata metadata) {
        String charset = null;

        // check if the server specified a charset
        String specifiedContentType = metadata
                .getFirstValue(HttpHeaders.CONTENT_TYPE);
        try {
            if (specifiedContentType != null) {
                ContentType parsedContentType = ContentType
                        .parse(specifiedContentType);
                charset = parsedContentType.getCharset().name();
            }
        } catch (Exception e) {
            charset = null;
        }

        // filter HTML tags
        CharsetDetector detector = new CharsetDetector();
        detector.enableInputFilter(true);
        // give it a hint
        detector.setDeclaredEncoding(charset);
        detector.setText(content);
        try {
            CharsetMatch charsetMatch = detector.detect();
            if (charsetMatch != null) {
                charset = charsetMatch.getName();
            }
        } catch (Exception e) {
            // ignore and leave the charset as-is
        }
        return charset;
    }

    private List<Outlink> toOutlinks(String url, Metadata metadata,
            Map<String, List<String>> slinks) {
        List<Outlink> outlinks = new LinkedList<Outlink>();
        URL sourceUrl;
        try {
            sourceUrl = new URL(url);
        } catch (MalformedURLException e) {
            // we would have known by now as previous components check whether
            // the URL is valid
            LOG.error("MalformedURLException on {}", url);
            eventCounter.scope("error_invalid_source_url").incrBy(1);
            return outlinks;
        }

        Map<String, List<String>> linksKept = new HashMap<String, List<String>>();

        for (Map.Entry<String, List<String>> linkEntry : slinks.entrySet()) {
            String targetURL = linkEntry.getKey();
            // filter the urls
            if (urlFilters != null) {
                targetURL = urlFilters.filter(sourceUrl, metadata, targetURL);
                if (targetURL == null) {
                    eventCounter.scope("outlink_filtered").incr();
                    continue;
                }
            }
            // the link has survived the various filters
            if (targetURL != null) {
                List<String> anchors = linkEntry.getValue();
                linksKept.put(targetURL, anchors);
                eventCounter.scope("outlink_kept").incr();
            }
        }

        for (String outlink : linksKept.keySet()) {
            // configure which metadata gets inherited from parent
            Metadata linkMetadata = metadataTransfer.getMetaForOutlink(outlink,
                    url, metadata);
            Outlink ol = new Outlink(outlink);
            // add the anchors to the metadata?
            if (trackAnchors) {
                List<String> anchors = linksKept.get(outlink);
                if (anchors.size() > 0) {
                    linkMetadata.addValues(ANCHORS_KEY_NAME, anchors);
                    // sets the first anchor
                    ol.setAnchor(anchors.get(0));
                }
            }
            ol.setMetadata(linkMetadata);
            outlinks.add(ol);
        }
        return outlinks;
    }
}