/** * Licensed to DigitalPebble Ltd under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * DigitalPebble licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.digitalpebble.stormcrawler.bolt; import java.net.InetAddress; import java.net.MalformedURLException; import java.net.URL; import java.util.Collections; import java.util.LinkedHashMap; import java.util.Map; import org.apache.commons.lang.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.digitalpebble.stormcrawler.Constants; import com.digitalpebble.stormcrawler.Metadata; import com.digitalpebble.stormcrawler.util.ConfUtils; import org.apache.storm.metric.api.MultiCountMetric; import org.apache.storm.task.OutputCollector; import org.apache.storm.task.TopologyContext; import org.apache.storm.topology.OutputFieldsDeclarer; import org.apache.storm.topology.base.BaseRichBolt; import org.apache.storm.tuple.Fields; import org.apache.storm.tuple.Tuple; import org.apache.storm.tuple.Values; import crawlercommons.domains.PaidLevelDomain; /** * Generates a partition key for a given URL based on the hostname, domain or IP * address. */ public class URLPartitionerBolt extends BaseRichBolt { private static final Logger LOG = LoggerFactory .getLogger(URLPartitionerBolt.class); private OutputCollector _collector; private MultiCountMetric eventCounter; private Map<String, String> cache; private String mode = Constants.PARTITION_MODE_HOST; @Override public void execute(Tuple tuple) { String url = tuple.getStringByField("url"); Metadata metadata = null; if (tuple.contains("metadata")) metadata = (Metadata) tuple.getValueByField("metadata"); // maybe there is a field metadata but it can be null // or there was no field at all if (metadata == null) metadata = Metadata.empty; String partitionKey = null; String host = ""; // IP in metadata? if (mode.equalsIgnoreCase(Constants.PARTITION_MODE_IP)) { String ip_provided = metadata.getFirstValue("ip"); if (StringUtils.isNotBlank(ip_provided)) { partitionKey = ip_provided; eventCounter.scope("provided").incrBy(1); } } if (partitionKey == null) { URL u; try { u = new URL(url); host = u.getHost(); } catch (MalformedURLException e1) { eventCounter.scope("Invalid URL").incrBy(1); LOG.warn("Invalid URL: {}", url); // ack it so that it doesn't get replayed _collector.ack(tuple); return; } } // partition by hostname if (mode.equalsIgnoreCase(Constants.PARTITION_MODE_HOST)) partitionKey = host; // partition by domain : needs fixing else if (mode.equalsIgnoreCase(Constants.PARTITION_MODE_DOMAIN)) { partitionKey = PaidLevelDomain.getPLD(host); } // partition by IP if (mode.equalsIgnoreCase(Constants.PARTITION_MODE_IP) && partitionKey == null) { // try to get it from cache first partitionKey = cache.get(host); if (partitionKey != null) { eventCounter.scope("from cache").incrBy(1); } else { try { long start = System.currentTimeMillis(); final InetAddress addr = InetAddress.getByName(host); partitionKey = addr.getHostAddress(); long end = System.currentTimeMillis(); LOG.debug("Resolved IP {} in {} msec for : {}", partitionKey, end - start, url); // add to cache cache.put(host, partitionKey); } catch (final Exception e) { eventCounter.scope("Unable to resolve IP").incrBy(1); LOG.warn("Unable to resolve IP for: {}", host); _collector.ack(tuple); return; } } } LOG.debug("Partition Key for: {} > {}", url, partitionKey); _collector.emit(tuple, new Values(url, partitionKey, metadata)); _collector.ack(tuple); } @Override public void declareOutputFields(OutputFieldsDeclarer declarer) { declarer.declare(new Fields("url", "key", "metadata")); } @Override public void prepare(Map stormConf, TopologyContext context, OutputCollector collector) { mode = ConfUtils.getString(stormConf, Constants.PARTITION_MODEParamName, Constants.PARTITION_MODE_HOST); // check that the mode is known if (!mode.equals(Constants.PARTITION_MODE_IP) && !mode.equals(Constants.PARTITION_MODE_DOMAIN) && !mode.equals(Constants.PARTITION_MODE_HOST)) { LOG.error("Unknown partition mode : {} - forcing to byHost", mode); mode = Constants.PARTITION_MODE_HOST; } LOG.info("Using partition mode : {}", mode); _collector = collector; // Register a "MultiCountMetric" to count different events in this bolt // Storm will emit the counts every n seconds to a special bolt via a // system stream // The data can be accessed by registering a "MetricConsumer" in the // topology this.eventCounter = context.registerMetric("URLPartitioner", new MultiCountMetric(), 10); final int MAX_ENTRIES = 500; cache = new LinkedHashMap(MAX_ENTRIES + 1, .75F, true) { // This method is called just after a new entry has been added @Override public boolean removeEldestEntry(Map.Entry eldest) { return size() > MAX_ENTRIES; } }; // If the cache is to be used by multiple threads, // the cache must be wrapped with code to synchronize the methods cache = Collections.synchronizedMap(cache); } }