/**
* Licensed to DigitalPebble Ltd under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* DigitalPebble licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.digitalpebble.stormcrawler.util;
import java.net.InetAddress;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Map;
import org.apache.commons.lang.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.digitalpebble.stormcrawler.Constants;
import com.digitalpebble.stormcrawler.Metadata;
import crawlercommons.domains.PaidLevelDomain;
/**
* Generates a partition key for a given URL based on the hostname, domain or IP
* address. This can be called by the URLPartitionerBolt or any other component.
*/
public class URLPartitioner {
private static final Logger LOG = LoggerFactory
.getLogger(URLPartitioner.class);
private String mode = Constants.PARTITION_MODE_HOST;
/**
* Returns the host, domain, IP of a URL so that it can be partitioned for
* politeness, depending on the value of the config
* <i>partition.url.mode</i>.
**/
public String getPartition(String url, Metadata metadata) {
String partitionKey = null;
String host = "";
// IP in metadata?
if (mode.equalsIgnoreCase(Constants.PARTITION_MODE_IP)) {
String ip_provided = metadata.getFirstValue("ip");
if (StringUtils.isNotBlank(ip_provided)) {
partitionKey = ip_provided;
}
}
if (partitionKey == null) {
URL u;
try {
u = new URL(url);
host = u.getHost();
} catch (MalformedURLException e1) {
LOG.warn("Invalid URL: {}", url);
return null;
}
}
// partition by hostname
if (mode.equalsIgnoreCase(Constants.PARTITION_MODE_HOST))
partitionKey = host;
// partition by domain : needs fixing
else if (mode.equalsIgnoreCase(Constants.PARTITION_MODE_DOMAIN)) {
partitionKey = PaidLevelDomain.getPLD(host);
}
// partition by IP
if (mode.equalsIgnoreCase(Constants.PARTITION_MODE_IP)
&& partitionKey == null) {
try {
long start = System.currentTimeMillis();
final InetAddress addr = InetAddress.getByName(host);
partitionKey = addr.getHostAddress();
long end = System.currentTimeMillis();
LOG.debug("Resolved IP {} in {} msec for : {}", partitionKey,
end - start, url);
} catch (final Exception e) {
LOG.warn("Unable to resolve IP for: {}", host);
return null;
}
}
LOG.debug("Partition Key for: {} > {}", url, partitionKey);
return partitionKey;
}
public void configure(Map stormConf) {
mode = ConfUtils.getString(stormConf,
Constants.PARTITION_MODEParamName,
Constants.PARTITION_MODE_HOST);
// check that the mode is known
if (!mode.equals(Constants.PARTITION_MODE_IP)
&& !mode.equals(Constants.PARTITION_MODE_DOMAIN)
&& !mode.equals(Constants.PARTITION_MODE_HOST)) {
LOG.error("Unknown partition mode : {} - forcing to byHost", mode);
mode = Constants.PARTITION_MODE_HOST;
}
LOG.info("Using partition mode : {}", mode);
}
}