/**
* Licensed to DigitalPebble Ltd under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* DigitalPebble licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.digitalpebble.stormcrawler.filtering.robots;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Map;
import com.digitalpebble.stormcrawler.Metadata;
import com.digitalpebble.stormcrawler.filtering.URLFilter;
import com.digitalpebble.stormcrawler.protocol.HttpRobotRulesParser;
import com.digitalpebble.stormcrawler.protocol.ProtocolFactory;
import com.fasterxml.jackson.databind.JsonNode;
import org.apache.storm.Config;
import crawlercommons.robots.BaseRobotRules;
/**
* URLFilter which discards URLs based on the robots.txt directives. This is
* meant to be used on small, limited crawls where the number of hosts is
* finite. Using this on a larger or open crawl would have a negative impact on
* performance as the filter would try to retrieve the robots.txt files for any
* host found.
**/
public class RobotsFilter implements URLFilter {
private com.digitalpebble.stormcrawler.protocol.HttpRobotRulesParser robots;
private ProtocolFactory factory;
private boolean limitToSameHost = false;
@Override
public String filter(URL sourceUrl, Metadata sourceMetadata,
String urlToFilter) {
URL target;
try {
target = new URL(urlToFilter);
} catch (MalformedURLException e) {
return null;
}
// check whether the source and target have the same hostname
if (limitToSameHost) {
if (!target.getHost().equalsIgnoreCase(sourceUrl.getHost())) {
return urlToFilter;
}
}
BaseRobotRules rules = robots.getRobotRulesSet(
factory.getProtocol(target), urlToFilter);
if (!rules.isAllowed(urlToFilter)) {
return null;
}
return urlToFilter;
}
@Override
public void configure(Map stormConf, JsonNode filterParams) {
Config conf = new Config();
conf.putAll(stormConf);
factory = new ProtocolFactory(conf);
robots = new HttpRobotRulesParser(conf);
JsonNode node = filterParams.get("limitToSameHost");
if (node != null && node.isBoolean()) {
limitToSameHost = node.booleanValue();
}
}
}