/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.urlfilter.domain;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.io.Reader;
import java.util.LinkedHashSet;
import java.util.Set;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.net.URLFilter;
import org.apache.nutch.plugin.Extension;
import org.apache.nutch.plugin.PluginRepository;
import org.apache.nutch.util.URLUtil;
/**
* <p>Filters URLs based on a file containing domain suffixes, domain names, and
* hostnames. Only a url that matches one of the suffixes, domains, or hosts
* present in the file is allowed.</p>
*
* <p>Urls are checked in order of domain suffix, domain name, and hostname
* against entries in the domain file. The domain file would be setup as follows
* with one entry per line:
*
* <pre> com apache.org www.apache.org </pre>
*
* <p>The first line is an example of a filter that would allow all .com
* domains. The second line allows all urls from apache.org and all of its
* subdomains such as lucene.apache.org and hadoop.apache.org. The third line
* would allow only urls from www.apache.org. There is no specific ordering to
* entries. The entries are from more general to more specific with the more
* general overridding the more specific.</p>
*
* The domain file defaults to domain-urlfilter.txt in the classpath but can be
* overridden using the:
*
* <ul> <ol>property "urlfilter.domain.file" in ./conf/nutch-*.xml, and</ol>
* <ol>attribute "file" in plugin.xml of this plugin</ol> </ul>
*
* the attribute "file" has higher precedence if defined.
*/
public class DomainURLFilter
implements URLFilter {
private static final Log LOG = LogFactory.getLog(DomainURLFilter.class);
// read in attribute "file" of this plugin.
private static String attributeFile = null;
private Configuration conf;
private String domainFile = null;
private Set<String> domainSet = new LinkedHashSet<String>();
private void readConfigurationFile(Reader configReader)
throws IOException {
// read the configuration file, line by line
BufferedReader reader = new BufferedReader(configReader);
String line = null;
while ((line = reader.readLine()) != null) {
if (StringUtils.isNotBlank(line) && !line.startsWith("#")) {
// add non-blank lines and non-commented lines
domainSet.add(StringUtils.lowerCase(line));
}
}
}
/**
* Default constructor.
*/
public DomainURLFilter() {
}
/**
* Constructor that specifies the domain file to use.
*
* @param domainFile The domain file, overrides domain-urlfilter.text default.
*
* @throws IOException
*/
public DomainURLFilter(String domainFile) {
this.domainFile = domainFile;
}
/**
* Sets the configuration.
*/
public void setConf(Configuration conf) {
this.conf = conf;
// get the extensions for domain urlfilter
String pluginName = "urlfilter-domain";
Extension[] extensions = PluginRepository.get(conf).getExtensionPoint(
URLFilter.class.getName()).getExtensions();
for (int i = 0; i < extensions.length; i++) {
Extension extension = extensions[i];
if (extension.getDescriptor().getPluginId().equals(pluginName)) {
attributeFile = extension.getAttribute("file");
break;
}
}
// handle blank non empty input
if (attributeFile != null && attributeFile.trim().equals("")) {
attributeFile = null;
}
if (attributeFile != null) {
if (LOG.isInfoEnabled()) {
LOG.info("Attribute \"file\" is defined for plugin " + pluginName
+ " as " + attributeFile);
}
}
else {
if (LOG.isWarnEnabled()) {
LOG.warn("Attribute \"file\" is not defined in plugin.xml for plugin "
+ pluginName);
}
}
// domain file and attribute "file" take precedence if defined
String file = conf.get("urlfilter.domain.file");
if (domainFile != null) {
file = domainFile;
}
else if (attributeFile != null) {
file = attributeFile;
}
// get the file as a classpath resource and populate the domain set with
// the domains from the file
try {
Reader reader = conf.getConfResourceAsReader(file);
if (reader == null) {
reader = new FileReader(file);
}
readConfigurationFile(reader);
}
catch (IOException e) {
LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
}
}
public Configuration getConf() {
return this.conf;
}
public String filter(String url) {
try {
// match for suffix, domain, and host in that order. more general will
// override more specific
String suffix = URLUtil.getDomainSuffix(url).getDomain();
String domain = URLUtil.getDomainName(url).toLowerCase().trim();
String host = URLUtil.getHost(url);
if (domainSet.contains(suffix) || domainSet.contains(domain)
|| domainSet.contains(host)) {
return url;
}
// doesn't match, don't allow
return null;
}
catch (Exception e) {
// if an error happens, allow the url to pass
LOG.error("Could not apply filter on url: " + url + "\n"
+ org.apache.hadoop.util.StringUtils.stringifyException(e));
return null;
}
}
}