/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.nutch.urlfilter.domain; import java.lang.invoke.MethodHandles; import java.io.BufferedReader; import java.io.FileReader; import java.io.IOException; import java.io.Reader; import java.io.StringReader; import java.util.LinkedHashSet; import java.util.Set; import org.apache.commons.lang.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.conf.Configuration; import org.apache.nutch.net.URLFilter; import org.apache.nutch.plugin.Extension; import org.apache.nutch.plugin.PluginRepository; import org.apache.nutch.util.URLUtil; import org.apache.nutch.util.domain.DomainSuffix; /** * <p> * Filters URLs based on a file containing domain suffixes, domain names, and * hostnames. Only a url that matches one of the suffixes, domains, or hosts * present in the file is allowed. * </p> * * <p> * Urls are checked in order of domain suffix, domain name, and hostname against * entries in the domain file. The domain file would be setup as follows with * one entry per line: * * <pre> * com apache.org www.apache.org * </pre> * * <p> * The first line is an example of a filter that would allow all .com domains. * The second line allows all urls from apache.org and all of its subdomains * such as lucene.apache.org and hadoop.apache.org. The third line would allow * only urls from www.apache.org. There is no specific ordering to entries. The * entries are from more general to more specific with the more general * overridding the more specific. * </p> * * The domain file defaults to domain-urlfilter.txt in the classpath but can be * overridden using the: * * <ul> * <li> * property "urlfilter.domain.file" in ./conf/nutch-*.xml, and * </li> * <li> * attribute "file" in plugin.xml of this plugin * </li> * </ul> * * the attribute "file" has higher precedence if defined. */ public class DomainURLFilter implements URLFilter { private static final Logger LOG = LoggerFactory .getLogger(MethodHandles.lookup().lookupClass()); // read in attribute "file" of this plugin. private static String attributeFile = null; private Configuration conf; private String domainFile = null; private Set<String> domainSet = new LinkedHashSet<String>(); private void readConfiguration(Reader configReader) throws IOException { // read the configuration file, line by line BufferedReader reader = new BufferedReader(configReader); String line = null; while ((line = reader.readLine()) != null) { if (StringUtils.isNotBlank(line) && !line.startsWith("#")) { // add non-blank lines and non-commented lines domainSet.add(StringUtils.lowerCase(line.trim())); } } } /** * Default constructor. */ public DomainURLFilter() { } /** * Constructor that specifies the domain file to use. * * @param domainFile * The domain file, overrides domain-urlfilter.text default. */ public DomainURLFilter(String domainFile) { this.domainFile = domainFile; } /** * Sets the configuration. */ public void setConf(Configuration conf) { this.conf = conf; // get the extensions for domain urlfilter String pluginName = "urlfilter-domain"; Extension[] extensions = PluginRepository.get(conf) .getExtensionPoint(URLFilter.class.getName()).getExtensions(); for (int i = 0; i < extensions.length; i++) { Extension extension = extensions[i]; if (extension.getDescriptor().getPluginId().equals(pluginName)) { attributeFile = extension.getAttribute("file"); break; } } // handle blank non empty input if (attributeFile != null && attributeFile.trim().equals("")) { attributeFile = null; } if (attributeFile != null) { if (LOG.isInfoEnabled()) { LOG.info("Attribute \"file\" is defined for plugin " + pluginName + " as " + attributeFile); } } else { if (LOG.isWarnEnabled()) { LOG.warn("Attribute \"file\" is not defined in plugin.xml for plugin " + pluginName); } } // domain file and attribute "file" take precedence if defined String file = conf.get("urlfilter.domain.file"); String stringRules = conf.get("urlfilter.domain.rules"); if (domainFile != null) { file = domainFile; } else if (attributeFile != null) { file = attributeFile; } Reader reader = null; if (stringRules != null) { // takes precedence over files reader = new StringReader(stringRules); } else { reader = conf.getConfResourceAsReader(file); } try { if (reader == null) { reader = new FileReader(file); } readConfiguration(reader); } catch (IOException e) { LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e)); } } public Configuration getConf() { return this.conf; } public String filter(String url) { // https://issues.apache.org/jira/browse/NUTCH-2189 if (domainSet.size() == 0) return url; try { // match for suffix, domain, and host in that order. more general will // override more specific String domain = URLUtil.getDomainName(url).toLowerCase().trim(); String host = URLUtil.getHost(url); String suffix = null; DomainSuffix domainSuffix = URLUtil.getDomainSuffix(url); if (domainSuffix != null) { suffix = domainSuffix.getDomain(); } if (domainSet.contains(suffix) || domainSet.contains(domain) || domainSet.contains(host)) { return url; } // doesn't match, don't allow return null; } catch (Exception e) { // if an error happens, allow the url to pass LOG.error("Could not apply filter on url: " + url + "\n" + org.apache.hadoop.util.StringUtils.stringifyException(e)); return null; } } }