/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.nutch.urlfilter.api; // JDK imports import java.lang.invoke.MethodHandles; import java.io.File; import java.io.Reader; import java.io.FileReader; import java.io.BufferedReader; import java.io.InputStreamReader; import java.io.IOException; import java.io.StringReader; import java.net.MalformedURLException; import java.util.List; import java.util.ArrayList; // Commons Logging imports import org.slf4j.Logger; import org.slf4j.LoggerFactory; // Hadoop imports import org.apache.hadoop.conf.Configuration; // Nutch imports import org.apache.nutch.net.*; import org.apache.nutch.util.URLUtil; /** * Generic {@link org.apache.nutch.net.URLFilter URL filter} based on regular * expressions. * * <p> * The regular expressions rules are expressed in a file. The file of rules is * determined for each implementation using the * {@link #getRulesReader(Configuration conf)} method. * </p> * * <p> * The format of this file is made of many rules (one per line):<br> * <code> * [+-]<regex> * </code><br> * where plus (<code>+</code>)means go ahead and index it and minus ( * <code>-</code>)means no. * </p> * * @author Jérôme Charron */ public abstract class RegexURLFilterBase implements URLFilter { /** My logger */ private static final Logger LOG = LoggerFactory .getLogger(MethodHandles.lookup().lookupClass()); /** An array of applicable rules */ private List<RegexRule> rules; /** The current configuration */ private Configuration conf; /** * Constructs a new empty RegexURLFilterBase */ public RegexURLFilterBase() { } /** * Constructs a new RegexURLFilter and init it with a file of rules. * * @param filename * is the name of rules file. */ public RegexURLFilterBase(File filename) throws IOException, IllegalArgumentException { this(new FileReader(filename)); } /** * Constructs a new RegexURLFilter and inits it with a list of rules. * * @param rules * string with a list of rules, one rule per line * @throws IOException * @throws IllegalArgumentException */ public RegexURLFilterBase(String rules) throws IOException, IllegalArgumentException { this(new StringReader(rules)); } /** * Constructs a new RegexURLFilter and init it with a Reader of rules. * * @param reader * is a reader of rules. */ protected RegexURLFilterBase(Reader reader) throws IOException, IllegalArgumentException { rules = readRules(reader); } /** * Creates a new {@link RegexRule}. * * @param sign * of the regular expression. A <code>true</code> value means that * any URL matching this rule must be included, whereas a * <code>false</code> value means that any URL matching this rule * must be excluded. * @param regex * is the regular expression associated to this rule. */ protected abstract RegexRule createRule(boolean sign, String regex); /** * Creates a new {@link RegexRule}. * @param * sign of the regular expression. * A <code>true</code> value means that any URL matching this rule * must be included, whereas a <code>false</code> * value means that any URL matching this rule must be excluded. * @param regex * is the regular expression associated to this rule. * @param hostOrDomain * the host or domain to which this regex belongs */ protected abstract RegexRule createRule(boolean sign, String regex, String hostOrDomain); /** * Returns the name of the file of rules to use for a particular * implementation. * * @param conf * is the current configuration. * @return the name of the resource containing the rules to use. */ protected abstract Reader getRulesReader(Configuration conf) throws IOException; /* * -------------------------- * <implementation:URLFilter> * * -------------------------- */ // Inherited Javadoc public String filter(String url) { String host = URLUtil.getHost(url); String domain = null; try { domain = URLUtil.getDomainName(url); } catch (MalformedURLException e) { // shouldnt happen here right? } if (LOG.isDebugEnabled()) { LOG.debug("URL belongs to host " + host + " and domain " + domain); } for (RegexRule rule : rules) { // Skip the skip for rules that don't share the same host and domain if (rule.hostOrDomain() != null && !rule.hostOrDomain().equals(host) && !rule.hostOrDomain().equals(domain)) { if (LOG.isDebugEnabled()) { LOG.debug("Skipping rule [" + rule.regex() + "] for host: " + rule.hostOrDomain()); } continue; } if (LOG.isDebugEnabled()) { LOG.debug("Applying rule [" + rule.regex() + "] for host: " + host + " and domain " + domain); } if (rule.match(url)) { return rule.accept() ? url : null; } } ; return null; } /* * --------------------------- * </implementation:URLFilter> * * --------------------------- */ /* * ----------------------------- * <implementation:Configurable> * * ----------------------------- */ public void setConf(Configuration conf) { this.conf = conf; Reader reader = null; try { reader = getRulesReader(conf); } catch (Exception e) { if (LOG.isErrorEnabled()) { LOG.error(e.getMessage()); } throw new RuntimeException(e.getMessage(), e); } try { rules = readRules(reader); } catch (IOException e) { if (LOG.isErrorEnabled()) { LOG.error(e.getMessage()); } throw new RuntimeException(e.getMessage(), e); } } public Configuration getConf() { return this.conf; } /* * ------------------------------ * </implementation:Configurable> * * ------------------------------ */ /** * Read the specified file of rules. * * @param reader * is a reader of regular expressions rules. * @return the corresponding {@RegexRule rules}. */ private List<RegexRule> readRules(Reader reader) throws IOException, IllegalArgumentException { BufferedReader in = new BufferedReader(reader); List<RegexRule> rules = new ArrayList<RegexRule>(); String line; String hostOrDomain = null; while ((line = in.readLine()) != null) { if (line.length() == 0) { continue; } char first = line.charAt(0); boolean sign = false; switch (first) { case '+': sign = true; break; case '-': sign = false; break; case ' ': case '\n': case '#': // skip blank & comment lines continue; case '>': hostOrDomain = line.substring(1).trim(); continue; case '<': hostOrDomain = null; continue; default: throw new IOException("Invalid first character: " + line); } String regex = line.substring(1); if (LOG.isTraceEnabled()) { LOG.trace("Adding rule [" + regex + "] for " + hostOrDomain); } RegexRule rule = createRule(sign, regex, hostOrDomain); rules.add(rule); } return rules; } /** * Filter the standard input using a RegexURLFilterBase. * * @param filter * is the RegexURLFilterBase to use for filtering the standard input. * @param args * some optional parameters (not used). */ public static void main(RegexURLFilterBase filter, String args[]) throws IOException, IllegalArgumentException { BufferedReader in = new BufferedReader(new InputStreamReader(System.in)); String line; while ((line = in.readLine()) != null) { String out = filter.filter(line); if (out != null) { System.out.print("+"); System.out.println(out); } else { System.out.print("-"); System.out.println(line); } } } }