/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.nutch.urlfilter.api; // JDK imports import java.io.Reader; import java.io.FileReader; import java.io.BufferedReader; import java.io.InputStreamReader; import java.io.IOException; import java.util.List; import java.util.ArrayList; // Commons Logging imports import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; // Hadoop imports import org.apache.hadoop.conf.Configuration; // Nutch imports import org.apache.nutch.net.*; /** * Generic {@link org.apache.nutch.net.URLFilter URL filter} based on * regular expressions. * * <p>The regular expressions rules are expressed in a file. The file of rules * is provided by each implementation using the * {@link #getRulesFile(Configuration)} method.</p> * * <p>The format of this file is made of many rules (one per line):<br/> * <code> * [+-]<regex> * </code><br/> * where plus (<code>+</code>)means go ahead and index it and minus * (<code>-</code>)means no.</p> * * @author Jérôme Charron */ public abstract class RegexURLFilterBase implements URLFilter { /** My logger */ private final static Log LOG = LogFactory.getLog(RegexURLFilterBase.class); /** An array of applicable rules */ private RegexRule[] rules; /** The current configuration */ private Configuration conf; /** * Constructs a new empty RegexURLFilterBase */ public RegexURLFilterBase() { } /** * Constructs a new RegexURLFilter and init it with a file of rules. * @param filename is the name of rules file. */ public RegexURLFilterBase(String filename) throws IOException, IllegalArgumentException { this(new FileReader(filename)); } /** * Constructs a new RegexURLFilter and init it with a Reader of rules. * @param reader is a reader of rules. */ protected RegexURLFilterBase(Reader reader) throws IOException, IllegalArgumentException { rules = readRulesFile(reader); } /** * Creates a new {@link RegexRule}. * @param sign of the regular expression. * A <code>true</code> value means that any URL matching this rule * must be included, whereas a <code>false</code> * value means that any URL matching this rule must be excluded. * @param regex is the regular expression associated to this rule. */ protected abstract RegexRule createRule(boolean sign, String regex); /** * Returns the name of the file of rules to use for * a particular implementation. * @param conf is the current configuration. * @return the name of the file of rules to use. */ protected abstract String getRulesFile(Configuration conf); /* -------------------------- * * <implementation:URLFilter> * * -------------------------- */ // Inherited Javadoc public synchronized String filter(String url) { for (int i=0; i<rules.length; i++) { if (rules[i].match(url)) { return rules[i].accept() ? url : null; } }; return null; } /* --------------------------- * * </implementation:URLFilter> * * --------------------------- */ /* ----------------------------- * * <implementation:Configurable> * * ----------------------------- */ public void setConf(Configuration conf) { this.conf = conf; String file = getRulesFile(conf); Reader reader = conf.getConfResourceAsReader(file); if (reader == null) { if (LOG.isFatalEnabled()) { LOG.fatal("Can't find resource: " + file); } } else { try { rules = readRulesFile(reader); } catch (IOException e) { if (LOG.isFatalEnabled()) { LOG.fatal(e.getMessage()); } //TODO mb@media-style.com: throw Exception? Because broken api. throw new RuntimeException(e.getMessage(), e); } } } public Configuration getConf() { return this.conf; } /* ------------------------------ * * </implementation:Configurable> * * ------------------------------ */ /** * Read the specified file of rules. * @param reader is a reader of regular expressions rules. * @return the corresponding {@RegexRule rules}. */ private RegexRule[] readRulesFile(Reader reader) throws IOException, IllegalArgumentException { BufferedReader in = new BufferedReader(reader); List rules = new ArrayList(); String line; while((line=in.readLine())!=null) { if (line.length() == 0) { continue; } char first=line.charAt(0); boolean sign=false; switch (first) { case '+' : sign=true; break; case '-' : sign=false; break; case ' ' : case '\n' : case '#' : // skip blank & comment lines continue; default : throw new IOException("Invalid first character: "+line); } String regex = line.substring(1); if (LOG.isTraceEnabled()) { LOG.trace("Adding rule [" + regex + "]"); } RegexRule rule = createRule(sign, regex); rules.add(rule); } return (RegexRule[]) rules.toArray(new RegexRule[rules.size()]); } /** * Filter the standard input using a RegexURLFilterBase. * @param filter is the RegexURLFilterBase to use for filtering the * standard input. * @param args some optional parameters (not used). */ public static void main(RegexURLFilterBase filter, String args[]) throws IOException, IllegalArgumentException { BufferedReader in = new BufferedReader(new InputStreamReader(System.in)); String line; while((line=in.readLine())!=null) { String out = filter.filter(line); if (out!=null) { System.out.print("+"); System.out.println(out); } else { System.out.print("-"); System.out.println(line); } } } }