/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.nutch.net.urlnormalizer.regex; import java.net.URL; import java.net.MalformedURLException; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.ArrayList; import java.util.Iterator; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.nutch.net.URLNormalizer; import org.apache.nutch.net.URLNormalizers; import org.apache.nutch.util.NutchConfiguration; import javax.xml.parsers.*; import org.w3c.dom.*; import org.apache.oro.text.regex.*; /** * Allows users to do regex substitutions on all/any URLs that are encountered, * which is useful for stripping session IDs from URLs. * * <p>This class uses the <tt>urlnormalizer.regex.file</tt> property. * It should be set to the file name of an xml file which should contain the * patterns and substitutions to be done on encountered URLs. * </p> * <p>This class also supports different rules depending on the scope. Please see * the javadoc in {@link org.apache.nutch.net.URLNormalizers} for more details.</p> * * @author Luke Baker * @author Andrzej Bialecki */ public class RegexURLNormalizer extends Configured implements URLNormalizer { private static final Log LOG = LogFactory.getLog(RegexURLNormalizer.class); /** * Class which holds a compiled pattern and its corresponding substition * string. */ private static class Rule { public Perl5Pattern pattern; public String substitution; } private HashMap scopedRules; private static final List EMPTY_RULES = Collections.EMPTY_LIST; private PatternMatcher matcher = new Perl5Matcher(); /** * The default constructor which is called from UrlNormalizerFactory * (normalizerClass.newInstance()) in method: getNormalizer()* */ public RegexURLNormalizer() { super(null); } public RegexURLNormalizer(Configuration conf) { super(conf); } /** * Constructor which can be passed the file name, so it doesn't look in the * configuration files for it. */ public RegexURLNormalizer(Configuration conf, String filename) throws IOException, MalformedPatternException { super(conf); List rules = readConfigurationFile(filename); if (rules != null) scopedRules.put(URLNormalizers.SCOPE_DEFAULT, rules); } public void setConf(Configuration conf) { super.setConf(conf); if (conf == null) return; // the default constructor was called if (this.scopedRules == null) { String filename = getConf().get("urlnormalizer.regex.file"); scopedRules = new HashMap(); URL url = getConf().getResource(filename); List rules = null; if (url == null) { LOG.warn("Can't load the default config file! " + filename); rules = EMPTY_RULES; } else { try { rules = readConfiguration(url.openStream()); } catch (Exception e) { LOG.warn("Couldn't read default config from '" + url + "': " + e); rules = EMPTY_RULES; } } scopedRules.put(URLNormalizers.SCOPE_DEFAULT, rules); } } // used in JUnit test. void setConfiguration(InputStream is, String scope) { List rules = readConfiguration(is); scopedRules.put(scope, rules); LOG.debug("Set config for scope '" + scope + "': " + rules.size() + " rules."); } /** * This function does the replacements by iterating through all the regex * patterns. It accepts a string url as input and returns the altered string. */ public synchronized String regexNormalize(String urlString, String scope) { List curRules = (List)scopedRules.get(scope); if (curRules == null) { // try to populate String configFile = getConf().get("urlnormalizer.regex.file." + scope); if (configFile != null) { URL resource = getConf().getResource(configFile); LOG.debug("resource for scope '" + scope + "': " + resource); if (resource == null) { LOG.warn("Can't load resource for config file: " + configFile); } else { try { InputStream is = resource.openStream(); curRules = readConfiguration(resource.openStream()); scopedRules.put(scope, curRules); } catch (Exception e) { LOG.warn("Couldn't load resource '" + resource + "': " + e); } } } if (curRules == EMPTY_RULES || curRules == null) { LOG.warn("can't find rules for scope '" + scope + "', using default"); scopedRules.put(scope, EMPTY_RULES); } } if (curRules == EMPTY_RULES || curRules == null) { // use global rules curRules = (List)scopedRules.get(URLNormalizers.SCOPE_DEFAULT); } Iterator i = curRules.iterator(); while (i.hasNext()) { Rule r = (Rule) i.next(); urlString = Util.substitute(matcher, r.pattern, new Perl5Substitution( r.substitution), urlString, Util.SUBSTITUTE_ALL); // actual // substitution } return urlString; } public synchronized String normalize(String urlString, String scope) throws MalformedURLException { return regexNormalize(urlString, scope); } /** Reads the configuration file and populates a List of Rules. */ private List readConfigurationFile(String filename) { if (LOG.isInfoEnabled()) { LOG.info("loading " + filename); } try { FileInputStream fis = new FileInputStream(filename); return readConfiguration(fis); } catch (Exception e) { LOG.fatal("Error loading rules from '" + filename + "': " + e); return EMPTY_RULES; } } private List readConfiguration(InputStream is) { Perl5Compiler compiler = new Perl5Compiler(); List rules = new ArrayList(); try { // borrowed heavily from code in Configuration.java Document doc = DocumentBuilderFactory.newInstance().newDocumentBuilder() .parse(is); Element root = doc.getDocumentElement(); if ((!"regex-normalize".equals(root.getTagName())) && (LOG.isFatalEnabled())) { LOG.fatal("bad conf file: top-level element not <regex-normalize>"); } NodeList regexes = root.getChildNodes(); for (int i = 0; i < regexes.getLength(); i++) { Node regexNode = regexes.item(i); if (!(regexNode instanceof Element)) continue; Element regex = (Element) regexNode; if ((!"regex".equals(regex.getTagName())) && (LOG.isWarnEnabled())) { LOG.warn("bad conf file: element not <regex>"); } NodeList fields = regex.getChildNodes(); String patternValue = null; String subValue = null; for (int j = 0; j < fields.getLength(); j++) { Node fieldNode = fields.item(j); if (!(fieldNode instanceof Element)) continue; Element field = (Element) fieldNode; if ("pattern".equals(field.getTagName()) && field.hasChildNodes()) patternValue = ((Text) field.getFirstChild()).getData(); if ("substitution".equals(field.getTagName()) && field.hasChildNodes()) subValue = ((Text) field.getFirstChild()).getData(); if (!field.hasChildNodes()) subValue = ""; } if (patternValue != null && subValue != null) { Rule rule = new Rule(); rule.pattern = (Perl5Pattern) compiler.compile(patternValue); rule.substitution = subValue; rules.add(rule); } } } catch (Exception e) { if (LOG.isFatalEnabled()) { LOG.fatal("error parsing conf file: " + e); } return EMPTY_RULES; } if (rules.size() == 0) return EMPTY_RULES; return rules; } /** Spits out patterns and substitutions that are in the configuration file. */ public static void main(String args[]) throws MalformedPatternException, IOException { RegexURLNormalizer normalizer = new RegexURLNormalizer(); normalizer.setConf(NutchConfiguration.create()); Iterator i = ((List)normalizer.scopedRules.get(URLNormalizers.SCOPE_DEFAULT)).iterator(); System.out.println("* Rules for 'DEFAULT' scope:"); while (i.hasNext()) { Rule r = (Rule) i.next(); System.out.print(" " + r.pattern.getPattern() + " -> "); System.out.println(r.substitution); } // load the scope if (args.length > 1) { normalizer.normalize("http://test.com", args[1]); } if (normalizer.scopedRules.size() > 1) { Iterator it = normalizer.scopedRules.keySet().iterator(); while (it.hasNext()) { String scope = (String)it.next(); if (URLNormalizers.SCOPE_DEFAULT.equals(scope)) continue; System.out.println("* Rules for '" + scope + "' scope:"); i = ((List)normalizer.scopedRules.get(scope)).iterator(); while (i.hasNext()) { Rule r = (Rule) i.next(); System.out.print(" " + r.pattern.getPattern() + " -> "); System.out.println(r.substitution); } } } if (args.length > 0) { System.out.println("\n---------- Normalizer test -----------"); String scope = URLNormalizers.SCOPE_DEFAULT; if (args.length > 1) scope = args[1]; System.out.println("Scope: " + scope); System.out.println("Input url: '" + args[0] + "'"); System.out.println("Output url: '" + normalizer.normalize(args[0], scope) + "'"); } System.exit(0); } }