/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.nutch.net.urlnormalizer.basic; import java.net.URL; import java.net.MalformedURLException; // Commons Logging imports import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; // Nutch imports import org.apache.nutch.net.URLNormalizer; import org.apache.nutch.util.LogUtil; import org.apache.hadoop.conf.Configuration; import org.apache.oro.text.regex.*; /** Converts URLs to a normal form . */ public class BasicURLNormalizer implements URLNormalizer { public static final Log LOG = LogFactory.getLog(BasicURLNormalizer.class); private Perl5Compiler compiler = new Perl5Compiler(); private ThreadLocal matchers = new ThreadLocal() { protected synchronized Object initialValue() { return new Perl5Matcher(); } }; private Rule relativePathRule = null; private Rule leadingRelativePathRule = null; private Rule adjacentSlashRule = null; private Configuration conf; public BasicURLNormalizer() { try { // this pattern tries to find spots like "/xx/../" in the url, which // could be replaced by "/" xx consists of chars, different then "/" // (slash) and needs to have at least one char different from "." relativePathRule = new Rule(); relativePathRule.pattern = (Perl5Pattern) compiler.compile("(/[^/]*[^/.]{1}[^/]*/\\.\\./)", Perl5Compiler.READ_ONLY_MASK); relativePathRule.substitution = new Perl5Substitution("/"); // this pattern tries to find spots like leading "/../" in the url, // which could be replaced by "/" leadingRelativePathRule = new Rule(); leadingRelativePathRule.pattern = (Perl5Pattern) compiler.compile("^(/\\.\\./)+", Perl5Compiler.READ_ONLY_MASK); leadingRelativePathRule.substitution = new Perl5Substitution("/"); // this pattern tries to find spots like "xx//yy" in the url, // which could be replaced by a "/" adjacentSlashRule = new Rule(); adjacentSlashRule.pattern = (Perl5Pattern) compiler.compile("/{2,}", Perl5Compiler.READ_ONLY_MASK); adjacentSlashRule.substitution = new Perl5Substitution("/"); } catch (MalformedPatternException e) { e.printStackTrace(LogUtil.getWarnStream(LOG)); throw new RuntimeException(e); } } public String normalize(String urlString, String scope) throws MalformedURLException { if ("".equals(urlString)) // permit empty return urlString; urlString = urlString.trim(); // remove extra spaces URL url = new URL(urlString); String protocol = url.getProtocol(); String host = url.getHost(); int port = url.getPort(); String file = url.getFile(); boolean changed = false; if (!urlString.startsWith(protocol)) // protocol was lowercased changed = true; if ("http".equals(protocol) || "ftp".equals(protocol)) { if (host != null) { String newHost = host.toLowerCase(); // lowercase host if (!host.equals(newHost)) { host = newHost; changed = true; } } if (port == url.getDefaultPort()) { // uses default port port = -1; // so don't specify it changed = true; } if (file == null || "".equals(file)) { // add a slash file = "/"; changed = true; } if (url.getRef() != null) { // remove the ref changed = true; } // check for unnecessary use of "/../" String file2 = substituteUnnecessaryRelativePaths(file); if (!file.equals(file2)) { changed = true; file = file2; } } if (changed) urlString = new URL(protocol, host, port, file).toString(); return urlString; } private String substituteUnnecessaryRelativePaths(String file) { String fileWorkCopy = file; int oldLen = file.length(); int newLen = oldLen - 1; // All substitutions will be done step by step, to ensure that certain // constellations will be normalized, too // // For example: "/aa/bb/../../cc/../foo.html will be normalized in the // following manner: // "/aa/bb/../../cc/../foo.html" // "/aa/../cc/../foo.html" // "/cc/../foo.html" // "/foo.html" // // The normalization also takes care of leading "/../", which will be // replaced by "/", because this is a rather a sign of bad webserver // configuration than of a wanted link. For example, urls like // "http://www.foo.com/../" should return a http 404 error instead of // redirecting to "http://www.foo.com". // Perl5Matcher matcher = (Perl5Matcher)matchers.get(); while (oldLen != newLen) { // substitue first occurence of "/xx/../" by "/" oldLen = fileWorkCopy.length(); fileWorkCopy = Util.substitute (matcher, relativePathRule.pattern, relativePathRule.substitution, fileWorkCopy, 1); // remove leading "/../" fileWorkCopy = Util.substitute (matcher, leadingRelativePathRule.pattern, leadingRelativePathRule.substitution, fileWorkCopy, 1); // collapse adjacent slashes with "/" fileWorkCopy = Util.substitute (matcher, adjacentSlashRule.pattern, adjacentSlashRule.substitution, fileWorkCopy, 1); newLen = fileWorkCopy.length(); } return fileWorkCopy; } /** * Class which holds a compiled pattern and its corresponding substition * string. */ private static class Rule { public Perl5Pattern pattern; public Perl5Substitution substitution; } public void setConf(Configuration conf) { this.conf = conf; } public Configuration getConf() { return this.conf; } }