/*
* This file is part of the Heritrix web crawler (crawler.archive.org).
*
* Licensed to the Internet Archive (IA) by one or more individual
* contributors.
*
* The IA licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.archive.modules.canonicalize;
import java.util.regex.Pattern;
/**
* Strip any 'www' found on http/https URLs, IF they have some
* path/query component (content after third slash). (Top 'slash page'
* URIs are left unstripped, so that we prefer crawling redundant
* top pages to missing an entire site only available from either
* the www-full or www-less hostname, but not both).
* @author stack
* @version $Date$, $Revision$
*/
public class StripWWWRule extends BaseRule {
private static final long serialVersionUID = 3L;
// private static final String DESCRIPTION = "Strip any 'www' found. " +
// "Use this rule to equate 'http://www.archive.org/index.html' and" +
// " 'http://archive.org/index.html'. The resulting canonicalization" +
// " returns 'http://archive.org/index.html'. It removes any www's " +
// "found, except on URIs that have no path/query component " +
// "('slash' pages). Operates on http and https schemes only. " +
// "Use the more general StripWWWNRule if you want to strip both 'www' " +
// "and 'www01', 'www02', etc.";
private static final Pattern REGEX =
Pattern.compile("(?i)^(https?://)(?:www\\.)([^/]*/.+)$");
public StripWWWRule() {
}
public String canonicalize(String url) {
return doStripRegexMatch(url, REGEX.pattern());
}
}