package focusedCrawler.crawler.crawlercommons.filters.basic; /** * Copyright 2016 Crawler-Commons * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.net.MalformedURLException; import java.net.URISyntaxException; import java.net.URL; import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; import java.util.Collections; import java.util.Comparator; import java.util.Iterator; import java.util.List; import java.util.Locale; import java.util.Set; import java.util.TreeSet; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.lang3.StringUtils; import org.apache.http.NameValuePair; import org.apache.http.client.utils.URLEncodedUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * Code borrowed from Apache Nutch. Converts URLs to a normal form: * <ul> * <li>remove dot segments in path: <code>/./</code> or <code>/../</code></li> * <li>remove default ports, e.g. 80 for protocol <code>http://</code></li> * <li>normalize <a href= * "https://en.wikipedia.org/wiki/Percent-encoding#Percent-encoding_in_a_URI"> * percent-encoding</a> in URL paths</li> * </ul> */ public class BasicURLNormalizer { public static final Logger LOG = LoggerFactory.getLogger(BasicURLNormalizer.class); /** * Pattern to detect whether a URL path could be normalized. Contains one of * /. or ./ /.. or ../ // */ private final static Pattern hasNormalizablePathPattern = Pattern.compile("/[./]|[.]/"); /** * Nutch 1098 - finds URL encoded parts of the URL */ private final static Pattern unescapeRulePattern = Pattern.compile("%([0-9A-Fa-f]{2})"); // charset used for encoding URLs before escaping private final static Charset utf8 = Charset.forName("UTF-8"); private static final Pattern thirtytwobithash = Pattern.compile("[a-fA-F\\d]{32}"); /** look-up table for characters which should not be escaped in URL paths */ private final static boolean[] unescapedCharacters = new boolean[128]; private final static Comparator<NameValuePair> parametersComparator = new Comparator<NameValuePair>() { @Override public int compare(NameValuePair p1, NameValuePair p2) { return p1.getName().compareTo(p2.getName()); } }; static { for (int c = 0; c < 128; c++) { /* * https://tools.ietf.org/html/rfc3986#section-2.2 For consistency, * percent-encoded octets in the ranges of ALPHA (%41-%5A and * %61-%7A), DIGIT (%30-%39), hyphen (%2D), period (%2E), underscore * (%5F), or tilde (%7E) should not be created by URI producers and, * when found in a URI, should be decoded to their corresponding * unreserved characters by URI normalizers. */ if ((0x41 <= c && c <= 0x5A) || (0x61 <= c && c <= 0x7A) || (0x30 <= c && c <= 0x39) || c == 0x2D || c == 0x2E || c == 0x5F || c == 0x7E) { unescapedCharacters[c] = true; } else { unescapedCharacters[c] = false; } } } final Set<String> queryElementsToRemove; final boolean removeHashes; public BasicURLNormalizer() { this(new TreeSet<>(), false); } public BasicURLNormalizer(Set<String> queryElementsToRemove, boolean removeHashes) { this.queryElementsToRemove = new TreeSet<>(queryElementsToRemove); this.removeHashes = removeHashes; } public String filter(String urlString) { if ("".equals(urlString)) // permit empty return urlString; urlString = urlString.trim(); // remove extra spaces urlString = processQueryElements(urlString); URL url = null; try { url = new URL(urlString); } catch (MalformedURLException e) { LOG.info("Malformed URL {}", urlString); return null; } String protocol = url.getProtocol(); String host = url.getHost(); int port = url.getPort(); String file = url.getFile(); boolean changed = false; if (!urlString.startsWith(protocol)) // protocol was lowercased changed = true; if ("http".equals(protocol) || "https".equals(protocol) || "ftp".equals(protocol)) { if (host != null) { String newHost = host.toLowerCase(Locale.ROOT); // lowercase // host if (!host.equals(newHost)) { host = newHost; changed = true; } } if (port == url.getDefaultPort()) { // uses default port port = -1; // so don't specify it changed = true; } if (file == null || "".equals(file)) { // add a slash file = "/"; changed = true; } if (url.getRef() != null) { // remove the ref changed = true; } // check for unnecessary use of "/../", "/./", and "//" String file2 = null; try { file2 = getFileWithNormalizedPath(url); } catch (MalformedURLException e) { LOG.info("Malformed URL {}", url); return null; } if (!file.equals(file2)) { changed = true; file = file2; } } // properly encode characters in path/file using percent-encoding String file2 = unescapePath(file); file2 = escapePath(file2); if (!file.equals(file2)) { changed = true; file = file2; } if (changed) { try { urlString = new URL(protocol, host, port, file).toString(); } catch (MalformedURLException e) { LOG.info("Malformed URL {}{}{}{}", protocol, host, port, file); return null; } } return urlString; } /** * Basic filter to remove query parameters from urls so parameters that * don't change the content of the page can be removed. An example would be * a google analytics query parameter like "utm_campaign" which might have * several different values for a url that points to the same content. This * is also called when removing attributes where the value is a hash. */ private String processQueryElements(String urlToFilter) { try { // Handle illegal characters by making a url first // this will clean illegal characters like | URL url = new URL(urlToFilter); String path = url.getPath(); String query = url.getQuery(); // check if the last element of the path contains parameters // if so convert them to query elements if (path.contains(";")) { String[] pathElements = path.split("/"); String last = pathElements[pathElements.length - 1]; // replace last value by part without params int semicolon = last.indexOf(";"); if (semicolon != -1) { pathElements[pathElements.length - 1] = last.substring(0, semicolon); String params = last.substring(semicolon + 1).replaceAll( ";", "&"); if (query == null) { query = params; } else { query += "&" + params; } // rebuild the path StringBuilder newPath = new StringBuilder(); for (String p : pathElements) { if (StringUtils.isNotBlank(p)) { newPath.append("/").append(p); } } path = newPath.toString(); } } if (StringUtils.isEmpty(query)) { return urlToFilter; } List<NameValuePair> pairs = URLEncodedUtils.parse(query, StandardCharsets.UTF_8); Iterator<NameValuePair> pairsIterator = pairs.iterator(); while (pairsIterator.hasNext()) { NameValuePair param = pairsIterator.next(); if (queryElementsToRemove.contains(param.getName())) { pairsIterator.remove(); } else if (removeHashes && param.getValue() != null) { Matcher m = thirtytwobithash.matcher(param.getValue()); if (m.matches()) { pairsIterator.remove(); } } } StringBuilder newFile = new StringBuilder(); if (StringUtils.isNotBlank(path)) { newFile.append(path); } if (!pairs.isEmpty()) { Collections.sort(pairs, parametersComparator); String newQueryString = URLEncodedUtils.format(pairs, StandardCharsets.UTF_8); newFile.append('?').append(newQueryString); } if (url.getRef() != null) { newFile.append('#').append(url.getRef()); } return new URL(url.getProtocol(), url.getHost(), url.getPort(), newFile.toString()).toString(); } catch (MalformedURLException e) { LOG.warn("Invalid urlToFilter {}. {}", urlToFilter, e); return null; } } private String getFileWithNormalizedPath(URL url) throws MalformedURLException { String file; if (hasNormalizablePathPattern.matcher(url.getPath()).find()) { // only normalize the path if there is something to normalize // to avoid needless work try { file = url.toURI().normalize().toURL().getFile(); // URI.normalize() does not normalize leading dot segments, // see also http://tools.ietf.org/html/rfc3986#section-5.2.4 int start = 0; while (file.startsWith("/../", start)) { start += 3; } if (start > 0) { file = file.substring(start); } } catch (URISyntaxException e) { file = url.getFile(); } } else { file = url.getFile(); } // if path is empty return a single slash if (file.isEmpty()) { file = "/"; } return file; } /** * Remove % encoding from path segment in URL for characters which should be * unescaped according to <a * href="https://tools.ietf.org/html/rfc3986#section-2.2">RFC3986</a>. */ private String unescapePath(String path) { StringBuilder sb = new StringBuilder(); Matcher matcher = unescapeRulePattern.matcher(path); int end = -1; int letter; // Traverse over all encoded groups while (matcher.find()) { // Append everything up to this group sb.append(path.substring(end + 1, matcher.start())); // Get the integer representation of this hexadecimal encoded // character letter = Integer.valueOf(matcher.group().substring(1), 16); if (letter < 128 && unescapedCharacters[letter]) { // character should be unescaped in URLs sb.append(new Character((char) letter)); } else { // Append the encoded character as uppercase sb.append(matcher.group().toUpperCase(Locale.ROOT)); } end = matcher.start() + 2; } letter = path.length(); // Append the rest if there's anything if (end <= letter - 1) { sb.append(path.substring(end + 1, letter)); } // Ok! return sb.toString(); } /** * Convert path segment of URL from Unicode to UTF-8 and escape all * characters which should be escaped according to <a * href="https://tools.ietf.org/html/rfc3986#section-2.2">RFC3986</a>.. */ private String escapePath(String path) { StringBuilder sb = new StringBuilder(path.length()); // Traverse over all bytes in this URL for (byte b : path.getBytes(utf8)) { // Is this a control character? if (b < 33 || b == 91 || b == 93) { // Start escape sequence sb.append('%'); // Get this byte's hexadecimal representation String hex = Integer.toHexString(b & 0xFF).toUpperCase(Locale.ROOT); // Do we need to prepend a zero? if (hex.length() % 2 != 0) { sb.append('0'); sb.append(hex); } else { // No, append this hexadecimal representation sb.append(hex); } } else { // No, just append this character as-is sb.append((char) b); } } return sb.toString(); } }