/** * Licensed to DigitalPebble Ltd under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * DigitalPebble licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.digitalpebble.stormcrawler.filtering.basic; import java.net.MalformedURLException; import java.net.URI; import java.net.URL; import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; import java.util.Collections; import java.util.Comparator; import java.util.Iterator; import java.util.List; import java.util.Locale; import java.util.Map; import java.util.Set; import java.util.TreeSet; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.lang.StringUtils; import org.apache.http.NameValuePair; import org.apache.http.client.utils.URLEncodedUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.digitalpebble.stormcrawler.Metadata; import com.digitalpebble.stormcrawler.filtering.URLFilter; import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.node.ArrayNode; public class BasicURLNormalizer implements URLFilter { private static final Logger LOG = LoggerFactory .getLogger(BasicURLNormalizer.class); /** * Nutch 1098 - finds URL encoded parts of the URL */ private static final Pattern unescapeRulePattern = Pattern .compile("%([0-9A-Fa-f]{2})"); /** https://github.com/DigitalPebble/storm-crawler/issues/401 **/ private static final Pattern illegalEscapePattern = Pattern .compile("%u([0-9A-Fa-f]{4})"); // charset used for encoding URLs before escaping private static final Charset utf8 = Charset.forName("UTF-8"); /** look-up table for characters which should not be escaped in URL paths */ private static final boolean[] unescapedCharacters = new boolean[128]; private static final Pattern thirtytwobithash = Pattern .compile("[a-fA-F\\d]{32}"); static { for (int c = 0; c < 128; c++) { /* * https://tools.ietf.org/html/rfc3986#section-2.2 For consistency, * percent-encoded octets in the ranges of ALPHA (%41-%5A and * %61-%7A), DIGIT (%30-%39), hyphen (%2D), period (%2E), underscore * (%5F), or tilde (%7E) should not be created by URI producers and, * when found in a URI, should be decoded to their corresponding * unreserved characters by URI normalizers. */ if ((0x41 <= c && c <= 0x5A) || (0x61 <= c && c <= 0x7A) || (0x30 <= c && c <= 0x39) || c == 0x2D || c == 0x2E || c == 0x5F || c == 0x7E) { unescapedCharacters[c] = true; } else { unescapedCharacters[c] = false; } } } boolean removeAnchorPart = true; boolean unmangleQueryString = true; boolean checkValidURI = true; boolean removeHashes = false; final Set<String> queryElementsToRemove = new TreeSet<>(); @Override public String filter(URL sourceUrl, Metadata sourceMetadata, String urlToFilter) { urlToFilter = urlToFilter.trim(); final String originalURL = urlToFilter; if (removeAnchorPart) { try { URL theURL = new URL(urlToFilter); String anchor = theURL.getRef(); if (anchor != null) urlToFilter = urlToFilter.replace("#" + anchor, ""); } catch (MalformedURLException e) { return null; } } if (unmangleQueryString) { urlToFilter = unmangleQueryString(urlToFilter); } if (!queryElementsToRemove.isEmpty() || removeHashes) { urlToFilter = processQueryElements(urlToFilter); } try { URL theURL = new URL(urlToFilter); String file = theURL.getFile(); String protocol = theURL.getProtocol(); String host = theURL.getHost(); boolean hasChanged = false; // lowercased protocol if (!urlToFilter.startsWith(protocol)) { hasChanged = true; } if (host != null) { String newHost = host.toLowerCase(Locale.ROOT); if (!host.equals(newHost)) { host = newHost; hasChanged = true; } } int port = theURL.getPort(); // properly encode characters in path/file using percent-encoding String file2 = unescapePath(file); file2 = escapePath(file2); if (!file.equals(file2)) { hasChanged = true; } if (hasChanged) { urlToFilter = new URL(protocol, host, port, file2).toString(); } } catch (MalformedURLException e) { return null; } if (checkValidURI) { try { URI uri = URI.create(urlToFilter); urlToFilter = uri.normalize().toString(); } catch (java.lang.IllegalArgumentException e) { LOG.info("Invalid URI {} from {} ", urlToFilter, originalURL); return null; } } return urlToFilter; } @Override public void configure(Map stormConf, JsonNode paramNode) { JsonNode node = paramNode.get("removeAnchorPart"); if (node != null) { removeAnchorPart = node.booleanValue(); } node = paramNode.get("unmangleQueryString"); if (node != null) { unmangleQueryString = node.booleanValue(); } node = paramNode.get("queryElementsToRemove"); if (node != null) { if (!node.isArray()) { LOG.warn( "Failed to configure queryElementsToRemove. Not an array: {}", node.toString()); } else { ArrayNode array = (ArrayNode) node; for (JsonNode element : array) { queryElementsToRemove.add(element.asText()); } } } node = paramNode.get("checkValidURI"); if (node != null) { checkValidURI = node.booleanValue(); } node = paramNode.get("removeHashes"); if (node != null) { removeHashes = node.booleanValue(); } } /** * Basic filter to remove query parameters from urls so parameters that * don't change the content of the page can be removed. An example would be * a google analytics query parameter like "utm_campaign" which might have * several different values for a url that points to the same content. This * is also called when removing attributes where the value is a hash. */ private String processQueryElements(String urlToFilter) { try { // Handle illegal characters by making a url first // this will clean illegal characters like | URL url = new URL(urlToFilter); String query = url.getQuery(); String path = url.getPath(); // check if the last element of the path contains parameters // if so convert them to query elements if (path.contains(";")) { String[] pathElements = path.split("/"); String last = pathElements[pathElements.length - 1]; // replace last value by part without params int semicolon = last.indexOf(";"); if (semicolon != -1) { pathElements[pathElements.length - 1] = last.substring(0, semicolon); String params = last.substring(semicolon + 1).replaceAll( ";", "&"); if (query == null) { query = params; } else { query += "&" + params; } // rebuild the path StringBuilder newPath = new StringBuilder(); for (String p : pathElements) { if (StringUtils.isNotBlank(p)) { newPath.append("/").append(p); } } path = newPath.toString(); } } if (StringUtils.isEmpty(query)) { return urlToFilter; } List<NameValuePair> pairs = URLEncodedUtils.parse(query, StandardCharsets.UTF_8); Iterator<NameValuePair> pairsIterator = pairs.iterator(); while (pairsIterator.hasNext()) { NameValuePair param = pairsIterator.next(); if (queryElementsToRemove.contains(param.getName())) { pairsIterator.remove(); } else if (removeHashes && param.getValue() != null) { Matcher m = thirtytwobithash.matcher(param.getValue()); if (m.matches()) { pairsIterator.remove(); } } } StringBuilder newFile = new StringBuilder(); if (StringUtils.isNotBlank(path)) { newFile.append(path); } if (!pairs.isEmpty()) { Collections.sort(pairs, comp); String newQueryString = URLEncodedUtils.format(pairs, StandardCharsets.UTF_8); newFile.append('?').append(newQueryString); } if (url.getRef() != null) { newFile.append('#').append(url.getRef()); } return new URL(url.getProtocol(), url.getHost(), url.getPort(), newFile.toString()).toString(); } catch (MalformedURLException e) { LOG.warn("Invalid urlToFilter {}. {}", urlToFilter, e); return null; } } Comparator<NameValuePair> comp = new Comparator<NameValuePair>() { @Override public int compare(NameValuePair p1, NameValuePair p2) { return p1.getName().compareTo(p2.getName()); } }; /** * A common error to find is a query string that starts with an & instead of * a ? This will fix that error. So http://foo.com&a=b will be changed to * http://foo.com?a=b. * * @param urlToFilter * @return corrected url */ private String unmangleQueryString(String urlToFilter) { int firstAmp = urlToFilter.indexOf('&'); if (firstAmp > 0) { int firstQuestionMark = urlToFilter.indexOf('?'); if (firstQuestionMark == -1) { return urlToFilter.replaceFirst("&", "?"); } } return urlToFilter; } /** * Remove % encoding from path segment in URL for characters which should be * unescaped according to <a * href="https://tools.ietf.org/html/rfc3986#section-2.2">RFC3986</a> as * well as non-standard implementations of percent encoding, see * <https://en. * wikipedia.org/wiki/Percent-encoding#Non-standard_implementations>. */ private String unescapePath(String path) { Matcher matcher = illegalEscapePattern.matcher(path); StringBuilder sb = null; int end = 0; while (matcher.find()) { if (sb == null) { sb = new StringBuilder(); } // Append everything up to this group sb.append(path.substring(end, matcher.start())); String group = matcher.group(1); int letter = Integer.valueOf(group, 16); sb.append((char) letter); end = matcher.end(); } // we got a replacement if (sb != null) { // append whatever is left sb.append(path.substring(end)); path = sb.toString(); end = 0; } matcher = unescapeRulePattern.matcher(path); if (!matcher.find()) { return path; } sb = new StringBuilder(); // Traverse over all encoded groups do { // Append everything up to this group sb.append(path.substring(end, matcher.start())); // Get the integer representation of this hexadecimal encoded // character int letter = Integer.valueOf(matcher.group(1), 16); if (letter < 128 && unescapedCharacters[letter]) { // character should be unescaped in URLs sb.append((char) letter); } else { // Append the whole sequence as uppercase sb.append(matcher.group().toUpperCase(Locale.ROOT)); } end = matcher.end(); } while (matcher.find()); // Append the rest if there's anything left sb.append(path.substring(end)); return sb.toString(); } /** * Convert path segment of URL from Unicode to UTF-8 and escape all * characters which should be escaped according to <a * href="https://tools.ietf.org/html/rfc3986#section-2.2">RFC3986</a>.. */ private String escapePath(String path) { StringBuilder sb = new StringBuilder(path.length()); // Traverse over all bytes in this URL for (byte b : path.getBytes(utf8)) { // Is this a control character? if (b < 33 || b == 91 || b == 92 || b == 93 || b == 124) { // Start escape sequence sb.append('%'); // Get this byte's hexadecimal representation String hex = Integer.toHexString(b & 0xFF).toUpperCase( Locale.ROOT); // Do we need to prepend a zero? if (hex.length() % 2 != 0) { sb.append('0'); sb.append(hex); } else { // No, append this hexadecimal representation sb.append(hex); } } else { // No, just append this character as-is sb.append((char) b); } } return sb.toString(); } }