/** * Licensed to DigitalPebble Ltd under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * DigitalPebble licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.digitalpebble.storm.crawler.filtering.basic; import java.net.MalformedURLException; import java.net.URL; import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Scanner; import java.util.Set; import java.util.TreeSet; import com.digitalpebble.storm.crawler.Metadata; import com.digitalpebble.storm.crawler.filtering.URLFilter; import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.node.ArrayNode; import org.apache.commons.lang.StringUtils; import org.apache.http.NameValuePair; import org.apache.http.client.utils.URLEncodedUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; public class BasicURLNormalizer implements URLFilter { private static final Logger LOG = LoggerFactory.getLogger(BasicURLNormalizer.class); boolean removeAnchorPart = true; boolean unmangleQueryString = true; final Set<String> queryElementsToRemove = new TreeSet<>(); @Override public String filter(URL sourceUrl, Metadata sourceMetadata, String urlToFilter) { if (removeAnchorPart) { try { URL theURL = new URL(urlToFilter); String anchor = theURL.getRef(); if (anchor != null) urlToFilter = urlToFilter.replace("#" + anchor, ""); } catch (MalformedURLException e) { return null; } } if (unmangleQueryString) { urlToFilter = unmangleQueryString(urlToFilter); } if (!queryElementsToRemove.isEmpty()) { urlToFilter = filterQueryElements(urlToFilter); } return urlToFilter; } @Override public void configure(Map stormConf, JsonNode paramNode) { JsonNode node = paramNode.get("removeAnchorPart"); if (node != null) { removeAnchorPart = node.booleanValue(); } node = paramNode.get("unmangleQueryString"); if (node != null) { unmangleQueryString = node.booleanValue(); } node = paramNode.get("queryElementsToRemove"); if (node != null) { if (!node.isArray()) { LOG.warn("Failed to configure queryElementsToRemove. Not an array: {}", node.toString()); } else { ArrayNode array = (ArrayNode) node; for (JsonNode element : array) { queryElementsToRemove.add(element.asText()); } } } } /** * Basic filter to remove query parameters from urls so parameters that don't change the content * of the page can be removed. An example would be a google analytics query parameter like * "utm_campaign" which might have several different values for a url that points to the same * content. */ private String filterQueryElements(String urlToFilter) { try { // Handle illegal characters by making a url first // this will clean illegal characters like | URL url = new URL(urlToFilter); if (StringUtils.isEmpty(url.getQuery())) { return urlToFilter; } List<NameValuePair> pairs = new ArrayList<NameValuePair>(); URLEncodedUtils.parse(pairs, new Scanner(url.getQuery()), "UTF-8"); Iterator<NameValuePair> pairsIterator = pairs.iterator(); while (pairsIterator.hasNext()) { NameValuePair param = pairsIterator.next(); if (queryElementsToRemove.contains(param.getName())) { pairsIterator.remove(); } } StringBuilder newFile = new StringBuilder(); if (url.getPath() != null) { newFile.append(url.getPath()); } if (!pairs.isEmpty()) { Collections.sort(pairs, comp); String newQueryString = URLEncodedUtils.format(pairs, StandardCharsets.UTF_8); newFile.append('?').append(newQueryString); } if (url.getRef() != null) { newFile.append('#').append(url.getRef()); } return new URL(url.getProtocol(), url.getHost(), url.getPort(), newFile.toString()).toString(); } catch (MalformedURLException e) { LOG.warn("Invalid urlToFilter {}. {}", urlToFilter, e); return null; } } Comparator<NameValuePair> comp = new Comparator<NameValuePair>() { @Override public int compare(NameValuePair p1, NameValuePair p2) { return p1.getName().compareTo(p2.getName()); } }; /** * A common error to find is a query string that starts with an & instead of a ? This will fix * that error. So http://foo.com&a=b will be changed to http://foo.com?a=b. * * @param urlToFilter * @return corrected url */ private String unmangleQueryString(String urlToFilter) { int firstAmp = urlToFilter.indexOf('&'); if (firstAmp > 0) { int firstQuestionMark = urlToFilter.indexOf('?'); if (firstQuestionMark == -1) { return urlToFilter.replaceFirst("&", "?"); } } return urlToFilter; } }