/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.streams.urls;
import org.apache.commons.codec.net.URLCodec;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.validator.routines.UrlValidator;
import org.joda.time.DateTime;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.io.Serializable;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Objects;
public class LinkResolver implements Serializable {
/**
* References:
* Some helpful references to demonstrate the different types of browser re-directs that
* can happen. If you notice a redirect that was not followed to the proper place please
* submit a bug at :
* https://issues.apache.org/jira/browse/STREAMS
* <p/>
* Purpose URL
* ------------- ----------------------------------------------------------------
* [Status Codes] http://www.w3.org/Protocols/rfc2616/rfc2616-sec10.html
* [Test Cases] http://greenbytes.de/tech/tc/httpredirects/
* [t.co behavior] https://dev.twitter.com/docs/tco-redirection-behavior
*/
private final static Logger LOGGER = LoggerFactory.getLogger(LinkResolver.class);
private static final int MAX_ALLOWED_REDIRECTS = 30; // We will only chase the link to it's final destination a max of 30 times.
private static final int DEFAULT_HTTP_TIMEOUT = 10000; // We will only wait a max of 10,000 milliseconds (10 seconds) for any HTTP response
private static final String LOCATION_IDENTIFIER = "location";
private static final String SET_COOKIE_IDENTIFIER = "set-cookie";
// if Bots are not 'ok' this is the spoof settings that we'll use
private static final Map<String, String> SPOOF_HTTP_HEADERS = new HashMap<String, String>() {{
put("Connection", "Keep-Alive");
put("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.48 Safari/537.36");
put("Accept-Language", "en-US,en;q=0.8,zh;q=0.6");
}};
// These are the known domains that are 'bot' friendly.
private static final Collection<String> BOTS_ARE_OK = new ArrayList<String>() {{
add("t.co");
}};
// To help canonicalize the URL, these parts are 'known' to be 'ok' to remove
private static final Collection<String> URL_TRACKING_TO_REMOVE = new ArrayList<String>() {{
/*
* Google uses parameters in the URL string to track referrers
* on their Google Analytics and promotions. These are the
* identified URL patterns.
*
* URL:
* https://support.google.com/analytics/answer/1033867?hl=en
*****************************************************************/
// Required. Use utm_source to identify a search engine, newsletter name, or other source.
add("([\\?&])utm_source(=)[^&?]*");
// Required. Use utm_medium to identify a medium such as email or cost-per- click.
add("([\\?&])utm_medium(=)[^&?]*");
// Used for paid search. Use utm_term to note the keywords for this ad.
add("([\\?&])utm_term(=)[^&?]*");
// Used for A/B testing and content-targeted ads. Use utm_content to differentiate ads or links that point to the same
add("([\\?&])utm_content(=)[^&?]*");
// Used for keyword analysis. Use utm_campaign to identify a specific product promotion or strategic campaign.
add("([\\?&])utm_campaign(=)[^&?]*");
}};
// This element holds all the information about all the re-directs that have taken place
// and the steps and HTTP codes that occurred inside of each step.
private final LinkDetails linkDetails;
private Collection<String> domainsSensitiveTo = new HashSet<>();
/**
* Get the link details
*
* @return Detailed log of every redirection that took place with the browser along with it it's ultimate status code.
*/
public LinkDetails getLinkDetails() {
return linkDetails;
}
/**
* Raw string input of the URL. If the URL is invalid, the response code that is returned will indicate such.
*
* @param originalURL The URL you wish to unwind represented as a string.
*/
public LinkResolver(String originalURL) {
linkDetails = new LinkDetails();
linkDetails.setOriginalURL(originalURL);
}
public void run() {
Objects.requireNonNull(linkDetails.getOriginalURL());
linkDetails.setStartTime(DateTime.now());
// we are going to try three times just in case we catch a slow server or one that needs
// to be warmed up. This tends to happen many times with smaller private servers
for (int i = 0; (i < 3) && linkDetails.getFinalURL() == null; i++)
if (linkDetails.getLinkStatus() != LinkDetails.LinkStatus.SUCCESS)
unwindLink(linkDetails.getOriginalURL());
// because this is a POJO we need to make sure that we set this to false if it was never re-directed
if(this.linkDetails.getRedirectCount() == 0 || this.linkDetails.getRedirected() == null)
this.linkDetails.setRedirected(false);
linkDetails.setFinalURL(cleanURL(linkDetails.getFinalURL()));
if(StringUtils.isNotBlank(linkDetails.getFinalURL()))
linkDetails.setNormalizedURL(normalizeURL(linkDetails.getFinalURL()));
if(StringUtils.isNotBlank(linkDetails.getNormalizedURL()))
linkDetails.setUrlParts(tokenizeURL(linkDetails.getNormalizedURL()));
this.updateTookInMillis();
}
protected void updateTookInMillis() {
Objects.requireNonNull(linkDetails.getStartTime());
linkDetails.setTookInMills(DateTime.now().minus(linkDetails.getStartTime().getMillis()).getMillis());
}
public void unwindLink(String url) {
Objects.requireNonNull(linkDetails);
Objects.requireNonNull(url);
// Check url validity
UrlValidator urlValidator = new UrlValidator();
if (!urlValidator.isValid(url)) {
linkDetails.setLinkStatus(LinkDetails.LinkStatus.MALFORMED_URL);
return;
}
// Check to see if they wound up in a redirect loop,
// IE: 'A' redirects to 'B', then 'B' redirects to 'A'
if ((linkDetails.getRedirectCount() != null && linkDetails.getRedirectCount() > 0 &&
(linkDetails.getOriginalURL().equals(url) || linkDetails.getRedirects().contains(url)))
|| (linkDetails.getRedirectCount() != null && linkDetails.getRedirectCount() > MAX_ALLOWED_REDIRECTS)) {
linkDetails.setLinkStatus(LinkDetails.LinkStatus.LOOP);
return;
}
if (!linkDetails.getOriginalURL().equals(url))
linkDetails.getRedirects().add(url);
HttpURLConnection connection = null;
// Store where the redirected link will go (if there is one)
String reDirectedLink = null;
try {
// Turn the string into a URL
URL thisURL = new URL(url);
// Be sensitive to overloading domains STREAMS-77
try {
String host = thisURL.getHost().toLowerCase();
if(!domainsSensitiveTo.contains(host)) {
domainsSensitiveTo.add(host);
long domainWait = LinkResolverHelperFunctions.waitTimeForDomain(thisURL.getHost());
if (domainWait > 0) {
LOGGER.debug("Waiting for domain: {}", domainWait);
Thread.sleep(domainWait);
}
}
} catch(Exception e) {
// noOp
}
connection = (HttpURLConnection) new URL(url).openConnection();
// now we are going to pretend that we are a browser...
// This is the way my mac works.
if (!BOTS_ARE_OK.contains(thisURL.getHost())) {
connection.addRequestProperty("Host", thisURL.getHost());
// Bots are not 'ok', so we need to spoof the headers
for (String k : SPOOF_HTTP_HEADERS.keySet())
connection.addRequestProperty(k, SPOOF_HTTP_HEADERS.get(k));
// the test to seattlemamadoc.com prompted this change.
// they auto detect bots by checking the referrer chain and the 'user-agent'
// this broke the t.co test. t.co URLs are EXPLICITLY ok with bots
// there is a list for URLS that behave this way at the top in BOTS_ARE_OK
// smashew 2013-13-2013
if (linkDetails.getRedirectCount() > 0 && BOTS_ARE_OK.contains(thisURL.getHost()))
connection.addRequestProperty("Referrer", linkDetails.getOriginalURL());
}
connection.setReadTimeout(DEFAULT_HTTP_TIMEOUT);
connection.setConnectTimeout(DEFAULT_HTTP_TIMEOUT);
// we want to follow this behavior on our own to ensure that we are getting to the
// proper place. This is especially true with links that are wounded by special
// link winders,
// IE:
connection.setInstanceFollowRedirects(false);
if (linkDetails.getCookies() != null)
for (String cookie : linkDetails.getCookies())
connection.addRequestProperty("Cookie", cookie.split(";", 1)[0]);
connection.connect();
linkDetails.setFinalResponseCode((long) connection.getResponseCode());
Map<String, List<String>> headers = createCaseInsensitiveMap(connection.getHeaderFields());
/*
* If they want us to set cookies, well, then we will set cookies
* Example URL:
* http://nyti.ms/1bCpesx
*****************************************************************/
if (headers.containsKey(SET_COOKIE_IDENTIFIER))
linkDetails.getCookies().add(headers.get(SET_COOKIE_IDENTIFIER).get(0));
switch (linkDetails.getFinalResponseCode().intValue()) {
/*
* W3C HTTP Response Codes:
* http://www.w3.org/Protocols/rfc2616/rfc2616-sec10.html
*/
case 200: // HTTP OK
linkDetails.setFinalURL(connection.getURL().toString());
linkDetails.setDomain(new URL(linkDetails.getFinalURL()).getHost());
linkDetails.setLinkStatus(LinkDetails.LinkStatus.SUCCESS);
break;
case 300: // Multiple choices
case 301: // URI has been moved permanently
case 302: // Found
case 303: // Primarily for a HTTP Post
case 304: // Not Modified
case 306: // This status code is unused but in the redirect block.
case 307: // Temporary re-direct
/*
* Author:
* Smashew
*
* Date: 2013-11-15
*
* Note:
* It is possible that we have already found our final URL. In
* the event that we have found our final URL, we are going to
* save this URL as long as it isn't the original URL.
* We are still going to ask the browser to re-direct, but in the
* case of yet another redirect, seen with the redbull test
* this can be followed by a 304, a browser, by W3C standards would
* still render the page with it's content, but for us to assert
* a success, we are really hoping for a 304 message.
*******************************************************************/
if (!linkDetails.getOriginalURL().toLowerCase().equals(connection.getURL().toString().toLowerCase()))
linkDetails.setFinalURL(connection.getURL().toString());
if (!headers.containsKey(LOCATION_IDENTIFIER)) {
LOGGER.info("Headers: {}", headers);
linkDetails.setLinkStatus(LinkDetails.LinkStatus.REDIRECT_ERROR);
} else {
linkDetails.setRedirected(Boolean.TRUE);
linkDetails.setRedirectCount(linkDetails.getRedirectCount() + 1);
reDirectedLink = connection.getHeaderField(LOCATION_IDENTIFIER);
}
break;
case 305: // User must use the specified proxy (deprecated by W3C)
break;
case 401: // Unauthorized (nothing we can do here)
linkDetails.setLinkStatus(LinkDetails.LinkStatus.UNAUTHORIZED);
break;
case 403: // HTTP Forbidden (Nothing we can do here)
linkDetails.setLinkStatus(LinkDetails.LinkStatus.FORBIDDEN);
break;
case 404: // Not Found (Page is not found, nothing we can do with a 404)
linkDetails.setLinkStatus(LinkDetails.LinkStatus.NOT_FOUND);
break;
case 500: // Internal Server Error
case 501: // Not Implemented
case 502: // Bad Gateway
case 503: // Service Unavailable
case 504: // Gateway Timeout
case 505: // Version not supported
linkDetails.setLinkStatus(LinkDetails.LinkStatus.HTTP_ERROR_STATUS);
break;
default:
LOGGER.info("Unrecognized HTTP Response Code: {}", linkDetails.getFinalResponseCode());
linkDetails.setLinkStatus(LinkDetails.LinkStatus.NOT_FOUND);
break;
}
} catch (MalformedURLException e) {
// the URL is trash, so, it can't load it.
linkDetails.setLinkStatus(LinkDetails.LinkStatus.MALFORMED_URL);
} catch (IOException ex) {
// there was an issue we are going to set to error.
linkDetails.setLinkStatus(LinkDetails.LinkStatus.ERROR);
} catch (Exception ex) {
// there was an unknown issue we are going to set to exception.
linkDetails.setLinkStatus(LinkDetails.LinkStatus.EXCEPTION);
} finally {
// if the connection is not null, then we need to disconnect to close any underlying resources
if (connection != null)
connection.disconnect();
}
// If there was a redirection, then we have to keep going
// Placing this code here should help to satisfy ensuring that the connection object
// is closed successfully.
if (reDirectedLink != null)
unwindLink(reDirectedLink);
}
private Map<String, List<String>> createCaseInsensitiveMap(Map<String, List<String>> input) {
Map<String, List<String>> toReturn = new HashMap<>();
for (String k : input.keySet())
if (k != null && input.get(k) != null)
toReturn.put(k.toLowerCase(), input.get(k));
return toReturn;
}
private String cleanURL(String url) {
// If they pass us a null URL then we are going to pass that right back to them.
if (url == null)
return null;
// remember how big the URL was at the start
int startLength = url.length();
// Iterate through all the known URL parameters of tracking URLs
for (String pattern : URL_TRACKING_TO_REMOVE)
url = url.replaceAll(pattern, "");
// If the URL is smaller than when it came in. Then it had tracking information
if (url.length() < startLength)
linkDetails.setTracked(Boolean.TRUE);
// return our url.
return url;
}
/**
* Removes the protocol, if it exists, from the front and
* removes any random encoding characters
* Extend this to do other url cleaning/pre-processing
*
* @param url - The String URL to normalize
* @return normalizedUrl - The String URL that has no junk or surprises
*/
public static String normalizeURL(String url) {
// Decode URL to remove any %20 type stuff
String normalizedUrl = url;
try {
// Replaced URLDecode with commons-codec b/c of failing tests
URLCodec codec = new URLCodec();
normalizedUrl = codec.decode(url);
// Remove the protocol, http:// ftp:// or similar from the front
if (normalizedUrl.contains("://"))
normalizedUrl = normalizedUrl.split(":/{2}")[1];
} catch (NullPointerException npe) {
System.err.println("NPE Decoding URL. Decoding skipped.");
npe.printStackTrace();
} catch (Throwable e) {
System.err.println("Misc error Decoding URL. Decoding skipped.");
e.printStackTrace();
}
// Room here to do more pre-processing
return normalizedUrl;
}
/**
* Goal is to get the different parts of the URL path. This can be used
* in a classifier to help us determine if we are working with
* <p/>
* Reference:
* http://stackoverflow.com/questions/10046178/pattern-matching-for-url-classification
*
* @param url - Url to be tokenized
* @return tokens - A String array of all the tokens
*/
public static List<String> tokenizeURL(String url) {
url = normalizeURL(url);
// I assume that we're going to use the whole URL to find tokens in
// If you want to just look in the GET parameters, or you want to ignore the domain
// or you want to use the domain as a token itself, that would have to be
// processed above the next line, and only the remaining parts split
List<String> toReturn = new ArrayList<>();
// Split the URL by forward slashes. Most modern browsers will accept a URL
// this malformed such as http://www.smashew.com/hello//how////are/you
// hence the '+' in the regular expression.
for (String part : url.split("/+"))
toReturn.add(part.toLowerCase());
// return our object.
return toReturn;
// One could alternatively use a more complex regex to remove more invalid matches
// but this is subject to your (?:in)?ability to actually write the regex you want
// These next two get rid of tokens that are too short, also.
// Destroys anything that's not alphanumeric and things that are
// alphanumeric but only 1 character long
//String[] tokens = url.split("(?:[\\W_]+\\w)*[\\W_]+");
// Destroys anything that's not alphanumeric and things that are
// alphanumeric but only 1 or 2 characters long
//String[] tokens = url.split("(?:[\\W_]+\\w{1,2})*[\\W_]+");
}
}