// ================================================================================================= // Copyright 2011 Twitter, Inc. // ------------------------------------------------------------------------------------------------- // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this work except in compliance with the License. // You may obtain a copy of the License in the LICENSE file, or at: // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // ================================================================================================= package com.twitter.common.net; import com.google.common.base.Function; import com.google.common.base.Functions; import com.google.common.base.Preconditions; import com.twitter.common.base.MorePreconditions; import java.io.IOException; import java.net.HttpURLConnection; import java.net.Proxy; import java.net.Proxy.Type; import java.net.URISyntaxException; import java.net.URL; import java.util.Map; import java.util.logging.Logger; import javax.annotation.Nullable; /** * A utility that can resolve HTTP urls. * * @author John Sirois */ class UrlResolverUtil { private static final Logger LOG = Logger.getLogger(UrlResolverUtil.class.getName()); // Default user-agent string to user for HTTP requests. private static final String DEFAULT_USER_AGENT = "Lynxy/6.6.6dev.8 libwww-FM/3.14159FM"; private static Map<String, String> checkNotBlank(Map<String, String> hostToUserAgent) { Preconditions.checkNotNull(hostToUserAgent); MorePreconditions.checkNotBlank(hostToUserAgent.entrySet()); return hostToUserAgent; } private final Function<? super URL, String> urlToUserAgent; UrlResolverUtil(Map<String, String> hostToUserAgent) { this(Functions.compose(Functions.forMap(checkNotBlank(hostToUserAgent), DEFAULT_USER_AGENT), new Function<URL, String>() { @Override public String apply(URL url) { return url.getHost(); } })); } UrlResolverUtil(Function<? super URL, String> urlToUserAgent) { this.urlToUserAgent = Preconditions.checkNotNull(urlToUserAgent); } /** * Returns the URL that {@code url} lands on, which will be the result of a 3xx redirect, * or {@code url} if the url does not redirect using an HTTP 3xx response code. If there is a * non-2xx or 3xx HTTP response code null is returned. * * @param url The URL to follow. * @return The redirected URL, or {@code url} if {@code url} returns a 2XX response, otherwise * null * @throws java.io.IOException If an error occurs while trying to follow the url. */ String getEffectiveUrl(String url, @Nullable ProxyConfig proxyConfig) throws IOException { Preconditions.checkNotNull(url); // Don't follow https. if (url.startsWith("https://")) { url = url.replace("https://", "http://"); } else if (!url.startsWith("http://")) { url = "http://" + url; } URL urlObj = new URL(url); HttpURLConnection con; if (proxyConfig != null) { Proxy proxy = new Proxy(Type.HTTP, proxyConfig.getProxyAddress()); con = (HttpURLConnection) urlObj.openConnection(proxy); ProxyAuthorizer.adapt(proxyConfig).authorize(con); } else { con = (HttpURLConnection) urlObj.openConnection(); } try { // TODO(John Sirois): several commonly tweeted hosts 406 or 400 on HEADs and only work with GETs // fix the call chain to be able to specify retry-with-GET con.setRequestMethod("HEAD"); con.setUseCaches(true); con.setConnectTimeout(5000); con.setReadTimeout(5000); con.setInstanceFollowRedirects(false); // I hate to have to do this, but some URL shorteners don't respond otherwise. con.setRequestProperty("User-Agent", urlToUserAgent.apply(urlObj)); try { con.connect(); } catch (StringIndexOutOfBoundsException e) { LOG.info("Got StringIndexOutOfBoundsException when fetching headers for " + url); return null; } int responseCode = con.getResponseCode(); switch (responseCode / 100) { case 2: return url; case 3: String location = con.getHeaderField("Location"); if (location == null) { if (responseCode != 304 /* not modified */) { LOG.info( String.format("[%d] Location header was null for URL: %s", responseCode, url)); } return url; } // HTTP 1.1 spec says this should be an absolute URI, but i see lots of instances where it // is relative, so we need to check. try { String domain = UrlHelper.getDomainChecked(location); if (domain == null || domain.isEmpty()) { // This is a relative URI. location = "http://" + UrlHelper.getDomain(url) + location; } } catch (URISyntaxException e) { LOG.info("location contained an invalid URI: " + location); } return location; default: LOG.info("Failed to resolve url: " + url + " with: " + responseCode + " -> " + con.getResponseMessage()); return null; } } finally { con.disconnect(); } } }