// ================================================================================================= // Copyright 2011 Twitter, Inc. // ------------------------------------------------------------------------------------------------- // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this work except in compliance with the License. // You may obtain a copy of the License in the LICENSE file, or at: // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // ================================================================================================= package com.twitter.common.net; import java.io.IOException; import java.util.List; import java.util.concurrent.Callable; import java.util.concurrent.Executor; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.Future; import java.util.concurrent.Semaphore; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; import java.util.logging.Level; import java.util.logging.Logger; import javax.annotation.Nullable; import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Functions; import com.google.common.base.Joiner; import com.google.common.base.Preconditions; import com.google.common.collect.ImmutableList; import com.google.common.collect.Iterables; import com.google.common.collect.Lists; import com.google.common.util.concurrent.ListenableFutureTask; import com.google.common.util.concurrent.ThreadFactoryBuilder; import com.twitter.common.base.ExceptionalFunction; import com.twitter.common.net.UrlResolver.ResolvedUrl.EndState; import com.twitter.common.quantity.Amount; import com.twitter.common.quantity.Time; import com.twitter.common.stats.PrintableHistogram; import com.twitter.common.util.BackoffStrategy; import com.twitter.common.util.Clock; import com.twitter.common.util.TruncatedBinaryBackoff; import com.twitter.common.util.caching.Cache; import com.twitter.common.util.caching.LRUCache; /** * Class to aid in resolving URLs by following redirects, which can optionally be performed * asynchronously using a thread pool. * * @author William Farner */ public class UrlResolver { private static final Logger LOG = Logger.getLogger(UrlResolver.class.getName()); private static final String TWITTER_UA = "Twitterbot/0.1"; private static final UrlResolverUtil URL_RESOLVER = new UrlResolverUtil(Functions.constant(TWITTER_UA)); private static final ExceptionalFunction<String, String, IOException> RESOLVER = new ExceptionalFunction<String, String, IOException>() { @Override public String apply(String url) throws IOException { return URL_RESOLVER.getEffectiveUrl(url, null); } }; private static ExceptionalFunction<String, String, IOException> getUrlResolver(final @Nullable ProxyConfig proxyConfig) { if (proxyConfig != null) { return new ExceptionalFunction<String, String, IOException>() { @Override public String apply(String url) throws IOException { return URL_RESOLVER.getEffectiveUrl(url, proxyConfig); } }; } else { return RESOLVER; } } private final ExceptionalFunction<String, String, IOException> resolver; private final int maxRedirects; // Tracks the number of active tasks (threads in use). private final Semaphore poolEntrySemaphore; private final Integer threadPoolSize; // Helps with signaling the handler. private final Executor handlerExecutor; // Manages the thread pool and task execution. private ExecutorService executor; // Cache to store resolved URLs. private final Cache<String, String> urlCache = LRUCache.<String, String>builder() .maxSize(10000) .makeSynchronized(true) .build(); // Variables to track connection/request stats. private AtomicInteger requestCount = new AtomicInteger(0); private AtomicInteger cacheHits = new AtomicInteger(0); private AtomicInteger failureCount = new AtomicInteger(0); // Tracks the time (in milliseconds) required to resolve URLs. private final PrintableHistogram urlResolutionTimesMs = new PrintableHistogram( 1, 5, 10, 25, 50, 75, 100, 150, 200, 250, 300, 500, 750, 1000, 1500, 2000); private final Clock clock; private final BackoffStrategy backoffStrategy; @VisibleForTesting UrlResolver(Clock clock, BackoffStrategy backoffStrategy, ExceptionalFunction<String, String, IOException> resolver, int maxRedirects) { this(clock, backoffStrategy, resolver, maxRedirects, null); } /** * Creates a new asynchronous URL resolver. A thread pool will be used to resolve URLs, and * resolved URLs will be announced via {@code handler}. * * @param maxRedirects The maximum number of HTTP redirects to follow. * @param threadPoolSize The number of threads to use for resolving URLs. * @param proxyConfig The proxy settings with which to make the HTTP request, or null for the * default configured proxy. */ public UrlResolver(int maxRedirects, int threadPoolSize, @Nullable ProxyConfig proxyConfig) { this(Clock.SYSTEM_CLOCK, new TruncatedBinaryBackoff(Amount.of(100L, Time.MILLISECONDS), Amount.of(1L, Time.SECONDS)), getUrlResolver(proxyConfig), maxRedirects, threadPoolSize); } public UrlResolver(int maxRedirects, int threadPoolSize) { this(maxRedirects, threadPoolSize, null); } private UrlResolver(Clock clock, BackoffStrategy backoffStrategy, ExceptionalFunction<String, String, IOException> resolver, int maxRedirects, @Nullable Integer threadPoolSize) { this.clock = clock; this.backoffStrategy = backoffStrategy; this.resolver = resolver; this.maxRedirects = maxRedirects; if (threadPoolSize != null) { this.threadPoolSize = threadPoolSize; Preconditions.checkState(threadPoolSize > 0); poolEntrySemaphore = new Semaphore(threadPoolSize); // Start up the thread pool. reset(); // Executor to send notifications back to the handler. This also needs to be // a daemon thread. handlerExecutor = Executors.newSingleThreadExecutor(new ThreadFactoryBuilder().setDaemon(true).build()); } else { this.threadPoolSize = null; poolEntrySemaphore = null; handlerExecutor = null; } } public Future<ResolvedUrl> resolveUrlAsync(final String url, final ResolvedUrlHandler handler) { Preconditions.checkNotNull( "Asynchronous URL resolution cannot be performed without a valid handler.", handler); try { poolEntrySemaphore.acquire(); } catch (InterruptedException e) { LOG.log(Level.SEVERE, "Interrupted while waiting for thread to resolve URL: " + url, e); return null; } final ListenableFutureTask<ResolvedUrl> future = ListenableFutureTask.create( new Callable<ResolvedUrl>() { @Override public ResolvedUrl call() { return resolveUrl(url); } }); future.addListener(new Runnable() { @Override public void run() { try { handler.resolved(future); } finally { poolEntrySemaphore.release(); } } }, handlerExecutor); executor.execute(future); return future; } private void logThreadpoolInfo() { LOG.info("Shutting down thread pool, available permits: " + poolEntrySemaphore.availablePermits()); LOG.info("Queued threads? " + poolEntrySemaphore.hasQueuedThreads()); LOG.info("Queue length: " + poolEntrySemaphore.getQueueLength()); } public void reset() { Preconditions.checkState(threadPoolSize != null); if (executor != null) { Preconditions.checkState(executor.isShutdown(), "The thread pool must be shut down before resetting."); Preconditions.checkState(executor.isTerminated(), "There may still be pending async tasks."); } // Create a thread pool with daemon threads, so that they may be terminated when no // application threads are running. executor = Executors.newFixedThreadPool(threadPoolSize, new ThreadFactoryBuilder().setDaemon(true).setNameFormat("UrlResolver[%d]").build()); } /** * Terminates the thread pool, waiting at most {@code waitSeconds} for active threads to complete. * After this method is called, no more URLs may be submitted for resolution. * * @param waitSeconds The number of seconds to wait for active threads to complete. */ public void clearAsyncTasks(int waitSeconds) { Preconditions.checkState(threadPoolSize != null, "finish() should not be called on a synchronous URL resolver."); logThreadpoolInfo(); executor.shutdown(); // Disable new tasks from being submitted. try { // Wait a while for existing tasks to terminate if (!executor.awaitTermination(waitSeconds, TimeUnit.SECONDS)) { LOG.info("Pool did not terminate, forcing shutdown."); logThreadpoolInfo(); List<Runnable> remaining = executor.shutdownNow(); LOG.info("Tasks still running: " + remaining); // Wait a while for tasks to respond to being cancelled if (!executor.awaitTermination(waitSeconds, TimeUnit.SECONDS)) { LOG.warning("Pool did not terminate."); logThreadpoolInfo(); } } } catch (InterruptedException e) { LOG.log(Level.WARNING, "Interrupted while waiting for threadpool to finish.", e); // (Re-)Cancel if current thread also interrupted executor.shutdownNow(); // Preserve interrupt status Thread.currentThread().interrupt(); } } /** * Resolves a URL synchronously. * * @param url The URL to resolve. * @return The resolved URL. */ public ResolvedUrl resolveUrl(String url) { ResolvedUrl resolvedUrl = new ResolvedUrl(); resolvedUrl.setStartUrl(url); String cached = urlCache.get(url); if (cached != null) { cacheHits.incrementAndGet(); resolvedUrl.setNextResolve(cached); resolvedUrl.setEndState(EndState.CACHED); return resolvedUrl; } String currentUrl = url; long backoffMs = 0L; String next = null; for (int i = 0; i < maxRedirects; i++) { try { next = resolveOnce(currentUrl); // If there was a 4xx or a 5xx, we''ll get a null back, so we pretend like we never advanced // to allow for a retry within the redirect limit. // TODO(John Sirois): we really need access to the return code here to do the right thing; ie: // retry for internal server errors but probably not for unauthorized if (next == null) { if (i < maxRedirects - 1) { // don't wait if we're about to exit the loop backoffMs = backoffStrategy.calculateBackoffMs(backoffMs); try { clock.waitFor(backoffMs); } catch (InterruptedException e) { Thread.currentThread().interrupt(); throw new RuntimeException( "Interrupted waiting to retry a failed resolution for: " + currentUrl, e); } } continue; } backoffMs = 0L; if (next.equals(currentUrl)) { // We've reached the end of the redirect chain. resolvedUrl.setEndState(EndState.REACHED_LANDING); urlCache.put(url, currentUrl); for (String intermediateUrl : resolvedUrl.getIntermediateUrls()) { urlCache.put(intermediateUrl, currentUrl); } return resolvedUrl; } else if (!url.equals(next)) { resolvedUrl.setNextResolve(next); } currentUrl = next; } catch (IOException e) { LOG.log(Level.INFO, "Failed to resolve url: " + url, e); resolvedUrl.setEndState(EndState.ERROR); return resolvedUrl; } } resolvedUrl.setEndState(next == null || url.equals(currentUrl) ? EndState.ERROR : EndState.REDIRECT_LIMIT); return resolvedUrl; } /** * Resolves a url, following at most one redirect. Thread-safe. * * @param url The URL to resolve. * @return The result of following the URL through at most one redirect or null if the url could * not be followed * @throws IOException If an error occurs while resolving the URL. */ private String resolveOnce(String url) throws IOException { requestCount.incrementAndGet(); String resolvedUrl = urlCache.get(url); if (resolvedUrl != null) { cacheHits.incrementAndGet(); return resolvedUrl; } try { long startTimeMs = System.currentTimeMillis(); resolvedUrl = resolver.apply(url); if (resolvedUrl == null) { return null; } urlCache.put(url, resolvedUrl); synchronized (urlResolutionTimesMs) { urlResolutionTimesMs.addValue(System.currentTimeMillis() - startTimeMs); } return resolvedUrl; } catch (IOException e) { failureCount.incrementAndGet(); throw e; } } @Override public String toString() { return String.format("Cache: %s\nFailed requests: %d,\nResolution Times: %s", urlCache, failureCount.get(), urlResolutionTimesMs.toString()); } /** * Class to wrap the result of a URL resolution. */ public static class ResolvedUrl { public enum EndState { REACHED_LANDING, ERROR, CACHED, REDIRECT_LIMIT } private String startUrl; private final List<String> resolveChain; private EndState endState; public ResolvedUrl() { resolveChain = Lists.newArrayList(); } @VisibleForTesting public ResolvedUrl(EndState endState, String startUrl, String... resolveChain) { this.endState = endState; this.startUrl = startUrl; this.resolveChain = Lists.newArrayList(resolveChain); } public String getStartUrl() { return startUrl; } void setStartUrl(String startUrl) { this.startUrl = startUrl; } /** * Returns the last URL resolved following a redirect chain, or null if the startUrl is a * landing URL. */ public String getEndUrl() { return resolveChain.isEmpty() ? null : Iterables.getLast(resolveChain); } void setNextResolve(String endUrl) { this.resolveChain.add(endUrl); } /** * Returns any immediate URLs encountered on the resolution chain. If the startUrl redirects * directly to the endUrl or they are the same the imtermediate URLs will be empty. */ public Iterable<String> getIntermediateUrls() { return resolveChain.size() <= 1 ? ImmutableList.<String>of() : resolveChain.subList(0, resolveChain.size() - 1); } public EndState getEndState() { return endState; } void setEndState(EndState endState) { this.endState = endState; } public String toString() { return String.format("%s -> %s [%s, %d redirects]", startUrl, Joiner.on(" -> ").join(resolveChain), endState, resolveChain.size()); } } /** * Interface to use for notifying the caller of resolved URLs. */ public interface ResolvedUrlHandler { /** * Signals that a URL has been resolved to its target. The implementation of this method must * be thread safe. * * @param future The future that has finished resolving a URL. */ public void resolved(Future<ResolvedUrl> future); } }