package focusedCrawler.crawler.async;
import java.io.BufferedOutputStream;
import java.io.Closeable;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.PrintWriter;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Future;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.ThreadFactory;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.codahale.metrics.Counter;
import com.codahale.metrics.Gauge;
import com.codahale.metrics.Timer;
import com.codahale.metrics.Timer.Context;
import com.google.common.util.concurrent.ThreadFactoryBuilder;
import focusedCrawler.crawler.async.fetcher.FetcherFactory;
import focusedCrawler.crawler.crawlercommons.fetcher.AbortedFetchException;
import focusedCrawler.crawler.crawlercommons.fetcher.BaseFetchException;
import focusedCrawler.crawler.crawlercommons.fetcher.BaseFetcher;
import focusedCrawler.crawler.crawlercommons.fetcher.FetchedResult;
import focusedCrawler.link.frontier.LinkRelevance;
import focusedCrawler.util.MetricsManager;
/**
* This class manages thread pools for downloading links. Since downloading is a
* IO-bound process (network IO), we use a large number of threads for
* downloads, whereas for processing the downloaded data, we use a smaller
* number of threads, since this is usually a CPU-bound task (and thus, the
* parallelization performance is limited by the number of CPU cores available).
*
* @author aeciosantos
*
*/
public class HttpDownloader implements Closeable {
private static final int CPU_CORES = Runtime.getRuntime().availableProcessors();
private static final Logger logger = LoggerFactory.getLogger(HttpDownloader.class);
private final BaseFetcher fetcher;
private final ExecutorService downloadThreadPool;
private final ExecutorService distpatchThreadPool;
private final LinkedBlockingQueue<Runnable> downloadQueue;
private final LinkedBlockingQueue<Runnable> dispatchQueue;
private final AtomicInteger numberOfDownloads = new AtomicInteger(0);
private final AtomicInteger runningRequests = new AtomicInteger(0);
private final AtomicInteger runningHandlers = new AtomicInteger(0);
private final int maxQueueSize;
private final PrintWriter requestLog;
private Timer fetchTimer;
private Timer handlerTimer;
private Counter counterAborted;
private Counter counterSuccess;
private Counter counterHttpStatus2xx;
private Counter counterErrors;
public HttpDownloader() {
this(new HttpDownloaderConfig(), null, new MetricsManager(false));
}
public HttpDownloader(HttpDownloaderConfig config, String dataPath, MetricsManager metricsManager) {
ThreadFactory downloadThreadFactory = new ThreadFactoryBuilder().setNameFormat("downloader-%d").build();
ThreadFactory dispatcherThreadFactory = new ThreadFactoryBuilder().setNameFormat("dispatcher-%d").build();
this.downloadQueue = new LinkedBlockingQueue<Runnable>();
this.dispatchQueue = new LinkedBlockingQueue<Runnable>();
int threadPoolSize = config.getDownloadThreadPoolSize();
this.downloadThreadPool = new ThreadPoolExecutor(threadPoolSize , threadPoolSize,
0L, TimeUnit.MILLISECONDS, this.downloadQueue, downloadThreadFactory);
this.distpatchThreadPool = new ThreadPoolExecutor(CPU_CORES, CPU_CORES,
0L, TimeUnit.MILLISECONDS, this.dispatchQueue, dispatcherThreadFactory);
this.maxQueueSize = threadPoolSize * 2;
this.fetcher = FetcherFactory.createFetcher(config);
if(config.getValidMimeTypes() != null) {
for (String mimeTypes : config.getValidMimeTypes()) {
this.fetcher.addValidMimeType(mimeTypes);
}
}
if(dataPath == null) {
requestLog = null;
} else {
Path logPath = Paths.get(dataPath, "data_monitor", "downloadrequests.csv");
try {
Files.createDirectories(logPath.getParent());
this.requestLog = openLogFile(logPath);
} catch (IOException e) {
throw new RuntimeException("Failed to open downloader log at path: "+logPath.toString(), e);
}
}
setupMetrics(metricsManager);
}
private void setupMetrics(MetricsManager metrics) {
fetchTimer = metrics.getTimer("downloader.fetch.time");
handlerTimer = metrics.getTimer("downloader.handler.time");
counterAborted = metrics.getCounter("downloader.fetches.aborted");
counterSuccess = metrics.getCounter("downloader.fetches.successes");
counterErrors = metrics.getCounter("downloader.fetches.errors");
counterHttpStatus2xx = metrics.getCounter("downloader.http_response.status.2xx");
Gauge<Integer> downloadQueueGauge = () -> downloadQueue.size();
metrics.register("downloader.download_queue.size", downloadQueueGauge);
Gauge<Integer> dispatchQueueGauge = () -> dispatchQueue.size();
metrics.register("downloader.dispatch_queue.size", dispatchQueueGauge);
Gauge<Integer> numberOfDownloadsGauge = () -> numberOfDownloads.get();
metrics.register("downloader.pending_downloads", numberOfDownloadsGauge);
Gauge<Integer> runningRequestsGauge = () -> runningRequests.get();
metrics.register("downloader.running_requests", runningRequestsGauge);
Gauge<Integer> runningHandlersGauge = () -> runningHandlers.get();
metrics.register("downloader.running_handlers", runningHandlersGauge);
}
private PrintWriter openLogFile(Path path) throws FileNotFoundException {
boolean append = true;
BufferedOutputStream bos = new BufferedOutputStream(new FileOutputStream(path.toFile(), append));
boolean autoFlush = true;
return new PrintWriter(bos, autoFlush);
}
public Future<FetchedResult> dipatchDownload(String url) {
try {
return dipatchDownload(new URL(url), null);
} catch (MalformedURLException e) {
throw new IllegalArgumentException("Invalid URL provided: "+url, e);
}
}
public Future<FetchedResult> dipatchDownload(URL url, Callback callback) {
return dipatchDownload(new LinkRelevance(url, 0d), callback);
}
public Future<FetchedResult> dipatchDownload(LinkRelevance link, Callback callback) {
try {
while(downloadQueue.size() >= maxQueueSize ||
dispatchQueue.size() >= maxQueueSize) {
Thread.sleep(10);
}
} catch (InterruptedException e) {
// ok, just finish execution
}
Future<FetchedResult> future = downloadThreadPool.submit(new RequestTask(link, callback));
numberOfDownloads.incrementAndGet();
return future;
}
@Override
public void close() {
downloadThreadPool.shutdownNow();
distpatchThreadPool.shutdownNow();
try {
downloadThreadPool.awaitTermination(10, TimeUnit.SECONDS);
distpatchThreadPool.awaitTermination(10, TimeUnit.SECONDS);
} catch (InterruptedException e) {
throw new RuntimeException("Failed to shutdown downloader threads.", e);
}
if(requestLog != null) {
requestLog.close();
}
}
public void await() {
try {
logger.info("Waiting downloads be finalized...");
long timeWaited = 0;
while(downloadQueue.size() > 0 || runningRequests.get() > 0) {
Thread.sleep(10);
timeWaited = 10;
if(timeWaited % 5000 == 0) {
logger.info("Still waiting to finish downloads...");
}
}
while(dispatchQueue.size() > 0 || runningHandlers.get() > 0) {
Thread.sleep(10);
timeWaited = 10;
if(timeWaited % 5000 == 0) {
logger.info("Still waiting to process downloaded pages...");
}
}
downloadThreadPool.shutdown();
distpatchThreadPool.shutdown();
downloadThreadPool.awaitTermination(5, TimeUnit.MINUTES);
distpatchThreadPool.awaitTermination(5, TimeUnit.MINUTES);
} catch (InterruptedException e) {
throw new RuntimeException("Thread interrupted while waiting downloader threads finalize.", e);
}
}
public boolean hasPendingDownloads() {
if(numberOfDownloads.get() > 0) {
return true;
} else {
return false;
}
}
public interface Callback {
public void completed(LinkRelevance link, FetchedResult result);
public void failed(LinkRelevance link, Exception e);
}
private final class RequestTask implements Callable<FetchedResult> {
private final Callback callback;
private LinkRelevance link;
public RequestTask(LinkRelevance url, Callback callback) {
this.link = url;
this.callback = callback;
}
@Override
public FetchedResult call() {
runningRequests.incrementAndGet();
try {
return doRequest();
} catch(Throwable e) {
logger.error("Failed to execute download request", e);
return null;
} finally {
runningRequests.decrementAndGet();
}
}
private FetchedResult doRequest() {
BaseFetchException exception = null;
FetchedResult result = null;
String url = link.getURL().toString();
final Timer.Context context = fetchTimer.time();
try {
result = fetcher.get(url);
counterSuccess.inc();
} catch (BaseFetchException e) {
exception = e;
if(e instanceof AbortedFetchException) {
counterAborted.inc();
}
} finally {
context.stop();
}
if(result != null && result.getStatusCode() >= 200 && result.getStatusCode() < 300) {
counterHttpStatus2xx.inc();
} else {
counterErrors.inc();
}
if(requestLog != null) {
if(result != null) {
requestLog.printf("%d\t%s\t%s\t%s\n", result.getFetchTime(),
result.getStatusCode(), result.getHostAddress(), url);
} else {
requestLog.printf("%d\t%s\t%s\t%s\n", System.currentTimeMillis(),
-1, "unknown", url);
}
}
distpatchThreadPool.submit(new FetchFinishedHandler(link, result, callback, exception));
return result;
}
}
private final class FetchFinishedHandler implements Runnable {
final private FetchedResult response;
final private Callback callback;
final private BaseFetchException exception;
final private LinkRelevance link;
public FetchFinishedHandler(LinkRelevance link, FetchedResult response,
Callback callback, BaseFetchException exception) {
this.link = link;
this.response = response;
this.callback = callback;
this.exception = exception;
}
@Override
public void run() {
runningHandlers.incrementAndGet();
try{
doHandle();
} catch(Throwable e) {
logger.error("Failed to execute result handler", e);
}
finally {
runningHandlers.decrementAndGet();
numberOfDownloads.decrementAndGet();
}
}
private void doHandle() {
if(callback != null) {
Context context = handlerTimer.time();
try {
if(exception != null) {
callback.failed(link, exception);
} else {
callback.completed(link, response);
}
} finally {
context.stop();
}
}
}
}
}