package com.github.sefler1987.javaworker.worker.mapreduce; import java.io.BufferedReader; import java.io.InputStream; import java.io.InputStreamReader; import java.net.URL; import java.net.URLConnection; import java.util.concurrent.TimeUnit; import com.github.sefler1987.javaworker.worker.TaskProcessor; import com.github.sefler1987.javaworker.worker.WorkerTask; public class PageContentFetchProcessor implements TaskProcessor { private static final int MAX_PAGE_SIZE = 1024 * 1024 * 10; private static final int BUFFER_SIZE = 128 * 1024; @Override public void process(WorkerTask<?> task) { if (!(task instanceof MapReducePageURLMiningTask)) throw new IllegalArgumentException("Excepted PageURLMiningTask but was: " + task.getClass().getSimpleName()); MapReducePageURLMiningTask mapReduceURLMiningTask = (MapReducePageURLMiningTask) task; try { URL url = new URL(mapReduceURLMiningTask.getTargetURL()); URLConnection urlConnection = url.openConnection(); urlConnection.setConnectTimeout((int) TimeUnit.SECONDS.toMillis(2)); urlConnection.setReadTimeout((int) TimeUnit.SECONDS.toMillis(2)); InputStream inputStream = urlConnection.getInputStream(); BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream), BUFFER_SIZE); StringBuilder pageContent = new StringBuilder(); String line = null; while ((line = reader.readLine()) != null) { pageContent.append(line); if (line.length() > MAX_PAGE_SIZE || pageContent.length() > MAX_PAGE_SIZE) { break; } } mapReduceURLMiningTask.setPageContent(pageContent.toString()); mapReduceURLMiningTask.setDone(true); } catch (Exception e) { //System.err.println("Error while fetching specified URL: " + mapReduceURLMiningTask.getTargetURL() + "\nException" + e.toString()); } finally { synchronized (mapReduceURLMiningTask) { mapReduceURLMiningTask.notifyAll(); } } } }