PageContentFetchProcessor.java example

Explorer
zava-master
- src
package com.github.sefler1987.javaworker.worker.mapreduce;

import java.io.BufferedReader;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import java.util.concurrent.TimeUnit;

import com.github.sefler1987.javaworker.worker.TaskProcessor;
import com.github.sefler1987.javaworker.worker.WorkerTask;

public class PageContentFetchProcessor implements TaskProcessor {
    private static final int MAX_PAGE_SIZE = 1024 * 1024 * 10;

    private static final int BUFFER_SIZE = 128 * 1024;

    @Override
    public void process(WorkerTask<?> task) {
        if (!(task instanceof MapReducePageURLMiningTask))
            throw new IllegalArgumentException("Excepted PageURLMiningTask but was: " + task.getClass().getSimpleName());

        MapReducePageURLMiningTask mapReduceURLMiningTask = (MapReducePageURLMiningTask) task;

        try {
            URL url = new URL(mapReduceURLMiningTask.getTargetURL());

            URLConnection urlConnection = url.openConnection();
            urlConnection.setConnectTimeout((int) TimeUnit.SECONDS.toMillis(2));
            urlConnection.setReadTimeout((int) TimeUnit.SECONDS.toMillis(2));

            InputStream inputStream = urlConnection.getInputStream();
            BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream), BUFFER_SIZE);

            StringBuilder pageContent = new StringBuilder();

            String line = null;
            while ((line = reader.readLine()) != null) {
                pageContent.append(line);

                if (line.length() > MAX_PAGE_SIZE || pageContent.length() > MAX_PAGE_SIZE) {
                    break;
                }
            }

            mapReduceURLMiningTask.setPageContent(pageContent.toString());
            mapReduceURLMiningTask.setDone(true);
        } catch (Exception e) {
            //System.err.println("Error while fetching specified URL: " + mapReduceURLMiningTask.getTargetURL() + "\nException" + e.toString());
        } finally {
            synchronized (mapReduceURLMiningTask) {
                mapReduceURLMiningTask.notifyAll();
            }
        }
    }
}