PageURLMiningProcessor.java example

Explorer
zava-master
- src
package com.github.sefler1987.javaworker.worker.linear;

import java.io.BufferedReader;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import java.util.concurrent.TimeUnit;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import com.github.sefler1987.javaworker.worker.TaskProcessor;
import com.github.sefler1987.javaworker.worker.WorkerTask;

/**
 * Given a specified URL, the processor will try to mine all of the URLs out from the page. The URLs
 * are guaranteed to be unique.
 *
 * @author xuanyin.zy E-mail:xuanyin.zy@taobao.com
 * @since Sep 15, 2012 4:19:15 PM
 */
public class PageURLMiningProcessor implements TaskProcessor {
    private static final String URL_PATTERN = "http(s)?://[\\w\\.\\/]*(\\.htm|\\.do|\\.html|\\.xhtm|\\.xhtml)";

    private static final int MAX_PAGE_SIZE = 1024 * 1024 * 10;

    private static final int BUFFER_SIZE = 128 * 1024;

    @Override
    public void process(WorkerTask<?> task) {
        if (!(task instanceof PageURLMiningTask))
            throw new IllegalArgumentException("Excepted PageURLMiningTask but was: " + task.getClass().getSimpleName());

        PageURLMiningTask urlMiningTask = (PageURLMiningTask) task;

        try {
            //先访问任务提供的目标URL
            URL url = new URL(urlMiningTask.getTargetURL());

            URLConnection urlConnection = url.openConnection();
            urlConnection.setConnectTimeout((int) TimeUnit.SECONDS.toMillis(2));
            urlConnection.setReadTimeout((int) TimeUnit.SECONDS.toMillis(2));

            InputStream inputStream = urlConnection.getInputStream();
            BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream), BUFFER_SIZE);

            //目标URL的页面内容
            StringBuilder pageContent = new StringBuilder();
            String line = null;
            while ((line = reader.readLine()) != null) {
                pageContent.append(line);

                if (line.length() > MAX_PAGE_SIZE || pageContent.length() > MAX_PAGE_SIZE) {
                    break;
                }
            }

            //这个目标页面上有没有链接
            Matcher matcher = Pattern.compile(URL_PATTERN).matcher(pageContent);
            while (matcher.find()) {
                //添加到这个任务需要挖掘的URL集合中
                urlMiningTask.addMinedURL(matcher.group());
            }

            //这个目标页面访问完毕,任务结束. 那么那些需要挖掘的页面呢?
            urlMiningTask.setDone(true);
        } catch (Exception e) {
            //System.err.println("Error while fetching specified URL: " + urlMiningTask.getTargetURL() + "\nException" + e.toString());
        } finally {
            synchronized (urlMiningTask) {
                urlMiningTask.notifyAll();
            }
        }
    }
}