MapReduceURLMiningMain.java example

Explorer
zava-master
- src
package com.github.sefler1987.javaworker.main;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentSkipListSet;
import java.util.concurrent.TimeUnit;

import com.github.sefler1987.javaworker.worker.ConfigurableWorker;
import com.github.sefler1987.javaworker.worker.SimpleURLComparator;
import com.github.sefler1987.javaworker.worker.WorkerEvent;
import com.github.sefler1987.javaworker.worker.WorkerListener;
import com.github.sefler1987.javaworker.worker.WorkerTask;
import com.github.sefler1987.javaworker.worker.mapreduce.Map2ReduceConnector;
import com.github.sefler1987.javaworker.worker.mapreduce.MapReducePageURLMiningTask;
import com.github.sefler1987.javaworker.worker.mapreduce.PageContentFetchProcessor;
import com.github.sefler1987.javaworker.worker.mapreduce.URLMatchingProcessor;

/**
 * MapReduce version of page URL mining. It's very powerful.
 *
 * @author xuanyin.zy E-mail:xuanyin.zy@taobao.com
 * @since Sep 16, 2012 5:35:40 PM
 */
public class MapReduceURLMiningMain implements WorkerListener {
    private static final String EMPTY_STRING = "";

    private static final int URL_SIZE_TO_MINE = 10000;

    private static ConcurrentHashMap<String, WorkerTask<?>> taskID2TaskMap = new ConcurrentHashMap<String, WorkerTask<?>>();

    private static ConcurrentSkipListSet<String> foundURLs = new ConcurrentSkipListSet<String>(new SimpleURLComparator());

    public static void main(String[] args) throws InterruptedException {
        long startTime = System.currentTimeMillis();

        // four mapers
        List<ConfigurableWorker> mappers = new ArrayList<ConfigurableWorker>(4);

        ConfigurableWorker mapWorker_1 = new ConfigurableWorker("W_M1");
        ConfigurableWorker mapWorker_2 = new ConfigurableWorker("W_M2");
        ConfigurableWorker mapWorker_3 = new ConfigurableWorker("W_M3");
        ConfigurableWorker mapWorker_4 = new ConfigurableWorker("W_M4");
        //map操作:只访问目标URL,并获取内容
        mapWorker_1.setTaskProcessor(new PageContentFetchProcessor());
        mapWorker_2.setTaskProcessor(new PageContentFetchProcessor());
        mapWorker_3.setTaskProcessor(new PageContentFetchProcessor());
        mapWorker_4.setTaskProcessor(new PageContentFetchProcessor());

        mappers.add(mapWorker_1);
        mappers.add(mapWorker_2);
        mappers.add(mapWorker_3);
        mappers.add(mapWorker_4);

        // one reducers
        ConfigurableWorker reduceWorker_1 = new ConfigurableWorker("W_R1");
        //reduce操作:找出map的页面内容中的链接
        reduceWorker_1.setTaskProcessor(new URLMatchingProcessor());

        // bind reducer to final result class
        MapReduceURLMiningMain main = new MapReduceURLMiningMain();
        reduceWorker_1.addListener(main);

        // initiate tasks
        addTask2Worker(mapWorker_1, new MapReducePageURLMiningTask("http://www.taobao.com"));
        addTask2Worker(mapWorker_2, new MapReducePageURLMiningTask("http://www.xinhuanet.com"));
        addTask2Worker(mapWorker_3, new MapReducePageURLMiningTask("http://www.zol.com.cn"));
        addTask2Worker(mapWorker_4, new MapReducePageURLMiningTask("http://www.sina.com.cn/"));

        // bind mapper to reduer
        Map2ReduceConnector connector = new Map2ReduceConnector(Arrays.asList(reduceWorker_1));
        //给每个map worker都添加监听器
        mapWorker_1.addListener(connector);
        mapWorker_2.addListener(connector);
        mapWorker_3.addListener(connector);
        mapWorker_4.addListener(connector);

        // start all
        mapWorker_1.start();
        mapWorker_2.start();
        mapWorker_3.start();
        mapWorker_4.start();
        reduceWorker_1.start();

        String targetURL = EMPTY_STRING;
        int lastIndex = 0;
        while (foundURLs.size() < URL_SIZE_TO_MINE) {
            targetURL = foundURLs.pollFirst();

            if (targetURL == null) {
                TimeUnit.MILLISECONDS.sleep(50);
                continue;
            }

            lastIndex = ++lastIndex % mappers.size();
            MapReducePageURLMiningTask task = new MapReducePageURLMiningTask(targetURL);
            taskID2TaskMap.putIfAbsent(mappers.get(lastIndex).addTask(task), task);

            TimeUnit.MILLISECONDS.sleep(100);
        }

        // stop all
        mapWorker_1.stop();
        mapWorker_2.stop();
        mapWorker_3.stop();
        mapWorker_4.stop();
        reduceWorker_1.stop();

        for (String string : foundURLs) {
            System.out.println(string);
        }

        System.out.println("Time Cost: " + (System.currentTimeMillis() - startTime) + "ms");
    }

    private static void addTask2Worker(ConfigurableWorker mapWorker_1, MapReducePageURLMiningTask task) {
        String taskID = mapWorker_1.addTask(task);
        taskID2TaskMap.put(taskID, task);
    }

    @Override
    public List<WorkerEvent> intrests() {
        return Arrays.asList(WorkerEvent.TASK_COMPLETE, WorkerEvent.TASK_FAILED);
    }

    @Override
    public void onEvent(WorkerEvent event, Object... args) {
        if (WorkerEvent.TASK_FAILED == event) {
            System.err.println("Error while extracting URLs");
            return;
        }

        if (WorkerEvent.TASK_COMPLETE != event)
            return;

        MapReducePageURLMiningTask task = (MapReducePageURLMiningTask) args[0];
        if (!taskID2TaskMap.containsKey(task.getTaskID()))
            return;

        foundURLs.addAll(task.getMinedURLs());

        System.out.println("Found URL size: " + foundURLs.size());

        taskID2TaskMap.remove(task.getTaskID());
    }
}