package com.github.sefler1987.javaworker.main;
import java.util.Arrays;
import java.util.List;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentSkipListSet;
import java.util.concurrent.TimeUnit;
import com.github.sefler1987.javaworker.worker.ConfigurableWorker;
import com.github.sefler1987.javaworker.worker.SimpleURLComparator;
import com.github.sefler1987.javaworker.worker.WorkerEvent;
import com.github.sefler1987.javaworker.worker.WorkerListener;
import com.github.sefler1987.javaworker.worker.WorkerTask;
import com.github.sefler1987.javaworker.worker.linear.PageURLMiningProcessor;
import com.github.sefler1987.javaworker.worker.linear.PageURLMiningTask;
/**
* Linear version of page URL mining. It's slow but simple.
* Average time cost for 1000 URLs is: 3800ms
*
* @author xuanyin.zy E-mail:xuanyin.zy@taobao.com
* @since Sep 16, 2012 5:35:40 PM
*/
public class LinearURLMiningMain implements WorkerListener {
private static final String EMPTY_STRING = "";
private static final int URL_SIZE_TO_MINE = 1000;
//任务id-->工人执行的任务
private static ConcurrentHashMap<String, WorkerTask<?>> taskID2TaskMap = new ConcurrentHashMap<String, WorkerTask<?>>();
private static ConcurrentSkipListSet<String> foundURLs = new ConcurrentSkipListSet<String>(new SimpleURLComparator());
public static void main(String[] args) throws InterruptedException {
long startTime = System.currentTimeMillis();
//创建一个工人,并为这个工人指定要怎么执行任务: 任务处理器
ConfigurableWorker worker = new ConfigurableWorker("W001");
worker.setTaskProcessor(new PageURLMiningProcessor());
//给工人分配多个任务
addTask2Worker(worker, new PageURLMiningTask("http://www.taobao.com"));
addTask2Worker(worker, new PageURLMiningTask("http://www.xinhuanet.com"));
addTask2Worker(worker, new PageURLMiningTask("http://www.zol.com.cn"));
addTask2Worker(worker, new PageURLMiningTask("http://www.163.com"));
LinearURLMiningMain mainListener = new LinearURLMiningMain();
//给工人注册监听器
worker.addListener(mainListener);
//启动工人的后台工作线程
worker.start();
String targetURL = EMPTY_STRING;
//只挖掘给定数量的URL, 如果不设置, 将一直挖掘下去.
while (foundURLs.size() < URL_SIZE_TO_MINE) {
//这是我们从目标URL的页面内容中找到的需要继续挖掘的URL
targetURL = foundURLs.pollFirst();
if (targetURL == null) {
TimeUnit.MILLISECONDS.sleep(50);
continue;
}
//以这个待挖掘的URL作为一个新的任务. 并再次分配给工人.
PageURLMiningTask task = new PageURLMiningTask(targetURL);
//工人真苦逼,以为完成了这个任务就可以收工回家了. 其实还有很多任务等着呢.
//注意要将task的id和task添加到map映射中. 添加任务给工人会返回一个任务id.
taskID2TaskMap.putIfAbsent(worker.addTask(task), task);
//addTask2Worker(worker, task); //It's OK too.
TimeUnit.MILLISECONDS.sleep(100);
}
//一共找到了指定数量的URL, 工人可以下班了
worker.stop();
//这是要给领导看的. 打印出来工人今天完成了多少工作
int i=0;
for (String string : foundURLs) {
System.out.println(i++ + string);
}
System.out.println("Time Cost: " + (System.currentTimeMillis() - startTime) + "ms");
}
private static void addTask2Worker(ConfigurableWorker worker, PageURLMiningTask task) {
String taskID = worker.addTask(task);
taskID2TaskMap.put(taskID, task);
}
//注册了两种类型的事件: 任务完成,任务失败
@Override
public List<WorkerEvent> intrests() {
return Arrays.asList(WorkerEvent.TASK_COMPLETE, WorkerEvent.TASK_FAILED);
}
//监听器的自定义实现类, 必须对注册到工人上的不同事件做出不同的响应.
@Override
public void onEvent(WorkerEvent event, Object... args) {
if (WorkerEvent.TASK_FAILED == event) {
System.err.println("Error while extracting URLs");
return;
}
if (WorkerEvent.TASK_COMPLETE != event)
return;
// TASK_COMPLETE. args is the task we already done!
PageURLMiningTask task = (PageURLMiningTask) args[0];
// 这个任务应该是最开始我们分配给工人的. 如果返回的没有在map中. 则不处理.
if (!taskID2TaskMap.containsKey(task.getTaskID()))
return;
// 工人的任务虽然完成了, 但是这个任务里还有需要挖掘的urls. 添加到foundURLs(被找到的url集合)
foundURLs.addAll(task.getMinedURLs());
System.out.println("Found URL size: " + foundURLs.size());
// 任务完成了, 从map中移除
taskID2TaskMap.remove(task.getTaskID());
}
}