package com.github.sefler1987.javaworker.worker.linear;
import java.io.BufferedReader;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import java.util.concurrent.TimeUnit;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.github.sefler1987.javaworker.worker.TaskProcessor;
import com.github.sefler1987.javaworker.worker.WorkerTask;
/**
* Given a specified URL, the processor will try to mine all of the URLs out from the page. The URLs
* are guaranteed to be unique.
*
* @author xuanyin.zy E-mail:xuanyin.zy@taobao.com
* @since Sep 15, 2012 4:19:15 PM
*/
public class PageURLMiningProcessor implements TaskProcessor {
private static final String URL_PATTERN = "http(s)?://[\\w\\.\\/]*(\\.htm|\\.do|\\.html|\\.xhtm|\\.xhtml)";
private static final int MAX_PAGE_SIZE = 1024 * 1024 * 10;
private static final int BUFFER_SIZE = 128 * 1024;
@Override
public void process(WorkerTask<?> task) {
if (!(task instanceof PageURLMiningTask))
throw new IllegalArgumentException("Excepted PageURLMiningTask but was: " + task.getClass().getSimpleName());
PageURLMiningTask urlMiningTask = (PageURLMiningTask) task;
try {
//先访问任务提供的目标URL
URL url = new URL(urlMiningTask.getTargetURL());
URLConnection urlConnection = url.openConnection();
urlConnection.setConnectTimeout((int) TimeUnit.SECONDS.toMillis(2));
urlConnection.setReadTimeout((int) TimeUnit.SECONDS.toMillis(2));
InputStream inputStream = urlConnection.getInputStream();
BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream), BUFFER_SIZE);
//目标URL的页面内容
StringBuilder pageContent = new StringBuilder();
String line = null;
while ((line = reader.readLine()) != null) {
pageContent.append(line);
if (line.length() > MAX_PAGE_SIZE || pageContent.length() > MAX_PAGE_SIZE) {
break;
}
}
//这个目标页面上有没有链接
Matcher matcher = Pattern.compile(URL_PATTERN).matcher(pageContent);
while (matcher.find()) {
//添加到这个任务需要挖掘的URL集合中
urlMiningTask.addMinedURL(matcher.group());
}
//这个目标页面访问完毕,任务结束. 那么那些需要挖掘的页面呢?
urlMiningTask.setDone(true);
} catch (Exception e) {
//System.err.println("Error while fetching specified URL: " + urlMiningTask.getTargetURL() + "\nException" + e.toString());
} finally {
synchronized (urlMiningTask) {
urlMiningTask.notifyAll();
}
}
}
}