package com.github.sefler1987.javaworker.worker.mapreduce;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.github.sefler1987.javaworker.worker.TaskProcessor;
import com.github.sefler1987.javaworker.worker.WorkerTask;
public class URLMatchingProcessor implements TaskProcessor {
private static final String URL_PATTERN = "http(s)?://[\\w\\.\\/]*(\\.html|\\.htm|\\.do|\\.xhtm|\\.xhtml)";
@Override
public void process(WorkerTask<?> task) {
if (!(task instanceof MapReducePageURLMiningTask))
throw new IllegalArgumentException("Excepted PageURLMiningTask but was: " + task.getClass().getSimpleName());
MapReducePageURLMiningTask mapReduceURLMiningTask = (MapReducePageURLMiningTask) task;
try {
Matcher matcher = Pattern.compile(URL_PATTERN).matcher(mapReduceURLMiningTask.getPageContent());
while (matcher.find()) {
mapReduceURLMiningTask.addMinedURL(matcher.group());
}
mapReduceURLMiningTask.setDone(true);
} catch (Exception e) {
System.err.println("Error while fetching specified URL: " + mapReduceURLMiningTask.getTargetURL()
+ "\nException" + e.toString());
} finally {
synchronized (mapReduceURLMiningTask) {
mapReduceURLMiningTask.notifyAll();
}
}
}
}