package focusedCrawler.target.classifier;
import java.nio.file.Path;
import java.util.List;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import focusedCrawler.target.model.Page;
import focusedCrawler.util.LinkFilter;
import focusedCrawler.util.LinkFilter.LinkBlackList;
import focusedCrawler.util.LinkFilter.LinkWhiteList;
public class UrlRegexTargetClassifier implements TargetClassifier {
private LinkFilter linkFilter;
public UrlRegexTargetClassifier(LinkFilter linkfilter) {
this.linkFilter = linkfilter;
}
public UrlRegexTargetClassifier(String regexFilename) {
this.linkFilter = new LinkFilter(new LinkWhiteList(regexFilename));
}
public UrlRegexTargetClassifier(List<String> urlPatterns) {
this.linkFilter = new LinkFilter(urlPatterns);
}
@Override
public TargetRelevance classify(Page page) throws TargetClassifierException {
if(linkFilter.accept(page.getURL().toString())) {
return new TargetRelevance(true, 1.0);
} else {
return new TargetRelevance(false, 0.0);
}
}
public static UrlRegexTargetClassifier fromRegularExpressions(List<String> regularExpressions) {
return new UrlRegexTargetClassifier(regularExpressions);
}
public static UrlRegexTargetClassifier fromWhitelistFile(String whitelistFilename) {
LinkFilter linkfilter = new LinkFilter(new LinkWhiteList(whitelistFilename));
return new UrlRegexTargetClassifier(linkfilter);
}
public static UrlRegexTargetClassifier fromBlacklistFile(String blacklistFilename) {
LinkFilter linkfilter = new LinkFilter(new LinkBlackList(blacklistFilename));
return new UrlRegexTargetClassifier(linkfilter);
}
public static UrlRegexTargetClassifier fromWhitelistAndBlacklistFiles(String whitelistFilename,
String blacklistFilename) {
LinkFilter linkfilter = new LinkFilter(new LinkWhiteList(whitelistFilename),
new LinkBlackList(blacklistFilename));
return new UrlRegexTargetClassifier(linkfilter);
}
static class UrlRegexClassifierConfig {
public List<String> regular_expressions;
public String whitelist_file;
public String blacklist_file;
}
public static class Builder {
public TargetClassifier build(Path basePath, ObjectMapper yaml, JsonNode parameters) throws JsonProcessingException {
UrlRegexClassifierConfig params = yaml.treeToValue(parameters,
UrlRegexClassifierConfig.class);
TargetClassifier classifier = null;
if(params.regular_expressions != null && params.regular_expressions.size() > 0) {
classifier = UrlRegexTargetClassifier.fromRegularExpressions(params.regular_expressions);
}
if(params.whitelist_file != null && params.blacklist_file != null) {
params.whitelist_file = basePath.resolve(params.whitelist_file).toString();
params.blacklist_file = basePath.resolve(params.blacklist_file).toString();
classifier = UrlRegexTargetClassifier.fromWhitelistAndBlacklistFiles(
params.whitelist_file,
params.blacklist_file
);
}
if(params.whitelist_file != null && params.blacklist_file == null) {
params.whitelist_file = basePath.resolve(params.whitelist_file).toString();
classifier = UrlRegexTargetClassifier.fromWhitelistFile(params.whitelist_file);
}
if(params.whitelist_file == null && params.blacklist_file != null) {
params.blacklist_file = basePath.resolve(params.blacklist_file).toString();
classifier = UrlRegexTargetClassifier.fromBlacklistFile(params.blacklist_file);
}
return classifier;
}
}
}