package focusedCrawler.target.classifier;
import java.nio.file.Path;
import java.util.List;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import focusedCrawler.target.model.Page;
import focusedCrawler.util.RegexMatcher;
public class BodyRegexTargetClassifier implements TargetClassifier {
private RegexMatcher matcher;
public BodyRegexTargetClassifier(String regexFilename) {
this.matcher = RegexMatcher.fromFile(regexFilename);
}
public BodyRegexTargetClassifier(List<String> patterns) {
this.matcher = RegexMatcher.fromList(patterns);
}
@Override
public TargetRelevance classify(Page page) throws TargetClassifierException {
if (matcher.matches(page.getContentAsString())) {
return TargetRelevance.RELEVANT;
}
return TargetRelevance.IRRELEVANT;
}
public static class BodyRegexClassifierConfig {
public List<String> regular_expressions;
}
public static class Builder {
public TargetClassifier build(Path basePath, ObjectMapper yaml, JsonNode parameters)
throws JsonProcessingException {
BodyRegexClassifierConfig params =
yaml.treeToValue(parameters, BodyRegexClassifierConfig.class);
if (params.regular_expressions != null) {
return new BodyRegexTargetClassifier(params.regular_expressions);
} else {
return null;
}
}
}
}