package focusedCrawler.target.classifier;
import java.nio.file.Path;
import java.util.List;
import java.util.regex.Pattern;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import focusedCrawler.target.model.Page;
/**
* Classify whether a page is relevant by matching lists of regular expressions against multiples
* fields of the web page.
*/
public class RegexTargetClassifier implements TargetClassifier {
private RegexClassifierConfig params;
private Pattern[] urlPatterns;
private Pattern[] titlePatterns;
private Pattern[] contentPatterns;
private boolean OR;
private boolean AND;
public RegexTargetClassifier(RegexClassifierConfig params) {
this.params = params;
this.urlPatterns = compilePatterns(params.url.regexes);
this.titlePatterns = compilePatterns(params.title.regexes);
this.contentPatterns = compilePatterns(params.content.regexes);
this.OR = "OR".equals(params.boolean_operator);
this.AND = "AND".equals(params.boolean_operator);
}
private Pattern[] compilePatterns(List<String> regexes) {
if (regexes != null && !regexes.isEmpty()) {
Pattern[] patterns = new Pattern[regexes.size()];
for (int i = 0; i < patterns.length; i++) {
patterns[i] = Pattern.compile(regexes.get(i), Pattern.CASE_INSENSITIVE | Pattern.DOTALL);
}
return patterns;
}
return null;
}
@Override
public TargetRelevance classify(Page page) throws TargetClassifierException {
boolean matchesAll = true;
boolean matchesOne = false;
if(titlePatterns != null && titlePatterns.length > 0) {
boolean matches = regexMatchesField(page.getParsedData().getTitle(), params.title.boolean_operator, titlePatterns);
matchesAll = (matchesAll && matches);
matchesOne = (matchesOne || matches);
if( matchesOne && OR) return TargetRelevance.RELEVANT;
if(!matchesAll && AND) return TargetRelevance.IRRELEVANT;
}
if(urlPatterns != null && urlPatterns.length > 0) {
boolean matches = regexMatchesField(page.getURL().toString(), params.url.boolean_operator, urlPatterns);
matchesAll = (matchesAll && matches);
matchesOne = (matchesOne || matches);
if( matchesOne && OR) return TargetRelevance.RELEVANT;
if(!matchesAll && AND) return TargetRelevance.IRRELEVANT;
}
if(contentPatterns != null && contentPatterns.length > 0) {
boolean matches = regexMatchesField(page.getContentAsString(), params.content.boolean_operator, contentPatterns);
matchesAll = (matchesAll && matches);
matchesOne = (matchesOne || matches);
if( matchesOne && OR) return TargetRelevance.RELEVANT;
if(!matchesAll && AND) return TargetRelevance.IRRELEVANT;
}
if(AND)
return matchesAll ? TargetRelevance.RELEVANT : TargetRelevance.IRRELEVANT;
else
return matchesOne ? TargetRelevance.RELEVANT : TargetRelevance.IRRELEVANT;
}
public boolean regexMatchesField(String field, String boolOp, Pattern[] patterns) {
if (field == null || field.isEmpty()) {
return false;
}
boolean OR = "OR".equals(boolOp);
boolean AND = !OR;
boolean matchesAll = true;
boolean matchesOne = false;
for(int i = 0; i < patterns.length; i++) {
boolean matches = patterns[i].matcher(field).matches();
matchesAll = (matchesAll && matches);
matchesOne = (matchesOne || matches);
if( matchesOne && OR) return true;
if(!matchesAll && AND) return false;
}
if(AND)
return matchesAll;
else
return matchesOne;
}
public static class RegexList {
public String boolean_operator = "AND";
public List<String> regexes;
}
public static class RegexClassifierConfig {
public String boolean_operator = "AND";
public RegexList url = new RegexList();
public RegexList title = new RegexList();
public RegexList content = new RegexList();
}
public static class Builder {
public TargetClassifier build(Path basePath, ObjectMapper yaml, JsonNode parameters) throws JsonProcessingException {
RegexClassifierConfig params = yaml.treeToValue(parameters, RegexClassifierConfig.class);
if (params.url.regexes.isEmpty() &&
params.title.regexes.isEmpty() &&
params.content.regexes.isEmpty()) {
throw new IllegalArgumentException(
"Failed to configure " + getClass().getSimpleName() +
". At least one regular expression needs to be provided.");
}
return new RegexTargetClassifier(params);
}
}
}