package org.meaningfulweb.cext.processors;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.meaningfulweb.cext.HtmlContentProcessor;
import org.meaningfulweb.util.XMLUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.jdom.Document;
public class RegexProcessor
extends HtmlContentProcessor {
public static final Log LOG = LogFactory.getLog(RegexProcessor.class);
private Map<String, List<String>> regexes = new LinkedHashMap<String, List<String>>();
@Override
public boolean processContent(Document document) {
if (regexes != null && regexes.size() > 0) {
Document tempDoc = new Document();
tempDoc.addContent(document.cloneContent());
for (Entry<String, List<String>> entry : regexes.entrySet()) {
String name = entry.getKey();
List<String> regexValues = entry.getValue();
for (String regex : regexValues) {
try {
Pattern curPattern = Pattern.compile(regex);
String cleanedHtml = XMLUtils.toXml(tempDoc);
Matcher curMatcher = curPattern.matcher(cleanedHtml);
Set<String> uniqueMatches = new LinkedHashSet<String>();
while (curMatcher.find()) {
for (int i = 0; i <= curMatcher.groupCount(); i++) {
uniqueMatches.add(curMatcher.group(i));
}
}
List<String> matches = new ArrayList<String>();
if (uniqueMatches != null && uniqueMatches.size() > 0) {
matches.addAll(uniqueMatches);
addExtractedValue(name + ".matches", matches);
}
}
catch (Exception e) {
// continue to the next regex
}
}
}
}
return true;
}
public Map<String, List<String>> getRegexes() {
return regexes;
}
public void setRegexes(Map<String, List<String>> regexes) {
this.regexes = regexes;
}
}