package org.archive.cdxserver.filter; import java.io.UnsupportedEncodingException; import java.net.URLDecoder; import java.util.ArrayList; import java.util.List; import java.util.regex.Pattern; import org.archive.format.cdx.CDXLine; import org.archive.format.cdx.FieldSplitFormat; import org.archive.format.cdx.FieldSplitLine; /** * Matches a FieldSplitLine against a string of regex * Supports matching against individual fields if specified * eg: * * ~<containsstr> = look for containing sting <containsstr> and not a regex * * <regex> = match whole line * <field>:<regex> = match <field> in FieldSplitLine, by name or number, and match only that field * * Supports !<regex> for not matching * * @author ilya * */ public class FieldRegexFilter implements CDXFilter { final static String INVERT_CHAR = "!"; final static String CONTAINS_CHAR = "~"; final static String FIELD_SEP_CHAR = ":"; final protected FieldSplitFormat names; final protected List<RegexMatch> regexMatchers; class RegexMatch { final Pattern regex; final boolean inverted; final String containsStr; final int fieldIndex; RegexMatch(String str) { boolean contains = false; if (str.startsWith(CONTAINS_CHAR)) { str = str.substring(1); contains = true; } if (str.startsWith(INVERT_CHAR)) { str = str.substring(1); inverted = true; } else { inverted = false; } int sepIndex = str.indexOf(FIELD_SEP_CHAR); // Match entire line if (sepIndex < 0) { fieldIndex = -1; if (contains) { containsStr = str; regex = null; } else { containsStr = null; regex = Pattern.compile(str); } return; } String field = str.substring(0, sepIndex); String pattern = str.substring(sepIndex + 1); int index = -1; // First try parsing as int try { index = Integer.parseInt(field); } catch (NumberFormatException n) { } // Then try names if available if ((index < 0) && (names != null)) { index = names.getFieldIndex(field); } fieldIndex = index; if (contains) { containsStr = pattern; regex = null; } else { containsStr = null; regex = Pattern.compile(pattern); } } boolean matches(FieldSplitLine line) { boolean matched; if (fieldIndex < 0) { if (containsStr != null) { matched = line.toString().contains(containsStr); } else { matched = regex.matcher(line.toString()).matches(); } } else { if (containsStr != null) { matched = line.getField(fieldIndex).contains(containsStr); } else { matched = regex.matcher(line.getField(fieldIndex)).matches(); } } if (inverted) { matched = !matched; } return matched; } } public FieldRegexFilter(String[] regexs, FieldSplitFormat names) { this.names = names; this.regexMatchers = new ArrayList<RegexMatch>(regexs.length); for (String regex : regexs) { if (!regex.isEmpty()) { regexMatchers.add(new RegexMatch(regex)); } } } public boolean include(CDXLine line) { for (RegexMatch regexMatch : regexMatchers) { if (!regexMatch.matches(line)) { return false; } } return true; } }