package org.archive.cdxserver.filter; import java.io.IOException; import java.util.ArrayList; import java.util.HashSet; import java.util.List; import java.util.Set; import java.util.logging.Level; import java.util.logging.Logger; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.archive.format.cdx.CDXLine; import org.archive.util.GeneralURIStreamFactory; import org.archive.util.binsearch.SeekableLineReaderFactory; import org.archive.util.binsearch.SeekableLineReaderIterator; public class FilenamePrefixFilter implements CDXFilter { private static final Logger LOGGER = Logger .getLogger(FilenamePrefixFilter.class.getName()); protected String paramFile; protected int paramIndex = 1; protected boolean isExclusion = true; protected char delim = '\t'; protected String containsMatch; protected Set<String> paramSet = null; protected Pattern patterns[] = null; protected List<String> prefixList = null; /** * Default constructor. Call setters to configure. */ public FilenamePrefixFilter() { } /** * Initialize with essential configuration parameters. * @param prefixList List of filename prefixes * @param exclusion disposition: {@code true} for exclusion */ public FilenamePrefixFilter(List<String> prefixList, boolean exclusion) { this.prefixList = prefixList; this.isExclusion = exclusion; } // This method should be set as the init-method in the spring config // init-method="loadParamFile" when using this Filter public void loadParamFile() throws IOException { SeekableLineReaderFactory fact = null; SeekableLineReaderIterator iter = null; try { fact = GeneralURIStreamFactory.createSeekableStreamFactory( paramFile, false); iter = new SeekableLineReaderIterator(fact.get()); paramSet = new HashSet<String>(); while (iter.hasNext()) { String param = iter.next(); param = param.trim(); if (param.isEmpty() || param.startsWith("#")) { continue; } // Use only the first word, ignore the rest int wordEnd = param.indexOf(delim); if (wordEnd > 0) { param = param.substring(0, wordEnd); } paramSet.add(param); } } finally { if (iter != null) { iter.close(); } if (fact != null) { fact.close(); } } } public boolean include(CDXLine line) { final String file = line.getFilename(); boolean matched = false; if (containsMatch != null) { if (!file.contains(containsMatch)) { return (isExclusion ? true : false); } } if (prefixList != null) { for (String prefix : prefixList) { if (file.startsWith(prefix)) { return (isExclusion ? false : true); } } } if (patterns != null) { for (Pattern pattern : patterns) { Matcher matcher = pattern.matcher(file); if (matcher.find()) { String param = matcher.group(paramIndex); if (paramSet.contains(param)) { if (LOGGER.isLoggable(Level.FINE)) { LOGGER.fine("Excluding (w)arc: " + file); } matched = true; break; } } } } if (isExclusion) { return (matched ? false : true); } else { return (matched ? true : false); } } //Getters/Setters public String getParamFile() { return paramFile; } public void setParamFile(String paramFile) { this.paramFile = paramFile; } public int getParamIndex() { return paramIndex; } public void setParamIndex(int paramIndex) { this.paramIndex = paramIndex; } public boolean isExclusion() { return isExclusion; } public void setExclusion(boolean isExclusion) { this.isExclusion = isExclusion; } public char getDelim() { return delim; } public void setDelim(char delim) { this.delim = delim; } public String getContainsMatch() { return containsMatch; } public void setContainsMatch(String containsMatch) { this.containsMatch = containsMatch; } public List<String> getPatterns() { ArrayList<String> s = new ArrayList<String>(); for (Pattern p : patterns) { s.add(p.pattern()); } return s; } public void setPatterns(List<String> patternStrings) { int size = patternStrings.size(); patterns = new Pattern[size]; for (int i = 0; i < size; i++) { patterns[i] = Pattern.compile(patternStrings.get(i)); } } public List<String> getPrefixList() { return prefixList; } public void setPrefixList(List<String> prefixList) { this.prefixList = prefixList; } }