package org.archive.cdxserver.filter;
import java.io.UnsupportedEncodingException;
import java.net.URLDecoder;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.lang.math.NumberUtils;
import org.archive.format.cdx.CDXLine;
import org.archive.format.cdx.FieldSplitFormat;
import org.archive.format.cdx.FieldSplitLine;
/**
* Dedupes a FieldSplitLine by a specific field, or part of a specific field eg:
*
* <field> = if <field> matches previous match, then its a dupe <field>:<n> = if
* first <n> character of <field> match, then its a dupe
*
*
* @author ilya
*
*/
public class CollapseFieldFilter implements CDXFilter {
final static String FIELD_SEP_CHAR = ":";
final protected FieldSplitFormat names;
final protected List<DupeMatch> dupeMatchers;
class DupeMatch {
final int fieldIndex;
final int substrLength;
String prevValue;
DupeMatch(String str) {
try {
str = URLDecoder.decode(str, "UTF-8");
} catch (UnsupportedEncodingException e) {
}
int sepIndex = str.indexOf(FIELD_SEP_CHAR);
String field;
// Match entire field
if (sepIndex < 0) {
field = str;
substrLength = -1;
} else {
field = str.substring(0, sepIndex);
substrLength = NumberUtils.toInt(str.substring(sepIndex + 1));
}
// First try parsing as int
int index = NumberUtils.toInt(field, -1);
// Then try names if available
if ((index < 0) && (names != null)) {
index = names.getFieldIndex(field);
}
fieldIndex = index;
}
boolean isUnique(FieldSplitLine line) {
String currValue = line.getField(fieldIndex);
if ((substrLength > 0) && (substrLength <= currValue.length())) {
currValue = currValue.substring(0, substrLength);
}
boolean unique = false;
if ((prevValue == null) || !currValue.equals(prevValue)) {
unique = true;
prevValue = currValue;
}
return unique;
}
void clear()
{
prevValue = null;
}
}
public CollapseFieldFilter(String[] fields, FieldSplitFormat names) {
this.names = names;
this.dupeMatchers = new ArrayList<DupeMatch>(fields.length);
for (String field : fields) {
if (!field.isEmpty()) {
dupeMatchers.add(new DupeMatch(field));
}
}
}
public boolean include(CDXLine line) {
for (DupeMatch duper : dupeMatchers) {
if (!duper.isUnique(line)) {
return false;
}
}
return true;
// boolean anyUnique = false;
//
// for (DupeMatch duper : dupeMatchers) {
// if (anyUnique) {
// duper.clear();
// } else {
// anyUnique = anyUnique || duper.isUnique(line);
// }
// }
//
// return anyUnique;
}
}