package org.archive.cdxserver.processor;
import org.apache.commons.lang.math.NumberUtils;
import org.archive.cdxserver.CDXServer;
import org.archive.format.cdx.CDXLine;
/**
* Performs <i>timestamp-based collapsing</i>, that is to group
* CDX lines by {@code timestamp} prefix and filter out all but just
* one CDX line per group.
* <p>Timestamp prefix is specified in terms of the number of digits
* (from left). If two CDX lines have {@code timestamp}s whose prefixes are
* identical, they are considered to be in the same group.</p>
* <p>It picks the first CDX line with the best (i.e. smallest)
* {@code statuscode} field within each group.</p>
* <p>CDX lines with {@code filename} that starts with any of prefixes
* specified in {@code noCollapsePrefix} are written out regardless of its
* {@code timestamp} or {@code statuscode}, in addition to the one picked for
* the group.</p>
* <p>Instantiated by {@link CDXServer} as part of CDX line processing pipeline.</p>
*/
public class DupeTimestampBestStatusFilter extends WrappedProcessor {
final static int WORST_HTTP_CODE = 9999;
protected String lastTimestamp;
protected int bestHttpCode = WORST_HTTP_CODE;
protected int timestampDedupLength;
protected String[] noCollapsePrefix;
public DupeTimestampBestStatusFilter(BaseProcessor output,
int timestampDedupLength, String[] noCollapsePrefix) {
super(output);
this.timestampDedupLength = timestampDedupLength;
this.noCollapsePrefix = noCollapsePrefix;
}
/**
* Return {@code true} if {@code line} is to be passed through,
* as specified by {@code noCollapsePrefix}.
* <p>Soft-blocked captures are also passed-through.</p>
* @param line CDX line
* @return boolean
*/
protected final boolean passThrough(CDXLine line) {
return isBlocked(line) || noCollapse(line);
}
protected final boolean isBlocked(CDXLine line) {
String robotflags = line.getRobotFlags();
// TODO: give 'X' a constant symbol - CaptureSearchResult.CAPTURE_ROBOT_BLOCKED
// is exactly that, but wayback-cdx-server cannot use it.
return robotflags != null && robotflags.indexOf('X') >= 0;
}
protected final boolean noCollapse(CDXLine line) {
if (noCollapsePrefix != null) {
for (String prefix : noCollapsePrefix) {
if (line.getFilename().startsWith(prefix)) {
return true;
}
}
}
return false;
}
@Override
public int writeLine(CDXLine line) {
if (include(line)) {
return super.writeLine(line);
} else {
return 0;
}
}
protected boolean include(CDXLine line) {
if (timestampDedupLength <= 0) {
return true;
}
// If starts with special no collapse prefix, then always include
if (passThrough(line)) return true;
String timestamp = line.getTimestamp();
timestamp = timestamp.substring(0, Math.min(timestampDedupLength, timestamp.length()));
int httpCode = NumberUtils.toInt(line.getStatusCode(), WORST_HTTP_CODE);
boolean isDupe = false;
if ((lastTimestamp != null) && timestamp.equals(lastTimestamp)) {
if (httpCode < bestHttpCode) {
bestHttpCode = httpCode;
} else {
isDupe = true;
}
} else {
bestHttpCode = httpCode;
}
lastTimestamp = timestamp;
return !isDupe;
}
}