package org.archive.cdxserver.processor; import java.util.ArrayList; import java.util.List; import org.apache.commons.lang.math.NumberUtils; import org.archive.format.cdx.CDXLine; /** * A variant of {@link DupeTimestampBestStatusFilter} that returns * the last best capture instead of the first one. * Support of {@code noCollapsePrefix} complicates processing, so this * may be slightly slower than {@link DupeTimestampBestStatusFilter}. * <p>Note that the semantics of {@link #writeLine(CDXLine)} is slightly * different from other processors. It returns 1 if some (non-pass-through) * CDX line, which is not necessarily the same as the CDX line passed as argument, * is written out. Count of ones would be one less than others.</p> * <p>Tests in {@link DupeTimestampBestStatusFilterTest}.</p> */ public class DupeTimestampLastBestStatusFilter extends DupeTimestampBestStatusFilter { /** * Keeps the best CDX line so far within a group. */ protected CDXLine bestLine; /** * Keeps a list of CDXLines that matches {@code noCollapsePreifx}, but * cannot be written yet because their {@code timestamp}s are larger than * {@code bestLine.timestamp}. */ protected List<CDXLine> pendingPassThroughs; public DupeTimestampLastBestStatusFilter(BaseProcessor output, int timestampDedupLength, String[] noCollapsePrefix) { super(output, timestampDedupLength, noCollapsePrefix); this.pendingPassThroughs = new ArrayList<CDXLine>(); } /** * Write out all pending pass-throughs, and * clear pass-through buffer. */ protected final void flushPassThrough() { for (CDXLine line : pendingPassThroughs) { // NB: don't call super.writeLine() inner.writeLine(line); } pendingPassThroughs.clear(); } /** * return group key of {@code line}. * @param line CDX line * @return first {@code timestampDedupLength} digits * of {@code timestamp} */ protected final String groupKey(CDXLine line) { String timestamp = line.getTimestamp(); return timestamp.substring(0, Math.min(timestampDedupLength, timestamp.length())); } @Override public int writeLine(CDXLine line) { if (timestampDedupLength <= 0) { // NB: do not call super.writeLine() // same for all writeLine() calls below. return inner.writeLine(line); } if (passThrough(line)) { if (bestLine != null) { pendingPassThroughs.add(line); return 1; } else { return inner.writeLine(line); } } String key = groupKey(line); int httpCode = NumberUtils.toInt(line.getStatusCode(), WORST_HTTP_CODE); if (lastTimestamp != null && key.equals(lastTimestamp)) { // within a collapse group if (httpCode <= bestHttpCode) { flushPassThrough(); bestLine = line; bestHttpCode = httpCode; } return 0; } else { // new collapse group int r = 0; if (bestLine != null) { // for the first line r = inner.writeLine(bestLine); flushPassThrough(); } bestLine = line; bestHttpCode = httpCode; lastTimestamp = key; return r; } } @Override public void end() { // last collapse group. bestLine == null happens // only when writeLine() was never called, or // timestampDedupLength <= 0 if (bestLine != null) { inner.writeLine(bestLine); flushPassThrough(); } super.end(); } }