package org.archive.wayback.resourceindex.cdxserver; import java.util.HashMap; import java.util.LinkedList; import org.apache.commons.lang.math.NumberUtils; import org.archive.cdxserver.CDXQuery; import org.archive.cdxserver.CDXServer; import org.archive.cdxserver.auth.AuthChecker; import org.archive.format.cdx.CDXLine; import org.archive.wayback.core.CaptureSearchResult; import org.archive.wayback.core.CaptureSearchResults; import org.archive.wayback.core.FastCaptureSearchResult; import org.archive.wayback.resourceindex.LocalResourceIndex; import org.archive.wayback.resourceindex.filterfactory.ExclusionCaptureFilterGroup; import org.archive.wayback.resourceindex.filters.ExclusionFilter; import org.archive.wayback.resourceindex.filters.SelfRedirectFilter; import org.archive.wayback.util.ObjectFilter; import org.archive.wayback.util.Timestamp; import org.archive.wayback.util.url.UrlOperations; import org.archive.wayback.webapp.AccessPoint; /** * {@link CDXToSearchResultWriter} for producing {@link CaptureSearchResults}. * <p>Also resolves revisits and sets closest.</p> */ public class CDXToCaptureSearchResultsWriter extends CDXToSearchResultWriter { public final static String REVISIT_VALUE = "warc/revisit"; protected CaptureSearchResults results = null; protected String targetTimestamp; protected int flip = 1; protected boolean done = false; protected CaptureSearchResult closest = null; protected SelfRedirectFilter selfRedirFilter = null; protected ExclusionFilter exclusionFilter = null; protected CaptureSearchResult prevResult = null; protected CDXLine prevLine = null; protected HashMap<String, CaptureSearchResult> digestToOriginal; protected HashMap<String, LinkedList<CaptureSearchResult>> digestToRevisits; protected boolean resolveRevisits = false; protected boolean seekSingleCapture = false; protected boolean isReverse = false; protected String preferContains = null; // tentative protected boolean includeBlockedCaptures = false; /** * Initialize with CDXQuery and other options. * <p> * This class generates {@link CaptureSearchResult} in chronological * order, even when {@link CDXQuery#isReverse()} is {@code true}. * </p> * <p> * Note: {@code preferContains} parameter is specifically intended for * choosing one out of two copies of the identical capture record in different * storage locations. For example, If WARCs in staging area are made available * for replay through secondary index, there may be a period where one capture * is indexed in both main and secondary index, with different {@code filename} * field. If {@code preferContains} is set, CDX line that has {@code preferContains} * as substring in {@code filename} will be picked over others that does not. * It can be used, for example, to put higher preference on the archive in primary * storage area. * </p> * @param query CDXQuery * @param resolveRevisits Whether to resolve revisit captures * @param seekSingleCapture Whether just one capture is wanted. * (Only effective when {@code resolveRevisits} is also {@code true}.) * @param preferContains Preferred archive filename substring. If * non-{@code null}, It picks capture in the archive with a given substring * in its filename, out of multiple captures of the same timestamp, original * URL, length and offset (if any). */ public CDXToCaptureSearchResultsWriter(CDXQuery query, boolean resolveRevisits, boolean seekSingleCapture, String preferContains) { super(query); this.resolveRevisits = resolveRevisits; this.seekSingleCapture = seekSingleCapture; this.isReverse = query.isReverse(); this.preferContains = preferContains; } public void setTargetTimestamp(String timestamp) { targetTimestamp = timestamp; if (isReverse) { flip = -1; } } @Override public void begin() { results = new CaptureSearchResults(); if (resolveRevisits) { if (isReverse) { digestToRevisits = new HashMap<String, LinkedList<CaptureSearchResult>>(); } else { digestToOriginal = new HashMap<String, CaptureSearchResult>(); } } } @Override public int writeLine(CDXLine line) { String timestamp = line.getTimestamp(); String originalUrl = line.getOriginalUrl(); if ((prevResult != null) && (preferContains != null) && prevResult.getCaptureTimestamp().equals(timestamp) && prevResult.getOriginalUrl().equals(originalUrl) && prevLine.getLength().equals(line.getLength()) && prevLine.getOffset().equals(line.getOffset())) { String currFile = line.getFilename(); String prevFile = prevLine.getFilename(); if (currFile.contains(preferContains) && !prevFile.contains(preferContains)) { prevResult.setFile(currFile); } return 0; } FastCaptureSearchResult result = new FastCaptureSearchResult(); result.setUrlKey(line.getUrlKey()); result.setCaptureTimestamp(timestamp); result.setOriginalUrl(originalUrl); // Special case: filter out captures that have userinfo boolean hasUserInfo = (UrlOperations.urlToUserInfo(result .getOriginalUrl()) != null); if (hasUserInfo) { return 0; } result.setRedirectUrl(line.getRedirect()); result.setHttpCode(line.getStatusCode()); if (selfRedirFilter != null && !result.getRedirectUrl().equals(CDXLine.EMPTY_VALUE)) { if (selfRedirFilter.filterObject(result) != ObjectFilter.FILTER_INCLUDE) { return 0; } } // make these fields available to exclusionFilter. it may also modify some fields // (typically robotflags field). result.setMimeType(line.getMimeType()); result.setDigest(line.getDigest()); result.setFile(line.getFilename()); // ugly - move this check to FastCaptureSearchResult#setRobotFlags if (!"-".equals(line.getRobotFlags())) result.setRobotFlags(line.getRobotFlags()); if (exclusionFilter != null) { if (exclusionFilter.filterObject(result) != ObjectFilter.FILTER_INCLUDE) { return 0; } } result.setOffset(NumberUtils.toLong(line.getOffset(), -1)); result.setCompressedLength(NumberUtils.toLong(line.getLength(), -1)); boolean isRevisit = false; if (resolveRevisits) { isRevisit = result.getFile().equals(CDXLine.EMPTY_VALUE) || result.getMimeType().equals(REVISIT_VALUE); String digest = result.getDigest(); if (isRevisit) { if (!isReverse) { CaptureSearchResult payload = digestToOriginal.get(digest); if (payload != null) { result.flagDuplicateDigest(payload); } else { result.flagDuplicateDigest(); } } else { LinkedList<CaptureSearchResult> revisits = digestToRevisits .get(digest); if (revisits == null) { revisits = new LinkedList<CaptureSearchResult>(); digestToRevisits.put(digest, revisits); } revisits.add(result); } } else { if (!isReverse) { digestToOriginal.put(digest, result); } else { LinkedList<CaptureSearchResult> revisits = digestToRevisits .remove(digest); if (revisits != null) { for (CaptureSearchResult revisit : revisits) { revisit.flagDuplicateDigest(result); } } } } } // String payloadFile = line.getField(RevisitResolver.origfilename); // // if (!payloadFile.equals(CDXLine.EMPTY_VALUE)) { // FastCaptureSearchResult payload = new FastCaptureSearchResult(); // payload.setFile(payloadFile); // payload.setOffset(NumberUtils.toLong(line.getField(RevisitResolver.origoffset), -1)); // payload.setCompressedLength(NumberUtils.toLong(line.getField(RevisitResolver.origlength), -1)); // result.flagDuplicateDigest(payload); // } // Drop soft-blocked captures after resolving revisits. They are excluded // from regular replay, but available as the original of revisits. // It is disabled when AccessPoint is looking up the original for a // URL-agnostic revisit (indicated by includeBlockedCaptures flag). if (!includeBlockedCaptures && result.isRobotFlagSet(CaptureSearchResult.CAPTURE_ROBOT_BLOCKED)) { return 0; } if ((targetTimestamp != null) && (closest == null)) { closest = determineClosest(result); } results.addSearchResult(result, !isReverse); prevResult = result; prevLine = line; // Short circuit the load if seeking single capture if (seekSingleCapture && resolveRevisits) { if (closest != null) { // If not a revisit, we're done if (!isRevisit) { done = true; // Else make sure the revisit is resolved } else if (result.getDuplicatePayload() != null) { done = true; } } } return 1; } @Override public boolean isAborted() { return done; } protected CaptureSearchResult determineClosest( CaptureSearchResult nextResult) { int compare = targetTimestamp.compareTo(nextResult .getCaptureTimestamp()) * flip; if (compare == 0) { return nextResult; } else if (compare > 0) { // Too early to tell return null; } // First result that is greater/less than target if (results.isEmpty()) { return nextResult; } CaptureSearchResult lastResult = getLastAdded(); // Now compare date diff long nextTime = nextResult.getCaptureDate().getTime(); long lastTime = lastResult.getCaptureDate().getTime(); long targetTime = Timestamp.parseAfter(targetTimestamp).getDate() .getTime(); if (Math.abs(nextTime - targetTime) < Math.abs(lastTime - targetTime)) { return nextResult; } else { return lastResult; } } public void end() { results.setClosest(this.getClosest()); results.setReturnedCount(results.getResults().size()); results.setMatchingCount(results.getResults().size()); } public CaptureSearchResult getClosest() { if (closest != null) { return closest; } if (!results.isEmpty()) { // If no target timestamp, always return the latest capture, // otherwise first or last based on reverse state if (targetTimestamp != null) { return getLastAdded(); } else { return results.getResults().getLast(); } } return null; } protected CaptureSearchResult getLastAdded() { if (!isReverse) { return results.getResults().getLast(); } else { return results.getResults().getFirst(); } } @Override public CaptureSearchResults getSearchResults() { return results; } public SelfRedirectFilter getSelfRedirFilter() { return selfRedirFilter; } public void setSelfRedirFilter(SelfRedirectFilter selfRedirFilter) { this.selfRedirFilter = selfRedirFilter; } @Deprecated public ExclusionFilter getExclusionFilter() { return exclusionFilter; } /** * If non-{@code null}, the filter will be applied before revisit * resolution. * <p>Note: there is no class using this property in baseline Wayback. * You need to write a custom class to utilize this property. * See {@link CDXServer} and {@link LocalResourceIndex} * for other ways of configuring exclusion filters. * </p> * <p> * This method is deprecated because this can run exclusion after * timestamp deduplication, which results in undesirable capture * search results. Exclusion should happen in regular CDXServer * pipeline. This method was necessary to implement collection sensitive * exclusion filter. New exclusion filter factory addresses such needs * in ordinary CDX filtering pipeline. * </p> * @param exclusionFilter * @see CDXServer * @see LocalResourceIndex * @see AuthChecker#createAccessFilter(org.archive.cdxserver.auth.AuthToken) * @see ExclusionCaptureFilterGroup#ExclusionCaptureFilterGroup(org.archive.wayback.core.WaybackRequest, org.archive.wayback.UrlCanonicalizer) * @deprecated 2014-11-10 Use new implementation {@link AccessPoint#setExclusionFactory(org.archive.wayback.accesscontrol.ExclusionFilterFactory)} */ public void setExclusionFilter(ExclusionFilter exclusionFilter) { this.exclusionFilter = exclusionFilter; } public boolean isIncludeBlockedCaptures() { return includeBlockedCaptures; } /** * set to {@code true} if blocked captures are to be included * in the result. * <p>This is a tentative property and specifically intended for * looking up revisit original for URL-agnostic revisits. May change * in the future.</p> * @param includeBlockedCaptures */ public void setIncludeBlockedCaptures(boolean includeBlockedCaptures) { this.includeBlockedCaptures = includeBlockedCaptures; } }