package org.archive.wayback.resourceindex.cdxserver; import org.archive.cdxserver.auth.AuthToken; import org.archive.cdxserver.filter.CDXAccessFilter; import org.archive.cdxserver.filter.CDXFilter; import org.archive.format.cdx.CDXLine; import org.archive.util.io.RuntimeIOException; import org.archive.wayback.accesscontrol.oracleclient.CustomPolicyOracleFilter; import org.archive.wayback.core.CaptureSearchResult; import org.archive.wayback.core.FastCaptureSearchResult; import org.archive.wayback.exception.AdministrativeAccessControlException; import org.archive.wayback.exception.RobotAccessControlException; import org.archive.wayback.resourceindex.filters.ExclusionFilter; import org.archive.wayback.util.url.UrlOperations; /** * Standard {@link CDXAccessFilter} implementation useful for most cases. * <p> * 2014-11-06: Disabled per-{@code urlkey} caching of <i>include</i> result ( * {@code lastKey} and {@code cachedValue}). It assumes exclusion is per-URL * basis, which is not necessarily true (access control oracle allows for * excluding captures by date-range, for example). Such optimization should be * done in ExclusionFilter. * </p> * <p> * 2014-11-06: The second CDXFilter (@code cdxFilter2) is soon to be dropped. If * more than one {@code CDXFilter}s are needed, bundle them up in one composite * CDXFilter. * </p> */ public class AccessCheckFilter implements CDXAccessFilter { protected ExclusionFilter adminFilter; protected ExclusionFilter robotsFilter; protected CDXFilter cdxFilter; // being dropped protected CDXFilter cdxFilter2; // protected CaptureSearchResult resultTester; protected AuthToken authToken; protected String lastKey; protected boolean cachedValue = false; /** * Initialize with {@code AuthToken}, two {@code ExclusionFilter}s and just one {@link CDXFilter}. * @param token provides user privilege information * @param adminFilter administrative exclusion filter * @param robotsFilter robots exclusion filter * @param cdxFilter CDX filter for narrowing down the visible archive space. */ public AccessCheckFilter(AuthToken token, ExclusionFilter adminFilter, ExclusionFilter robotsFilter, CDXFilter cdxFilter) { this(token, adminFilter, robotsFilter, cdxFilter, null); } /** * Initializes with {@code AuthToken}, two {@code ExclusionFilter}s and two {@code CDXFilter}s. * Both {@code cdxFilter} and {@code cdxFilter2} must pass for capture to be included (i.e. they are AND). * @param token provides user privilege information * @param adminFilter administrative exclusion filter * @param robotsFilter robots exclusion filter * @param cdxFilter CDX filter for narrowing down the visible archive space. * @param cdxFilter2 Second CDX filter. * @obsolete 2014-11-06 Use one CDXFilter version. */ public AccessCheckFilter(AuthToken token, ExclusionFilter adminFilter, ExclusionFilter robotsFilter, CDXFilter cdxFilter, CDXFilter cdxFilter2) { this.authToken = token; this.adminFilter = adminFilter; this.robotsFilter = robotsFilter; this.cdxFilter = cdxFilter; this.cdxFilter2 = cdxFilter2; // this.resultTester = new FastCaptureSearchResult(); } public boolean include(CaptureSearchResult resultTester, boolean throwOnFail) { int status = ExclusionFilter.FILTER_INCLUDE; // Admin Excludes if (adminFilter != null) { status = adminFilter.filterObject(resultTester); } if (status != ExclusionFilter.FILTER_INCLUDE) { if (throwOnFail) { throw new RuntimeIOException(403, new AdministrativeAccessControlException( resultTester.getOriginalUrl() + " is not available in the Wayback Machine.")); } else { // lastKey = resultTester.getUrlKey(); // return cachedValue; return false; } } // Robot Excludes if (robotsFilter != null && !authToken.isIgnoreRobots()) { status = robotsFilter.filterObject(resultTester); } if (status != ExclusionFilter.FILTER_INCLUDE) { if (throwOnFail) { throw new RuntimeIOException(403, new RobotAccessControlException( resultTester.getOriginalUrl() + " is blocked by the sites robots.txt file")); } else { // lastKey = resultTester.getUrlKey(); // return cachedValue; return false; } } // lastKey = resultTester.getUrlKey(); // cachedValue = true; // // return cachedValue; return true; } // public boolean include(String urlKey, String originalUrl, // boolean throwOnFail) { // // if (lastKey != null && lastKey.equals(urlKey)) { // return cachedValue; // } // // cachedValue = false; // // if (UrlOperations.urlToScheme(originalUrl) == null) { // originalUrl = UrlOperations.HTTP_SCHEME + originalUrl; // } // // resultTester.setUrlKey(urlKey); // resultTester.setOriginalUrl(originalUrl); // // return include(resultTester, throwOnFail); // } @Override public boolean includeUrl(String urlKey, String originalUrl) { // return include(urlKey, originalUrl, true); if (UrlOperations.urlToScheme(originalUrl) == null) { originalUrl = UrlOperations.HTTP_SCHEME + originalUrl; } CaptureSearchResult resultTester = new FastCaptureSearchResult(); resultTester.setUrlKey(urlKey); resultTester.setOriginalUrl(originalUrl); // null captureTimestamp signifies per-URL access-check. resultTester.setCaptureTimestamp(null); return include(resultTester, true); } /** * Adapts CDXLine to CaptureSearchResult interface. Fetches * {@code originalUrl}, {@code captureTimestamp} and {@code robotFlags} from * the {@code CDXLine} adopted (minimum required for known existing filter * implementations). It also have {@code setRobotFlag} call modify * {@code robotflags} field in the underlining {@code CDXLine} (necessary * for soft-block feature). * <p> * TODO: Unfortunately this is not as lightweight as it should have been. * Only if CaptureSearchResult was an interface. * </p> * <p> * Caveat: it only overrides those methods used by known existing filter * implementations. * </p> * @see CustomPolicyOracleFilter */ protected static class CDXSearchResult extends FastCaptureSearchResult { final CDXLine cdxLine; public CDXSearchResult(CDXLine cdxLine) { this.cdxLine = cdxLine; } @Override public String getUrlKey() { return cdxLine.getUrlKey(); } @Override public final String getOriginalUrl() { return cdxLine.getOriginalUrl(); } // CustomPolicyOracleFilter calls getCaptureDate(), // which is implemented by CaptureSearchResult on top // of getCaptureTimestamp() @Override public final String getCaptureTimestamp() { return cdxLine.getTimestamp(); } @Override public final void setRobotFlag(char flag) { String robotFlags = cdxLine.getRobotFlags(); if (robotFlags == null || robotFlags.equals("-")) { setRobotFlags(Character.toString(flag)); } else { if (robotFlags.indexOf(flag) == -1) { setRobotFlags(robotFlags + flag); } } } @Override public final void setRobotFlags(String robotFlags) { // CDXLine does not have setter for robotFlags, // but setField method is available. cdxLine.setField(CDXLine.robotflags, robotFlags); } } @Override public boolean includeCapture(CDXLine line) { // if (!include(line.getUrlKey(), line.getOriginalUrl(), false)) { // return false; // } CDXSearchResult searchResult = new CDXSearchResult(line); if (!include(searchResult, false)) return false; // TODO: cdxFilter should be applied *before* exclusion filter. if (cdxFilter != null && !cdxFilter.include(line)) return false; if (cdxFilter2 != null && !cdxFilter2.include(line)) return false; return true; } }