/* * This file is part of the Wayback archival access software * (http://archive-access.sourceforge.net/projects/wayback/). * * Licensed to the Internet Archive (IA) by one or more individual * contributors. * * The IA licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.archive.wayback.resourceindex.filterfactory; import java.util.List; import org.archive.wayback.UrlCanonicalizer; import org.archive.wayback.core.CaptureSearchResult; import org.archive.wayback.core.SearchResults; import org.archive.wayback.core.WaybackRequest; import org.archive.wayback.exception.AccessControlException; import org.archive.wayback.exception.AdministrativeAccessControlException; import org.archive.wayback.exception.ResourceNotInArchiveException; import org.archive.wayback.exception.RobotAccessControlException; import org.archive.wayback.exception.RobotNotAvailableException; import org.archive.wayback.exception.RobotTimedOutAccessControlException; import org.archive.wayback.resourceindex.filters.CounterFilter; import org.archive.wayback.resourceindex.filters.ExclusionFilter; import org.archive.wayback.util.ObjectFilter; import org.archive.wayback.util.ObjectFilterChain; public class ExclusionCaptureFilterGroup implements CaptureFilterGroup { private ObjectFilterChain<CaptureSearchResult> chain = null; private CounterFilter preCounter = null; private CounterFilter postCounter = null; String requestUrl = null; private boolean sawRobots = false; private boolean passedRobots = false; private boolean robotTimedOut = false; private boolean liveWebGone = false; private boolean sawAdministrative = false; private boolean passedAdministrative = false; private UrlCanonicalizer canonicalizer = null; public ExclusionCaptureFilterGroup(WaybackRequest request, UrlCanonicalizer canonicalizer) { this.canonicalizer = canonicalizer; // checks an exclusion service for every matching record ExclusionFilter exclusion = request.getExclusionFilter(); chain = new ObjectFilterChain<CaptureSearchResult>(); if(exclusion != null) { exclusion.setFilterGroup(this); // preCounter = new CounterFilter(); // // count how many results got to the ExclusionFilter: // chain.addFilter(preCounter); chain.addFilter(exclusion); // count how many results got past the ExclusionFilter: requestUrl = request.getRequestUrl(); } // postCounter = new CounterFilter(); // chain.addFilter(postCounter); } public UrlCanonicalizer getCaptureFilterGroupCanonicalizer() { return canonicalizer; } public List<ObjectFilter<CaptureSearchResult>> getFilters() { return chain.getFilters(); } public void annotateResults(SearchResults results) throws AccessControlException, ResourceNotInArchiveException, RobotNotAvailableException { if(getRobotTimedOut()) { throw new RobotTimedOutAccessControlException("Unable to check" + " robots.txt for " + requestUrl); } if(getLiveWebGone()) { throw new RobotNotAvailableException("The URL " + requestUrl + " is blocked by the sites robots.txt file"); } if(isSawRobots() && !isPassedRobots()) { throw new RobotAccessControlException("The URL " + requestUrl + " is blocked by the sites robots.txt file"); } if(isSawAdministrative() && !isPassedAdministrative()) { throw new AdministrativeAccessControlException(requestUrl + " is not available in the Wayback Machine."); } } public void setPassedRobots() { passedRobots = true; } public void setSawRobots() { sawRobots = true; } public void setPassedAdministrative(boolean passed) { passedAdministrative = passed; } public void setPassedAdministrative() { passedAdministrative = true; } public void setSawAdministrative() { sawAdministrative = true; } public void setRobotTimedOut() { robotTimedOut = true; } public boolean getRobotTimedOut() { return robotTimedOut; } public void setLiveWebGone() { liveWebGone = true; } public boolean getLiveWebGone() { return liveWebGone; } public boolean isSawRobots() { return sawRobots; } public boolean isPassedRobots() { return passedRobots; } public boolean isSawAdministrative() { return sawAdministrative; } public boolean isPassedAdministrative() { return passedAdministrative; } }