/* This file is part of the Wayback archival access software * (http://archive-access.sourceforge.net/projects/wayback/). * * Licensed to the Internet Archive (IA) by one or more individual * contributors. * * The IA licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.archive.wayback.accesscontrol.oracleclient; import java.util.Date; import org.archive.accesscontrol.AccessControlClient; import org.archive.accesscontrol.RobotsUnavailableException; import org.archive.accesscontrol.RuleOracleUnavailableException; import org.archive.util.ArchiveUtils; import org.archive.wayback.core.CaptureSearchResult; import org.archive.wayback.resourceindex.filters.ExclusionFilter; /** * @author brad * */ public class OracleExclusionFilter extends ExclusionFilter { protected AccessControlClient client = null; protected String accessGroup = null; private final static String POLICY_ALLOW = "allow"; private final static String POLICY_BLOCK = "block"; private final static String POLICY_ROBOT = "robots"; private boolean notifiedRobotSeen = false; // private boolean notifiedRobotPassed = false; private boolean notifiedAdminSeen = false; private boolean notifiedAdminPassed = false; /** * @param oracleUrl String URL prefix for the Oracle HTTP server * @param accessGroup String group to use with requests to the Oracle */ public OracleExclusionFilter(String oracleUrl, String accessGroup) { this(oracleUrl,accessGroup,null); } /** * @param oracleUrl String URL prefix for the Oracle HTTP server * @param accessGroup String group to use with requests to the Oracle * @param proxyHostPort String proxyHost:proxyPort to use for robots.txt */ public OracleExclusionFilter(String oracleUrl, String accessGroup, String proxyHostPort) { client = new AccessControlClient(oracleUrl); if(proxyHostPort != null) { int colonIdx = proxyHostPort.indexOf(':'); if(colonIdx > 0) { String host = proxyHostPort.substring(0,colonIdx); int port = Integer.valueOf(proxyHostPort.substring(colonIdx+1)); client.setRobotProxy(host, port); } } this.accessGroup = accessGroup; } protected int handleAllow() { if(!notifiedAdminSeen) { notifiedAdminSeen = true; if(filterGroup != null) { filterGroup.setSawAdministrative(); } } if(!notifiedAdminPassed) { notifiedAdminPassed = true; if(filterGroup != null) { filterGroup.setPassedAdministrative(); } } return FILTER_INCLUDE; } protected int handleBlock() { if(!notifiedAdminSeen) { notifiedAdminSeen = true; if(filterGroup != null) { filterGroup.setSawAdministrative(); } } return FILTER_EXCLUDE; } protected int handleRobots() { if(!notifiedRobotSeen) { notifiedRobotSeen = true; if(filterGroup != null) { filterGroup.setSawRobots(); } } return FILTER_INCLUDE; // if(robotFilter != null) { // if(!notifiedRobotPassed) { // notifiedRobotPassed = true; // if(filterGroup != null) { // filterGroup.setPassedRobot(); // } // } // return robotFilter.filterObject(o); // } else { // return FILTER_EXCLUDE; // } } public int filterObject(CaptureSearchResult o) { String url = o.getOriginalUrl(); Date captureDate = o.getCaptureDate(); Date retrievalDate = new Date(); String policy; try { policy = client.getPolicy(ArchiveUtils.addImpliedHttpIfNecessary(url), captureDate, retrievalDate, accessGroup); if(policy != null) { if(policy.equals(POLICY_ALLOW)) { return handleAllow(); } else if(policy.equals(POLICY_BLOCK)) { return handleBlock(); } else if(policy.equals(POLICY_ROBOT)) { return handleRobots(); } } } catch (RobotsUnavailableException e) { e.printStackTrace(); } catch (RuleOracleUnavailableException e) { e.printStackTrace(); } return FILTER_EXCLUDE; } }