/* * This file is part of the Wayback archival access software * (http://archive-access.sourceforge.net/projects/wayback/). * * Licensed to the Internet Archive (IA) by one or more individual * contributors. * * The IA licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.archive.wayback.resourceindex.filters; import java.util.HashMap; import java.util.logging.Level; import java.util.logging.Logger; import org.archive.wayback.core.CaptureSearchResult; import org.archive.wayback.util.ObjectFilter; /** * Filter class that observes a stream of SearchResults tracking for each * complete record, a mapping of that records Digest to: * Arc/Warc Filename * Arc/Warc offset * HTTP Response * MIME-Type * Redirect URL * * If subsequent SearchResults are missing these fields ("-") and the Digest * field is in the map, then the SearchResults missing fields are replaced with * the values from the previously seen record with the same digest, and an * additional annotation field is added. * * @author brad * @version $Date: 2011-11-28 22:03:59 -0800 (Mon, 28 Nov 2011) $, $Revision: 3574 $ */ public class WARCRevisitAnnotationFilter implements ObjectFilter<CaptureSearchResult> { private final static String EMPTY_VALUE = "-"; private final static String REVISIT_VALUE = "warc/revisit"; private static final Logger LOGGER = Logger.getLogger( WARCRevisitAnnotationFilter.class.getName()); private HashMap<String,CaptureSearchResult> memory = null; public WARCRevisitAnnotationFilter() { memory = new HashMap<String,CaptureSearchResult>(); } private int annotate(CaptureSearchResult o) { o.flagDuplicateDigest(); String thisDigest = o.getDigest(); CaptureSearchResult last = memory.get(thisDigest); if (last == null) { if (LOGGER.isLoggable(Level.FINER)) { LOGGER.finer("did not find matching digest in previous fetch of url, hopefully it's a new-style revisit - " + o.getCaptureTimestamp() + " " + o.getOriginalUrl()); } return FILTER_INCLUDE; } o.flagDuplicateDigest(last); return FILTER_INCLUDE; } private int remember(CaptureSearchResult o) { memory.put(o.getDigest(),o); return FILTER_INCLUDE; } // public CaptureSearchResult adapt(CaptureSearchResult o) { // if(o.getFile().equals(EMPTY_VALUE) // || o.getMimeType().equals(REVISIT_VALUE)) { // return annotate(o); // } // return remember(o); // } public int filterObject(CaptureSearchResult o) { if(o.getFile().equals(EMPTY_VALUE) || o.getMimeType().equals(REVISIT_VALUE)) { return annotate(o); } return remember(o); } }