/*
* This file is part of the Wayback archival access software
* (http://archive-access.sourceforge.net/projects/wayback/).
*
* Licensed to the Internet Archive (IA) by one or more individual
* contributors.
*
* The IA licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.archive.wayback.resourceindex;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import org.apache.commons.httpclient.URIException;
import org.archive.util.io.RuntimeIOException;
import org.archive.util.iterator.CloseableIterator;
import org.archive.wayback.ResourceIndex;
import org.archive.wayback.UrlCanonicalizer;
import org.archive.wayback.core.CaptureSearchResult;
import org.archive.wayback.core.CaptureSearchResults;
import org.archive.wayback.core.SearchResult;
import org.archive.wayback.core.SearchResults;
import org.archive.wayback.core.UrlSearchResult;
import org.archive.wayback.core.UrlSearchResults;
import org.archive.wayback.core.WaybackRequest;
import org.archive.wayback.exception.AccessControlException;
import org.archive.wayback.exception.BadQueryException;
import org.archive.wayback.exception.ResourceIndexNotAvailableException;
import org.archive.wayback.exception.ResourceNotInArchiveException;
import org.archive.wayback.resourceindex.adapters.CaptureToUrlSearchResultIterator;
import org.archive.wayback.resourceindex.filterfactory.AccessPointCaptureFilterGroupFactory;
import org.archive.wayback.resourceindex.filterfactory.AnnotatingCaptureFilterGroupFactory;
import org.archive.wayback.resourceindex.filterfactory.CaptureFilterGroup;
import org.archive.wayback.resourceindex.filterfactory.ClosestTrackingCaptureFilterGroupFactory;
import org.archive.wayback.resourceindex.filterfactory.CoreCaptureFilterGroupFactory;
import org.archive.wayback.resourceindex.filterfactory.ExclusionCaptureFilterGroupFactory;
import org.archive.wayback.resourceindex.filterfactory.FilterGroupFactory;
import org.archive.wayback.resourceindex.filterfactory.QueryCaptureFilterGroupFactory;
import org.archive.wayback.resourceindex.filterfactory.WindowFilterGroup;
import org.archive.wayback.util.ObjectFilter;
import org.archive.wayback.util.ObjectFilterChain;
import org.archive.wayback.util.ObjectFilterIterator;
import org.archive.wayback.util.url.AggressiveUrlCanonicalizer;
import org.archive.wayback.webapp.PerfStats;
/**
* ResourceIndex implementation which assumes a "local" SearchResultSource.
*
* Extracting SearchResults from the source involves several layered steps:
*
* 1) extraction of results based on a prefix into the index
* 2) passing each result through a series of adapters
* these adapters can create new fields based on existing fields, or can
* annotate fields as they are scanned in order
* 3) filtering results based on request filters, which may come from
* * WaybackRequest-specific parameters.
* Ex. exact host match only, exact scheme match only, ...
* * AccessPoint-specific configuration
* Ex. only return records with (ARC/WARC) filename prefixed with XXX
* Ex. block any dates not older than 6 months
* 4) filtering based on AccessControl configurations
* Ex. block any urls with prefixes in file X
* 5) windowing filters, which provide pagination of the results, allowing
* requests to specify "show results between 10 and 20"
* 6) post filter adapters, which may annotate final results with other
* information
* Ex. for each result, consult DB to see if user-contributed messages
* apply to the results
*
* After all results have been processed, we annotate the final SearchResultS
* object with summary information about the results included. As we set up the
* chain of filters, we instrument the chain with counters that observe the
* number of results that went into, and came out of the Exclusion filters.
*
* If there were results presented to the Exclusion filter, but none were
* emitted from it, an AccessControlException is thrown.
*
* @author brad
* @version $Date$, $Revision$
*/
public class LocalResourceIndex implements ResourceIndex {
public final static int TYPE_REPLAY = 0;
public final static int TYPE_CAPTURE = 1;
public final static int TYPE_URL = 2;
/**
* maximum number of records to return
*/
private final static int MAX_RECORDS = 1000;
enum PerfStat
{
IndexLoad;
}
private int maxRecords = MAX_RECORDS;
protected SearchResultSource source;
private UrlCanonicalizer canonicalizer = null;
private boolean dedupeRecords = false;
private boolean timestampSearch = false;
private boolean markPrefixQueries = false;
private ObjectFilter<CaptureSearchResult> annotater = null;
private ObjectFilter<CaptureSearchResult> filter = null;
protected List<FilterGroupFactory> fgFactories = null;
public LocalResourceIndex() {
canonicalizer = new AggressiveUrlCanonicalizer();
fgFactories = new ArrayList<FilterGroupFactory>();
fgFactories.add(new AccessPointCaptureFilterGroupFactory());
fgFactories.add(new CoreCaptureFilterGroupFactory());
fgFactories.add(new QueryCaptureFilterGroupFactory());
fgFactories.add(new AnnotatingCaptureFilterGroupFactory());
fgFactories.add(new ExclusionCaptureFilterGroupFactory());
fgFactories.add(new ClosestTrackingCaptureFilterGroupFactory());
}
private void cleanupIterator(CloseableIterator<? extends SearchResult> itr)
throws ResourceIndexNotAvailableException {
try {
itr.close();
} catch (IOException e) {
e.printStackTrace();
throw new ResourceIndexNotAvailableException(
e.getLocalizedMessage());
}
}
protected List<CaptureFilterGroup> getRequestFilterGroups(WaybackRequest r)
throws BadQueryException {
ArrayList<CaptureFilterGroup> groups =
new ArrayList<CaptureFilterGroup>();
for(FilterGroupFactory f : fgFactories) {
groups.add(f.getGroup(r, canonicalizer, this));
}
return groups;
}
public CaptureSearchResults doCaptureQuery(WaybackRequest wbRequest,
int type) throws ResourceIndexNotAvailableException,
ResourceNotInArchiveException, BadQueryException,
AccessControlException {
String urlKey;
try {
urlKey = canonicalizer.urlStringToKey(wbRequest.getRequestUrl());
} catch (IOException e) {
throw new BadQueryException("Bad URL(" +
wbRequest.getRequestUrl() + ")");
}
// Special handling for index where the key is url<space>timestamp
// for faster binary search lookup
if (timestampSearch && wbRequest.isTimestampSearchKey()) {
String replayTimestamp = wbRequest.getReplayTimestamp();
if (replayTimestamp != null) {
urlKey += " " + replayTimestamp;
}
}
// the CaptureSearchResults we are about to return:
CaptureSearchResults results = new CaptureSearchResults();
// the various filters to apply to the results:
ObjectFilterChain<CaptureSearchResult> filters =
new ObjectFilterChain<CaptureSearchResult>();
// Groupings of filters for... sanity and summary annotation of results:
// Windows:
WindowFilterGroup<CaptureSearchResult> window =
new WindowFilterGroup<CaptureSearchResult>(wbRequest,this);
List<CaptureFilterGroup> groups = getRequestFilterGroups(wbRequest);
if(filter != null) {
filters.addFilter(filter);
}
for(CaptureFilterGroup cfg : groups) {
filters.addFilters(cfg.getFilters());
}
filters.addFilters(window.getFilters());
CloseableIterator<CaptureSearchResult> itr = null;
try {
PerfStats.timeStart(PerfStat.IndexLoad);
itr = new ObjectFilterIterator<CaptureSearchResult>(source.getPrefixIterator(urlKey),filters);
while(itr.hasNext()) {
results.addSearchResult(itr.next());
}
} catch(RuntimeIOException e) {
throw new ResourceIndexNotAvailableException(e.getLocalizedMessage());
} finally {
if (itr != null) {
cleanupIterator(itr);
}
PerfStats.timeEnd(PerfStat.IndexLoad);
}
for(CaptureFilterGroup cfg : groups) {
cfg.annotateResults(results);
}
window.annotateResults(results);
return results;
}
public UrlSearchResults doUrlQuery(WaybackRequest wbRequest)
throws ResourceIndexNotAvailableException,
ResourceNotInArchiveException, BadQueryException,
AccessControlException {
String urlKey;
try {
urlKey = canonicalizer.urlStringToKey(wbRequest.getRequestUrl());
} catch (URIException e) {
throw new BadQueryException("Bad URL(" +
wbRequest.getRequestUrl() + ")");
}
if (markPrefixQueries) {
urlKey += "*\t";
}
UrlSearchResults results = new UrlSearchResults();
// the various CAPTURE filters to apply to the results:
ObjectFilterChain<CaptureSearchResult> cFilters =
new ObjectFilterChain<CaptureSearchResult>();
// Groupings of filters for clarity(?) and summary annotation of
// results:
List<CaptureFilterGroup> groups = getRequestFilterGroups(wbRequest);
for(CaptureFilterGroup cfg : groups) {
cFilters.addFilters(cfg.getFilters());
}
if (filter != null) {
cFilters.addFilter(filter);
}
// we've filtered the appropriate CaptureResult objects within the
// iterator, now we're going to convert whatever records make it past
// the filters into UrlSearchResults, and then do further window
// filtering on those results:
// Windows:
// the window URL filters to apply to the results, once they're
// UrlSearchResult objects
ObjectFilterChain<UrlSearchResult> uFilters =
new ObjectFilterChain<UrlSearchResult>();
WindowFilterGroup<UrlSearchResult> window =
new WindowFilterGroup<UrlSearchResult>(wbRequest,this);
uFilters.addFilters(window.getFilters());
CloseableIterator<CaptureSearchResult> itrC = null;
CloseableIterator<UrlSearchResult> itrU = null;
try {
PerfStats.timeStart(PerfStat.IndexLoad);
itrC = new ObjectFilterIterator<CaptureSearchResult>(
source.getPrefixIterator(urlKey),cFilters);
itrU = new ObjectFilterIterator<UrlSearchResult>(
new CaptureToUrlSearchResultIterator(itrC),
uFilters);
while(itrU.hasNext()) {
results.addSearchResult(itrU.next());
}
} finally {
if (itrU != null) {
cleanupIterator(itrU);
}
PerfStats.timeEnd(PerfStat.IndexLoad);
}
for(CaptureFilterGroup cfg : groups) {
cfg.annotateResults(results);
}
window.annotateResults(results);
return results;
}
/*
* (non-Javadoc)
*
* @see org.archive.wayback.ResourceIndex#query(org.archive.wayback.core.WaybackRequest)
*/
public SearchResults query(WaybackRequest wbRequest)
throws ResourceIndexNotAvailableException,
ResourceNotInArchiveException, BadQueryException,
AccessControlException {
SearchResults results = null; // return value placeholder
if (wbRequest.isReplayRequest()) {
results = doCaptureQuery(wbRequest, TYPE_REPLAY);
results.putFilter(WaybackRequest.REQUEST_TYPE,
WaybackRequest.REQUEST_REPLAY_QUERY);
} else if (wbRequest.isCaptureQueryRequest()) {
results = doCaptureQuery(wbRequest, TYPE_CAPTURE);
results.putFilter(WaybackRequest.REQUEST_TYPE,
WaybackRequest.REQUEST_CAPTURE_QUERY);
} else if (wbRequest.isUrlQueryRequest()) {
results = doUrlQuery(wbRequest);
results.putFilter(WaybackRequest.REQUEST_TYPE,
WaybackRequest.REQUEST_URL_QUERY);
} else {
throw new BadQueryException("Unknown query type, must be "
+ WaybackRequest.REQUEST_REPLAY_QUERY
+ ", " + WaybackRequest.REQUEST_CAPTURE_QUERY
+ ", or " + WaybackRequest.REQUEST_URL_QUERY);
}
return results;
}
public void addSearchResults(Iterator<CaptureSearchResult> itr) throws IOException,
UnsupportedOperationException {
if(source instanceof UpdatableSearchResultSource) {
UpdatableSearchResultSource updatable =
(UpdatableSearchResultSource) source;
updatable.addSearchResults(itr,canonicalizer);
} else {
throw new UnsupportedOperationException("Underlying " +
"SearchResultSource is not Updatable.");
}
}
public boolean isUpdatable() {
return (source instanceof UpdatableSearchResultSource);
}
/**
* @param maxRecords the maxRecords to set
*/
public void setMaxRecords(int maxRecords) {
this.maxRecords = maxRecords;
}
public int getMaxRecords() {
return maxRecords;
}
/**
* @param source the source to set
*/
public void setSource(SearchResultSource source) {
this.source = source;
}
public boolean isDedupeRecords() {
return dedupeRecords;
}
public void setDedupeRecords(boolean dedupeRecords) {
this.dedupeRecords = dedupeRecords;
}
public UrlCanonicalizer getCanonicalizer() {
return canonicalizer;
}
public void setCanonicalizer(UrlCanonicalizer canonicalizer) {
this.canonicalizer = canonicalizer;
}
public void shutdown() throws IOException {
source.shutdown();
}
public ObjectFilter<CaptureSearchResult> getAnnotater() {
return annotater;
}
public void setAnnotater(ObjectFilter<CaptureSearchResult> annotater) {
this.annotater = annotater;
}
public ObjectFilter<CaptureSearchResult> getFilter() {
return filter;
}
public void setFilter(ObjectFilter<CaptureSearchResult> filter) {
this.filter = filter;
}
public boolean isTimestampSearch() {
return timestampSearch;
}
public void setTimestampSearch(boolean timestampSearch) {
this.timestampSearch = timestampSearch;
}
public boolean isMarkPrefixQueries() {
return markPrefixQueries;
}
public void setMarkPrefixQueries(boolean markPrefixQueries) {
this.markPrefixQueries = markPrefixQueries;
}
}