/*
* This file is part of the Wayback archival access software
* (http://archive-access.sourceforge.net/projects/wayback/).
*
* Licensed to the Internet Archive (IA) by one or more individual
* contributors.
*
* The IA licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.archive.wayback.replay;
import java.util.List;
import org.archive.wayback.ReplayDispatcher;
import org.archive.wayback.ReplayRenderer;
import org.archive.wayback.core.CaptureSearchResult;
import org.archive.wayback.core.CaptureSearchResults;
import org.archive.wayback.core.Resource;
import org.archive.wayback.core.WaybackRequest;
import org.archive.wayback.replay.mimetype.MimeTypeDetector;
import org.archive.wayback.resourcestore.indexer.IndexWorker;
import org.archive.wayback.webapp.AccessPoint;
/**
* ReplayDispatcher instance which uses a configurable ClosestResultSelector
* to find the best result to show from a given set, and a list of
* ReplayRendererSelector to determine how best to replay that result to a user.
*
* <p>Optionally it can be configured with {@link MimeTypeDetector}s used for
* overriding unknown ({@code "unk"}) or often-misused ({@code "text/html"})
* value of {@link CaptureSearchResult#getMimeType()}.</p>
*
* @author brad
*/
public class SelectorReplayDispatcher implements ReplayDispatcher {
private List<ReplayRendererSelector> selectors = null;
private List<MimeTypeDetector> mimeTypeDetectors = null;
private ClosestResultSelector closestSelector = null;
public static final String DEFAULT_MISSING_MIMETYPE = "unk";
private String missingMimeType = DEFAULT_MISSING_MIMETYPE;
/**
* default value for {@link #untrustfulMimeTypes}
*/
public static final String[] DEFAULT_UNTRUSTFUL_MIMETYPES = {
// Found many occurrence of "www/unknown" and "*/*" in IA's archive.
"text/html", "www/unknown", "*/"
};
private String[] untrustfulMimeTypes = DEFAULT_UNTRUSTFUL_MIMETYPES;
/**
* A list of {@code mimetype} values that cannot be fully trusted.
* For captures whose {@code mimetype} prefix-matches any of these,
* SelectorReplayDispatcher will attempt to detect actual mime-type
* with {@code mimeTypeDetector} (if configured).
* <p>Value set to {@link #missingMimeType} is always considered
* <i>untrustful</i>. You don't need to include it in this list.</p>
* <p>If passed {@code null}, default {@link #DEFAULT_UNTRUSTFUL_MIMETYPES}
* will be used. If set to an empty array, detection is applied only to
* captures without {@code Content-Type} header.</p>
* @param untrustfulMimeTypes list of mime-type prefixes
*/
public void setUntrustfulMimeTypes(List<String> untrustfulMimeTypes) {
if (untrustfulMimeTypes == null)
this.untrustfulMimeTypes = DEFAULT_UNTRUSTFUL_MIMETYPES;
else
this.untrustfulMimeTypes = untrustfulMimeTypes
.toArray(new String[untrustfulMimeTypes.size()]);
}
/**
* Value of {@code mimetype} field indicating {@code Content-Type}
* is unavailable in the response.
* Default is {@code unk} (compatible with CDX-Writer).
* {@link IndexWorker} puts {@code application/http}, apparently.
* @param missingMimeType
*/
public void setMissingMimeType(String missingMimeType) {
if (missingMimeType == null || missingMimeType.isEmpty())
this.missingMimeType = DEFAULT_MISSING_MIMETYPE;
else
this.missingMimeType = missingMimeType;
}
public String getMissingMimeType() {
return missingMimeType;
}
/**
* check if mime-type detection is suggested for mimeType.
* @param mimeType mime-type to test (must not be null/empty/"unk")
* @return {@code true} if mime-type should be determined
* by looking into Resource.
*/
protected boolean shouldDetectMimeType(String mimeType) {
for (String prefix : untrustfulMimeTypes) {
if (mimeType.startsWith(prefix)) return true;
}
return false;
}
@Override
public ReplayRenderer getRenderer(WaybackRequest wbRequest,
CaptureSearchResult result, Resource resource) {
// if content-type is already specified, don't override it.
if (wbRequest.getForcedContentType() == null) {
String mimeType = result.getMimeType();
// TODO: this code should be encapsulated in CaptureSearchResult.getMimeType()
if (AccessPoint.REVISIT_STR.equals(mimeType)) {
if (result.getDuplicatePayload() != null) {
mimeType = result.getDuplicatePayload().getMimeType();
} else {
// let following code get it from resource
mimeType = null;
}
}
// Many old ARCs have "unk" or "no-type" in ARC header even though
// HTTP response has valid Content-Type header. CDX writer does not fix
// it (although it's capable of fixing it internally). If CaptureSearchResult
// says mimeType is "unk", try reading Content-Type header from the resource.
if (mimeType == null || mimeType.isEmpty() || missingMimeType.equals(mimeType)) {
mimeType = resource.getHeader("Content-Type");
}
// "unk" and "" are changed to Content-Type header value (or null if in fact missing)
// so null test is enough.
if (mimeType == null || shouldDetectMimeType(mimeType)) {
if (mimeTypeDetectors != null) {
for (MimeTypeDetector detector : mimeTypeDetectors) {
String detected = detector.sniff(resource);
if (detected != null) {
// detected mimeType is communicated to Selectors
// through forcedContentType. better way? replace
// CaptureSearchResult.mimeType?
wbRequest.setForcedContentType(detected);
}
}
}
} else {
// hmm, now CaptureSearchResult.mimeType can be set to
// forcedContentType - it should work, but this may
// be a bad design.
wbRequest.setForcedContentType(mimeType);
}
}
if (selectors != null) {
for (ReplayRendererSelector selector : selectors) {
if (selector.canHandle(wbRequest, result, resource, resource)) {
return selector.getRenderer();
}
}
}
return null;
}
@Override
public ReplayRenderer getRenderer(WaybackRequest wbRequest,
CaptureSearchResult result, Resource httpHeadersResource,
Resource payloadResource) {
if (httpHeadersResource == payloadResource)
return getRenderer(wbRequest, result, httpHeadersResource);
else {
Resource resource = new CompositeResource(httpHeadersResource, payloadResource);
return getRenderer(wbRequest, result, resource);
}
}
public CaptureSearchResult getClosest(WaybackRequest wbRequest,
CaptureSearchResults results) {
return closestSelector.getClosest(wbRequest, results);
}
/**
* @return the List of ReplayRendererSelector objects configured
*/
public List<ReplayRendererSelector> getSelectors() {
return selectors;
}
/**
* @param selectors the List of ReplayRendererSelector to use
*/
public void setSelectors(List<ReplayRendererSelector> selectors) {
this.selectors = selectors;
}
public List<MimeTypeDetector> getMimeTypeDetectors() {
return mimeTypeDetectors;
}
public void setMimeTypeDetectors(List<MimeTypeDetector> sniffers) {
this.mimeTypeDetectors = sniffers;
}
/**
* @param closestSelector the closestSelector to set
*/
public void setClosestSelector(ClosestResultSelector closestSelector) {
this.closestSelector = closestSelector;
}
/**
* @return the closestSelector
*/
public ClosestResultSelector getClosestSelector() {
return closestSelector;
}
}