/*
* This file is part of the Wayback archival access software
* (http://archive-access.sourceforge.net/projects/wayback/).
*
* Licensed to the Internet Archive (IA) by one or more individual
* contributors.
*
* The IA licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.archive.wayback.webapp;
import java.io.IOException;
import java.net.InetAddress;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.UnknownHostException;
import java.util.Date;
import java.util.logging.Logger;
import java.util.regex.Pattern;
import javax.servlet.ServletException;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;
import org.archive.io.arc.ARCRecord;
import org.archive.wayback.accesscontrol.robotstxt.RobotExclusionFilterFactory;
import org.archive.wayback.accesscontrol.staticmap.StaticMapExclusionFilterFactory;
import org.archive.wayback.core.CaptureSearchResult;
import org.archive.wayback.core.CaptureSearchResults;
import org.archive.wayback.core.FastCaptureSearchResult;
import org.archive.wayback.core.WaybackRequest;
import org.archive.wayback.exception.AccessControlException;
import org.archive.wayback.exception.AdministrativeAccessControlException;
import org.archive.wayback.exception.BadQueryException;
import org.archive.wayback.exception.ConfigurationException;
import org.archive.wayback.exception.LiveDocumentNotAvailableException;
import org.archive.wayback.exception.ResourceIndexNotAvailableException;
import org.archive.wayback.exception.ResourceNotInArchiveException;
import org.archive.wayback.exception.RobotAccessControlException;
import org.archive.wayback.exception.WaybackException;
import org.archive.wayback.liveweb.LiveWebCache;
import org.archive.wayback.resourceindex.filters.ExclusionFilter;
import org.archive.wayback.resourcestore.resourcefile.ArcResource;
import org.archive.wayback.util.url.UrlOperations;
/**
* @author brad
*
* RequestHandler which satisfies all incoming requests through a LiveWebCache,
* using an internal AccessPoint to rewrite replayed documents.
*
*/
public class LiveWebAccessPoint extends LiveWebRequestHandler {
private static final Logger LOGGER = Logger.getLogger(
LiveWebAccessPoint.class.getName());
enum PerfStat
{
LiveWeb;
}
private AccessPoint inner = null;
private LiveWebCache cache = null;
private RobotExclusionFilterFactory robotFactory = null;
private StaticMapExclusionFilterFactory adminFactory = null;
private Pattern skipHost = null;
private int dnsCheckTimeout = 0;
private String requireReferrer = null;
public final static String LIVEWEB_RUNTIME_ERROR_HEADER = "X-Archive-Wayback-Runtime-Liveweb-Error";
private long maxCacheMS = 86400000;
public boolean handleRequest(HttpServletRequest httpRequest,
HttpServletResponse httpResponse)
throws ServletException, IOException {
String urlString = translateRequestPathQuery(httpRequest);
urlString = UrlOperations.fixupHTTPUrlWithOneSlash(urlString);
boolean handled = true;
ArcResource r = null;
WaybackRequest wbRequest = new WaybackRequest();
wbRequest.setAccessPoint(inner);
wbRequest.setRequestUrl(urlString);
try {
String ref = httpRequest.getHeader("Referer");
PerfStats.clearAll();
if ((ref == null) || !skipHost.matcher(ref).find()) {
wbRequest.setTimestampSearchKey(true);
wbRequest.setReplayDate(new Date());
wbRequest.setReplayRequest();
try {
inner.queryIndex(wbRequest);
// Succeeded, so send redirect to query
httpResponse.sendRedirect(inner.getReplayPrefix() + urlString);
return true;
} catch (ResourceIndexNotAvailableException e) {
throw new LiveDocumentNotAvailableException(e.toString());
} catch (ResourceNotInArchiveException e) {
//Continue
} catch (BadQueryException e) {
throw new LiveDocumentNotAvailableException(e.toString());
} catch (AccessControlException e) {
//Continue
//throw new LiveDocumentNotAvailableException(e.toString());
} catch (ConfigurationException e) {
throw new LiveDocumentNotAvailableException(e.toString());
}
}
wbRequest.setLiveWebRequest(true);
if (inner.isEnablePerfStatsHeader()) {
PerfStats.timeStart(AccessPoint.PerfStat.Total);
httpResponse = new PerfWritingHttpServletResponse(httpRequest, httpResponse, AccessPoint.PerfStat.Total, inner.getPerfStatsHeader());
}
Thread.currentThread().setName("Thread " +
Thread.currentThread().getId() + " " + getBeanName() +
" handling: " + urlString);
CaptureSearchResult result = new FastCaptureSearchResult();
r = this.getLiveWebResource(result, urlString);
if (r != null) {
CaptureSearchResults results = new CaptureSearchResults();
results.addSearchResult(result);
wbRequest.setReplayTimestamp(result.getCaptureTimestamp());
inner.getReplay().getRenderer(wbRequest, result, r).renderResource(httpRequest, httpResponse, wbRequest, result, r,
inner.getUriConverter(), results);
} else {
throw new LiveDocumentNotAvailableException(urlString);
}
} catch(WaybackException e) {
inner.logError(httpResponse, LIVEWEB_RUNTIME_ERROR_HEADER, e, wbRequest);
inner.getException().renderException(httpRequest, httpResponse, wbRequest, e, inner.getUriConverter());
} catch(Exception e) {
inner.logError(httpResponse, LIVEWEB_RUNTIME_ERROR_HEADER, e, wbRequest);
} finally {
if (r != null) {
r.close();
}
}
return handled;
}
protected ArcResource getLiveWebResource(CaptureSearchResult result, String urlString) throws WaybackException, IOException
{
URL url = null;
ArcResource r = null;
if (!urlString.startsWith(UrlOperations.HTTP_SCHEME) &&
!urlString.startsWith(UrlOperations.HTTPS_SCHEME)) {
// Remove accidental calendar page requests
if (urlString.startsWith("*/")) {
urlString = urlString.substring(2);
}
// Assume http
urlString = UrlOperations.HTTP_SCHEME + urlString;
}
try {
url = new URL(urlString);
} catch(MalformedURLException e) {
throw new BadQueryException("Bad URL(" + urlString + ")");
}
if ((skipHost != null) && skipHost.matcher(url.getHost()).find()) {
return null;
}
if ((dnsCheckTimeout > 0) && !checkUrlDns(url, dnsCheckTimeout)) {
return null;
}
result.setOriginalUrl(urlString);
String canonUrl = urlString;
if (inner.getSelfRedirectCanonicalizer() != null) {
try {
canonUrl = inner.getSelfRedirectCanonicalizer().urlStringToKey(urlString);
} catch (IOException io) {
throw new BadQueryException("Bad URL(" + urlString + ")");
}
}
result.setUrlKey(canonUrl);
// check admin excludes first, if configured:
if(adminFactory != null) {
ExclusionFilter f = adminFactory.get();
if(f == null) {
LOGGER.severe("Unable to get administrative exclusion filter!");
throw new AdministrativeAccessControlException(urlString + "is blocked.");
}
int ruling = f.filterObject(result);
if(ruling == ExclusionFilter.FILTER_EXCLUDE) {
throw new AdministrativeAccessControlException(urlString + "is blocked.");
}
}
// check robots next, if configured
if(robotFactory != null) {
int ruling = robotFactory.get().filterObject(result);
if(ruling == ExclusionFilter.FILTER_EXCLUDE) {
throw new RobotAccessControlException(urlString + "is blocked by robots.txt");
}
}
// no robots check, or robots.txt says GO:
//long start = System.currentTimeMillis();
try {
PerfStats.timeStart(PerfStat.LiveWeb);
r = (ArcResource) cache.getCachedResource(url, maxCacheMS , false);
} finally {
PerfStats.timeEnd(PerfStat.LiveWeb);
}
ARCRecord ar = (ARCRecord) r.getArcRecord();
int status = ar.getStatusCode();
if ((status == 200) || ((status >= 300) && (status < 400))) {
result.setCaptureTimestamp(ar.getMetaData().getDate());
result.setMimeType(ar.getMetaData().getMimetype());
return r;
}
return null;
}
@Override
public String getLiveWebRedirect(HttpServletRequest request, WaybackRequest wbRequest, WaybackException we)
{
if (isLiveWebFound(request, wbRequest)) {
return LiveWebRedirector.DEFAULT;
}
return null;
}
protected boolean checkUrlDns(URL url, int timeout)
{
InetAddress addr = null;
try {
addr = InetAddress.getByName(url.getHost());
} catch (UnknownHostException e) {
return false;
}
if (addr.isAnyLocalAddress() || addr.isLinkLocalAddress() || addr.isLoopbackAddress()) {
return false;
}
if (timeout == 0) {
return true;
}
try {
if (addr.isReachable(timeout)) {
return true;
}
} catch (IOException e) {
}
return false;
}
private boolean isLiveWebFound(HttpServletRequest request, WaybackRequest wbRequest)
{
ArcResource r = null;
String urlString = wbRequest.getRequestUrl();
try {
r = getLiveWebResource(new FastCaptureSearchResult(), urlString);
return (r != null);
} catch (Exception e) {
return false;
} finally {
if (r != null) {
try {
r.close();
} catch (IOException e) {
}
}
}
}
/**
* @return the cache
*/
public LiveWebCache getCache() {
return cache;
}
/**
* @param cache the cache to set
*/
public void setCache(LiveWebCache cache) {
this.cache = cache;
}
/**
* @return the robotFactory
*/
public RobotExclusionFilterFactory getRobotFactory() {
return robotFactory;
}
/**
* @param robotFactory the robotFactory to set
*/
public void setRobotFactory(RobotExclusionFilterFactory robotFactory) {
this.robotFactory = robotFactory;
}
/**
* @return the inner
*/
public AccessPoint getInner() {
return inner;
}
/**
* @param inner the inner to set
*/
public void setInner(AccessPoint inner) {
this.inner = inner;
}
public StaticMapExclusionFilterFactory getAdminFactory() {
return adminFactory;
}
public void setAdminFactory(StaticMapExclusionFilterFactory adminFactory) {
this.adminFactory = adminFactory;
}
public String getSkipHost() {
return skipHost.pattern();
}
public void setSkipHost(String skipHost) {
this.skipHost = Pattern.compile(skipHost);
}
public int getDnsCheckTimeout() {
return dnsCheckTimeout;
}
public void setDnsCheckTimeout(int dnsCheckTimeout) {
this.dnsCheckTimeout = dnsCheckTimeout;
}
public String getRequireReferrer() {
return requireReferrer;
}
public void setRequireReferrer(String requireReferrer) {
this.requireReferrer = requireReferrer;
}
}