/* * This file is part of the Wayback archival access software * (http://archive-access.sourceforge.net/projects/wayback/). * * Licensed to the Internet Archive (IA) by one or more individual * contributors. * * The IA licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.archive.wayback.liveweb; import java.io.ByteArrayInputStream; import java.io.File; import java.io.IOException; import java.io.InputStream; import java.net.ConnectException; import java.net.NoRouteToHostException; import java.net.SocketException; import java.net.SocketTimeoutException; import java.net.UnknownHostException; import java.util.Date; import java.util.logging.Logger; import org.apache.commons.httpclient.ConnectTimeoutException; import org.apache.commons.httpclient.Header; import org.apache.commons.httpclient.HostConfiguration; import org.apache.commons.httpclient.HttpClient; import org.apache.commons.httpclient.HttpConnection; import org.apache.commons.httpclient.HttpException; import org.apache.commons.httpclient.HttpMethodRetryHandler; import org.apache.commons.httpclient.HttpState; import org.apache.commons.httpclient.SimpleHttpConnectionManager; import org.apache.commons.httpclient.URIException; import org.apache.commons.httpclient.cookie.CookiePolicy; import org.apache.commons.httpclient.params.HttpClientParams; import org.apache.commons.httpclient.protocol.Protocol; import org.apache.commons.io.IOUtils; import org.archive.httpclient.HttpRecorderGetMethod; import org.archive.io.RecordingInputStream; import org.archive.io.ReplayInputStream; import org.archive.io.arc.ARCWriter; import org.archive.url.LaxURI; import org.archive.util.Recorder; import org.archive.wayback.util.ByteOp; /** * * Takes an input URL String argument, downloads, stores in an ARCWriter, * and returns a FileRegion consisting of the compressed ARCRecord containing * the response, or a forged, "fake error response" ARCRecord which can be * used to send the content to an OutputStream. * * @author brad * */ public class URLtoARCCacher { private static final Logger LOGGER = Logger.getLogger( URLtoARCCacher.class.getName()); private static String CONTENT_TYPE_HEADER = "Content-Type".toLowerCase(); private static String GET_METHOD_NAME = "GET"; private static String DEFAULT_RECORDER_DIR = System.getProperty("java.io.tmpdir"); private File recorderCacheDir = new File(DEFAULT_RECORDER_DIR); private static String DEFAULT_BACKING_FILE_BASE = "recorder-tmp"; private String backingFileBase = DEFAULT_BACKING_FILE_BASE; private String userAgent = "genericUserAgent"; private int connectionTimeoutMS = 10000; private int socketTimeoutMS = 10000; private int outBufferSize = 1024 * 100; private int inBufferSize = 1024 * 100; // private int outBufferSize = 10; // private int inBufferSize = 100; private final static HttpMethodRetryHandler noRetryHandler = new NoRetryHandler(); private final ThreadLocal<HttpClient> tl = new ThreadLocal<HttpClient>() { protected synchronized HttpClient initialValue() { HttpClientParams params = new HttpClientParams(); params.setParameter(HttpClientParams.RETRY_HANDLER, noRetryHandler); IPHttpConnectionManager manager = new IPHttpConnectionManager(); Protocol dnsTimedProtocol = new Protocol("http", new DNSTimingProtocolSocketFactory(), 80); Protocol.registerProtocol("http", dnsTimedProtocol); manager.getParams().setConnectionTimeout(connectionTimeoutMS); manager.getParams().setSoTimeout(socketTimeoutMS); return new HttpClient(params, manager); } }; private HttpClient getHttpClient() { return tl.get(); } private static byte[] ERROR_BYTES = "HTTP 502 Bad Gateway\n\n".getBytes(); private static String ERROR_MIME = "unk"; private static String ERROR_IP = "0.0.0.0"; private static byte[] TIMEOUT_BYTES = "HTTP 504 Gateway Timeout\n\n".getBytes(); private static String TIMEOUT_MIME = "unk"; private static String TIMEOUT_IP = "0.0.0.0"; /** * @param url to cache * @param cache ARCCacheDirectory for storing result or faked result * @return FileRegion of compressed byte range for ARCRecord. * @throws IOException for the usual reasons * @throws URIException if url argument isn't really an URL.. */ public FileRegion cacheURL(String url, ARCCacheDirectory cache) throws IOException, URIException { FileRegion region = null; // to track if we got a response (any response) or an exception. boolean gotUrl = false; boolean isTimeout = false; String fName = backingFileBase + "-" + Thread.currentThread().getId(); Recorder recorder = new Recorder(recorderCacheDir,fName, outBufferSize, inBufferSize); ExtendedGetMethod getMethod = null; // TWO STEPS: // first do the GET, using a Recorder to get the response. // then, if that worked, save the recorded value into an ARC // and return it's region // if we didn't get a response, forge a fake record and return that. try { Recorder.setHttpRecorder(recorder); LaxURI lURI = new LaxURI(url,true); getMethod = new ExtendedGetMethod(url,recorder); getMethod.setURI(lURI); HttpClient client = getHttpClient(); getMethod.getParams().setCookiePolicy(CookiePolicy.IGNORE_COOKIES); getMethod.setFollowRedirects(false); getMethod.setRequestHeader("User-Agent", userAgent); int code = client.executeMethod(getMethod); LOGGER.info("URL(" + url + ") HTTP:" + code); InputStream responseIS = getMethod.getResponseBodyAsStream(); if(responseIS != null) { ByteOp.discardStream(responseIS); responseIS.close(); } gotUrl = true; } catch (URIException e) { e.printStackTrace(); } catch (UnknownHostException e) { LOGGER.warning("Unknown host for " + url); } catch (ConnectTimeoutException e) { // TODO: should we act like it's a full block? LOGGER.warning("Timeout out connecting to " + url); isTimeout = true; } catch(SocketTimeoutException e) { LOGGER.warning("Timeout out socket for " + url); isTimeout = true; } catch (ConnectException e) { LOGGER.warning("ConnectionRefused to " + url); } catch (NoRouteToHostException e) { LOGGER.warning("NoRouteToHost for " + url); } catch (SocketException e) { // should only be things like "Connection Reset", etc.. LOGGER.warning("SocketException for " + url); } catch (HttpException e) { e.printStackTrace(); // we have to let IOExceptions out, problems caused by local disk // NEED to return errors, indicating that there is not an // authoritative answer, and thus... NOTHING can be shown. // } catch (IOException e) { // e.printStackTrace(); } finally { recorder.closeRecorders(); Recorder.setHttpRecorder(null); if(getMethod != null) { getMethod.releaseConnection(); } } // now write the content, or a fake record: ARCWriter writer = null; ReplayInputStream replayIS = null; try { writer = cache.getWriter(); if(gotUrl) { RecordingInputStream ris = recorder.getRecordedInput(); replayIS = ris.getReplayInputStream(); region = storeInputStreamARCRecord(writer, url, getMethod.getMime(), getMethod.getRemoteIP(), getMethod.getCaptureDate(), replayIS, (int) ris.getSize()); } else if(isTimeout) { region = storeTimeout(writer,url); } else { region = storeNotAvailable(writer, url); } } finally { IOUtils.closeQuietly(replayIS); if(writer != null) { cache.returnWriter(writer); } } recorder.close(); return region; } private FileRegion storeInputStreamARCRecord(ARCWriter writer, String url, String mime, String ip, Date captureDate, InputStream is, int length) throws IOException { writer.checkSize(); final long arcOffset = writer.getPosition(); final String arcPath = writer.getFile().getAbsolutePath(); writer.write(url,mime,ip,captureDate.getTime(),length,is); writer.checkSize(); // long newSize = writer.getPosition(); long oSize = writer.getFile().length(); // final long arcEndOffset = oSize; LOGGER.info("Wrote " + url + ": " + arcPath + "(" + arcOffset + "-" + oSize + ")"); FileRegion fr = new FileRegion(); fr.file = writer.getFile(); fr.start = arcOffset; fr.end = oSize; fr.isFake = false; return fr; } private FileRegion storeNotAvailable(ARCWriter writer, String url) throws IOException { ByteArrayInputStream bais = new ByteArrayInputStream(ERROR_BYTES); FileRegion fr = storeInputStreamARCRecord(writer, url, ERROR_MIME, ERROR_IP, new Date(), bais, ERROR_BYTES.length); fr.isFake = true; return fr; } private FileRegion storeTimeout(ARCWriter writer, String url) throws IOException { ByteArrayInputStream bais = new ByteArrayInputStream(TIMEOUT_BYTES); FileRegion fr = storeInputStreamARCRecord(writer, url, TIMEOUT_MIME, TIMEOUT_IP, new Date(), bais, TIMEOUT_BYTES.length); fr.isFake = true; return fr; } /* * Get method which ferrets away the Content-Type header, the remote IP * and remembers when the HTTP Message header was received. */ private class ExtendedGetMethod extends HttpRecorderGetMethod { /** * @param uri to be fetched * @param recorder which is not currently used by base class, but * we're going to require and send it on anyways. */ public ExtendedGetMethod(String uri, Recorder recorder) { super(uri, recorder); } private String remoteIP = ""; private Date captureDate = null; private String mime = "unk"; public String getName() { return GET_METHOD_NAME; } protected void processStatusLine(HttpState state, HttpConnection conn) { // grab the remote IP, and record when we started getting bytes.. // Sam thinks we should somehow record how fast we got it back.. // and then replay it at the same rate we received it. captureDate = new Date(); IPStoringHttpConnection bhc = (IPStoringHttpConnection) conn; remoteIP = bhc.getRemoteIP(); } protected void processResponseBody(HttpState state, HttpConnection conn) { // grab the mime.. Header headers[] = this.getResponseHeaders(); for (int i = 0; i < headers.length; i++) { String lcHeader = headers[i].getName().toLowerCase(); if(lcHeader.compareTo(CONTENT_TYPE_HEADER) == 0) { mime = headers[i].getValue(); } } } /** * @return Returns the captureDate. */ public Date getCaptureDate() { return captureDate; } /** * @return Returns the mime. */ public String getMime() { return mime; } /** * @return Returns the remoteIP. */ public String getRemoteIP() { return remoteIP; } } /** * HttpConnectionManager that returns IPHttpConnection objects, for * accessing the IP address */ private class IPHttpConnectionManager extends SimpleHttpConnectionManager { public HttpConnection getConnection(HostConfiguration hostConfiguration) { IPStoringHttpConnection conn = new IPStoringHttpConnection(hostConfiguration); conn.setHttpConnectionManager(this); conn.getParams().setDefaults(this.getParams()); return conn; } public HttpConnection getConnectionWithTimeout( HostConfiguration hostConfiguration, long timeout) { // TODO: is this lying? have we really set the time out? IPStoringHttpConnection conn = new IPStoringHttpConnection(hostConfiguration); conn.setHttpConnectionManager(this); conn.getParams().setDefaults(this.getParams()); return conn; } public HttpConnection getConnection( HostConfiguration hostConfiguration, long timeout) { return new IPStoringHttpConnection(hostConfiguration); } public void releaseConnection(HttpConnection conn) { // ensure connection is closed conn.close(); InputStream lastResponse = conn.getLastResponseInputStream(); if (lastResponse != null) { conn.setLastResponseInputStream(null); try { lastResponse.close(); } catch (IOException ioe) { //FIX ME: badness - close to force reconnect. conn.close(); } } } } /** * HttpConnection that allows access to the IP address which was * used for the connection. */ private class IPStoringHttpConnection extends HttpConnection { /** * @param hc HostConfiguration */ public IPStoringHttpConnection(HostConfiguration hc) { super(hc); } /** * @return the remote IP address that was connected to, as a String */ public String getRemoteIP() { return getSocket().getInetAddress().getHostAddress(); } } /** * @return the recorderCacheDir */ public String getRecorderCacheDir() { return recorderCacheDir.getAbsolutePath(); } /** * @param recorderCacheDirPath the recorderCacheDir to set */ public void setRecorderCacheDir(String recorderCacheDirPath) { this.recorderCacheDir = new File(recorderCacheDirPath); } /** * @return the backingFileBase */ public String getBackingFileBase() { return backingFileBase; } /** * @param backingFileBase the backingFileBase to set */ public void setBackingFileBase(String backingFileBase) { this.backingFileBase = backingFileBase; } /** * @return the userAgent */ public String getUserAgent() { return userAgent; } /** * @param userAgent the userAgent to set */ public void setUserAgent(String userAgent) { this.userAgent = userAgent; } /** * @return the connectionTimeoutMS */ public int getConnectionTimeoutMS() { return connectionTimeoutMS; } /** * @param connectionTimeoutMS the connectionTimeoutMS to set */ public void setConnectionTimeoutMS(int connectionTimeoutMS) { this.connectionTimeoutMS = connectionTimeoutMS; } /** * @return the socketTimeoutMS */ public int getSocketTimeoutMS() { return socketTimeoutMS; } /** * @param socketTimeoutMS the socketTimeoutMS to set */ public void setSocketTimeoutMS(int socketTimeoutMS) { this.socketTimeoutMS = socketTimeoutMS; } }