/* * This file is part of the Heritrix web crawler (crawler.archive.org). * * Licensed to the Internet Archive (IA) by one or more individual * contributors. * * The IA licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.archive.util; import java.io.BufferedInputStream; import java.io.File; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.OutputStream; import java.nio.charset.Charset; import java.util.HashSet; import java.util.Set; import java.util.logging.Level; import java.util.logging.Logger; import java.util.zip.DeflaterInputStream; import java.util.zip.GZIPInputStream; import org.apache.commons.httpclient.ChunkedInputStream; import org.apache.commons.io.FileUtils; import org.apache.commons.io.IOUtils; import org.apache.commons.lang.StringUtils; import org.archive.io.GenericReplayCharSequence; import org.archive.io.RecordingInputStream; import org.archive.io.RecordingOutputStream; import org.archive.io.ReplayCharSequence; import org.archive.io.ReplayInputStream; import com.google.common.base.Charsets; /** * Pairs together a RecordingInputStream and RecordingOutputStream * to capture exactly a single HTTP transaction. * * Initially only supports HTTP/1.0 (one request, one response per stream) * * Call {@link #markContentBegin()} to demarc the transition between HTTP * header and body. * * @author gojomo */ public class Recorder { protected static Logger logger = Logger.getLogger("org.archive.util.HttpRecorder"); private static final int DEFAULT_OUTPUT_BUFFER_SIZE = 16384; private static final int DEFAULT_INPUT_BUFFER_SIZE = 524288; private RecordingInputStream ris = null; private RecordingOutputStream ros = null; /** * Backing file basename. * * Keep it around so can clean up backing files left on disk. */ private String backingFileBasename = null; /** * Backing file output stream suffix. */ private static final String RECORDING_OUTPUT_STREAM_SUFFIX = ".ros"; /** * Backing file input stream suffix. */ private static final String RECORDING_INPUT_STREAM_SUFFIX = ".ris"; /** * recording-input (ris) content character encoding. */ protected String characterEncoding = null; /** * Charset to use for CharSequence provision. Will be UTF-8 if no * encoding ever requested; a Charset matching above characterEncoding * if possible; ISO_8859 if above characterEncoding is unsatisfiable. * TODO: unify to UTF-8 for unspecified and bad-specified cases? * (current behavior is for consistency with our prior but perhaps not * optimal behavior) */ protected Charset charset = Charsets.UTF_8; /** whether recording-input (ris) message-body is chunked */ protected boolean inputIsChunked = false; /** recording-input (ris) entity content-encoding (eg gzip, deflate), if any */ protected String contentEncoding = null; private ReplayCharSequence replayCharSequence; /** * Create an HttpRecorder. * * @param tempDir Directory into which we drop backing files for * recorded input and output. * @param backingFilenameBase Backing filename base to which we'll append * suffices <code>ris</code> for recorded input stream and * <code>ros</code> for recorded output stream. * @param outBufferSize Size of output buffer to use. * @param inBufferSize Size of input buffer to use. */ public Recorder(File tempDir, String backingFilenameBase, int outBufferSize, int inBufferSize) { this(new File(ensure(tempDir), backingFilenameBase), outBufferSize, inBufferSize); } private static File ensure(File tempDir) { try { org.archive.util.FileUtils.ensureWriteableDirectory(tempDir); } catch (IOException e) { throw new IllegalStateException(e); } return tempDir; } public Recorder(File file, int outBufferSize, int inBufferSize) { super(); this.backingFileBasename = file.getAbsolutePath(); this.ris = new RecordingInputStream(inBufferSize, this.backingFileBasename + RECORDING_INPUT_STREAM_SUFFIX); this.ros = new RecordingOutputStream(outBufferSize, this.backingFileBasename + RECORDING_OUTPUT_STREAM_SUFFIX); } /** * Create an HttpRecorder. * * @param tempDir * Directory into which we drop backing files for recorded input * and output. * @param backingFilenameBase * Backing filename base to which we'll append suffices * <code>ris</code> for recorded input stream and * <code>ros</code> for recorded output stream. */ public Recorder(File tempDir, String backingFilenameBase) { this(tempDir, backingFilenameBase, DEFAULT_INPUT_BUFFER_SIZE, DEFAULT_OUTPUT_BUFFER_SIZE); } /** * Wrap the provided stream with the internal RecordingInputStream * * open() throws an exception if RecordingInputStream is already open. * * @param is InputStream to wrap. * * @return The input stream wrapper which itself is an input stream. * Pass this in place of the passed stream so input can be recorded. * * @throws IOException */ public InputStream inputWrap(InputStream is) throws IOException { logger.fine(Thread.currentThread().getName() + " wrapping input"); // discard any state from previously-recorded input this.characterEncoding = null; this.inputIsChunked = false; this.contentEncoding = null; this.ris.open(is); return this.ris; } /** * Wrap the provided stream with the internal RecordingOutputStream * * open() throws an exception if RecordingOutputStream is already open. * * @param os The output stream to wrap. * * @return The output stream wrapper which is itself an output stream. * Pass this in place of the passed stream so output can be recorded. * * @throws IOException */ public OutputStream outputWrap(OutputStream os) throws IOException { this.ros.open(os); return this.ros; } /** * Close all streams. */ public void close() { logger.fine(Thread.currentThread().getName() + " closing"); try { this.ris.close(); } catch (IOException e) { // TODO: Can we not let the exception out of here and report it // higher up in the caller? DevUtils.logger.log(Level.SEVERE, "close() ris" + DevUtils.extraInfo(), e); } try { this.ros.close(); } catch (IOException e) { DevUtils.logger.log(Level.SEVERE, "close() ros" + DevUtils.extraInfo(), e); } } /** * Return the internal RecordingInputStream * * @return A RIS. */ public RecordingInputStream getRecordedInput() { return this.ris; } /** * @return The RecordingOutputStream. */ public RecordingOutputStream getRecordedOutput() { return this.ros; } /** * Mark current position as the point where the HTTP headers end. */ public void markContentBegin() { this.ris.markContentBegin(); } public long getResponseContentLength() { return this.ris.getResponseContentLength(); } /** * Close both input and output recorders. * * Recorders are the output streams to which we are recording. * {@link #close()} closes the stream that is being recorded and the * recorder. This method explicitly closes the recorder only. */ public void closeRecorders() { try { this.ris.closeRecorder(); this.ros.closeRecorder(); } catch (IOException e) { DevUtils.warnHandle(e, "Convert to runtime exception?"); } } /** * Cleanup backing files. * * Call when completely done w/ recorder. Removes any backing files that * may have been dropped. */ public void cleanup() { this.close(); this.delete(this.backingFileBasename + RECORDING_OUTPUT_STREAM_SUFFIX); this.delete(this.backingFileBasename + RECORDING_INPUT_STREAM_SUFFIX); } /** * Delete file if exists. * * @param name Filename to delete. */ private void delete(String name) { File f = new File(name); if (f.exists()) { f.delete(); } } protected static ThreadLocal<Recorder> currentRecorder = new ThreadLocal<Recorder>(); public static void setHttpRecorder(Recorder httpRecorder) { currentRecorder.set(httpRecorder); } /** * Get the current threads' HttpRecorder. * * @return This threads' HttpRecorder. Returns null if can't find a * HttpRecorder in current instance. */ public static Recorder getHttpRecorder() { return currentRecorder.get(); } /** * @param characterEncoding Character encoding of input recording. * @return actual charset in use after attempt to set */ public void setCharset(Charset cs) { this.charset = cs; } /** * @return effective Charset of input recording */ public Charset getCharset() { return this.charset; } /** * @param characterEncoding Character encoding of input recording. */ public void setInputIsChunked(boolean chunked) { this.inputIsChunked = chunked; } protected static Set<String> SUPPORTED_ENCODINGS = new HashSet<String>(); static { SUPPORTED_ENCODINGS.add("gzip"); SUPPORTED_ENCODINGS.add("x-gzip"); SUPPORTED_ENCODINGS.add("deflate"); SUPPORTED_ENCODINGS.add("identity"); SUPPORTED_ENCODINGS.add("none"); // unofficial but common } /** * @param contentEncoding declared content-encoding of input recording. */ public void setContentEncoding(String contentEncoding) { String lowerCoding = contentEncoding.toLowerCase(); if(!SUPPORTED_ENCODINGS.contains(contentEncoding.toLowerCase())) { throw new IllegalArgumentException("contentEncoding unsupported: "+contentEncoding); } this.contentEncoding = lowerCoding; } /** * @return Returns the characterEncoding. */ public String getContentEncoding() { return this.contentEncoding; } /** * @return * @throws IOException * @deprecated use getContentReplayCharSequence */ public ReplayCharSequence getReplayCharSequence() throws IOException { return getContentReplayCharSequence(); } /** * @return A ReplayCharSequence. Caller may call * {@link ReplayCharSequence#close()} when finished. However, in * heritrix, the ReplayCharSequence is closed automatically when url * processing has finished; in that context it's preferable not * to close, so that processors can reuse the same instance. * @throws IOException * @see {@link #endReplays()} */ public ReplayCharSequence getContentReplayCharSequence() throws IOException { if (replayCharSequence == null || !replayCharSequence.isOpen() || !replayCharSequence.getCharset().equals(charset)) { if(replayCharSequence!=null && replayCharSequence.isOpen()) { // existing sequence must not have matched now-configured Charset; close replayCharSequence.close(); } replayCharSequence = getContentReplayCharSequence(this.charset); } return replayCharSequence; } /** * @param characterEncoding Encoding of recorded stream. * @return A ReplayCharSequence Will return null if an IOException. Call * close on returned RCS when done. * @throws IOException */ public ReplayCharSequence getContentReplayCharSequence(Charset requestedCharset) throws IOException { // raw data overflows to disk; use temp file InputStream ris = getContentReplayInputStream(); ReplayCharSequence rcs = new GenericReplayCharSequence( ris, calcRecommendedCharBufferSize(this.getRecordedInput()), this.backingFileBasename + RECORDING_OUTPUT_STREAM_SUFFIX, requestedCharset); ris.close(); return rcs; } /** * Calculate a recommended size for an in-memory decoded-character buffer * of this content. We seek a size that is itself no larger (in 2-byte chars) * than the memory already used by the RecordingInputStream's internal raw * byte buffer, and also no larger than likely necessary. So, we take the * minimum of the actual recorded byte size and the RecordingInputStream's * max buffer size. * * @param inStream * @return int length for in-memory decoded-character buffer */ static protected int calcRecommendedCharBufferSize(RecordingInputStream inStream) { return (int) Math.min(inStream.getRecordedBufferLength()/2, inStream.getSize()); } /** * Get a raw replay of all recorded data (including, for example, HTTP * protocol headers) * * @return A replay input stream. * @throws IOException */ public ReplayInputStream getReplayInputStream() throws IOException { return getRecordedInput().getReplayInputStream(); } /** * Get a raw replay of the 'message-body'. For the common case of * HTTP, this is the raw, possibly chunked-transfer-encoded message * contents not including the leading headers. * * @return A replay input stream. * @throws IOException */ public ReplayInputStream getMessageBodyReplayInputStream() throws IOException { return getRecordedInput().getMessageBodyReplayInputStream(); } /** * Get a raw replay of the 'entity'. For the common case of * HTTP, this is the message-body after any (usually-unnecessary) * transfer-decoding but before any content-encoding (eg gzip) decoding * * @return A replay input stream. * @throws IOException */ public InputStream getEntityReplayInputStream() throws IOException { if(inputIsChunked) { return new ChunkedInputStream(getRecordedInput().getMessageBodyReplayInputStream()); } else { return getRecordedInput().getMessageBodyReplayInputStream(); } } /** * Get a replay cued up for the 'content' (after all leading headers) * * @return A replay input stream. * @throws IOException */ public InputStream getContentReplayInputStream() throws IOException { InputStream entityStream = getEntityReplayInputStream(); if(StringUtils.isEmpty(contentEncoding)) { return entityStream; } else if ("gzip".equalsIgnoreCase(contentEncoding) || "x-gzip".equalsIgnoreCase(contentEncoding)) { try { return new GZIPInputStream(entityStream); } catch (IOException ioe) { logger.log(Level.WARNING,"gzip problem; using raw entity instead",ioe); IOUtils.closeQuietly(entityStream); // close partially-read stream return getEntityReplayInputStream(); } } else if ("deflate".equalsIgnoreCase(contentEncoding)) { return new DeflaterInputStream(entityStream); } else if ("identity".equalsIgnoreCase(contentEncoding) || "none".equalsIgnoreCase(contentEncoding)) { return entityStream; } else { // shouldn't be reached given check on setContentEncoding logger.log(Level.INFO,"Unknown content-encoding '"+contentEncoding+"' declared; using raw entity instead"); return entityStream; } } /** * Return a short prefix of the presumed-textual content as a String. * * @param size max length of String to return * @return String prefix, or empty String (with logged exception) on any error */ public String getContentReplayPrefixString(int size) { return getContentReplayPrefixString(size, this.charset); } /** * Return a short prefix of the presumed-textual content as a String. * * @param size max length of String to return * @return String prefix, or empty String (with logged exception) on any error */ public String getContentReplayPrefixString(int size, Charset cs) { try { InputStreamReader isr = new InputStreamReader(getContentReplayInputStream(), cs); char[] chars = new char[size]; int count = isr.read(chars); isr.close(); if (count > 0) { return new String(chars,0,count); } else { return ""; } } catch (IOException e) { logger.log(Level.SEVERE,"unable to get replay prefix string", e); return ""; } } /** * @param tempFile * @throws IOException */ public void copyContentBodyTo(File tempFile) throws IOException { InputStream inStream = null; OutputStream outStream = null; try { inStream = getContentReplayInputStream(); outStream = FileUtils.openOutputStream(tempFile); IOUtils.copy(inStream, outStream); } finally { IOUtils.closeQuietly(inStream); IOUtils.closeQuietly(outStream); } } /** * Record the input stream for later playback by an extractor, etc. * This is convenience method used to setup an artificial HttpRecorder * scenario used in unit tests, etc. * @param dir Directory to write backing file to. * @param basename of what we're recording. * @param in Stream to read. * @param encoding Stream encoding. * @throws IOException * @return An {@link org.archive.util.Recorder}. */ public static Recorder wrapInputStreamWithHttpRecord(File dir, String basename, InputStream in, String encoding) throws IOException { Recorder rec = new Recorder(dir, basename); if (encoding != null && encoding.length() > 0) { rec.setCharset(Charset.forName(encoding)); } // Do not use FastBufferedInputStream here. It does not // support mark. InputStream is = rec.inputWrap(new BufferedInputStream(in)); final int BUFFER_SIZE = 1024 * 4; byte [] buffer = new byte[BUFFER_SIZE]; while(true) { // Just read it all down. int x = is.read(buffer); if (x == -1) { break; } } is.close(); return rec; } public void endReplays() { ArchiveUtils.closeQuietly(replayCharSequence); replayCharSequence = null; // like closeQuietly try { ris.clearForReuse(); } catch (IOException ioe) { } // like closeQuietly try { ros.clearForReuse(); } catch (IOException e) { } } }