/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.tika.utils; import java.io.BufferedInputStream; import java.io.BufferedOutputStream; import java.io.ByteArrayInputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; /** * Wraps an input stream, reading it only once, but making it available * for rereading an arbitrary number of times. The stream's bytes are * stored in memory up to a user specified maximum, and then stored in a * temporary file which is deleted when this class' close() method is called. */ public class RereadableInputStream extends InputStream { /** * Input stream originally passed to the constructor. */ private InputStream originalInputStream; /** * The inputStream currently being used by this object to read contents; * may be the original stream passed in, or a stream that reads * the saved copy. */ private InputStream inputStream; /** * Maximum number of bytes that can be stored in memory before * storage will be moved to a temporary file. */ private int maxBytesInMemory; /** * True when the original stream is being read; set to false when * reading is set to use the stored data instead. */ private boolean firstPass = true; /** * Whether or not the stream's contents are being stored in a file * as opposed to memory. */ private boolean bufferIsInFile; /** * The buffer used to store the stream's content; this storage is moved * to a file when the stored data's size exceeds maxBytesInMemory. */ private byte[] byteBuffer; /** * The total number of bytes read from the original stream at the time. */ private int size; /** * File used to store the stream's contents; is null until the stored * content's size exceeds maxBytesInMemory. */ private File storeFile; /** * OutputStream used to save the content of the input stream in a * temporary file. */ private OutputStream storeOutputStream; /** * Specifies whether or not to read to the end of stream on first * rewind. This defaults to true. If this is set to false, * then the first time when rewind() is called, only those bytes * already read from the original stream will be available from then on. */ private boolean readToEndOfStreamOnFirstRewind = true; /** * Specifies whether or not to close the original input stream * when close() is called. Defaults to true. */ private boolean closeOriginalStreamOnClose = true; // TODO: At some point it would be better to replace the current approach // (specifying the above) with more automated behavior. The stream could // keep the original stream open until EOF was reached. For example, if: // // the original stream is 10 bytes, and // only 2 bytes are read on the first pass // rewind() is called // 5 bytes are read // // In this case, this instance gets the first 2 from its store, // and the next 3 from the original stream, saving those additional 3 // bytes in the store. In this way, only the maximum number of bytes // ever needed must be saved in the store; unused bytes are never read. // The original stream is closed when EOF is reached, or when close() // is called, whichever comes first. Using this approach eliminates // the need to specify the flag (though makes implementation more complex). /** * Creates a rereadable input stream. * * @param inputStream stream containing the source of data * @param maxBytesInMemory maximum number of bytes to use to store * the stream's contents in memory before switching to disk; note that * the instance will preallocate a byte array whose size is * maxBytesInMemory. This byte array will be made available for * garbage collection (i.e. its reference set to null) when the * content size exceeds the array's size, when close() is called, or * when there are no more references to the instance. * @param readToEndOfStreamOnFirstRewind Specifies whether or not to * read to the end of stream on first rewind. If this is set to false, * then when rewind() is first called, only those bytes already read * from the original stream will be available from then on. */ public RereadableInputStream(InputStream inputStream, int maxBytesInMemory, boolean readToEndOfStreamOnFirstRewind, boolean closeOriginalStreamOnClose) { this.inputStream = inputStream; this.originalInputStream = inputStream; this.maxBytesInMemory = maxBytesInMemory; byteBuffer = new byte[maxBytesInMemory]; this.readToEndOfStreamOnFirstRewind = readToEndOfStreamOnFirstRewind; this.closeOriginalStreamOnClose = closeOriginalStreamOnClose; } /** * Reads a byte from the stream, saving it in the store if it is being * read from the original stream. Implements the abstract * InputStream.read(). * * @return the read byte, or -1 on end of stream. * @throws IOException */ public int read() throws IOException { int inputByte = inputStream.read(); if (firstPass) { saveByte(inputByte); } return inputByte; } /** * "Rewinds" the stream to the beginning for rereading. * @throws IOException */ public void rewind() throws IOException { if (firstPass && readToEndOfStreamOnFirstRewind) { // Force read to end of stream to fill store with any // remaining bytes from original stream. while(read() != -1) { // empty loop } } closeStream(); if (storeOutputStream != null) { storeOutputStream.close(); storeOutputStream = null; } firstPass = false; boolean newStreamIsInMemory = (size < maxBytesInMemory); inputStream = newStreamIsInMemory ? new ByteArrayInputStream(byteBuffer) : new BufferedInputStream(new FileInputStream(storeFile)); } /** * Closes the input stream currently used for reading (may either be * the original stream or a memory or file stream after the first pass). * * @throws IOException */ // Does anyone need/want for this to be public? private void closeStream() throws IOException { if (inputStream != null && (inputStream != originalInputStream || closeOriginalStreamOnClose)) { inputStream.close(); inputStream = null; } } /** * Closes the input stream and removes the temporary file if one was * created. * * @throws IOException */ public void close() throws IOException { closeStream(); super.close(); if (storeFile != null) { storeFile.delete(); } } /** * Returns the number of bytes read from the original stream. * * @return number of bytes read */ public int getSize() { return size; } /** * Saves the byte read from the original stream to the store. * * @param inputByte byte read from original stream * @throws IOException */ private void saveByte(int inputByte) throws IOException { if (!bufferIsInFile) { boolean switchToFile = (size == (maxBytesInMemory)); if (switchToFile) { storeFile = File.createTempFile("TIKA_streamstore_", ".tmp"); bufferIsInFile = true; storeOutputStream = new BufferedOutputStream( new FileOutputStream(storeFile)); storeOutputStream.write(byteBuffer, 0, size); storeOutputStream.write(inputByte); byteBuffer = null; // release for garbage collection } else { byteBuffer[size] = (byte) inputByte; } } else { storeOutputStream.write(inputByte); } ++size; } }