/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.io;
import java.io.BufferedInputStream;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.net.URLConnection;
import java.sql.Blob;
import java.sql.SQLException;
import org.apache.tika.metadata.Metadata;
/**
* Input stream with extended capabilities. The purpose of this class is
* to allow files and other resources and information to be associated with
* the {@link InputStream} instance passed through the
* {@link org.apache.tika.parser.Parser} interface and other similar APIs.
* <p>
* TikaInputStream instances can be created using the various static
* <code>get()</code> factory methods. Most of these methods take an optional
* {@link Metadata} argument that is then filled with the available input
* metadata from the given resource. The created TikaInputStream instance
* keeps track of the original resource used to create it, while behaving
* otherwise just like a normal, buffered {@link InputStream}.
* A TikaInputStream instance is also guaranteed to support the
* {@link #mark(int)} feature.
* <p>
* Code that wants to access the underlying file or other resources
* associated with a TikaInputStream should first use the
* {@link #get(InputStream)} factory method to cast or wrap a given
* {@link InputStream} into a TikaInputStream instance.
*
* @since Apache Tika 0.8
*/
public class TikaInputStream extends ProxyInputStream {
/**
* Checks whether the given stream is a TikaInputStream instance.
* The given stream can be <code>null</code>, in which case the return
* value is <code>false</code>.
*
* @param stream input stream, possibly <code>null</code>
* @return <code>true</code> if the stream is a TikaInputStream instance,
* <code>false</code> otherwise
*/
public static boolean isTikaInputStream(InputStream stream) {
return stream instanceof TikaInputStream;
}
/**
* Casts or wraps the given stream to a TikaInputStream instance.
* This method can be used to access the functionality of this class
* even when given just a normal input stream instance.
*
* @param stream normal input stream
* @return a TikaInputStream instance
*/
public static TikaInputStream get(InputStream stream) {
if (stream instanceof TikaInputStream) {
return (TikaInputStream) stream;
} else {
return new TikaInputStream(
new BufferedInputStream(stream), null, -1);
}
}
/**
* Creates a TikaInputStream from the given array of bytes.
*
* @param data input data
* @return a TikaInputStream instance
* @throws IOException
*/
public static TikaInputStream get(byte[] data) {
return get(data, new Metadata());
}
/**
* Creates a TikaInputStream from the given array of bytes. The length of
* the array is stored as input metadata in the given metadata instance.
*
* @param data input data
* @param metadata metadata instance
* @return a TikaInputStream instance
* @throws IOException
*/
public static TikaInputStream get(byte[] data, Metadata metadata) {
metadata.set(Metadata.CONTENT_LENGTH, Integer.toString(data.length));
return new TikaInputStream(
new ByteArrayInputStream(data), null, data.length);
}
/**
* Creates a TikaInputStream from the given file.
*
* @param file input file
* @return a TikaInputStream instance
* @throws FileNotFoundException if the file does not exist
*/
public static TikaInputStream get(File file) throws FileNotFoundException {
return get(file, new Metadata());
}
/**
* Creates a TikaInputStream from the given file. The file name and
* length are stored as input metadata in the given metadata instance.
*
* @param file input file
* @param metadata metadata instance
* @return a TikaInputStream instance
* @throws FileNotFoundException if the file does not exist
*/
public static TikaInputStream get(File file, Metadata metadata)
throws FileNotFoundException {
metadata.set(Metadata.RESOURCE_NAME_KEY, file.getName());
metadata.set(Metadata.CONTENT_LENGTH, Long.toString(file.length()));
return new TikaInputStream(
new BufferedInputStream(new FileInputStream(file)),
file, file.length());
}
/**
* Creates a TikaInputStream from the given database BLOB.
* <p>
* Note that the result set containing the BLOB may need to be kept open
* until the returned TikaInputStream has been processed and closed.
*
* @param blob database BLOB
* @return a TikaInputStream instance
* @throws SQLException if BLOB data can not be accessed
*/
public static TikaInputStream get(Blob blob) throws SQLException {
return get(blob, new Metadata());
}
/**
* Blob size threshold that limits the largest BLOB size to be
* buffered fully in memory by the {@link #get(Blob, Metadata)}
* method.
*/
private static final int BLOB_SIZE_THRESHOLD = 1024 * 1024;
/**
* Creates a TikaInputStream from the given database BLOB. The BLOB
* length (if available) is stored as input metadata in the given
* metadata instance.
* <p>
* Note that the result set containing the BLOB may need to be kept open
* until the returned TikaInputStream has been processed and closed.
*
* @param blob database BLOB
* @param metadata metadata instance
* @return a TikaInputStream instance
* @throws SQLException if BLOB data can not be accessed
*/
public static TikaInputStream get(Blob blob, Metadata metadata)
throws SQLException {
long length = -1;
try {
length = blob.length();
metadata.set(Metadata.CONTENT_LENGTH, Long.toString(length));
} catch (SQLException ignore) {
}
// Prefer an in-memory buffer for reasonably sized blobs to reduce
// the likelihood of problems caused by long-lived database accesses
if (0 <= length && length <= BLOB_SIZE_THRESHOLD) {
// the offset in Blob.getBytes() starts at 1
return get(blob.getBytes(1, (int) length), metadata);
} else {
return new TikaInputStream(
new BufferedInputStream(blob.getBinaryStream()),
null, length);
}
}
/**
* Creates a TikaInputStream from the resource at the given URI.
*
* @param uri resource URI
* @return a TikaInputStream instance
* @throws IOException if the resource can not be accessed
*/
public static TikaInputStream get(URI uri) throws IOException {
return get(uri, new Metadata());
}
/**
* Creates a TikaInputStream from the resource at the given URI. The
* available input metadata is stored in the given metadata instance.
*
* @param uri resource URI
* @param metadata metadata instance
* @return a TikaInputStream instance
* @throws IOException if the resource can not be accessed
*/
public static TikaInputStream get(URI uri, Metadata metadata)
throws IOException {
// Special handling for file:// URIs
if ("file".equalsIgnoreCase(uri.getScheme())) {
File file = new File(uri);
if (file.isFile()) {
return get(file, metadata);
}
}
return get(uri.toURL(), metadata);
}
/**
* Creates a TikaInputStream from the resource at the given URL.
*
* @param url resource URL
* @return a TikaInputStream instance
* @throws IOException if the resource can not be accessed
*/
public static TikaInputStream get(URL url) throws IOException {
return get(url, new Metadata());
}
/**
* Creates a TikaInputStream from the resource at the given URL. The
* available input metadata is stored in the given metadata instance.
*
* @param url resource URL
* @param metadata metadata instance
* @return a TikaInputStream instance
* @throws IOException if the resource can not be accessed
*/
public static TikaInputStream get(URL url, Metadata metadata)
throws IOException {
// Special handling for file:// URLs
if ("file".equalsIgnoreCase(url.getProtocol())) {
try {
File file = new File(url.toURI());
if (file.isFile()) {
return get(file, metadata);
}
} catch (URISyntaxException e) {
// fall through
}
}
URLConnection connection = url.openConnection();
String path = url.getPath();
int slash = path.lastIndexOf('/');
if (slash + 1 < path.length()) { // works even with -1!
metadata.set(Metadata.RESOURCE_NAME_KEY, path.substring(slash + 1));
}
String type = connection.getContentType();
if (type != null) {
metadata.set(Metadata.CONTENT_TYPE, type);
}
String encoding = connection.getContentEncoding();
if (encoding != null) {
metadata.set(Metadata.CONTENT_ENCODING, encoding);
}
int length = connection.getContentLength();
if (length >= 0) {
metadata.set(Metadata.CONTENT_LENGTH, Integer.toString(length));
}
return new TikaInputStream(
new BufferedInputStream(connection.getInputStream()),
null, length);
}
/**
* The file that contains the contents of this stream. This is either
* the original file passed to the {@link #TikaInputStream(File)}
* constructor or a temporary file created by a call to the
* {@link #getFile()} method. If neither has been called, then
* the value is <code>null</code>.
*/
private File file;
/**
* Flag to indicate that {@link #file} is a temporary file that should
* be removed when this stream is {@link #close() closed}.
*/
private boolean temporary;
/**
* Total length of the stream, or -1 if unknown.
*/
private long length;
/**
* Current read position within this stream.
*/
private long position = 0;
/**
* Marked position, or -1 if there is no current mark.
*/
private long mark = -1;
/**
* A opened container, such as a POIFS FileSystem
* for an OLE2 document, or a Zip file for a
* zip based (eg ooxml, odf) document.
*/
private Object openContainer;
/**
* Creates a TikaInputStream instance. This private constructor is used
* by the static factory methods based on the available information.
*
* @param stream <em>buffered</em> stream (must support the mark feature)
* @param file the file that contains the stream, or <code>null</code>
* @param length total length of the stream, or -1 if unknown
*/
private TikaInputStream(InputStream stream, File file, long length) {
super(stream);
this.file = file;
this.temporary = (file == null);
this.length = length;
}
/**
* Fills the given buffer with upcoming bytes from this stream without
* advancing the current stream position. The buffer is filled up unless
* the end of stream is encountered before that. This method will block
* if not enough bytes are immediately available.
*
* @param buffer byte buffer
* @return number of bytes written to the buffer
* @throws IOException if the stream can not be read
*/
public int peek(byte[] buffer) throws IOException {
int n = 0;
mark(buffer.length);
int m = read(buffer);
while (m != -1) {
n += m;
if (n < buffer.length) {
m = read(buffer, n, buffer.length - n);
} else {
m = -1;
}
}
reset();
return n;
}
/**
* Returns the open container object, such as a
* POIFS FileSystem in the event of an OLE2
* document being detected and processed by
* the OLE2 detector.
*/
public Object getOpenContainer() {
return openContainer;
}
/**
* Stores the open container object against
* the stream, eg after a Zip contents
* detector has loaded the file to decide
* what it contains.
*/
public void setOpenContainer(Object container) {
openContainer = container;
}
public boolean hasFile() {
return file != null;
}
public File getFile() throws IOException {
if (file == null) {
if (in == null) {
throw new IOException("Stream has already been read");
} else if (position > 0) {
throw new IOException("Stream is already being read");
} else {
file = File.createTempFile("apache-tika-", ".tmp");
OutputStream out = new FileOutputStream(file);
try {
IOUtils.copy(in, out);
} finally {
out.close();
}
in.close();
// Re-point the stream at the file now we have it
in = new BufferedInputStream(new FileInputStream(file));
}
}
return file;
}
public boolean hasLength() {
return length != -1;
}
/**
* Returns the length (in bytes) of this stream. Note that if the length
* was not available when this stream was instantiated, then this method
* will use the {@link #getFile()} method to buffer the entire stream to
* a temporary file in order to calculate the stream length. This case
* will only work if the stream has not yet been consumed.
*
* @return stream length
* @throws IOException if the length can not be determined
*/
public long getLength() throws IOException {
if (length == -1) {
length = getFile().length();
}
return length;
}
@Override
public int available() throws IOException {
if (in == null && file == null) {
return 0;
} else {
return super.available();
}
}
@Override
public long skip(long ln) throws IOException {
if (in == null && file == null) {
return 0;
} else {
long n = super.skip(ln);
position += n;
return n;
}
}
@Override
public int read() throws IOException {
if (in == null && file == null) {
return -1;
} else {
return super.read();
}
}
@Override
public int read(byte[] bts, int off, int len) throws IOException {
if (in == null && file == null) {
return -1;
} else {
return super.read(bts, off, len);
}
}
@Override
public int read(byte[] bts) throws IOException {
return read(bts, 0, bts.length);
}
@Override
public void mark(int readlimit) {
super.mark(readlimit);
mark = position;
}
@Override
public boolean markSupported() {
return true;
}
@Override
public void reset() throws IOException {
super.reset();
position = mark;
mark = -1;
}
@Override
public void close() throws IOException {
if (openContainer != null) {
openContainer = null;
}
if (in != null) {
in.close();
in = null;
}
if (file != null) {
if (temporary) {
file.delete();
}
file = null;
}
}
@Override
protected void beforeRead(int n) throws IOException {
if (in == null) {
if (file != null) {
in = new FileInputStream(file);
} else {
throw new IOException("End of the stream reached");
}
}
}
@Override
protected void afterRead(int n) throws IOException {
if (n != -1) {
position += n;
} else if (mark == -1) {
close();
}
}
}