package eu.fbk.knowledgestore.data;
import java.io.ByteArrayInputStream;
import java.io.Closeable;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.Reader;
import java.net.URL;
import java.net.URLConnection;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.Charset;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.CoderResult;
import java.nio.charset.CodingErrorAction;
import java.util.Date;
import com.google.common.base.Charsets;
import com.google.common.base.Preconditions;
import com.google.common.base.Throwables;
import com.google.common.io.ByteStreams;
import com.google.common.io.CharSource;
import com.google.common.io.CharStreams;
import com.google.common.net.MediaType;
import org.openrdf.model.URI;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.fbk.knowledgestore.vocabulary.KS;
import eu.fbk.knowledgestore.vocabulary.NFO;
import eu.fbk.knowledgestore.vocabulary.NIE;
import eu.fbk.rdfpro.util.IO;
/**
* A digital representation of a resource.
* <p>
* A {@code Representation} object provides access to the binary or character representation of a
* resource, including associated (mutable) representation metadata ({@link #getMetadata()}) and
* the resource ID ( {@link #getResourceID()}).
* </p>
* <p>
* Representation data can be consumed either as a stream of bytes or as a stream of characters.
* Conversion from one stream to another is performed if necessary using the charset encoded by
* metadata attribute {@link NIE#MIME_TYPE}.
* </p>
* <p>
* A representation encapsulates an open {@code InputStream} or {@code Reader} that provide access
* to the representation data (see methods {@link #getInputStream()} and {@link #getReader()}),
* hence it is a {@code Closeable} object that MUST be closed after use. In addition to these
* methods, a number of {@code writeToXXX()} helper methods allow to consume the representation
* data in different ways:
* </p>
* <ul>
* <li>{@link #writeToByteArray()} returns a byte array with all the representation data;</li>
* <li>{@link #writeToString()} returns a string with all the representation characater data.</li>
* <li>{@link #writeTo(OutputStream)} writes all the binary representation data to a supplied
* {@code OutputStream};</li>
* <li>{@link #writeTo(Appendable)} writes the character representation data to a supplied
* {@code Appendable};</li>
* </ul>
* <p>
* Note that these methods exhaust the {@code InputStream} / {@code Reader} associated to the
* representation. Moreover, in case some data has already been read, it will not be written by
* those methods.
* </p>
* <p>
* Representation objects are created via {@code create()} factory methods that take care of
* acquiring both an {@code InputStream} / {@code Reader} over the data and the associated
* metadata starting from a number of sources:
* </p>
* <ul>
* <li>{@link #create(URI, InputStream)} builds a representation out of an {@code InputStream};</li>
* <li>{@link #create(URI, Reader)} builds a representation out of a {@code Reader} ;</li>
* <li>{@link #create(URI, byte[])} builds a representation out of a byte array, including
* metadata about the representation length;</li>
* <li>{@link #create(URI, CharSequence)} builds a representation out of a {@code CharSequence}
* (e.g., a {@code String});</li>
* <li>{@link #create(URI, File)} builds a representation out of a file, including metadata about
* the file name, size, mime type (from the extension) and last modified time;</li>
* <li>{@link #create(URI, URL)} builds a representation out of a resolvable URL, including
* metadata about the file name, file size, mime type (from the extension), MD5 hash, last
* modified time.</li>
* </ul>
* <p>
* Representation objects are mutable but thread safe. Object equality is used for
* {@code equals()} and {@code hashCode()}.
* </p>
*/
public final class Representation implements Closeable {
private static final Logger LOGGER = LoggerFactory.getLogger(Representation.class);
private final Closeable data; // InputStream or Reader
private final Record metadata;
private Representation(final Closeable data) {
this.data = Preconditions.checkNotNull(data);
this.metadata = Record.create(null, KS.REPRESENTATION);
}
private Charset getCharset() {
final String mimeType = this.metadata.getUnique(NIE.MIME_TYPE, String.class);
if (mimeType == null) {
return Charsets.UTF_8;
}
try {
return MediaType.parse(mimeType).charset().or(Charsets.UTF_8);
} catch (final Throwable ex) {
throw new IllegalArgumentException("Invalid mime type in metadata: " + mimeType, ex);
}
}
@Override
protected void finalize() throws Throwable {
try {
close();
} finally {
super.finalize();
}
}
/**
* Creates a representation based on the {@code InputStream} specified. Note that the supplied
* {@code InputStream} is never closed by this class: it MUST be closed externally under the
* responsibility of the caller.
*
* @param stream
* the {@code InputStream}, not null
* @return the created representation
*/
public static Representation create(final InputStream stream) {
return new Representation(stream);
}
/**
* Creates a representation based on the byte array specified. The length of the byte array
* will be reflected in the returned representation metadata (property {@link NFO#FILE_SIZE}).
* Note that the byte array should not be changed after calling this method, as modification
* could be (partially) reflected in the returned representation.
*
* @param bytes
* the byte array containing the binary data of the representation
* @return the created representation
*/
public static Representation create(final byte[] bytes) {
final Representation representation = new Representation(new ByteArrayInputStream(bytes));
representation.metadata.set(NFO.FILE_SIZE, (long) bytes.length);
return representation;
}
/**
* Creates a representation based on the {@code File} specified. The file length, size,
* creation time and MIME type will be reflected in the returned representation metadata
* (respectively, properties {@link NFO#FILE_SIZE}, {@link NFO#FILE_NAME},
* {@link NFO#FILE_LAST_MODIFIED}, {@link NIE#MIME_TYPE}). Note that this method causes the
* file to be opened for reading.
*
* @param file
* the file containing the binary data of the representation
* @param autoDecompress
* automatically decompress the file, if compressed with gzip, bzip2, xz, 7z or lz4
* @return the created representation
* @throws IllegalArgumentException
* in case the file does not exist
*/
public static Representation create(final File file, final boolean autoDecompress)
throws IllegalArgumentException {
try {
String name = file.getName();
final Representation representation;
if (autoDecompress) {
byte[] bytes = ByteStreams.toByteArray(IO.read(file.getAbsolutePath()));
representation = new Representation(new ByteArrayInputStream(bytes));
if (name.endsWith(".gz") || name.endsWith(".xz") || name.endsWith(".7z")) {
name = name.substring(0, name.length() - 3);
} else if (name.endsWith(".bz2") || name.endsWith(".lz4")) {
name = name.substring(0, name.length() - 4);
}
} else {
representation = new Representation(IO.buffer(new FileInputStream(file)));
}
representation.metadata.set(NFO.FILE_SIZE, file.length());
representation.metadata.set(NFO.FILE_NAME, name);
representation.metadata.set(NFO.FILE_LAST_MODIFIED, new Date(file.lastModified()));
representation.metadata.set(NIE.MIME_TYPE, Data.extensionToMimeType(name));
return representation;
} catch (final FileNotFoundException ex) {
throw new IllegalArgumentException("Not a file: " + file.getAbsolutePath());
} catch (final IOException e) {
throw new IllegalArgumentException("IOException on file: " + file.getAbsolutePath());
}
}
/**
* Creates a representation based on the resolvable URL specified. This method has the effect
* of acquiring a connection to the supplied URL, from which the representation stream and a
* number of metadata attributes are extracted. These attributes include the last modified
* timestamp ({@link NFO#FILE_LAST_MODIFIED}), the MIME type ({@link NIE#MIME_TYPE}), the file
* size ({@link NFO#FILE_SIZE}), the file name ({@link NFO#FILE_NAME}) and the MD5 hash (
* {@link NFO#HAS_HASH}); all of these attributes are optional and are extracted only if
* available.
*
* @param url
* the URL that, resolved, will produced the binary data of the representation
* @return the created representation
* @throws IllegalArgumentException
* in case acquiring a connection to the supplied URL fails
*/
public static Representation create(final URL url) throws IllegalArgumentException {
// Acquire a connection and open an InputStream over its entity content.
URLConnection connection;
InputStream stream;
try {
connection = url.openConnection();
connection.connect();
stream = connection.getInputStream();
} catch (final IOException ex) {
throw new IllegalArgumentException("Cannot acquire a connection to URL " + url, ex);
}
// Wrap the stream in a Representation object.
final Representation representation = new Representation(stream);
try {
// Extract last modified.
final long lastModified = connection.getLastModified();
if (lastModified != 0) {
representation.metadata.set(NFO.FILE_LAST_MODIFIED, new Date(lastModified));
}
// Extract MIME type from "Content-Type".
String mimeType = connection.getContentType();
if (mimeType == null) {
mimeType = Data.extensionToMimeType(url.getFile());
}
representation.metadata.set(NIE.MIME_TYPE, mimeType);
// Extract length from "Content-Length";
final int length = connection.getContentLength();
if (length >= 0) {
representation.metadata.set(NFO.FILE_SIZE, length);
}
// Extract the filename either from "Content-Disposition" header or from URL.
String filename = null;
final String disposition = connection.getHeaderField("Content-Disposition");
if (disposition != null && disposition.contains("filename")) {
final int start = Math.max(disposition.indexOf('\"'), disposition.indexOf('\''));
if (start > 0) {
final int end = Math.max(disposition.lastIndexOf('\"'),
disposition.lastIndexOf('\''));
if (end > 0) {
filename = disposition.substring(start + 1, end);
}
}
}
if (filename == null) {
final String path = url.getPath();
final int index = path.lastIndexOf('/');
if (index >= 0) {
filename = path.substring(index + 1);
}
}
representation.metadata.set(NFO.FILE_NAME, filename);
// Extract the MD5 hash from "Content-MD5".
final String md5 = connection.getHeaderField("Content-MD5");
if (md5 != null) {
final Record hash = Record.create();
hash.set(NFO.HASH_ALGORITHM, "MD5");
hash.set(NFO.HASH_VALUE, md5);
representation.metadata.set(NFO.HAS_HASH, hash);
}
// Return the representation built.
return representation;
} catch (final Throwable ex) {
// Ensure to close the connection if something goes wrong.
try {
connection.getInputStream().close();
} catch (final Throwable ex2) {
// ignore
}
throw Throwables.propagate(ex);
}
}
/**
* Creates a representation based on the {@code Reader} specified. Note that the supplied
* {@code Reader} is never closed by this class: it MUST be closed externally under the
* responsibility of the caller. Upon request (e.g., invocation of {@link #getInputStream()}),
* character data produced by the {@code Reader} will be translated into byte data either
* using the charset specified in the representation metadata (property {@link NIE#MIME_TYPE})
* or by using UTF-8.
*
* @param reader
* the reader producing the character data of the representation
* @return the created representation
*/
public static Representation create(final Reader reader) {
Preconditions.checkNotNull(reader);
return new Representation(reader);
}
/**
* Creates a representation based on the {@code CharSequence} specified. Upon request (e.g.,
* invocation of {@link #getInputStream()}), character data produced by the {@code Reader}
* will be translated into byte data either using the charset specified in the representation
* metadata (property {@link NIE#MIME_TYPE}) or by using UTF-8.
*
* @param sequence
* the {@code CharSequence} with the character data of the representation
* @return the created representation
*/
public static Representation create(final CharSequence sequence) {
try {
return new Representation(CharSource.wrap(sequence).openStream());
} catch (final IOException ex) {
throw new Error("Unexpected exception (!): " + ex.getMessage(), ex);
}
}
/**
* Returns the metadata about this representation.
*
* @return the representation metadata, not null
*/
public Record getMetadata() {
return this.metadata;
}
/**
* Returns an {@code InputStream} over the binary data of this representation object.
* Conversion from character to byte data, if required, is performed according to the charset
* specified by the MIME type metadata property ({@link NIE#MIME_TYPE}).
*
* @return an {@code InputStream} over the binary content of this representation
*/
public InputStream getInputStream() {
if (this.data instanceof InputStream) {
return (InputStream) this.data;
} else {
final Reader reader = (Reader) this.data;
return new ReaderInputStream(reader, getCharset());
}
}
/**
* Returns a {@code Reader} over the character data of this representation object. Conversion
* from byte to character data, if required, is performed according to the charset specified
* by the MIME type metadata property ({@link NIE#MIME_TYPE}).
*
* @return a {@code Reader} providing access to the character data of the representation.
*/
public Reader getReader() {
if (this.data instanceof Reader) {
return (Reader) this.data;
} else {
final InputStream stream = (InputStream) this.data;
return new InputStreamReader(stream, getCharset());
}
}
/**
* Writes all the binary data of this representation to a {@code byte[]} object. Conversion
* from character to byte data, if required, is performed according to the charset specified
* by the MIME type metadata property ({@link NIE#MIME_TYPE}). If some data has been already
* read via {@code getInputStream()} or {@code getReaer()}, it will not be returned in the
* result.
*
* @return a byte array with the binary content of this representation
* @throws IOException
* in case access to binary data fails
*/
public byte[] writeToByteArray() throws IOException {
final InputStream stream = getInputStream();
try {
return ByteStreams.toByteArray(stream);
} finally {
stream.close();
}
}
/**
* Writes all the character data of this representation to a {@code String} object. Conversion
* from byte to character data, if required, is performed according to the charset specified
* by the MIME type metadata property ({@link NIE#MIME_TYPE}). If some data has been already
* read via {@code getInputStream()} or {@code getReaer()}, it will not be returned in the
* result.
*
* @return a {@code String} containg the full character-based content of this representation
* @throws IOException
* in case access to binary data fails
*/
public String writeToString() throws IOException {
final Reader reader = getReader();
try {
return CharStreams.toString(reader);
} finally {
reader.close();
}
}
/**
* Writes all the binary data of this representation to the {@code OutputStream} sink
* specified. Conversion from character to byte data, if required, is performed according to
* the charset specified by the MIME type metadata property ({@link NIE#MIME_TYPE}). If some
* data has been already read via {@code getInputStream()} or {@code getReaer()}, it will not
* be written to the supplied sink.
*
* @param sink
* the sink where to write binary data to
* @throws IOException
* in case access to binary data fails
*/
public void writeTo(final OutputStream sink) throws IOException {
final InputStream in = getInputStream();
try {
ByteStreams.copy(in, sink);
} finally {
in.close();
}
}
/**
* Writes all the character data of this representation to the {@code Appendable} sink
* specified. Conversion from byte to character data, if required, is performed according to
* the charset specified by the MIME type metadata property ({@link NIE#MIME_TYPE}). If some
* data has been already read via {@code getInputStream()} or {@code getReaer()}, it will not
* be written to the supplied sink.
*
* @param sink
* the sink where to write character data to
* @throws IOException
* in case access to binary data fails
*/
public void writeTo(final Appendable sink) throws IOException {
final Reader reader = getReader();
try {
CharStreams.copy(reader, sink);
} finally {
reader.close();
}
}
@Override
public void close() {
try {
this.data.close();
} catch (final Exception ex) {
LOGGER.warn("Exception caught while closing representation", ex);
}
}
/**
* {@inheritDoc} The returned representation contains the associated resource ID.
*/
@Override
public String toString() {
final String file = this.metadata.getUnique(NFO.FILE_NAME, String.class, "unnamed file");
final String type = this.metadata.getUnique(NIE.MIME_TYPE, String.class, "unknown type");
final long size = this.metadata.getUnique(NFO.FILE_SIZE, Long.class, -1L);
return file + ", " + type + ", " + (size >= 0 ? size + " bytes" : "unknown size");
}
// Source: org.apache.commons.io.input.ReaderInputStream
private class ReaderInputStream extends InputStream {
private static final int BUFFER_SIZE = 1024;
private final Reader reader;
private final CharsetEncoder enc;
private final CharBuffer encIn;
private final ByteBuffer encOut;
private CoderResult lastCoderResult;
private boolean eof;
ReaderInputStream(final Reader reader, final Charset charset) {
this.reader = reader;
this.enc = charset.newEncoder().onMalformedInput(CodingErrorAction.REPLACE)
.onUnmappableCharacter(CodingErrorAction.REPLACE);
this.encIn = CharBuffer.allocate(BUFFER_SIZE);
this.encIn.flip();
this.encOut = ByteBuffer.allocate(128);
this.encOut.flip();
}
private void fillBuffer() throws IOException {
if (!this.eof && (this.lastCoderResult == null || this.lastCoderResult.isUnderflow())) {
this.encIn.compact();
final int p = this.encIn.position();
final int c = this.reader.read(this.encIn.array(), p, this.encIn.remaining());
if (c == -1) {
this.eof = true;
} else {
this.encIn.position(p + c);
}
this.encIn.flip();
}
this.encOut.compact();
this.lastCoderResult = this.enc.encode(this.encIn, this.encOut, this.eof);
this.encOut.flip();
}
@Override
public int read(final byte[] b, final int offset, final int length) throws IOException {
Preconditions.checkNotNull(b);
Preconditions.checkPositionIndex(offset, b.length);
Preconditions.checkPositionIndex(offset + length, b.length);
int read = 0;
int o = offset;
int l = length;
while (l > 0) {
if (this.encOut.hasRemaining()) {
final int c = Math.min(this.encOut.remaining(), l);
this.encOut.get(b, o, c);
o += c;
l -= c;
read += c;
} else {
fillBuffer();
if (this.eof && !this.encOut.hasRemaining()) {
break;
}
}
}
return read > 0 || !this.eof ? read : l == 0 ? 0 : -1;
}
@Override
public int read() throws IOException {
for (;;) {
if (this.encOut.hasRemaining()) {
return this.encOut.get() & 0xFF;
} else {
fillBuffer();
if (this.eof && !this.encOut.hasRemaining()) {
return -1;
}
}
}
}
@Override
public void close() throws IOException {
this.reader.close();
}
}
}