package org.apache.tika.parser.pkg;
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.security.AccessController;
import java.security.PrivilegedAction;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
import java.util.Locale;
import java.util.Set;
import java.util.SortedMap;
import java.util.TreeMap;
import org.apache.commons.compress.compressors.CompressorException;
import org.apache.commons.compress.compressors.CompressorInputStream;
import org.apache.commons.compress.compressors.CompressorOutputStream;
import org.apache.commons.compress.compressors.CompressorStreamProvider;
import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream;
import org.apache.commons.compress.compressors.deflate.DeflateCompressorInputStream;
import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
import org.apache.commons.compress.compressors.lzma.LZMAUtils;
import org.apache.commons.compress.compressors.pack200.Pack200CompressorInputStream;
import org.apache.commons.compress.compressors.snappy.FramedSnappyCompressorInputStream;
import org.apache.commons.compress.compressors.snappy.SnappyCompressorInputStream;
import org.apache.commons.compress.compressors.xz.XZCompressorInputStream;
import org.apache.commons.compress.compressors.xz.XZUtils;
import org.apache.commons.compress.compressors.z.ZCompressorInputStream;
import org.apache.commons.compress.utils.IOUtils;
import org.apache.commons.compress.utils.Lists;
import org.apache.commons.compress.utils.ServiceLoaderIterator;
import org.apache.commons.compress.utils.Sets;
import org.apache.tika.exception.TikaMemoryLimitException;
import org.tukaani.xz.LZMAInputStream;
import org.tukaani.xz.MemoryLimitException;
/**
* This is a temporary copy/paste hack from commons-compress for Tika 1.15
* that 1) allows detection without initialization of a stream and
* 2) prevents easily preventable OOM on two file formats.
*
* Once commons-compress 1.14 is released, we will delete this class
* and go back to commons-compress's CompressorStreamFactory.
*/
@Deprecated
class TikaCompressorStreamFactory implements CompressorStreamProvider {
private static final TikaCompressorStreamFactory SINGLETON = new TikaCompressorStreamFactory(true, -1);
/**
* Constant (value {@value}) used to identify the BZIP2 compression
* algorithm.
*
* @since 1.1
*/
public static final String BZIP2 = "bzip2";
/**
* Constant (value {@value}) used to identify the GZIP compression
* algorithm.
*
* @since 1.1
*/
public static final String GZIP = "gz";
/**
* Constant (value {@value}) used to identify the PACK200 compression
* algorithm.
*
* @since 1.3
*/
public static final String PACK200 = "pack200";
/**
* Constant (value {@value}) used to identify the XZ compression method.
*
* @since 1.4
*/
public static final String XZ = "xz";
/**
* Constant (value {@value}) used to identify the LZMA compression method.
*
* @since 1.6
*/
public static final String LZMA = "lzma";
/**
* Constant (value {@value}) used to identify the "framed" Snappy
* compression method.
*
* @since 1.7
*/
public static final String SNAPPY_FRAMED = "snappy-framed";
/**
* Constant (value {@value}) used to identify the "raw" Snappy compression
* method. Not supported as an output stream type.
*
* @since 1.7
*/
public static final String SNAPPY_RAW = "snappy-raw";
/**
* Constant (value {@value}) used to identify the traditional Unix compress
* method. Not supported as an output stream type.
*
* @since 1.7
*/
public static final String Z = "z";
/**
* Constant (value {@value}) used to identify the Deflate compress method.
*
* @since 1.9
*/
public static final String DEFLATE = "deflate";
private final int memoryLimitInKb;
private SortedMap<String, CompressorStreamProvider> compressorInputStreamProviders;
public static String getBzip2() {
return BZIP2;
}
public static String getDeflate() {
return DEFLATE;
}
public static String getGzip() {
return GZIP;
}
public static String getLzma() {
return LZMA;
}
public static String getPack200() {
return PACK200;
}
public static TikaCompressorStreamFactory getSingleton() {
return SINGLETON;
}
public static String getSnappyFramed() {
return SNAPPY_FRAMED;
}
public static String getSnappyRaw() {
return SNAPPY_RAW;
}
public static String getXz() {
return XZ;
}
public static String getZ() {
return Z;
}
static void putAll(final Set<String> names, final CompressorStreamProvider provider,
final TreeMap<String, CompressorStreamProvider> map) {
for (final String name : names) {
map.put(toKey(name), provider);
}
}
private static String toKey(final String name) {
return name.toUpperCase(Locale.ROOT);
}
/**
* If true, decompress until the end of the input. If false, stop after the
* first stream and leave the input position to point to the next byte after
* the stream
*/
private final Boolean decompressUntilEOF;
/**
* If true, decompress until the end of the input. If false, stop after the
* first stream and leave the input position to point to the next byte after
* the stream
*/
private volatile boolean decompressConcatenated = false;
/**
* Create an instance with the provided decompress Concatenated option.
*
* @param decompressUntilEOF
* if true, decompress until the end of the input; if false, stop
* after the first stream and leave the input position to point
* to the next byte after the stream. This setting applies to the
* gzip, bzip2 and xz formats only.
* @since 1.10
*/
public TikaCompressorStreamFactory(final boolean decompressUntilEOF, final int memoryLimitInKb) {
this.decompressUntilEOF = Boolean.valueOf(decompressUntilEOF);
// Also copy to existing variable so can continue to use that as the
// current value
this.decompressConcatenated = decompressUntilEOF;
this.memoryLimitInKb = memoryLimitInKb;
}
/**
* Try to detect the type of compressor stream.
*
* @param in input stream
* @return type of compressor stream detected
* @throws CompressorException if no compressor stream type was detected
* or if something else went wrong
* @throws IllegalArgumentException if stream is null or does not support mark
*
* @since 1.14
*/
public static String detect(final InputStream in) throws CompressorException {
if (in == null) {
throw new IllegalArgumentException("Stream must not be null.");
}
if (!in.markSupported()) {
throw new IllegalArgumentException("Mark is not supported.");
}
final byte[] signature = new byte[12];
in.mark(signature.length);
int signatureLength = -1;
try {
signatureLength = IOUtils.readFully(in, signature);
in.reset();
} catch (IOException e) {
throw new CompressorException("IOException while reading signature.", e);
}
if (BZip2CompressorInputStream.matches(signature, signatureLength)) {
return BZIP2;
}
if (GzipCompressorInputStream.matches(signature, signatureLength)) {
return GZIP;
}
if (Pack200CompressorInputStream.matches(signature, signatureLength)) {
return PACK200;
}
if (FramedSnappyCompressorInputStream.matches(signature, signatureLength)) {
return SNAPPY_FRAMED;
}
if (ZCompressorInputStream.matches(signature, signatureLength)) {
return Z;
}
if (DeflateCompressorInputStream.matches(signature, signatureLength)) {
return DEFLATE;
}
if (XZUtils.matches(signature, signatureLength)) {
return XZ;
}
if (LZMAUtils.matches(signature, signatureLength)) {
return LZMA;
}
/* if (FramedLZ4CompressorInputStream.matches(signature, signatureLength)) {
return LZ4_FRAMED;
}*/
throw new CompressorException("No Compressor found for the stream signature.");
}
public SortedMap<String, CompressorStreamProvider> getCompressorInputStreamProviders() {
if (compressorInputStreamProviders == null) {
compressorInputStreamProviders = Collections
.unmodifiableSortedMap(findAvailableCompressorInputStreamProviders());
}
return compressorInputStreamProviders;
}
public static SortedMap<String, CompressorStreamProvider> findAvailableCompressorInputStreamProviders() {
return AccessController.doPrivileged(new PrivilegedAction<SortedMap<String, CompressorStreamProvider>>() {
@Override
public SortedMap<String, CompressorStreamProvider> run() {
final TreeMap<String, CompressorStreamProvider> map = new TreeMap<>();
putAll(SINGLETON.getInputStreamCompressorNames(), SINGLETON, map);
for (final CompressorStreamProvider provider : findCompressorStreamProviders()) {
putAll(provider.getInputStreamCompressorNames(), provider, map);
}
return map;
}
});
}
private static ArrayList<CompressorStreamProvider> findCompressorStreamProviders() {
return Lists.newArrayList(serviceLoaderIterator());
}
private static Iterator<CompressorStreamProvider> serviceLoaderIterator() {
return new ServiceLoaderIterator<>(CompressorStreamProvider.class);
}
/**
* Create an compressor input stream from an input stream, autodetecting the
* compressor type from the first few bytes of the stream. The InputStream
* must support marks, like BufferedInputStream.
*
* @param in
* the input stream
* @return the compressor input stream
* @throws CompressorException
* if the compressor name is not known
* @throws IllegalArgumentException
* if the stream is null or does not support mark
* @since 1.1
*/
public CompressorInputStream createCompressorInputStream(final InputStream in) throws CompressorException,
TikaMemoryLimitException {
return createCompressorInputStream(detect(in), in);
}
/**
* Creates a compressor input stream from a compressor name and an input
* stream.
*
* @param name
* of the compressor, i.e. {@value #GZIP}, {@value #BZIP2},
* {@value #XZ}, {@value #LZMA}, {@value #PACK200},
* {@value #SNAPPY_RAW}, {@value #SNAPPY_FRAMED}, {@value #Z},
* or {@value #DEFLATE}
* @param in
* the input stream
* @return compressor input stream
* @throws CompressorException
* if the compressor name is not known or not available
* @throws IllegalArgumentException
* if the name or input stream is null
*/
public CompressorInputStream createCompressorInputStream(final String name, final InputStream in)
throws CompressorException, TikaMemoryLimitException {
return createCompressorInputStream(name, in, decompressConcatenated);
}
public CompressorInputStream createCompressorInputStream(final String name, final InputStream in,
final boolean actualDecompressConcatenated) throws CompressorException {
if (name == null || in == null) {
throw new IllegalArgumentException("Compressor name and stream must not be null.");
}
try {
if (GZIP.equalsIgnoreCase(name)) {
return new GzipCompressorInputStream(in, actualDecompressConcatenated);
}
if (BZIP2.equalsIgnoreCase(name)) {
return new BZip2CompressorInputStream(in, actualDecompressConcatenated);
}
if (XZ.equalsIgnoreCase(name)) {
if (!XZUtils.isXZCompressionAvailable()) {
throw new CompressorException("XZ compression is not available.");
}
return new XZCompressorInputStream(in, actualDecompressConcatenated);
}
if (LZMA.equalsIgnoreCase(name)) {
if (!LZMAUtils.isLZMACompressionAvailable()) {
throw new CompressorException("LZMA compression is not available");
}
try {
return new SaferLZMACompressorInputStream(in);
} catch (MemoryLimitException e) {
throw new CompressorException("MemoryLimitException: " + e.getMessage(), e);
}
}
if (PACK200.equalsIgnoreCase(name)) {
return new Pack200CompressorInputStream(in);
}
if (SNAPPY_RAW.equalsIgnoreCase(name)) {
return new SnappyCompressorInputStream(in);
}
if (SNAPPY_FRAMED.equalsIgnoreCase(name)) {
return new FramedSnappyCompressorInputStream(in);
}
if (Z.equalsIgnoreCase(name)) {
try {
return new SaferZCompressorInputStream(in);
} catch (TikaRuntimeMemoryLimitException e) {
throw new CompressorException("MemoryLimitException: " + e.getMessage(), e);
}
}
if (DEFLATE.equalsIgnoreCase(name)) {
return new DeflateCompressorInputStream(in);
}
/*
not currently supported
if (LZ4_BLOCK.equalsIgnoreCase(name)) {
return new BlockLZ4CompressorInputStream(in);
}
if (LZ4_FRAMED.equalsIgnoreCase(name)) {
return new FramedLZ4CompressorInputStream(in, actualDecompressConcatenated);
}
*/
} catch (final IOException e) {
throw new CompressorException("Could not create CompressorInputStream.", e);
}
final CompressorStreamProvider compressorStreamProvider = getCompressorInputStreamProviders().get(toKey(name));
if (compressorStreamProvider != null) {
return compressorStreamProvider.createCompressorInputStream(name, in, actualDecompressConcatenated);
}
throw new CompressorException("Compressor: " + name + " not found.");
}
@Override
public CompressorOutputStream createCompressorOutputStream(String s, OutputStream outputStream) throws CompressorException {
throw new UnsupportedOperationException();
}
// For Unit tests
boolean getDecompressConcatenated() {
return decompressConcatenated;
}
public Set<String> getInputStreamCompressorNames() {
return Sets.newHashSet(GZIP, BZIP2, XZ, LZMA, PACK200, DEFLATE, SNAPPY_RAW, SNAPPY_FRAMED, Z);
}
@Override
public Set<String> getOutputStreamCompressorNames() {
throw new UnsupportedOperationException();
}
public Boolean getDecompressUntilEOF() {
return decompressUntilEOF;
}
private class SaferZCompressorInputStream extends ZCompressorInputStream {
public SaferZCompressorInputStream(InputStream inputStream) throws IOException {
super(inputStream);
}
@Override
protected void initializeTables(int maxCodeSize) {
int maxTableSize = 1 << maxCodeSize;
if (memoryLimitInKb > -1 && maxTableSize > (memoryLimitInKb*1024)) {
throw new TikaRuntimeMemoryLimitException("Calculated maxCodeSize ("+maxCodeSize+" bytes) is greater "+
"than the maximum allowable ("+ (memoryLimitInKb*1024) +" bytes).\n"+
"If the file is not corrupt, consider increasing " +
"the memoryLimitInKb parameter in the CompressorParser");
}
super.initializeTables(maxCodeSize);
}
}
private static class TikaRuntimeMemoryLimitException extends RuntimeException {
public TikaRuntimeMemoryLimitException(String msg) {
super(msg);
}
}
private class SaferLZMACompressorInputStream extends CompressorInputStream {
private final InputStream in;
/**
* Creates a new input stream that decompresses LZMA-compressed data
* from the specified input stream.
*
* @param inputStream where to read the compressed data
*
* @throws IOException if the input is not in the .lzma format,
* the input is corrupt or truncated, the .lzma
* headers specify sizes that are not supported
* by this implementation, or the underlying
* <code>inputStream</code> throws an exception
*/
public SaferLZMACompressorInputStream(final InputStream inputStream) throws IOException {
in = new LZMAInputStream(inputStream, memoryLimitInKb);
}
/** {@inheritDoc} */
@Override
public int read() throws IOException {
final int ret = in.read();
count(ret == -1 ? 0 : 1);
return ret;
}
/** {@inheritDoc} */
@Override
public int read(final byte[] buf, final int off, final int len) throws IOException {
final int ret = in.read(buf, off, len);
count(ret);
return ret;
}
/** {@inheritDoc} */
@Override
public long skip(final long n) throws IOException {
return in.skip(n);
}
/** {@inheritDoc} */
@Override
public int available() throws IOException {
return in.available();
}
/** {@inheritDoc} */
@Override
public void close() throws IOException {
in.close();
}
}
}