package org.apache.tika.parser.pkg; /* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.security.AccessController; import java.security.PrivilegedAction; import java.util.ArrayList; import java.util.Collections; import java.util.Iterator; import java.util.Locale; import java.util.Set; import java.util.SortedMap; import java.util.TreeMap; import org.apache.commons.compress.archivers.ArchiveException; import org.apache.commons.compress.archivers.ArchiveInputStream; import org.apache.commons.compress.archivers.ArchiveOutputStream; import org.apache.commons.compress.archivers.ArchiveStreamFactory; import org.apache.commons.compress.archivers.ArchiveStreamProvider; import org.apache.commons.compress.archivers.StreamingNotSupportedException; import org.apache.commons.compress.archivers.ar.ArArchiveInputStream; import org.apache.commons.compress.archivers.ar.ArArchiveOutputStream; import org.apache.commons.compress.archivers.arj.ArjArchiveInputStream; import org.apache.commons.compress.archivers.cpio.CpioArchiveInputStream; import org.apache.commons.compress.archivers.cpio.CpioArchiveOutputStream; import org.apache.commons.compress.archivers.dump.DumpArchiveInputStream; import org.apache.commons.compress.archivers.jar.JarArchiveInputStream; import org.apache.commons.compress.archivers.jar.JarArchiveOutputStream; import org.apache.commons.compress.archivers.sevenz.SevenZFile; import org.apache.commons.compress.archivers.tar.TarArchiveInputStream; import org.apache.commons.compress.archivers.tar.TarArchiveOutputStream; import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream; import org.apache.commons.compress.archivers.zip.ZipArchiveOutputStream; import org.apache.commons.compress.utils.IOUtils; import org.apache.commons.compress.utils.Lists; import org.apache.commons.compress.utils.ServiceLoaderIterator; import org.apache.commons.compress.utils.Sets; /** * This is a temporary copy/paste hack from commons-compress for Tika 1.15 * that 1) allows detection without initialization of a stream and * 2) prevents easily preventable OOM on three file formats. * * Once commons-compress 1.14 is released, we will delete this class * and go back to commons-compress's CompressorStreamFactory. */ @Deprecated class TikaArchiveStreamFactory implements ArchiveStreamProvider { private static final int TAR_HEADER_SIZE = 512; private static final int DUMP_SIGNATURE_SIZE = 32; private static final int SIGNATURE_SIZE = 12; private static final ArchiveStreamFactory SINGLETON = new ArchiveStreamFactory(); /** * Constant (value {@value}) used to identify the AR archive format. * @since 1.1 */ public static final String AR = "ar"; /** * Constant (value {@value}) used to identify the ARJ archive format. * Not supported as an output stream type. * @since 1.6 */ public static final String ARJ = "arj"; /** * Constant (value {@value}) used to identify the CPIO archive format. * @since 1.1 */ public static final String CPIO = "cpio"; /** * Constant (value {@value}) used to identify the Unix DUMP archive format. * Not supported as an output stream type. * @since 1.3 */ public static final String DUMP = "dump"; /** * Constant (value {@value}) used to identify the JAR archive format. * @since 1.1 */ public static final String JAR = "jar"; /** * Constant used to identify the TAR archive format. * @since 1.1 */ public static final String TAR = "tar"; /** * Constant (value {@value}) used to identify the ZIP archive format. * @since 1.1 */ public static final String ZIP = "zip"; /** * Constant (value {@value}) used to identify the 7z archive format. * @since 1.8 */ public static final String SEVEN_Z = "7z"; /** * Entry encoding, null for the platform default. */ private final String encoding; /** * Entry encoding, null for the default. */ private volatile String entryEncoding; private SortedMap<String, ArchiveStreamProvider> archiveInputStreamProviders; private SortedMap<String, ArchiveStreamProvider> archiveOutputStreamProviders; private static ArrayList<ArchiveStreamProvider> findArchiveStreamProviders() { return Lists.newArrayList(serviceLoaderIterator()); } static void putAll(Set<String> names, ArchiveStreamProvider provider, TreeMap<String, ArchiveStreamProvider> map) { for (String name : names) { map.put(toKey(name), provider); } } private static Iterator<ArchiveStreamProvider> serviceLoaderIterator() { return new ServiceLoaderIterator<>(ArchiveStreamProvider.class); } private static String toKey(final String name) { return name.toUpperCase(Locale.ROOT); } /** * Constructs a new sorted map from input stream provider names to provider * objects. * * <p> * The map returned by this method will have one entry for each provider for * which support is available in the current Java virtual machine. If two or * more supported provider have the same name then the resulting map will * contain just one of them; which one it will contain is not specified. * </p> * * <p> * The invocation of this method, and the subsequent use of the resulting * map, may cause time-consuming disk or network I/O operations to occur. * This method is provided for applications that need to enumerate all of * the available providers, for example to allow user provider selection. * </p> * * <p> * This method may return different results at different times if new * providers are dynamically made available to the current Java virtual * machine. * </p> * * @return An immutable, map from names to provider objects * @since 1.13 */ public static SortedMap<String, ArchiveStreamProvider> findAvailableArchiveInputStreamProviders() { return AccessController.doPrivileged(new PrivilegedAction<SortedMap<String, ArchiveStreamProvider>>() { @Override public SortedMap<String, ArchiveStreamProvider> run() { TreeMap<String, ArchiveStreamProvider> map = new TreeMap<>(); putAll(SINGLETON.getInputStreamArchiveNames(), SINGLETON, map); for (ArchiveStreamProvider provider : findArchiveStreamProviders()) { putAll(provider.getInputStreamArchiveNames(), provider, map); } return map; } }); } /** * Constructs a new sorted map from output stream provider names to provider * objects. * * <p> * The map returned by this method will have one entry for each provider for * which support is available in the current Java virtual machine. If two or * more supported provider have the same name then the resulting map will * contain just one of them; which one it will contain is not specified. * </p> * * <p> * The invocation of this method, and the subsequent use of the resulting * map, may cause time-consuming disk or network I/O operations to occur. * This method is provided for applications that need to enumerate all of * the available providers, for example to allow user provider selection. * </p> * * <p> * This method may return different results at different times if new * providers are dynamically made available to the current Java virtual * machine. * </p> * * @return An immutable, map from names to provider objects * @since 1.13 */ public static SortedMap<String, ArchiveStreamProvider> findAvailableArchiveOutputStreamProviders() { return AccessController.doPrivileged(new PrivilegedAction<SortedMap<String, ArchiveStreamProvider>>() { @Override public SortedMap<String, ArchiveStreamProvider> run() { TreeMap<String, ArchiveStreamProvider> map = new TreeMap<>(); putAll(SINGLETON.getOutputStreamArchiveNames(), SINGLETON, map); for (ArchiveStreamProvider provider : findArchiveStreamProviders()) { putAll(provider.getOutputStreamArchiveNames(), provider, map); } return map; } }); } /** * Create an instance using the specified encoding. * * @param encoding the encoding to be used. * * @since 1.10 */ public TikaArchiveStreamFactory(final String encoding) { super(); this.encoding = encoding; // Also set the original field so can continue to use it. this.entryEncoding = encoding; } /** * Returns the encoding to use for arj, jar, zip, dump, cpio and tar * files, or null for the archiver default. * * @return entry encoding, or null for the archiver default * @since 1.5 */ public String getEntryEncoding() { return entryEncoding; } /** * Sets the encoding to use for arj, jar, zip, dump, cpio and tar files. Use null for the archiver default. * * @param entryEncoding the entry encoding, null uses the archiver default. * @since 1.5 * @deprecated 1.10 use {@link #TikaArchiveStreamFactory(String)} to specify the encoding * @throws IllegalStateException if the constructor {@link #TikaArchiveStreamFactory(String)} * was used to specify the factory encoding. */ @Deprecated public void setEntryEncoding(final String entryEncoding) { // Note: this does not detect new ArchiveStreamFactory(null) but that does not set the encoding anyway if (encoding != null) { throw new IllegalStateException("Cannot overide encoding set by the constructor"); } this.entryEncoding = entryEncoding; } /** * Creates an archive input stream from an archiver name and an input stream. * * @param archiverName the archive name, * i.e. {@value #AR}, {@value #ARJ}, {@value #ZIP}, {@value #TAR}, {@value #JAR}, {@value #CPIO}, {@value #DUMP} or {@value #SEVEN_Z} * @param in the input stream * @return the archive input stream * @throws ArchiveException if the archiver name is not known * @throws StreamingNotSupportedException if the format cannot be * read from a stream * @throws IllegalArgumentException if the archiver name or stream is null */ public ArchiveInputStream createArchiveInputStream(final String archiverName, final InputStream in) throws ArchiveException { return createArchiveInputStream(archiverName, in, entryEncoding); } @Override public ArchiveInputStream createArchiveInputStream(final String archiverName, final InputStream in, final String actualEncoding) throws ArchiveException { if (archiverName == null) { throw new IllegalArgumentException("Archivername must not be null."); } if (in == null) { throw new IllegalArgumentException("InputStream must not be null."); } if (AR.equalsIgnoreCase(archiverName)) { return new ArArchiveInputStream(in); } if (ARJ.equalsIgnoreCase(archiverName)) { if (actualEncoding != null) { return new ArjArchiveInputStream(in, actualEncoding); } return new ArjArchiveInputStream(in); } if (ZIP.equalsIgnoreCase(archiverName)) { if (actualEncoding != null) { return new ZipArchiveInputStream(in, actualEncoding); } return new ZipArchiveInputStream(in); } if (TAR.equalsIgnoreCase(archiverName)) { if (actualEncoding != null) { return new TarArchiveInputStream(in, actualEncoding); } return new TarArchiveInputStream(in); } if (JAR.equalsIgnoreCase(archiverName)) { if (actualEncoding != null) { return new JarArchiveInputStream(in, actualEncoding); } return new JarArchiveInputStream(in); } if (CPIO.equalsIgnoreCase(archiverName)) { if (actualEncoding != null) { return new CpioArchiveInputStream(in, actualEncoding); } return new CpioArchiveInputStream(in); } if (DUMP.equalsIgnoreCase(archiverName)) { if (actualEncoding != null) { return new DumpArchiveInputStream(in, actualEncoding); } return new DumpArchiveInputStream(in); } if (SEVEN_Z.equalsIgnoreCase(archiverName)) { throw new StreamingNotSupportedException(SEVEN_Z); } final ArchiveStreamProvider archiveStreamProvider = getArchiveInputStreamProviders().get(toKey(archiverName)); if (archiveStreamProvider != null) { return archiveStreamProvider.createArchiveInputStream(archiverName, in, actualEncoding); } throw new ArchiveException("Archiver: " + archiverName + " not found."); } /** * Creates an archive output stream from an archiver name and an output stream. * * @param archiverName the archive name, * i.e. {@value #AR}, {@value #ZIP}, {@value #TAR}, {@value #JAR} or {@value #CPIO} * @param out the output stream * @return the archive output stream * @throws ArchiveException if the archiver name is not known * @throws StreamingNotSupportedException if the format cannot be * written to a stream * @throws IllegalArgumentException if the archiver name or stream is null */ public ArchiveOutputStream createArchiveOutputStream(final String archiverName, final OutputStream out) throws ArchiveException { return createArchiveOutputStream(archiverName, out, entryEncoding); } @Override public ArchiveOutputStream createArchiveOutputStream( final String archiverName, final OutputStream out, final String actualEncoding) throws ArchiveException { if (archiverName == null) { throw new IllegalArgumentException("Archivername must not be null."); } if (out == null) { throw new IllegalArgumentException("OutputStream must not be null."); } if (AR.equalsIgnoreCase(archiverName)) { return new ArArchiveOutputStream(out); } if (ZIP.equalsIgnoreCase(archiverName)) { final ZipArchiveOutputStream zip = new ZipArchiveOutputStream(out); if (actualEncoding != null) { zip.setEncoding(actualEncoding); } return zip; } if (TAR.equalsIgnoreCase(archiverName)) { if (actualEncoding != null) { return new TarArchiveOutputStream(out, actualEncoding); } return new TarArchiveOutputStream(out); } if (JAR.equalsIgnoreCase(archiverName)) { if (actualEncoding != null) { return new JarArchiveOutputStream(out, actualEncoding); } return new JarArchiveOutputStream(out); } if (CPIO.equalsIgnoreCase(archiverName)) { if (actualEncoding != null) { return new CpioArchiveOutputStream(out, actualEncoding); } return new CpioArchiveOutputStream(out); } if (SEVEN_Z.equalsIgnoreCase(archiverName)) { throw new StreamingNotSupportedException(SEVEN_Z); } final ArchiveStreamProvider archiveStreamProvider = getArchiveOutputStreamProviders().get(toKey(archiverName)); if (archiveStreamProvider != null) { return archiveStreamProvider.createArchiveOutputStream(archiverName, out, actualEncoding); } throw new ArchiveException("Archiver: " + archiverName + " not found."); } /** * Create an archive input stream from an input stream, autodetecting * the archive type from the first few bytes of the stream. The InputStream * must support marks, like BufferedInputStream. * * @param in the input stream * @return the archive input stream * @throws ArchiveException if the archiver name is not known * @throws StreamingNotSupportedException if the format cannot be * read from a stream * @throws IllegalArgumentException if the stream is null or does not support mark */ public ArchiveInputStream createArchiveInputStream(final InputStream in) throws ArchiveException { return createArchiveInputStream(detect(in), in); } /** * Try to determine the type of Archiver * @param in input stream * @return type of archiver if found * @throws ArchiveException if an archiver cannot be detected in the stream * @since 1.14 */ public static String detect(InputStream in) throws ArchiveException { if (in == null) { throw new IllegalArgumentException("Stream must not be null."); } if (!in.markSupported()) { throw new IllegalArgumentException("Mark is not supported."); } final byte[] signature = new byte[SIGNATURE_SIZE]; in.mark(signature.length); int signatureLength = -1; try { signatureLength = IOUtils.readFully(in, signature); in.reset(); } catch (IOException e) { throw new ArchiveException("IOException while reading signature."); } if (ZipArchiveInputStream.matches(signature, signatureLength)) { return ZIP; } else if (JarArchiveInputStream.matches(signature, signatureLength)) { return JAR; } if (ArArchiveInputStream.matches(signature, signatureLength)) { return AR; } else if (CpioArchiveInputStream.matches(signature, signatureLength)) { return CPIO; } else if (ArjArchiveInputStream.matches(signature, signatureLength)) { return ARJ; } else if (SevenZFile.matches(signature, signatureLength)) { return SEVEN_Z; } // Dump needs a bigger buffer to check the signature; final byte[] dumpsig = new byte[DUMP_SIGNATURE_SIZE]; in.mark(dumpsig.length); try { signatureLength = IOUtils.readFully(in, dumpsig); in.reset(); } catch (IOException e) { throw new ArchiveException("IOException while reading dump signature"); } if (DumpArchiveInputStream.matches(dumpsig, signatureLength)) { return DUMP; } // Tar needs an even bigger buffer to check the signature; read the first block final byte[] tarHeader = new byte[TAR_HEADER_SIZE]; in.mark(tarHeader.length); try { signatureLength = IOUtils.readFully(in, tarHeader); in.reset(); } catch (IOException e) { throw new ArchiveException("IOException while reading tar signature"); } if (TarArchiveInputStream.matches(tarHeader, signatureLength)) { return TAR; } // COMPRESS-117 - improve auto-recognition if (signatureLength >= TAR_HEADER_SIZE) { TarArchiveInputStream tais = null; try { tais = new TarArchiveInputStream(new ByteArrayInputStream(tarHeader)); // COMPRESS-191 - verify the header checksum if (tais.getNextTarEntry().isCheckSumOK()) { return TAR; } } catch (final Exception e) { // NOPMD // can generate IllegalArgumentException as well // as IOException // autodetection, simply not a TAR // ignored } finally { IOUtils.closeQuietly(tais); } } throw new ArchiveException("No Archiver found for the stream signature"); } public SortedMap<String, ArchiveStreamProvider> getArchiveInputStreamProviders() { if (archiveInputStreamProviders == null) { archiveInputStreamProviders = Collections .unmodifiableSortedMap(findAvailableArchiveInputStreamProviders()); } return archiveInputStreamProviders; } public SortedMap<String, ArchiveStreamProvider> getArchiveOutputStreamProviders() { if (archiveOutputStreamProviders == null) { archiveOutputStreamProviders = Collections .unmodifiableSortedMap(findAvailableArchiveOutputStreamProviders()); } return archiveOutputStreamProviders; } @Override public Set<String> getInputStreamArchiveNames() { return Sets.newHashSet(AR, ARJ, ZIP, TAR, JAR, CPIO, DUMP, SEVEN_Z); } @Override public Set<String> getOutputStreamArchiveNames() { return Sets.newHashSet(AR, ZIP, TAR, JAR, CPIO, SEVEN_Z); } }