package org.apache.tika.parser.utils; /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.Locale; import org.apache.commons.codec.digest.DigestUtils; import org.apache.commons.io.IOUtils; import org.apache.tika.exception.TikaException; import org.apache.tika.io.CloseShieldInputStream; import org.apache.tika.io.IOExceptionWithCause; import org.apache.tika.io.TemporaryResources; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.parser.DigestingParser; import org.apache.tika.parser.ParseContext; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * Implementation of {@link org.apache.tika.parser.DigestingParser.Digester} * that relies on commons.codec.digest.DigestUtils to calculate digest hashes. * <p> * This digester tries to use the regular mark/reset protocol on the InputStream. * However, this wraps an internal BoundedInputStream, and if the InputStream * is not fully read, then this will reset the stream and * spool the InputStream to disk (via TikaInputStream) and then digest the file. * <p> * If a TikaInputStream is passed in and it has an underlying file that is longer * than the {@link #markLimit}, then this digester digests the file directly. * */ public class CommonsDigester implements DigestingParser.Digester { private static final Logger LOG = LoggerFactory.getLogger(CommonsDigester.class); public enum DigestAlgorithm { //those currently available in commons.digest MD2, MD5, SHA1, SHA256, SHA384, SHA512; String getMetadataKey() { return TikaCoreProperties.TIKA_META_PREFIX+ "digest"+Metadata.NAMESPACE_PREFIX_DELIMITER+this.toString(); } } private final List<DigestAlgorithm> algorithms = new ArrayList<DigestAlgorithm>(); private final int markLimit; public CommonsDigester(int markLimit, DigestAlgorithm... algorithms) { Collections.addAll(this.algorithms, algorithms); if (markLimit < 0) { throw new IllegalArgumentException("markLimit must be >= 0"); } this.markLimit = markLimit; } @Override public void digest(InputStream is, Metadata m, ParseContext parseContext) throws IOException { //if this is already a TikaInputStream, rely on the caller to close //the stream and free the tmp file. TikaInputStream tis = TikaInputStream.cast(is); TemporaryResources tmp = null; if (tis == null) { //if this isn't a TikaInputStream, create a new TempResources //and make sure to release it!!! tmp = new TemporaryResources(); tis = TikaInputStream.get(new CloseShieldInputStream(is), tmp); } try { long sz = -1; if (tis.hasFile()) { sz = tis.getLength(); } //if the file is definitely a file, //and its size is greater than its mark limit, //just digest the underlying file. if (sz > markLimit) { digestFile(tis.getFile(), m); return; } //try the usual mark/reset stuff. //however, if you actually hit the bound, //then stop and spool to file via TikaInputStream SimpleBoundedInputStream bis = new SimpleBoundedInputStream(markLimit, tis); boolean finishedStream = false; for (DigestAlgorithm algorithm : algorithms) { bis.mark(markLimit + 1); finishedStream = digestEach(algorithm, bis, m); bis.reset(); if (!finishedStream) { break; } } if (!finishedStream) { digestFile(tis.getFile(), m); } } finally { try { if (tmp != null) { tmp.dispose(); } } catch (TikaException e) { throw new IOExceptionWithCause(e); } } } private void digestFile(File f, Metadata m) throws IOException { for (DigestAlgorithm algorithm : algorithms) { InputStream is = new FileInputStream(f); try { digestEach(algorithm, is, m); } finally { IOUtils.closeQuietly(is); } } } /** * * @param algorithm algo to use * @param is input stream to read from * @param metadata metadata for reporting the digest * @return whether or not this finished the input stream * @throws IOException */ private boolean digestEach(DigestAlgorithm algorithm, InputStream is, Metadata metadata) throws IOException { String digest = null; try { switch (algorithm) { case MD2: digest = DigestUtils.md2Hex(is); break; case MD5: digest = DigestUtils.md5Hex(is); break; case SHA1: digest = DigestUtils.sha1Hex(is); break; case SHA256: digest = DigestUtils.sha256Hex(is); break; case SHA384: digest = DigestUtils.sha384Hex(is); break; case SHA512: digest = DigestUtils.sha512Hex(is); break; default: throw new IllegalArgumentException("Sorry, not aware of algorithm: " + algorithm.toString()); } } catch (IOException e) { LOG.warn("Problem digesting", e); //swallow, or should we throw this? } if (is instanceof SimpleBoundedInputStream) { if (((SimpleBoundedInputStream)is).hasHitBound()) { return false; } } metadata.set(algorithm.getMetadataKey(), digest); return true; } /** * * @param s comma-delimited (no space) list of algorithms to use: md5,sha256 * @return */ public static DigestAlgorithm[] parse(String s) { assert(s != null); List<DigestAlgorithm> ret = new ArrayList<>(); for (String algoString : s.split(",")) { String uc = algoString.toUpperCase(Locale.ROOT); if (uc.equals(DigestAlgorithm.MD2.toString())) { ret.add(DigestAlgorithm.MD2); } else if (uc.equals(DigestAlgorithm.MD5.toString())) { ret.add(DigestAlgorithm.MD5); } else if (uc.equals(DigestAlgorithm.SHA1.toString())) { ret.add(DigestAlgorithm.SHA1); } else if (uc.equals(DigestAlgorithm.SHA256.toString())) { ret.add(DigestAlgorithm.SHA256); } else if (uc.equals(DigestAlgorithm.SHA384.toString())) { ret.add(DigestAlgorithm.SHA384); } else if (uc.equals(DigestAlgorithm.SHA512.toString())) { ret.add(DigestAlgorithm.SHA512); } else { StringBuilder sb = new StringBuilder(); int i = 0; for (DigestAlgorithm algo : DigestAlgorithm.values()) { if (i++ > 0) { sb.append(", "); } sb.append(algo.toString()); } throw new IllegalArgumentException("Couldn't match " + s + " with any of: " + sb.toString()); } } return ret.toArray(new DigestAlgorithm[ret.size()]); } /** * Very slight modification of Commons' BoundedInputStream * so that we can figure out if this hit the bound or not. */ private class SimpleBoundedInputStream extends InputStream { private final static int EOF = -1; private final long max; private final InputStream in; private long pos; private SimpleBoundedInputStream(long max, InputStream in) { this.max = max; this.in = in; } @Override public int read() throws IOException { if (max >= 0 && pos >= max) { return EOF; } final int result = in.read(); pos++; return result; } /** * Invokes the delegate's <code>read(byte[])</code> method. * @param b the buffer to read the bytes into * @return the number of bytes read or -1 if the end of stream or * the limit has been reached. * @throws IOException if an I/O error occurs */ @Override public int read(final byte[] b) throws IOException { return this.read(b, 0, b.length); } /** * Invokes the delegate's <code>read(byte[], int, int)</code> method. * @param b the buffer to read the bytes into * @param off The start offset * @param len The number of bytes to read * @return the number of bytes read or -1 if the end of stream or * the limit has been reached. * @throws IOException if an I/O error occurs */ @Override public int read(final byte[] b, final int off, final int len) throws IOException { if (max>=0 && pos>=max) { return EOF; } final long maxRead = max>=0 ? Math.min(len, max-pos) : len; final int bytesRead = in.read(b, off, (int)maxRead); if (bytesRead==EOF) { return EOF; } pos+=bytesRead; return bytesRead; } /** * Invokes the delegate's <code>skip(long)</code> method. * @param n the number of bytes to skip * @return the actual number of bytes skipped * @throws IOException if an I/O error occurs */ @Override public long skip(final long n) throws IOException { final long toSkip = max>=0 ? Math.min(n, max-pos) : n; final long skippedBytes = in.skip(toSkip); pos+=skippedBytes; return skippedBytes; } @Override public void reset() throws IOException { in.reset(); } @Override public void mark(int readLimit) { in.mark(readLimit); } public boolean hasHitBound() { return pos >= max; } } }