/** * */ package uk.bl.wa.util; /* * #%L * warc-indexer * %% * Copyright (C) 2013 - 2014 The UK Web Archive * %% * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as * published by the Free Software Foundation, either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this program. If not, see * <http://www.gnu.org/licenses/gpl-2.0.html>. * #L% */ import static org.archive.format.warc.WARCConstants.HEADER_KEY_PAYLOAD_DIGEST; import static org.archive.format.warc.WARCConstants.HEADER_KEY_TYPE; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.File; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.InputStream; import java.io.OutputStream; import java.io.RandomAccessFile; import java.security.DigestInputStream; import java.security.MessageDigest; import java.security.NoSuchAlgorithmException; import org.apache.commons.codec.digest.MessageDigestAlgorithms; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.commons.io.IOUtils; import org.archive.format.warc.WARCConstants; import org.archive.io.ArchiveRecordHeader; import org.archive.util.Base32; import org.jwat.common.RandomAccessFileInputStream; /** * Utility method that takes a given input stream and caches the * content in RAM, on disk, based on some size limits. * * Also calculates the hash of the whole stream. * * @author anj * */ public class HashedCachedInputStream { private static Log log = LogFactory.getLog( HashedCachedInputStream.class ); private MessageDigest digest = null; private String headerHash = null; private String hash = null; private boolean inMemory; private File cacheFile; private byte[] cacheBytes; private boolean truncated = false; // Thresholds: private long inMemoryThreshold = 1024*1024; // Up to 1MB allowed in RAM. private long onDiskThreshold = 1024*1024*100; // Up to 100MB cached on disk. /** * * @param header * @param in * @param length * @param inMemoryThreshold * @param onDiskThreshold */ public HashedCachedInputStream( ArchiveRecordHeader header, InputStream in, long length, long inMemoryThreshold, long onDiskThreshold ) { this.inMemoryThreshold = inMemoryThreshold; this.onDiskThreshold = onDiskThreshold; init(header,in,length); } /** * Constructo, processed payload for hash and makes content available. * * @param header * @param in * @param length */ public HashedCachedInputStream( ArchiveRecordHeader header, InputStream in, long length ) { init(header,in,length); } /** * @param header * @param in * @param length */ private void init(ArchiveRecordHeader header, InputStream in, long length) { try { digest = MessageDigest.getInstance( MessageDigestAlgorithms.SHA_1); } catch (NoSuchAlgorithmException e) { log.error( "Hashing: " + header.getUrl() + "@" + header.getOffset(), e ); } try { if( header.getHeaderFieldKeys().contains( HEADER_KEY_PAYLOAD_DIGEST ) ) { headerHash = ( String ) header.getHeaderValue( HEADER_KEY_PAYLOAD_DIGEST ); } // Create a suitable outputstream for caching the content: OutputStream cache = null; if( length < inMemoryThreshold ) { inMemory = true; cache = new ByteArrayOutputStream(); } else { inMemory = false; cacheFile = File.createTempFile("warc-indexer", ".cache"); cacheFile.deleteOnExit(); cache = new FileOutputStream( cacheFile ); } DigestInputStream dinput = new DigestInputStream( in, digest ); long toCopy = length; if( length > this.onDiskThreshold ) { toCopy = this.onDiskThreshold; } IOUtils.copyLarge( dinput, cache, 0, toCopy); cache.close(); // Read the remainder of the stream, to get the hash. if( length > this.onDiskThreshold ) { truncated = true; IOUtils.skip( dinput, length - this.onDiskThreshold); } hash = "sha1:" + Base32.encode( digest.digest() ); // For response records, check the hash is consistent with any header hash: if( headerHash != null ) { if( header.getHeaderFieldKeys().contains( HEADER_KEY_TYPE ) && header.getHeaderValue( HEADER_KEY_TYPE ).equals(WARCConstants.WARCRecordType.response.toString()) ) { if( ! headerHash.equals(hash)) { log.error("Hashes are not equal for this input!"); throw new RuntimeException("Hash check failed!"); } else { log.debug("Hashes were found to match for "+header.getUrl()); } } else { // For revisit records, use the hash of the revisited payload: // TODO this should actually only do it for revisit type records. this.hash = this.headerHash; } } // Now set up the inputStream if( inMemory ) { this.cacheBytes = ((ByteArrayOutputStream)cache).toByteArray(); // Encourage GC cache = null; } } catch( Exception i ) { log.error( "Hashing: " + header.getUrl() + "@" + header.getOffset(), i ); } } /** * * @return */ public String getHash() { return hash; } /** * * @return */ public InputStream getInputStream() { if( inMemory ) { if( this.cacheBytes != null ) { return new ByteArrayInputStream( this.cacheBytes ); } else { log.error("Found a NULL byte array!"); return new ByteArrayInputStream( new byte[] {} ); } } else { RandomAccessFile RAFcache; try { RAFcache = new RandomAccessFile(cacheFile, "r"); } catch (FileNotFoundException e) { e.printStackTrace(); return null; } return new RandomAccessFileInputStream(RAFcache); } } /** * * @return */ public boolean isTruncated() { return truncated; } /** * */ public void cleanup() { if( this.cacheFile != null ) this.cacheFile.delete(); } }