/* * Copyright 2011-2014 Eric F. Savage, code@efsavage.com * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.ajah.util.io.file; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.math.BigInteger; import java.security.DigestInputStream; import java.security.MessageDigest; import java.security.NoSuchAlgorithmException; import java.util.ArrayList; import java.util.List; import java.util.logging.Level; import lombok.extern.java.Log; import com.ajah.util.lang.NameValuePair; /** * Set of utilities for hashing files. * * @author <a href="http://efsavage.com">Eric F. Savage</a>, <a * href="mailto:code@efsavage.com">code@efsavage.com</a>. */ @Log public class FileHashUtils { /** * Splits a hash signature into a directory structure so that there will not * be too many files in a single directory. * * Examples: * * <code>getHashedFileName("123abcdefhjkl",3,2)</code> will return * "12/3a/bc/123abcdefhjkl" * * <code>getHashedFileName("123abcdefhjkl",2,5)</code> will return * "123ab/cdefh/123abcdefhjkl" * * @param hash * The hash to split. * @param depth * The number of subdirectories. * @param breadth * The length of each split sequence. * @return The filename with parent directories. */ public static String getHashedFileName(final String hash, final int depth, final int breadth) { final StringBuffer name = new StringBuffer(); for (int i = 0; i < depth; i++) { name.append(hash.substring(i * breadth, (i + 1) * breadth)); name.append("/"); } name.append(hash); return name.toString(); } /** * Reads a file and returns the MD5 checksum of it. * * @param file * The file to checksum. * @param blockSize * The size of the block to read. Higher value may result in more * memory and faster performance. * @return The hexadecimal representation of the MD5 checksum of the file's * data. Will return null of {@link NoSuchAlgorithmException} * occurs. * @throws IOException * If there was an error reading the file. */ public static String md5Hex(final File file, final int blockSize) throws IOException { final long start = System.currentTimeMillis(); if (!file.canRead()) { return null; } try (InputStream fis = new FileInputStream(file)) { final MessageDigest md = MessageDigest.getInstance("MD5"); try (InputStream is = new DigestInputStream(fis, md)) { int read; do { final byte[] block = new byte[blockSize]; read = is.read(block); } while (read >= 0); // read stream to EOF as normal... final byte[] digest = md.digest(); final String hash = String.format("%0" + (digest.length << 1) + "x", new BigInteger(1, digest)); // log.info(hash); final long duration = System.currentTimeMillis() - start; // if (duration > 50) { log.fine(file.length() + " took " + duration + "ms"); // } return hash; } } catch (final NoSuchAlgorithmException e) { log.log(Level.WARNING, e.getMessage(), e); return null; } } /** * Will return a list of individually hashed chunks of a file. * * @param file * The file to hash/split. * @param blockSize * The size of the block to split the file into. * @return A list of {@link NameValuePair} where the name is the hash and * the value is the number of bytes. */ public static List<NameValuePair<Integer>> md5HexChunks(final File file, final int blockSize) { final List<NameValuePair<Integer>> nvps = new ArrayList<>(); final long start = System.currentTimeMillis(); if (!file.canRead()) { return null; } try (InputStream fis = new FileInputStream(file)) { final MessageDigest md = MessageDigest.getInstance("MD5"); try (InputStream is = new DigestInputStream(fis, md)) { int read; do { final long fragmentStart = System.currentTimeMillis(); final byte[] block = new byte[blockSize]; read = is.read(block); if (read < 0) { break; } final byte[] digest = md.digest(); final String hash = String.format("%0" + (digest.length << 1) + "x", new BigInteger(1, digest)); final NameValuePair<Integer> nvp = new NameValuePair<>(hash, Integer.valueOf(read)); log.fine(FileHashUtils.getHashedFileName(hash, 3, 2)); final long fragmentDuration = System.currentTimeMillis() - fragmentStart; log.fine(nvp.getValue() + " bytes hashing to " + nvp.getName() + " took " + fragmentDuration + "ms"); nvps.add(nvp); } while (true); final long duration = System.currentTimeMillis() - start; log.fine(nvps.size() + " fragments took " + duration + "ms"); return nvps; } } catch (final NoSuchAlgorithmException e) { log.log(Level.WARNING, e.getMessage(), e); return null; } catch (final IOException e) { if (e.getMessage().endsWith("(Access is denied)")) { // log.info(e.getMessage()); } else if (e.getMessage().equals("The process cannot access the file because another process has locked a portion of the file")) { // log.info(e.getMessage()); } else { log.warning(file.getAbsolutePath()); log.log(Level.WARNING, e.getMessage(), e); } return null; } } }