package org.icij.extract.document;
import org.apache.tika.metadata.Metadata;
import javax.xml.bind.DatatypeConverter;
import java.nio.charset.Charset;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import static java.util.Locale.ENGLISH;
public class DigestIdentifier extends AbstractIdentifier {
DigestIdentifier(final String algorithm, final Charset charset) {
super(algorithm, charset);
}
@Override
public String generate(final Document document) {
return hash(document).toLowerCase(ENGLISH);
}
@Override
public String generateForEmbed(final EmbeddedDocument embed) throws NoSuchAlgorithmException {
final MessageDigest digest = MessageDigest.getInstance(algorithm);
// Embedded documents in different files or the same file could have the same hash. Therefore, to avoid ID
// collisions within the child document tree, the digest considers:
// - the file digest hash
// - the parent path
// - the embedded relationship ID
// - the embedded document name
final Metadata metadata = embed.getMetadata();
final String embeddedRelationshipId = metadata.get(Metadata.EMBEDDED_RELATIONSHIP_ID);
final String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
digest.update(hash(embed).getBytes(charset));
digest.update(embed.getParent().getId().getBytes(charset));
if (null != embeddedRelationshipId) {
digest.update(embeddedRelationshipId.getBytes(charset));
}
if (null != name) {
digest.update(name.getBytes(charset));
}
return DatatypeConverter.printHexBinary(digest.digest()).toLowerCase(ENGLISH);
}
}