package org.icij.extract.document;
import org.apache.tika.metadata.Metadata;
/**
* An {@linkplain Identifier} holds logic for generating both unique identifiers for documents as well as digest
* hashes of the the underlying file data.
*/
public interface Identifier {
/**
* Generate an identifier for a root document.
*
* @param document the document to generate an identifier for
* @return A unique identifier, for example a fixed-length hash.
* @throws Exception if there's an exception generating the ID
*/
String generate(final Document document) throws Exception;
/**
* Generate an identifier for an embedded document.
*
* @param document the embedded document to generate an ID for
* @return A unique identifier for the embedded document.
* @throws Exception if there's an error generating the ID
*/
String generateForEmbed(final EmbeddedDocument document) throws Exception;
/**
* Generate or retrieve (from metadata) a hash digest of the document's underlying file data.
*
* Even if the {@link #generate(Document)} methods of the implementation generate hash digests, those are
* semantically different as they represent a hash of the document, rather than the file. The former might
* comprise the the relationship of the document with its parent, or its position in the path hierarchy, whereas
* the latter must not.
*
* @param document the document for which to return a file hash digest
* @return the hash
* @throws Exception if there's an error generating the hash
*/
String hash(final Document document) throws Exception;
/**
* Retrieve a hash digest of the document's underlying file data.
*
* @param metadata the document's metadata
* @return the hash
*/
String retrieveHash(final Metadata metadata);
}