package org.icij.extract.document; import org.apache.tika.metadata.Metadata; import org.icij.task.Options; import org.icij.task.annotation.Option; import java.net.URISyntaxException; import java.net.URL; import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; import java.nio.file.Path; import java.nio.file.Paths; import java.util.Objects; import java.util.Optional; /** * A factory class for creating {@link Document} objects with default parameters. * * {@link org.icij.extract.queue.DocumentQueue} implementations should use the {@literal create} method that * instantiates a {@link Document} with all of the information that it is capable of providing. * * For example, a queue that stores only paths should use the {@link #create(Path)} method, whereas a queue that * stores both a path and ID should use {@link #create(String, Path)}. */ @Option(name = "idMethod", description = "The method for determining document IDs, for queues that use them. " + "Defaults to using the path as an ID.", parameter = "name") @Option(name = "idDigestMethod", description = "For calculating document ID digests, where applicable depending on " + "the ID method.", parameter = "name") @Option(name = "charset", description = "Set the output encoding for text and document attributes. Defaults to UTF-8.", parameter = "name") public class DocumentFactory { private Identifier identifier = null; public DocumentFactory configure(final Options<String> options) { final String algorithm = options.get("idDigestMethod").value().orElse("SHA-256"); final Charset charset = options.get("charset").parse().asCharset().orElse(StandardCharsets.UTF_8); final Optional<String> method = options.get("idMethod").value(); if (method.isPresent()) { switch (method.get()) { case "path": this.identifier = new PathIdentifier(); break; case "tika-digest": this.identifier = new DigestIdentifier(algorithm, charset); break; default: throw new IllegalArgumentException(String.format("\"%s\" is not a valid identifier.", method.get())); } } else { identifier = new DigestIdentifier(algorithm, charset); } return this; } public DocumentFactory withIdentifier(final Identifier identifier) { Objects.requireNonNull(identifier, "Identifier generator must not be null."); this.identifier = identifier; return this; } public Document create(final String id, final Path path) { return new Document(id, identifier, path); } public Document create(final String id, final Path path, final Metadata metadata) { return new Document(id, identifier, path, metadata); } public Document create(final Path path) { return new Document(identifier, path); } public Document create(final String path) { return create(Paths.get(path)); } public Document create(final Path path, final Metadata metadata) { return new Document(identifier, path, metadata); } public Document create(final URL url) throws URISyntaxException { return create(Paths.get(url.toURI())); } }