package org.icij.extract.spewer;
import java.io.*;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import org.apache.commons.io.TaggedIOException;
import org.apache.commons.io.output.TaggedOutputStream;
import org.apache.tika.metadata.Metadata;
import com.fasterxml.jackson.core.JsonFactory;
import com.fasterxml.jackson.core.JsonGenerator;
import com.fasterxml.jackson.core.JsonEncoding;
import org.icij.extract.document.Document;
import org.icij.extract.extractor.Extractor;
import org.icij.task.Options;
import org.icij.task.annotation.Option;
import org.slf4j.LoggerFactory;
import org.slf4j.Logger;
/**
* Writes the text or HTML output from a {@link Reader} to the filesystem.
* Metadata is written to a JSON file.
*
* @since 1.0.0-beta
*/
@Option(name = "outputDirectory", description = "Directory to output extracted text. Defaults to the " +
"current directory.", parameter = "path")
@Option(name = "outputFormat", description = "Set the output format. Either \"text\" or \"HTML\". " +
"Defaults to text output.", parameter = "type")
public class FileSpewer extends Spewer implements Serializable {
private static final Logger logger = LoggerFactory.getLogger(FileSpewer.class);
private static final long serialVersionUID = -6541331052292803766L;
private Path outputDirectory = Paths.get(".");
private String outputExtension = "txt";
public FileSpewer(final FieldNames fields) {
super(fields);
}
@Override
public FileSpewer configure(final Options<String> options) {
super.configure(options);
final Extractor.OutputFormat outputFormat = options.get("outputFormat").parse()
.asEnum(Extractor.OutputFormat::parse).orElse(null);
if (null != outputFormat && outputFormat.equals(Extractor.OutputFormat.HTML)) {
outputExtension = "html";
}
options.get("outputDirectory").parse().asPath().ifPresent(this::setOutputDirectory);
return this;
}
public void setOutputDirectory(final Path outputDirectory) {
this.outputDirectory = outputDirectory;
}
public Path getOutputDirectory() {
return outputDirectory;
}
public String getOutputExtension() {
return outputExtension;
}
@Override
public void close() throws IOException {}
@Override
public void write(final Document document, final Reader reader) throws IOException {
final Path outputPath = getOutputPath(document);
// Add the output extension.
Path contentsOutputPath;
if (null != outputExtension) {
contentsOutputPath = outputPath.getFileSystem().getPath(outputPath.toString() + "." + outputExtension);
} else {
contentsOutputPath = outputPath;
}
logger.info(String.format("Outputting to file: \"%s\".", contentsOutputPath));
// Make the required directories.
final Path outputParent = contentsOutputPath.getParent();
if (null != outputParent) {
final File outputFileParent = outputParent.toFile();
final boolean madeDirs = outputFileParent.mkdirs();
// The {@link File#mkdirs} method will return false if the path already exists.
if (!madeDirs && !outputFileParent.isDirectory()) {
throw new TaggedIOException(new IOException(String.format("Unable to make directories for file: \"%s\".",
contentsOutputPath)), this);
}
}
TaggedOutputStream tagged = null;
// #copy buffers the input so there's no need to use an output buffer.
try (final OutputStream output = Files.newOutputStream(contentsOutputPath)) {
tagged = new TaggedOutputStream(output);
copy(reader, tagged);
} catch (IOException e) {
if (null != tagged && tagged.isCauseOf(e)) {
throw new TaggedIOException(new IOException(String.format("Error writing output to file: \"%s\".",
contentsOutputPath), e), this);
} else {
throw e;
}
}
if (outputMetadata) {
writeMetadata(document);
}
}
@Override
public void writeMetadata(final Document document) throws IOException {
final Metadata metadata = document.getMetadata();
Path outputPath = getOutputPath(document);
outputPath = outputPath.getFileSystem().getPath(outputPath.toString() + ".json");
logger.info(String.format("Outputting metadata to file: \"%s\".", outputPath));
try (final JsonGenerator jsonGenerator = new JsonFactory().createGenerator(outputPath.toFile(),
JsonEncoding.UTF8)) {
jsonGenerator.useDefaultPrettyPrinter();
jsonGenerator.writeStartObject();
new MetadataTransformer(metadata, fields).transform(jsonGenerator::writeStringField, (name, values)-> {
jsonGenerator.writeArrayFieldStart(name);
jsonGenerator.writeStartArray();
for (String value: values) {
jsonGenerator.writeString(value);
}
});
jsonGenerator.writeEndObject();
jsonGenerator.writeRaw('\n');
} catch (IOException e) {
throw new TaggedIOException(new IOException("Unable to output JSON."), this);
}
}
private Path getOutputPath(final Document document) {
final Path path = document.getPath();
// Join the file path to the output directory path to parse the output path.
// If the file path is absolute, the leading slash must be removed.
if (null != outputDirectory) {
if (path.isAbsolute()) {
return outputDirectory.resolve(path.toString().substring(1));
}
return outputDirectory.resolve(path);
}
return path;
}
}