package org.icij.extract.spewer; import java.io.Reader; import java.io.Writer; import java.io.Serializable; import java.io.IOException; import java.io.OutputStreamWriter; import java.io.OutputStream; import java.io.StringWriter; import java.util.Map; import java.util.HashMap; import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; import org.icij.extract.document.Document; import org.icij.extract.parser.ParsingReader; import org.icij.task.Options; import org.icij.task.annotation.Option; /** * Base class for {@linkplain Spewer} superclasses that write text output from a {@link ParsingReader} to specific * endpoints. * * @since 1.0.0-beta */ @Option(name = "outputMetadata", description = "Output metadata along with extracted text. For the " + "\"file\" output type, a corresponding JSON file is created for every input file. With indexes, metadata " + "fields are set using an optional prefix. On by default.") @Option(name = "tag", description = "Set the given field to a corresponding value on each document output.", parameter = "name-value-pair") @Option(name = "charset", description = "Set the output encoding for text and document attributes. Defaults to UTF-8.", parameter = "name") public abstract class Spewer implements AutoCloseable, Serializable { private static final long serialVersionUID = 5169670165236652447L; boolean outputMetadata = true; private Charset outputEncoding = StandardCharsets.UTF_8; final Map<String, String> tags = new HashMap<>(); protected final FieldNames fields; public Spewer(final FieldNames fields) { this.fields = fields; } public Spewer configure(final Options<String> options) { options.get("outputMetadata").parse().asBoolean().ifPresent(this::outputMetadata); options.get("charset").value(Charset::forName).ifPresent(this::setOutputEncoding); options.get("tag").values().forEach(this::setTag); return this; } public abstract void write(final Document document, final Reader reader) throws IOException; public abstract void writeMetadata(final Document document) throws IOException; public FieldNames getFields() { return fields; } public void setOutputEncoding(final Charset outputEncoding) { this.outputEncoding = outputEncoding; } public Charset getOutputEncoding() { return outputEncoding; } public void outputMetadata(final boolean outputMetadata) { this.outputMetadata = outputMetadata; } public boolean outputMetadata() { return outputMetadata; } public void setTags(final Map<String, String> tags) { tags.forEach(this::setTag); } private void setTag(final String name, final String value) { tags.put(name, value); } private void setTag(final String tag) { final String[] pair = tag.split(":", 2); if (2 == pair.length) { setTag(pair[0], pair[1]); } else { throw new IllegalArgumentException(String.format("Invalid tag pair: \"%s\".", tag)); } } protected void copy(final Reader input, final OutputStream output) throws IOException { copy(input, new OutputStreamWriter(output, outputEncoding)); } public static void copy(final Reader input, final Writer output) throws IOException { final char[] buffer = new char[1024]; int n; while (-1 != (n = input.read(buffer))) { output.write(buffer, 0, n); } output.flush(); } public static String toString(final Reader reader) throws IOException { final StringWriter writer = new StringWriter(4096); copy(reader, writer); return writer.toString(); } }