//Dstl (c) Crown Copyright 2017 package uk.gov.dstl.baleen.consumers.template; import java.io.IOException; import java.io.Writer; import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.util.Collection; import java.util.HashMap; import java.util.Map; import java.util.Map.Entry; import org.apache.commons.io.FilenameUtils; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; import com.samskivert.mustache.Mustache; import com.samskivert.mustache.Mustache.Compiler; import com.samskivert.mustache.Template; import uk.gov.dstl.baleen.consumers.template.ExtractedRecord.Kind; import uk.gov.dstl.baleen.consumers.utils.SingleDocumentConsumerFormat; import uk.gov.dstl.baleen.types.metadata.Metadata; /** * Abstract base implementation of the Mustache HTML template record consumer. * <p> * Subclasses should implement {@link #mapFields(Map)} to convert from records * to template fields. * </p> * <p> * The mustache context is populated with entries for metadata, content, * sources, records, and fields as per * {@link #writeRecords(String, JCas, Map, Map)}. * </p> * <p> * Maps exposed in the context can be iterated by using the entrySet property, * and then referring to the {{key}} and {{value}} properties, as per JMustache * convention (see * <a href="https://github.com/samskivert/jmustache/issues/82">JMustache github * ticket #82</a> for details) - eg for metadata: * </p> * * <pre> * {{#metadata.entrySet}} * {{key}} = {{value}} * {{/metadata.entrySet}} * </pre> * * An example template that uses all exposed properties in the context may look * like: * * <pre> <html> <body> <h1>All metadata</h1> <div> <table> <tbody> {{#metadata.entrySet}} <tr> <th>{{key}}</th> <td>{{value}}</td> </tr> {{/metadata.entrySet}} </tbody> </table> </div> <h1>Single metadata field</h1> <table> <tbody> <tr> <th>Author</th> <td>{{metadata.author}}</td> </tr> </tbody> </table> <div> <h1>Content</h1> <pre> {{content}} </pre> </div> <div> <h1>Sources</h1> {{#sources.entrySet}} <div> <h2>{{key}}</h2> {{#value.entrySet}} <h3>Record {{key}}</h3> <h4>Fields</h4> <table> <tbody> {{#value.entrySet}} <tr> <th>{{key}}</th> <td>{{value}}</td> </tr> {{/value.entrySet}} </tbody> </table> {{/value.entrySet}} </div> {{/sources.entrySet}} </div> <div> <h1>Flattened Records</h1> {{#records.entrySet}} <div> <h2>Record {{key}}</h2> <h3>Fields</h3> <table> <tbody> {{#value.entrySet}} <tr> <th>{{key}}</th> <td>{{value}}</td> </tr> {{/value.entrySet}} </tbody> </table> </div> {{/records.entrySet}} </div> <div> <h1>Fields</h1> <div> <table> <tbody> {{#fields.entrySet}} <tr> <th>{{key}}</th> <td>{{value}}</td> </tr> {{/fields.entrySet}} </tbody> </table> </div> </div> </body> </html> * </pre> */ public abstract class AbstractMustacheHtmlTemplateRecordConsumer extends AbstractTemplateRecordConsumer { /** The Constant PARAM_OUTPUT_DIRECTORY. */ public static final String PARAM_OUTPUT_DIRECTORY = "outputDirectory"; /** * The output directory. * * @baleen.config generatedDocuments */ @ConfigurationParameter(name = PARAM_OUTPUT_DIRECTORY, defaultValue = "generatedDocuments") private String outputDirectory = "generatedDocuments"; /** * Compile template. * * @return * * @throws IOException * Signals that an I/O exception has occurred. */ protected static Template compileTemplate(Path templateFilepath) throws IOException { Compiler compiler = Mustache.compiler(); String templateHtml = new String(Files.readAllBytes(templateFilepath), StandardCharsets.UTF_8); return compiler.compile(templateHtml); } @Override protected void writeRecords(JCas jCas, String documentSourceName, Map<String, Collection<ExtractedRecord>> records) throws AnalysisEngineProcessException { Collection<Metadata> metadata = JCasUtil.select(jCas, Metadata.class); Map<String, Object> metadataMap = SingleDocumentConsumerFormat.createMetadataMap(metadata); Map<String, ?> fields = mapFields(records); Map<String, Object> mustacheContext = new HashMap<>(fields); mustacheContext.put("metadata", metadataMap); mustacheContext.put("content", jCas.getDocumentText()); writeRecords(documentSourceName, jCas, records, mustacheContext); } /** * Write records using the given mustache template. * * The JCas and records are given for reference; subclasses should generally * use the mustache context, which contains entries for: * * <dl> * <dt>metadata</dt> * <dd>Metadata annotations, collected with * {@link SingleDocumentConsumerFormat#createMetadataMap(Collection)}. * Values for duplicate metadata keys are collected as a list.</dd> * <dt>content</dt> * <dd>The document content, as set on the JCas.</dd> * <dt>sources</dt> * <dd>A map of the records per source. If source names are duplicated then * the records from the last entry will overwrite previous entries.</dd> * <dt>records</dt> * <dd>A flattened map of the records found. If record names are duplicated * between sources then the records from the last entry will overwrite * previous entries.</dd> * <dt>fields</dt> * <dd>A flattened map of all the fields found. If record names or sources * are duplicated then the fields from the last entry will overwrite * previous entries.</dd> * </dl> * * @param documentSourceName * the document source name * @param records * the records * @param mustacheContext * the mustache context */ protected abstract void writeRecords(String documentSourceName, JCas jCas, Map<String, Collection<ExtractedRecord>> records, Map<String, Object> mustacheContext); /** * Map records to a moustache field name and value. * * In trivial cases the field value may be a String, but in others it could * be a list so the template can iterate the values. * * @param metadataMap * the map of metadata key/value pairs (values can be a String, * or a list of Strings if there are multiple values - see * {@link SingleDocumentConsumerFormat#createMetadataMap(Collection)} * @param records * the records * @return the map of field name to value */ private Map<String, ?> mapFields(Map<String, Collection<ExtractedRecord>> records) { Map<String, Object> context = new HashMap<>(); Map<String, String> flattenedFields = getFlattenedFields(records); context.put("fields", flattenedFields); Map<String, Map<String, String>> flattenedRecords = getFlattenedRecords(records); context.put("records", flattenedRecords); Map<String, Map<String, Map<String, String>>> sourceRecords = getSourceRecords(records); context.put("sources", sourceRecords); return context; } /** * Gets the flattened fields. * * Returns a Map of fieldName to fieldValue. Duplicate fields between * records (and records in sources) are flattened - which record / source * wins is undefined. * * @param records * the records * @return the flattened fields */ private static Map<String, String> getFlattenedFields(Map<String, Collection<ExtractedRecord>> records) { Map<String, String> fieldMap = new HashMap<>(); for (Entry<String, Collection<ExtractedRecord>> entry : records.entrySet()) { Collection<ExtractedRecord> sourceRecords = entry.getValue(); for (ExtractedRecord extractedRecord : sourceRecords) { Collection<ExtractedField> fields = extractedRecord.getFields(); fields.forEach(field -> fieldMap.put(field.getName(), field.getValue())); } } return fieldMap; } /** * Gets the flattened records. * * Returns a Map of recordName to fields, and fields is a map of fieldName * to fieldValue. Duplicate recordNames between sources are flattened - * which source record wins is undefined. * * @param records * the records * @return the flattened records */ private static Map<String, Map<String, String>> getFlattenedRecords( Map<String, Collection<ExtractedRecord>> records) { Map<String, Map<String, String>> recordMap = new HashMap<>(); for (Entry<String, Collection<ExtractedRecord>> entry : records.entrySet()) { Collection<ExtractedRecord> sourceRecords = entry.getValue(); for (ExtractedRecord extractedRecord : sourceRecords) { if (extractedRecord.getKind() == Kind.DEFAULT) { continue; } Collection<ExtractedField> fields = extractedRecord.getFields(); Map<String, String> fieldMap = new HashMap<>(); String name = extractedRecord.getName(); fields.forEach(field -> fieldMap.put(field.getName(), field.getValue())); if (fieldMap.size() > 0) { recordMap.put(name, fieldMap); } } } return recordMap; } /** * Gets the source records, as a nested map of maps of maps. * * Returns a Map of sourceName -> records, where records is a map of * recordName to fields, and fields is a map of fieldName to fieldValue. * * @param records * the records * @return the source records */ private static Map<String, Map<String, Map<String, String>>> getSourceRecords( Map<String, Collection<ExtractedRecord>> records) { Map<String, Map<String, Map<String, String>>> sourceMap = new HashMap<>(); for (Entry<String, Collection<ExtractedRecord>> entry : records.entrySet()) { String sourceName = entry.getKey(); Map<String, Map<String, String>> recordsMap = new HashMap<>(); Collection<ExtractedRecord> sourceRecords = entry.getValue(); for (ExtractedRecord extractedRecord : sourceRecords) { if (extractedRecord.getKind() == Kind.DEFAULT) { continue; } Collection<ExtractedField> fields = extractedRecord.getFields(); Map<String, String> recordFields = new HashMap<>(); recordsMap.put(extractedRecord.getName(), recordFields); fields.forEach(field -> recordFields.put(field.getName(), field.getValue())); } if (recordsMap.size() > 0) { sourceMap.put(sourceName, recordsMap); } } return sourceMap; } /** * Creates an output writer for a new file in the configured output * directory, with appropriate name and ".html" extension. * <p> * Note: this overwrites existing files (warning if it does so). * </p> * * @param documentSourceName * the document source name * @return the writer * @throws IOException * Signals that an I/O exception has occurred. */ protected Writer createOutputWriter(final String documentSourceName, String... parts) throws IOException { Path directoryPath = Paths.get(outputDirectory); if (!directoryPath.toFile().exists()) { Files.createDirectories(directoryPath); } String baseName = FilenameUtils.getBaseName(documentSourceName); String filename = baseName + ((parts != null && parts.length > 0) ? "-" + String.join("-", parts) : "") + ".html"; Path outputFilePath = directoryPath.resolve(filename); if (outputFilePath.toFile().exists()) { getMonitor().warn("Overwriting existing output properties file {}", outputFilePath); } return Files.newBufferedWriter(outputFilePath, StandardCharsets.UTF_8); } }