//Dstl (c) Crown Copyright 2017 // Dstl (c) Crown Copyright 2017 package uk.gov.dstl.baleen.consumers.utils; import java.io.File; import java.io.IOException; import org.apache.commons.io.FileUtils; import org.apache.uima.UimaContext; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.tcas.DocumentAnnotation; import org.apache.uima.resource.ResourceInitializationException; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import com.google.common.base.Strings; import uk.gov.dstl.baleen.consumers.Html5; import uk.gov.dstl.baleen.exceptions.BaleenException; import uk.gov.dstl.baleen.types.metadata.Metadata; import uk.gov.dstl.baleen.uima.BaleenConsumer; import uk.gov.dstl.baleen.uima.utils.UimaTypesUtils; /** * Creates HTML5 versions of the document, with entities annotated as spans. The original formatting * of the document is lost, and only the content is kept. * * Relationships are not currently supported. * * This is largely based off the original {@link Html5} consumer. * * @baleen.javadoc */ public abstract class AbstractHtmlConsumer extends BaleenConsumer { /** * The folder to output files to * * @baleen.config <i>Current directory</i> */ public static final String PARAM_OUTPUT_FOLDER = "outputFolder"; /** * Should the external ID be used for the file name? This option is useful if you have lots of * files with duplicate names, or you are reading from a source that isn't file system based (e.g. * a database). * * The external ID will be used by default if no Source URI is available, or it is badly formed. * * @baleen.config false */ public static final String PARAM_USE_EXTERNAL_ID = "useExternalId"; /** * Should a hash of the content be used to generate the ID? If false, then a hash of the Source * URI is used instead. * * @baleen.config true */ public static final String PARAM_CONTENT_HASH_AS_ID = "contentHashAsId"; /** * Set the CSS file for the output to reference. The string, if provided, will be added as a <link * ...> element in the document. * * @baleen.config */ public static final String PARAM_CSS = "css"; private static final String FILE_EXTENSION = ".html"; @ConfigurationParameter(name = PARAM_OUTPUT_FOLDER, defaultValue = "") private String outputFolderString; private File outputFolder; @ConfigurationParameter(name = PARAM_USE_EXTERNAL_ID, defaultValue = "false") private Boolean useExternalId; @ConfigurationParameter(name = PARAM_CONTENT_HASH_AS_ID, defaultValue = "true") private Boolean contentHashAsId = true; @ConfigurationParameter(name = PARAM_CSS, defaultValue = "") private String css; /* * (non-Javadoc) * * @see uk.gov.dstl.baleen.uima.BaleenAnnotator#doInitialize(org.apache.uima.UimaContext) */ @Override public void doInitialize(final UimaContext aContext) throws ResourceInitializationException { if (Strings.isNullOrEmpty(outputFolderString)) { outputFolderString = System.getProperty("user.dir"); } outputFolder = new File(outputFolderString); if (!outputFolder.exists()) { final Boolean ret = outputFolder.mkdirs(); if (!ret) { throw new ResourceInitializationException( new BaleenException("Unable to create output folder")); } } if (!outputFolder.isDirectory() || !outputFolder.canWrite()) { throw new ResourceInitializationException(new BaleenException("Unable to write to folder")); } } /** * Append meta tag to an element (typically a head). * * @param el the el * @param name the meta name * @param content the meta content * @return the meta element */ private Element appendMeta(final Element el, final String name, final String content) { if (Strings.isNullOrEmpty(name) || Strings.isNullOrEmpty(content)) { return null; } final Element meta = el.appendElement("meta"); meta.attr("name", name); meta.attr("content", content); return meta; } /** * Gets the file name for the jCas (from either contenthash or the original source). * * @param jCas the j cas * @return the file name */ private File getFileName(final JCas jCas) { File f = null; final DocumentAnnotation da = getDocumentAnnotation(jCas); final String source = da.getSourceUri(); if (useExternalId || Strings.isNullOrEmpty(source)) { final String id = ConsumerUtils.getExternalId(da, contentHashAsId); f = new File(outputFolder, id + FILE_EXTENSION); } else { try { final String name = source.substring(source.lastIndexOf(File.separator) + 1); f = new File(outputFolder, name + FILE_EXTENSION); int append = 0; while (f.exists()) { append++; f = new File(outputFolder, name + "." + append + FILE_EXTENSION); } if (append != 0) { getMonitor().info( "File with the same name already exists in {} - source file will be saved as {}", outputFolder.getName(), f.getName()); } } catch (final Exception e) { getMonitor().warn( "An error occurred trying to use the source URI {} as a file name - the external ID will be used instead", source, e); final String id = ConsumerUtils.getExternalId(da, contentHashAsId); f = new File(outputFolder, id + FILE_EXTENSION); } } return f; } /* * (non-Javadoc) * * @see uk.gov.dstl.baleen.uima.BaleenAnnotator#doProcess(org.apache.uima.jcas.JCas) */ @Override protected void doProcess(final JCas jCas) throws AnalysisEngineProcessException { final File f = getFileName(jCas); final DocumentAnnotation da = getDocumentAnnotation(jCas); final Document doc = Jsoup.parse("<!DOCTYPE html>\n<html lang=\"" + da.getLanguage() + "\"></html>"); doc.outputSettings(new Document.OutputSettings().prettyPrint(false)); final Element head = doc.head(); if (!Strings.isNullOrEmpty(css)) { final Element cssLink = head.appendElement("link"); cssLink.attr("rel", "stylesheet"); cssLink.attr("href", css); } final Element charset = head.appendElement("meta"); charset.attr("charset", "utf-8"); appendMeta(head, "document.type", da.getDocType()); appendMeta(head, "document.sourceUri", da.getSourceUri()); appendMeta(head, "externalId", da.getHash()); appendMeta(head, "document.classification", da.getDocumentClassification()); appendMeta(head, "document.caveats", String.join(",", UimaTypesUtils.toArray(da.getDocumentCaveats()))); appendMeta(head, "document.releasability", String.join(",", UimaTypesUtils.toArray(da.getDocumentReleasability()))); String title = null; for (final Metadata md : JCasUtil.select(jCas, Metadata.class)) { appendMeta(head, md.getKey(), md.getValue()); if ("documentTitle".equalsIgnoreCase(md.getKey())) { title = md.getValue(); } } if (!Strings.isNullOrEmpty(title)) { doc.title(title); } final Element body = doc.body(); writeBody(jCas, body); try { FileUtils.writeStringToFile(f, doc.html()); } catch (final IOException e) { throw new AnalysisEngineProcessException(e); } } /** * Called to actually write into the body element, from the jCas. * * @param jCas the jcas * @param body the body */ protected abstract void writeBody(JCas jCas, Element body); }