//Dstl (c) Crown Copyright 2017
package uk.gov.dstl.baleen.consumers;
import java.util.Set;
import org.apache.uima.UimaContext;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.cas.StringArray;
import org.apache.uima.resource.ResourceInitializationException;
import org.jsoup.nodes.Element;
import org.jsoup.parser.Tag;
import org.jsoup.select.Elements;
import com.google.common.base.Strings;
import uk.gov.dstl.baleen.consumers.utils.AbstractHtmlConsumer;
import uk.gov.dstl.baleen.types.structure.Anchor;
import uk.gov.dstl.baleen.types.structure.Aside;
import uk.gov.dstl.baleen.types.structure.Break;
import uk.gov.dstl.baleen.types.structure.Caption;
import uk.gov.dstl.baleen.types.structure.DefinitionDescription;
import uk.gov.dstl.baleen.types.structure.DefinitionItem;
import uk.gov.dstl.baleen.types.structure.DefinitionList;
import uk.gov.dstl.baleen.types.structure.Details;
import uk.gov.dstl.baleen.types.structure.Document;
import uk.gov.dstl.baleen.types.structure.Figure;
import uk.gov.dstl.baleen.types.structure.Footer;
import uk.gov.dstl.baleen.types.structure.Footnote;
import uk.gov.dstl.baleen.types.structure.Header;
import uk.gov.dstl.baleen.types.structure.Heading;
import uk.gov.dstl.baleen.types.structure.Link;
import uk.gov.dstl.baleen.types.structure.ListItem;
import uk.gov.dstl.baleen.types.structure.Ordered;
import uk.gov.dstl.baleen.types.structure.Page;
import uk.gov.dstl.baleen.types.structure.Paragraph;
import uk.gov.dstl.baleen.types.structure.Preformatted;
import uk.gov.dstl.baleen.types.structure.Quotation;
import uk.gov.dstl.baleen.types.structure.Section;
import uk.gov.dstl.baleen.types.structure.Sentence;
import uk.gov.dstl.baleen.types.structure.Sheet;
import uk.gov.dstl.baleen.types.structure.Slide;
import uk.gov.dstl.baleen.types.structure.SlideShow;
import uk.gov.dstl.baleen.types.structure.SpreadSheet;
import uk.gov.dstl.baleen.types.structure.Structure;
import uk.gov.dstl.baleen.types.structure.Style;
import uk.gov.dstl.baleen.types.structure.Summary;
import uk.gov.dstl.baleen.types.structure.Table;
import uk.gov.dstl.baleen.types.structure.TableBody;
import uk.gov.dstl.baleen.types.structure.TableCell;
import uk.gov.dstl.baleen.types.structure.TableFooter;
import uk.gov.dstl.baleen.types.structure.TableHeader;
import uk.gov.dstl.baleen.types.structure.TableRow;
import uk.gov.dstl.baleen.types.structure.TextDocument;
import uk.gov.dstl.baleen.types.structure.Unordered;
import uk.gov.dstl.baleen.uima.utils.StructureHierarchy;
import uk.gov.dstl.baleen.uima.utils.StructureUtil;
import uk.gov.dstl.baleen.uima.utils.select.Node;
/**
* Creates a HTML5 version of the structured annotations of a document.
* <p>
* The tag structure replicates to the extent possible the input from the Format
* Extractor library, used by StructureContentExtractor. That is for example
* 'aside' to 'footnote' to 'aside'. That said the purpose is not to reproduce a
* faithful original, but instead to produce something which visibly and
* structurally looks like high quality HTML representing a document.
* <p>
* This annotator optionally adds class information in the form of
* 'baleen-structure-[type]' to allow the originating Baleen type to be
* identified. (Optional on the outputData parameter.)
* <p>
* Naturally the HTML is best viewed with CSS, an an example CSS style sheet:
*
* <pre>
* html {
* background-color: #eee;
* }
*
* body {
* max-width: 1200px;
* margin: 5px auto;
* padding: 25px;
* }
*
* article {
* padding: 25px;
* background-color: #fff;
* border: 1px solid black;
* overflow: auto;
* margin-bottom: 25px;
* }
*
* section, div {
* margin: 5px;
* padding: 25px;
* border: 1px dashed #eee;
* background-color: #fff;
* }
*
* table {
* border-collapse: collapse;
* }
*
* table, th, td {
* border: 1px solid #666;
* }
*
* th {
* font-weight: bold;
* }
*
* h1,h2,h3,h4,h5,h6 {
* margin-left: -20px;
* }
*
* </pre>
*
* This will NOT output entity, relation or other semantic annotations.
*
* @baleen.javadoc
*/
public class StructuralHtml extends AbstractHtmlConsumer {
/**
* Apply styling information in the original document to the output.
*
* Examples include colour, underline, etc.
*
* Unless your documents encode important information through styles, you
* should use a CSS style sheet and leave this off.
*
*
* @baleen.config true
*/
public static final String PARAM_APPLY_STYLES = "applyStyles";
@ConfigurationParameter(name = PARAM_APPLY_STYLES, defaultValue = "true")
private Boolean applyStyles;
/**
* Outputs data-* attributes on the tags using Baleen information (begin,
* end, id, etc).
*
* This increases the overall size of the HTML, but if very useful for
* onward machine processing.
*
* @baleen.config false
*/
public static final String PARAM_OUTPUT_DATA = "outputData";
@ConfigurationParameter(name = PARAM_OUTPUT_DATA, defaultValue = "false")
private Boolean outputData;
/**
* Output empty tags.
*
* Should tags which have no text and no content be output to the HTML.
*
* There is little reason to do this unless debugging the structural
* processing of Baleen, as it unnecessarily complicates the documents.
*
* NOTE: This does not apply to empty table cells (th, td) because they are
* needed to preserve the table structure.
*
* @baleen.config false
*/
public static final String PARAM_OUTPUT_EMPTY_TAGS = "outputEmptyTags";
@ConfigurationParameter(name = PARAM_OUTPUT_EMPTY_TAGS, defaultValue = "false")
private Boolean outputEmptyTags;
/**
* A list of structural types which will be considered during record path
* analysis.
*
* @baleen.config Paragraph,TableCell,ListItem,Aside, ...
*/
public static final String PARAM_TYPE_NAMES = "types";
/** The type names. */
@ConfigurationParameter(name = PARAM_TYPE_NAMES, mandatory = false)
private String[] typeNames;
/** The structural classes. */
protected Set<Class<? extends Structure>> structuralClasses;
@Override
public void doInitialize(UimaContext aContext) throws ResourceInitializationException {
super.doInitialize(aContext);
structuralClasses = StructureUtil.getStructureClasses(typeNames);
}
/**
* Walk through the nodes in order to create the complete HTML structure.
*
* @param parentElement
* the parent element
* @param n
* the n
*/
private void walk(final Element parentElement, final Node<Structure> n) {
final Structure structure = n.getItem();
// TODO: Here we always create a new element, but in reality we could
// use parentElement if the
// element is just a div (that is structure == null etc)
// That might clean up the HTML
final Element e = createTag(structure);
if (structure == null || structure.getCoveredText() == null) {
// Descend into the children directly
for (final Node<Structure> child : n.getChildren()) {
walk(e, child);
}
} else {
final String text = structure.getCoveredText();
int offset = 0;
for (final Node<Structure> child : n.getChildren()) {
appendText(e, text, offset, child.getItem().getBegin() - n.getItem().getBegin());
walk(e, child);
offset = child.getItem().getEnd() - n.getItem().getBegin();
}
appendText(e, text, offset, n.getItem().getEnd() - n.getItem().getBegin());
}
parentElement.appendChild(e);
}
/**
* Add text to element
*
* @param e
* the element
* @param text
* the text buffer containing the substring
* @param start
* the start offset within the text
* @param end
* the end offset within the text
* @return true, if successful
*/
private boolean appendText(final Element e, final String text, final int start, final int end) {
if (start < end && end <= text.length()) {
e.appendText(text.substring(start, end));
return true;
} else {
return false;
}
}
/**
* Create a CSS style string for the style annotation
*
* @param s
* the style
* @return the string
*/
private String buildCssStyle(final Style s) {
final String color = s.getColor();
final StringArray decorations = s.getDecoration();
final String font = s.getFont();
// If no style info stop
if (Strings.isNullOrEmpty(color) && Strings.isNullOrEmpty(font)
&& (decorations == null || decorations.size() == 0)) {
return null;
}
final StringBuilder sb = new StringBuilder();
// This is very naive, it a passthrough of the original formats values
// Effectively we are just hoping the browser knows what to do.
if (!Strings.isNullOrEmpty(color)) {
sb.append(String.format("color:%s; ", color));
}
if (!Strings.isNullOrEmpty(font)) {
sb.append(String.format("font-family:\"%s\"; ", color));
}
if (decorations != null && decorations.size() > 0) {
final String[] array = decorations.toArray();
for (final String a : array) {
switch (a.toUpperCase()) {
case "UNDERLINE":
sb.append("text-decoration:underline; ");
break;
case "BOLD":
sb.append("font-weight:bold; ");
break;
case "ITALIC":
case "ITALICS":
sb.append("font-style:italic; ");
break;
case "STRIKE":
case "STRIKETHROUGH":
sb.append("text-decoration: line-through; ");
break;
case "SUPERSCRIPT":
sb.append("font-size: .7em; vertical-align: super; ");
break;
case "SUBSCRIPT":
sb.append("font-size: .7em; vertical-align: sub; ");
break;
case "BIG":
sb.append("font-size: 1.2em; ");
break;
case "SMALL":
sb.append("font-size: .9em; ");
break;
case "highlighted":
sb.append("background-color:#ffffe0; ");
break;
default:
// No nothing - we don't know what it means
break;
}
}
}
return sb.toString();
}
/**
* Creates the element of the given tag name.
*
* @param tag
* the tag
* @return the element
*/
private Element createElement(final String tag) {
return new Element(Tag.valueOf(tag), "");
}
/**
* Creates the tag from the structure annotation.
*
* @param s
* the structure annotation
* @return the element
*/
private Element createTag(final Structure s) {
Element e;
if (s == null) {
e = createElement("div");
} else if (s instanceof Anchor) {
e = createElement("a");
e.attr("id", s.getExternalId());
} else if (s instanceof Caption) {
e = createElement("figcaption");
} else if (s instanceof Document || s instanceof SpreadSheet || s instanceof SlideShow
|| s instanceof TextDocument) {
e = createElement("main");
} else if (s instanceof Figure) {
// TODO This is more complex I guess if we really wanted to put in a
// img / object tag.
// but we don't have that info.
e = createElement("figure");
} else if (s instanceof Footer) {
e = createElement("footer");
} else if (s instanceof Footnote) {
e = createElement("aside");
} else if (s instanceof Header) {
e = createElement("header");
} else if (s instanceof Heading) {
final Heading h = (Heading) s;
final int level = Math.min(6, Math.max(1, h.getLevel()));
e = createElement("h" + level);
} else if (s instanceof Link) {
e = createElement("a");
final String target = ((Link) s).getTarget();
if (!Strings.isNullOrEmpty(target)) {
e.attr("href", target);
}
} else if (s instanceof ListItem) {
e = createElement("li");
} else if (s instanceof Ordered) {
e = createElement("ol");
} else if (s instanceof Unordered) {
e = createElement("ul");
} else if (s instanceof DefinitionList) {
e = createElement("dl");
} else if (s instanceof DefinitionItem) {
e = createElement("dt");
} else if (s instanceof DefinitionDescription) {
e = createElement("dd");
} else if (s instanceof Page || s instanceof Slide || s instanceof Sheet) {
e = createElement("article");
} else if (s instanceof Paragraph) {
e = createElement("p");
} else if (s instanceof Section) {
e = createElement("section");
} else if (s instanceof Summary) {
e = createElement("summary");
} else if (s instanceof Details) {
e = createElement("details");
} else if (s instanceof Aside) {
e = createElement("aside");
} else if (s instanceof Preformatted) {
e = createElement("pre");
} else if (s instanceof Quotation) {
e = createElement("q");
} else if (s instanceof Sentence) {
e = createElement("span");
} else if (s instanceof Style) {
e = createElement("span");
if (applyStyles) {
final String cssStyle = buildCssStyle((Style) s);
if (!Strings.isNullOrEmpty(cssStyle)) {
e.attr("style", cssStyle);
}
}
} else if (s instanceof Table) {
e = createElement("table");
} else if (s instanceof TableBody) {
e = createElement("tbody");
} else if (s instanceof TableCell) {
e = createElement("td");
final TableCell cell = (TableCell) s;
addRowOrCol(e, "data-row", cell.getRow());
addRowOrCol(e, "data-col", cell.getColumn());
addRowOrColSpan(e, "rowspan", cell.getRowSpan());
addRowOrColSpan(e, "colspan", cell.getColumnSpan());
} else if (s instanceof TableHeader) {
e = createElement("thead");
} else if (s instanceof TableFooter) {
e = createElement("tfoot");
} else if (s instanceof TableRow) {
e = createElement("tr");
} else if (s instanceof Break) {
e = createElement("hr");
} else {
e = createElement("div");
}
if (s != null) {
e.attr("class", String.format("baleen-structure-%s", s.getType().getShortName().toLowerCase()));
// Add generic data attributes
if (outputData) {
e.attr("data-baleen-structure-depth", Integer.toString(s.getDepth()));
e.attr("data-baleen-id", s.getExternalId());
e.attr("data-baleen-begin", Integer.toString(s.getBegin()));
e.attr("data-baleen-end", Integer.toString(s.getEnd()));
}
}
return e;
}
private void addRowOrColSpan(final Element e, final String key, final int span) {
if (span > 0) {
e.attr(key, Integer.toString(span));
}
}
private void addRowOrCol(final Element e, final String key, final int v) {
if (v > 0) {
e.attr(key, Integer.toString(v));
}
}
@Override
protected void writeBody(final JCas jCas, final Element body) {
final Node<Structure> root = StructureHierarchy.build(jCas, structuralClasses).getRoot();
walk(body, root);
// We need to create the proper li tags under ol and ul
body.select("ul > p").wrap("<li></li>");
body.select("ol > p").wrap("<li></li>");
// Correct table cells from td to th in header
body.select("thead td").tagName("th");
// Add to any empty td or th's
body.select("td:empty,th:empty").html(" ");
if (!outputEmptyTags) {
Elements e = emptyElements(body);
while (!e.isEmpty()) {
e.remove();
e = emptyElements(body);
}
}
// TODO: In accordance with HTML spec
// - Captions for Table should be moved inside the table
// - Captions for Figure should be moved inside the figure
}
private Elements emptyElements(final Element body) {
return body.select("*:empty").not("body").not("hr").not("img").not("a");
}
}