//Dstl (c) Crown Copyright 2017 package uk.gov.dstl.baleen.contentmappers; import java.util.Collections; import org.apache.uima.jcas.JCas; import org.jsoup.nodes.Element; import com.google.common.base.Strings; import com.google.common.primitives.Ints; import uk.gov.dstl.baleen.contentmappers.helpers.AnnotationCollector; import uk.gov.dstl.baleen.contentmappers.helpers.ContentMapper; import uk.gov.dstl.baleen.types.structure.Anchor; import uk.gov.dstl.baleen.types.structure.Aside; import uk.gov.dstl.baleen.types.structure.Break; import uk.gov.dstl.baleen.types.structure.Caption; import uk.gov.dstl.baleen.types.structure.DefinitionDescription; import uk.gov.dstl.baleen.types.structure.DefinitionItem; import uk.gov.dstl.baleen.types.structure.DefinitionList; import uk.gov.dstl.baleen.types.structure.Details; import uk.gov.dstl.baleen.types.structure.Document; import uk.gov.dstl.baleen.types.structure.Figure; import uk.gov.dstl.baleen.types.structure.Footer; import uk.gov.dstl.baleen.types.structure.Header; import uk.gov.dstl.baleen.types.structure.Heading; import uk.gov.dstl.baleen.types.structure.Link; import uk.gov.dstl.baleen.types.structure.ListItem; import uk.gov.dstl.baleen.types.structure.Ordered; import uk.gov.dstl.baleen.types.structure.Page; import uk.gov.dstl.baleen.types.structure.Paragraph; import uk.gov.dstl.baleen.types.structure.Preformatted; import uk.gov.dstl.baleen.types.structure.Quotation; import uk.gov.dstl.baleen.types.structure.Section; import uk.gov.dstl.baleen.types.structure.Sheet; import uk.gov.dstl.baleen.types.structure.Slide; import uk.gov.dstl.baleen.types.structure.SlideShow; import uk.gov.dstl.baleen.types.structure.SpreadSheet; import uk.gov.dstl.baleen.types.structure.Structure; import uk.gov.dstl.baleen.types.structure.Style; import uk.gov.dstl.baleen.types.structure.Summary; import uk.gov.dstl.baleen.types.structure.Table; import uk.gov.dstl.baleen.types.structure.TableBody; import uk.gov.dstl.baleen.types.structure.TableCell; import uk.gov.dstl.baleen.types.structure.TableFooter; import uk.gov.dstl.baleen.types.structure.TableHeader; import uk.gov.dstl.baleen.types.structure.TableRow; import uk.gov.dstl.baleen.types.structure.Unordered; import uk.gov.dstl.baleen.uima.utils.UimaTypesUtils; /** * The default content mapper which convert generic HTML5 elements to Baleen Structural annotations. * * Since the Baleen structural types are modelled around HTML5 elements this mapper is mostly a * simple passthrough, dealing with some special casess (eg HTML a with and without a href has * different meanings). * * Due to the number of HTML tags refer to the source code to see the exact mapping. * * NOTE: Not all formats will produce rich enough HTML to use all the mappings defined here. * */ public class StructuralAnnotations implements ContentMapper { @Override public void map(final JCas jCas, final Element element, final AnnotationCollector collector) { Structure s = null; switch (element.tagName().toLowerCase()) { case "p": s = new Paragraph(jCas); break; // Headings case "h1": s = createHeading(jCas, 1); break; case "h2": s = createHeading(jCas, 2); break; case "h3": s = createHeading(jCas, 3); break; case "h4": s = createHeading(jCas, 4); break; case "h5": s = createHeading(jCas, 5); break; case "h6": s = createHeading(jCas, 6); break; // Lists case "ul": s = new Unordered(jCas); break; case "ol": s = new Ordered(jCas); break; case "li": s = new ListItem(jCas); break; case "dl": s = new DefinitionList(jCas); break; case "dt": s = new DefinitionItem(jCas); break; case "dd": // TODO: It might make sense to refer the dt wihtin the type system (setTerm()) s = new DefinitionDescription(jCas); break; // Table case "table": s = new Table(jCas); break; case "thead": s = new TableHeader(jCas); break; case "tfoot": s = new TableFooter(jCas); break; case "tbody": s = new TableBody(jCas); break; case "tr": final TableRow tr = new TableRow(jCas); tr.setRow(findRowIndexOfRow(element)); s = tr; break; case "th": // fall through case "td": final TableCell td = new TableCell(jCas); td.setColumn(findColIndexOfCell(element)); td.setRow(findRowIndexOfCell(element)); td.setRowSpan(getIntegerAttribute(element, "rowspan", 1)); td.setColumnSpan(getIntegerAttribute(element, "colspan", 1)); s = td; break; // Links and anchors case "a": s = createAnchor(jCas, element); break; // Images case "audio": case "video": case "embed": case "object": case "img": case "map": case "area": case "canvas": case "figure": final Figure figure = new Figure(jCas); if (element.hasAttr("src")) { figure.setTarget(element.attr("src")); } s = figure; break; case "caption": case "figcaption": s = new Caption(jCas); break; // Styling case "ins": // fall through - HTML W3 http://www.w3schools.com/tags/tag_ins.asp says that ins would // normally be underlined case "u": s = createStyle(jCas, "underline"); break; case "i": case "em": s = createStyle(jCas, "italics"); break; case "b": case "strong": s = createStyle(jCas, "bold"); break; case "strike": case "s": case "del": s = createStyle(jCas, "strike"); break; case "sup": s = createStyle(jCas, "superscript"); break; case "sub": s = createStyle(jCas, "subscript"); break; case "small": s = createStyle(jCas, "small"); break; case "big": // Not HTML5 so not likely to be seen s = createStyle(jCas, "big"); break; case "mark": s = createStyle(jCas, "highlighted"); break; // Purely structural case "aside": s = new Aside(jCas); break; case "details": s = new Details(jCas); break; case "summary": s = new Summary(jCas); break; case "section": case "div": // Div means very little nothing... but we wrap it in a section s = new Section(jCas); break; case "span": // Do nothing break; case "main": s = createFromMain(jCas, element); break; case "article": s = createFromArticle(jCas, element); break; case "header": s = new Header(jCas); break; case "footer": s = new Footer(jCas); break; case "kbd": case "samp": case "code": case "pre": s = new Preformatted(jCas); break; case "blockquote": collector.add(new Section(jCas)); // Fall through case "q": s = new Quotation(jCas); break; // Potential semantic types, but left to other mappers (SemanticHtml) to actually annotate case "time": case "meter": case "dfn": case "address": case "abbr": case "cite": return; case "hr": if (element.hasClass("pagebreak") || element.hasClass("sectionbreak")) { s = new Break(jCas); } break; // Misc ignored - head, details of embedded, ui specific, forms case "html": case "head": case "title": case "meta": case "base": case "style": case "script": case "noscript": case "link": case "dialog": case "nav": case "menu": case "menuitem": case "param": case "track": case "source": case "iframe": case "form": case "input": case "textarea": case "button": case "select": case "optgroup": case "option": case "label": case "fieldset": case "legend": case "datalist": case "keygen": case "output": case "ruby": case "rt": case "rp": case "progress": case "bdo": case "bdi": default: break; } if (s != null) { if (element.hasAttr("class")) { s.setElementClass(element.className()); } if (element.hasAttr("id")) { s.setElementId(element.id()); } collector.add(s); } } private int getIntegerAttribute(final Element element, final String key, final int defaultValue) { final String value = element.attr(key); if (Strings.isNullOrEmpty(value)) { return defaultValue; } final Integer i = Ints.tryParse(value); if (i == null) { return defaultValue; } return i; } private int findRowIndexOfCell(final Element element) { for (final Element e : element.parents()) { if (e.tagName().equalsIgnoreCase("tr")) { return findRowIndexOfRow(e); } } return -1; } private int findRowIndexOfRow(final Element e) { // TODO: The best we can do without rowspan type info return e.siblingIndex(); } private int findColIndexOfCell(final Element e) { // TODO: The best we can do without colspan type info return e.siblingIndex(); } private Structure createAnchor(final JCas jCas, final Element element) { String href = element.absUrl("href"); if (Strings.isNullOrEmpty(href)) { href = element.attr("href"); } if (!Strings.isNullOrEmpty(href)) { final Link l = new Link(jCas); l.setTarget(href); return l; } else { return new Anchor(jCas); } } private Page createFromArticle(final JCas jCas, final Element element) { final String clazz = element.attr("class"); switch (clazz.toLowerCase()) { case "sheet": return new Sheet(jCas); case "slide": return new Slide(jCas); case "page": // fall through default: return new Page(jCas); } } /** * Create a Document, selecting sub type if appropriate. * * @param jCas the jCas * @param element the element name * @return the Style */ private Document createFromMain(final JCas jCas, final Element element) { final String clazz = element.attr("class"); switch (clazz.toLowerCase()) { case "spreadsheet": return new SpreadSheet(jCas); case "slideshow": return new SlideShow(jCas); case "document": default: return new Document(jCas); } } /** * Create a Style * * @param jCas the jCas * @param styleName the style name * @return the Style */ private Style createStyle(final JCas jCas, final String styleName) { final Style style = new Style(jCas); style.setDecoration(UimaTypesUtils.toArray(jCas, Collections.singleton(styleName))); return style; } /** * Create a Heading with the given level * * @param jCas the jCas * @param level the level of the heading * @return the Heading */ private Heading createHeading(final JCas jCas, final int level) { final Heading h = new Heading(jCas); h.setLevel(level); return h; } }