//Dstl (c) Crown Copyright 2017
// Dstl (c) Crown Copyright 2017
package uk.gov.dstl.baleen.consumers;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import java.io.File;
import java.io.IOException;
import java.util.Arrays;
import org.apache.commons.io.FileUtils;
import org.apache.uima.UIMAException;
import org.apache.uima.analysis_engine.AnalysisEngine;
import org.apache.uima.fit.factory.AnalysisEngineFactory;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.tcas.DocumentAnnotation;
import org.jsoup.Jsoup;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;
import com.google.common.io.Files;
import uk.gov.dstl.baleen.types.structure.Anchor;
import uk.gov.dstl.baleen.types.structure.Aside;
import uk.gov.dstl.baleen.types.structure.Caption;
import uk.gov.dstl.baleen.types.structure.DefinitionDescription;
import uk.gov.dstl.baleen.types.structure.DefinitionItem;
import uk.gov.dstl.baleen.types.structure.DefinitionList;
import uk.gov.dstl.baleen.types.structure.Details;
import uk.gov.dstl.baleen.types.structure.Document;
import uk.gov.dstl.baleen.types.structure.Figure;
import uk.gov.dstl.baleen.types.structure.Footer;
import uk.gov.dstl.baleen.types.structure.Footnote;
import uk.gov.dstl.baleen.types.structure.Header;
import uk.gov.dstl.baleen.types.structure.Heading;
import uk.gov.dstl.baleen.types.structure.Link;
import uk.gov.dstl.baleen.types.structure.ListItem;
import uk.gov.dstl.baleen.types.structure.Ordered;
import uk.gov.dstl.baleen.types.structure.Page;
import uk.gov.dstl.baleen.types.structure.Paragraph;
import uk.gov.dstl.baleen.types.structure.Preformatted;
import uk.gov.dstl.baleen.types.structure.Quotation;
import uk.gov.dstl.baleen.types.structure.Section;
import uk.gov.dstl.baleen.types.structure.Sentence;
import uk.gov.dstl.baleen.types.structure.Sheet;
import uk.gov.dstl.baleen.types.structure.Slide;
import uk.gov.dstl.baleen.types.structure.SlideShow;
import uk.gov.dstl.baleen.types.structure.SpreadSheet;
import uk.gov.dstl.baleen.types.structure.Structure;
import uk.gov.dstl.baleen.types.structure.Style;
import uk.gov.dstl.baleen.types.structure.Summary;
import uk.gov.dstl.baleen.types.structure.Table;
import uk.gov.dstl.baleen.types.structure.TableBody;
import uk.gov.dstl.baleen.types.structure.TableCell;
import uk.gov.dstl.baleen.types.structure.TableRow;
import uk.gov.dstl.baleen.types.structure.TextDocument;
import uk.gov.dstl.baleen.types.structure.Unordered;
import uk.gov.dstl.baleen.uima.testing.JCasSingleton;
import uk.gov.dstl.baleen.uima.utils.UimaTypesUtils;
public class StructuralHtmlTest {
private static final String EXPECTED = "<!doctype html>" +
"<html lang=\"x-unspecified\">" +
" <head> " +
" <meta charset=\"utf-8\"> " +
" <meta name=\"document.sourceUri\" content=\"test.txt\"> " +
" <meta name=\"externalId\" content=\"52bfa1307972d3b9158718d6a6abede86a0315b4daaf14ca1e3682310a75705d\"> "
+
" </head> " +
" <body> " +
" <div> " +
" <main class=\"baleen-structure-document\"> " +
" <section class=\"baleen-structure-section\"> " +
" <p class=\"baleen-structure-paragraph\">This is a test document, that contains structure</p>. "
+
" <details class=\"baleen-structure-details\">" +
" This test was written by " +
" <span style=\"font-weight:bold; \" class=\"baleen-structure-style\">Chris</span> " +
" </details>. " +
" <aside class=\"baleen-structure-aside\">" +
" On the on " +
" <span style=\"font-style:italic; \" class=\"baleen-structure-style\">24 December 2016</span> "
+
" </aside>. " +
" </section> " +
" </main>" +
" </div> " +
" </body>" +
"</html>";
private static final String EXPECTED_TABLE = "<!doctype html><html lang=\"x-unspecified\">" +
" <head> " +
" <meta charset=\"utf-8\"> " +
" <meta name=\"document.sourceUri\" content=\"test.txt\"> " +
" <meta name=\"externalId\" content=\"5ced5f586e63306bf2232843be27dc6fbb531732bc73d6e3d521fe72edb894a1\"> "
+
" </head> " +
" <body> " +
" <div> " +
" <table class=\"baleen-structure-table\"> " +
" <tbody class=\"baleen-structure-tablebody\"> " +
" <tr class=\"baleen-structure-tablerow\"> " +
" <td class=\"baleen-structure-tablecell\">A</td> "
+
" <td class=\"baleen-structure-tablecell\">B</td> "
+
" <td class=\"baleen-structure-tablecell\">C</td> "
+
" </tr> " +
" <tr class=\"baleen-structure-tablerow\"> " +
" <td class=\"baleen-structure-tablecell\">1</td> "
+
" <td class=\"baleen-structure-tablecell\">2</td> "
+
" <td class=\"baleen-structure-tablecell\">3</td> "
+
" </tr> " +
" </tbody> " +
" </table> " +
" </div> " +
"</body>" +
"</html>";
private static final String EXPECTED_TAGS = "<!doctype html>" +
"<html lang=\"x-unspecified\">" +
" <head> " +
" <meta charset=\"utf-8\"> " +
" <meta name=\"document.sourceUri\" content=\"test.txt\"> " +
" <meta name=\"externalId\" content=\"2664ee3899bb360a9b29a62ff24694258527dc641449329b0f842d77cb21172d\"> "
+
" </head> " +
" <body> " +
" <div> " +
" <a id=\"a83fbdbc9c736caacfd7367fe2afc081caaa7df2982cced9a9751e02645e0175\" class=\"baleen-structure-anchor\">Anchor</a> "
+
" <figcaption class=\"baleen-structure-caption\">" +
" Caption " +
" </figcaption> " +
" <main class=\"baleen-structure-document\">" +
" Document " +
" </main> " +
" <main class=\"baleen-structure-spreadsheet\">" +
" SpreadSheet " +
" </main> " +
" <main class=\"baleen-structure-slideshow\">" +
" SlideShow " +
" </main> " +
" <main class=\"baleen-structure-textdocument\">" +
" TextDocument " +
" </main> " +
" <figure class=\"baleen-structure-figure\">" +
" Figure " +
" </figure> " +
" <footer class=\"baleen-structure-footer\">" +
" Footer " +
" </footer> " +
" <aside class=\"baleen-structure-footnote\">" +
" Footnote " +
" </aside> " +
" <header class=\"baleen-structure-header\">" +
" Header " +
" </header> " +
" <h1 class=\"baleen-structure-heading\">Heading</h1> " +
" <a class=\"baleen-structure-link\">Link</a> " +
" <li class=\"baleen-structure-listitem\">ListItem</li> " +
" <ol class=\"baleen-structure-ordered\">" +
" Ordered " +
" </ol> " +
" <ul class=\"baleen-structure-unordered\">" +
" Unordered " +
" </ul> " +
" <dl class=\"baleen-structure-definitionlist\">" +
" DefinitionList " +
" </dl> " +
" <dt class=\"baleen-structure-definitionitem\">" +
" DefinitionItem " +
" </dt> " +
" <dd class=\"baleen-structure-definitiondescription\">" +
" DefinitionDescription " +
" </dd> " +
" <article class=\"baleen-structure-page\">" +
" Page " +
" </article>" +
" <article class=\"baleen-structure-slide\">" +
" Slide " +
" </article> " +
" <article class=\"baleen-structure-sheet\">" +
" Sheet " +
" </article> " +
" <p class=\"baleen-structure-paragraph\">Paragraph</p> " +
" <section class=\"baleen-structure-section\">" +
" Section " +
" </section> " +
" <summary class=\"baleen-structure-summary\">Summary</summary>" +
" <details class=\"baleen-structure-details\">" +
" Details " +
" </details> " +
" <aside class=\"baleen-structure-aside\">" +
" Aside " +
" </aside> " +
" <pre class=\"baleen-structure-preformatted\">Preformatted</pre> " +
" <q class=\"baleen-structure-quotation\">Quotation</q> " +
" <span class=\"baleen-structure-sentence\">Sentence</span> " +
" <span class=\"baleen-structure-style\">Style</span> " +
" <table class=\"baleen-structure-table\">" +
" Table " +
" </table> " +
" </div> " +
" </body>" +
"</html>";
private static final String EXPECTED_DATA = "<!doctype html>" +
"<html lang=\"x-unspecified\">" +
" <head> " +
" <meta charset=\"utf-8\">" +
" <meta name=\"document.sourceUri\" content=\"test.txt\">" +
" <meta name=\"externalId\" content=\"d029f87e3d80f8fd9b1be67c7426b4cc1ff47b4a9d0a8461c826a59d8c5eb6cd\">"
+
" </head> " +
" <body> " +
" <div> " +
" <main class=\"baleen-structure-document\" data-baleen-structure-depth=\"0\" data-baleen-id=\"012b13a6b7a5a0b69c66d6b017c53c3f70be35ac1263766e6ce16776dee58caa\" data-baleen-begin=\"0\" data-baleen-end=\"7\">"
+
" Example " +
" </main> " +
" </div> " +
" </body>" +
"</html>";
private static final String EXPECTED_EMPTY = "<!doctype html>" +
"<html lang=\"x-unspecified\">" +
" <head> " +
" <meta charset=\"utf-8\"> " +
" <meta name=\"document.sourceUri\" content=\"test.txt\"> " +
" <meta name=\"externalId\" content=\"d92797e9f108f7dcf8beb2449d1aa046b037bdbec554960e2314f916118c37f3\"> "
+
" </head> " +
" <body> " +
" <div> " +
" <p class=\"baleen-structure-paragraph\"></p> " +
" </div> " +
" </body>" +
"</html>";
private File outputFolder;
private JCas jCas;
@Before
public void beforeTest() throws UIMAException {
outputFolder = Files.createTempDir();
jCas = JCasSingleton.getJCasInstance();
}
@After
public void afterTest() throws IOException {
FileUtils.deleteDirectory(outputFolder);
}
@Test
public void testDocument() throws UIMAException, IOException {
final AnalysisEngine consumer =
AnalysisEngineFactory.createEngine(StructuralHtml.class, Html5.PARAM_OUTPUT_FOLDER,
outputFolder.getPath());
final DocumentAnnotation da = (DocumentAnnotation) jCas.getDocumentAnnotationFs();
da.setSourceUri("test.txt");
final String text =
"This is a test document, that contains structure. This test was written by Chris. On the on 24 December 2016.";
jCas.setDocumentText(text);
final Document doc = new Document(jCas);
doc.setBegin(0);
doc.setEnd(text.length());
doc.addToIndexes();
final Section section = new Section(jCas);
section.setDepth(1);
section.setBegin(0);
section.setEnd(text.length());
section.addToIndexes();
final Paragraph para = new Paragraph(jCas);
para.setBegin(2 - 2);
para.setEnd(50 - 2);
para.addToIndexes();
final Details details = new Details(jCas);
details.setBegin(52 - 2);
details.setEnd(82 - 2);
details.addToIndexes();
final Aside aside = new Aside(jCas);
aside.setBegin(84 - 2);
aside.setEnd(110 - 2);
aside.addToIndexes();
final Style bold = new Style(jCas);
bold.setDecoration(UimaTypesUtils.toArray(jCas, Arrays.asList("bold")));
bold.setBegin(77 - 2);
bold.setEnd(82 - 2);
bold.addToIndexes();
final Style italics = new Style(jCas);
italics.setDecoration(UimaTypesUtils.toArray(jCas, Arrays.asList("italics")));
italics.setBegin(94 - 2);
italics.setEnd(110 - 2);
italics.addToIndexes();
consumer.process(jCas);
final File f = new File(outputFolder, "test.txt.html");
assertTrue(f.exists());
// Strip out all the whitespace... just to normalise it
assertEquals(Jsoup.parse(f, "UTF-8").html().replaceAll("\\s*", ""),
EXPECTED.replaceAll("\\s*", ""));
}
@Test
public void testTables() throws Exception {
final AnalysisEngine consumer =
AnalysisEngineFactory.createEngine(StructuralHtml.class, Html5.PARAM_OUTPUT_FOLDER,
outputFolder.getPath());
final DocumentAnnotation da = (DocumentAnnotation) jCas.getDocumentAnnotationFs();
da.setSourceUri("test.txt");
jCas.setDocumentText("A B C\n1 2 3\n");
final Table table = new Table(jCas);
table.setBegin(0);
table.setEnd(11);
table.addToIndexes();
final TableBody tbody = new TableBody(jCas);
tbody.setDepth(1);
tbody.setBegin(0);
tbody.setEnd(11);
tbody.addToIndexes();
final TableRow first = new TableRow(jCas);
first.setBegin(0);
first.setEnd(5);
first.addToIndexes();
final TableRow second = new TableRow(jCas);
second.setBegin(6);
second.setEnd(11);
second.addToIndexes();
final TableCell a = new TableCell(jCas);
a.setBegin(0);
a.setEnd(1);
a.addToIndexes();
final TableCell b = new TableCell(jCas);
b.setBegin(2);
b.setEnd(3);
b.addToIndexes();
final TableCell c = new TableCell(jCas);
c.setBegin(4);
c.setEnd(5);
c.addToIndexes();
final TableCell a1 = new TableCell(jCas);
a1.setBegin(6);
a1.setEnd(7);
a1.addToIndexes();
final TableCell b1 = new TableCell(jCas);
b1.setBegin(8);
b1.setEnd(9);
b1.addToIndexes();
final TableCell c1 = new TableCell(jCas);
c1.setBegin(10);
c1.setEnd(11);
c1.addToIndexes();
consumer.process(jCas);
final File f = new File(outputFolder, "test.txt.html");
assertTrue(f.exists());
assertEquals(Jsoup.parse(f, "UTF-8").html().replaceAll("\\s*", ""),
EXPECTED_TABLE.replaceAll("\\s*", ""));
}
@Test
public void testTags() throws Exception {
final AnalysisEngine consumer =
AnalysisEngineFactory.createEngine(StructuralHtml.class, Html5.PARAM_OUTPUT_FOLDER,
outputFolder.getPath());
final DocumentAnnotation da = (DocumentAnnotation) jCas.getDocumentAnnotationFs();
da.setSourceUri("test.txt");
// The document generated here is obviously nonsense from all perspectives but can visually
// inspect the output is as expected
// NOTE: This doesn't' include table sub elements as they need valid nesting
final Class<?>[] classes =
new Class<?>[] {Anchor.class, Caption.class,
Document.class, SpreadSheet.class, SlideShow.class, TextDocument.class, Figure.class,
Footer.class, Footnote.class, Header.class, Heading.class, Link.class, ListItem.class,
Ordered.class, Unordered.class, DefinitionList.class,
DefinitionItem.class, DefinitionDescription.class, Page.class, Slide.class, Sheet.class,
Paragraph.class, Section.class, Summary.class, Details.class, Aside.class,
Preformatted.class, Quotation.class,
Sentence.class, Style.class, Table.class};
final StringBuilder sb = new StringBuilder();
for (final Class<?> c : classes) {
final int b = sb.length();
sb.append(c.getSimpleName());
final int e = sb.length();
sb.append(" ");
final Structure annotation = (Structure) c.getConstructor(JCas.class).newInstance(jCas);
annotation.setBegin(b);
annotation.setEnd(e);
annotation.addToIndexes();
}
jCas.setDocumentText(sb.toString());
consumer.process(jCas);
final File f = new File(outputFolder, "test.txt.html");
assertTrue(f.exists());
assertEquals(Jsoup.parse(f, "UTF-8").html().replaceAll("\\s*", ""),
EXPECTED_TAGS.replaceAll("\\s*", ""));
}
@Test
public void testOutputData() throws Exception {
final AnalysisEngine consumer =
AnalysisEngineFactory.createEngine(StructuralHtml.class, Html5.PARAM_OUTPUT_FOLDER,
outputFolder.getPath(), StructuralHtml.PARAM_OUTPUT_DATA, true);
final DocumentAnnotation da = (DocumentAnnotation) jCas.getDocumentAnnotationFs();
da.setSourceUri("test.txt");
jCas.setDocumentText("Example");
final Document d = new Document(jCas);
d.setBegin(0);
d.setEnd("Example".length());
d.addToIndexes();
consumer.process(jCas);
final File f = new File(outputFolder, "test.txt.html");
assertTrue(f.exists());
assertEquals(Jsoup.parse(f, "UTF-8").html().replaceAll("\\s*", ""),
EXPECTED_DATA.replaceAll("\\s*", ""));
}
@Test
public void testOutputEmpty() throws Exception {
final AnalysisEngine consumer =
AnalysisEngineFactory.createEngine(StructuralHtml.class, Html5.PARAM_OUTPUT_FOLDER,
outputFolder.getPath(), StructuralHtml.PARAM_OUTPUT_EMPTY_TAGS, true);
final DocumentAnnotation da = (DocumentAnnotation) jCas.getDocumentAnnotationFs();
da.setSourceUri("test.txt");
jCas.setDocumentText("Example document: ''");
final Paragraph d = new Paragraph(jCas);
d.setBegin(19);
d.setEnd(19);
d.addToIndexes();
consumer.process(jCas);
final File f = new File(outputFolder, "test.txt.html");
assertTrue(f.exists());
System.out.println(Jsoup.parse(f, "UTF-8").html());
assertEquals(Jsoup.parse(f, "UTF-8").html().replaceAll("\\s*", ""),
EXPECTED_EMPTY.replaceAll("\\s*", ""));
}
}