//Dstl (c) Crown Copyright 2017
package uk.gov.dstl.baleen.consumers;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertTrue;
import java.io.File;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import org.apache.commons.io.FileUtils;
import org.apache.uima.UIMAException;
import org.apache.uima.analysis_engine.AnalysisEngine;
import org.apache.uima.fit.factory.AnalysisEngineFactory;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.tcas.DocumentAnnotation;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;
import com.google.common.io.Files;
import uk.gov.dstl.baleen.types.common.Buzzword;
import uk.gov.dstl.baleen.types.common.Organisation;
import uk.gov.dstl.baleen.types.common.Person;
import uk.gov.dstl.baleen.types.common.Quantity;
import uk.gov.dstl.baleen.types.metadata.Metadata;
import uk.gov.dstl.baleen.types.semantic.Entity;
import uk.gov.dstl.baleen.types.semantic.Temporal;
import uk.gov.dstl.baleen.uima.testing.JCasSingleton;
import uk.gov.dstl.baleen.uima.utils.TypeSystemSingleton;
import uk.gov.dstl.baleen.uima.utils.UimaTypesUtils;
/**
*
*/
public class Html5Test {
private File outputFolder;
private JCas jCas;
@Before
public void beforeTest() throws UIMAException {
outputFolder = Files.createTempDir();
jCas = JCasSingleton.getJCasInstance();
}
@After
public void afterTest() throws IOException {
FileUtils.deleteDirectory(outputFolder);
}
@Test
public void testCreateFile() throws UIMAException {
AnalysisEngine consumer = AnalysisEngineFactory.createEngine(Html5.class, TypeSystemSingleton.getTypeSystemDescriptionInstance(), Html5.PARAM_OUTPUT_FOLDER,
outputFolder.getPath());
jCas.setDocumentText("Hello World!");
DocumentAnnotation da = (DocumentAnnotation) jCas.getDocumentAnnotationFs();
da.setSourceUri("hello.txt");
consumer.process(jCas);
File f = new File(outputFolder, "hello.txt.html");
assertTrue(f.exists());
}
@Test
public void testCreateExistingFile() throws UIMAException, IOException {
AnalysisEngine consumer = AnalysisEngineFactory.createEngine(Html5.class, TypeSystemSingleton.getTypeSystemDescriptionInstance(), Html5.PARAM_OUTPUT_FOLDER,
outputFolder.getPath());
jCas.setDocumentText("Hello World!");
DocumentAnnotation da = (DocumentAnnotation) jCas.getDocumentAnnotationFs();
da.setSourceUri("hello.txt");
File fExisting = new File(outputFolder, "hello.txt.html");
fExisting.createNewFile();
consumer.process(jCas);
File f = new File(outputFolder, "hello.txt.1.html");
assertTrue(f.exists());
}
@Test
public void testCreateExternalIdFile() throws UIMAException {
AnalysisEngine consumer = AnalysisEngineFactory.createEngine(Html5.class, TypeSystemSingleton.getTypeSystemDescriptionInstance(), Html5.PARAM_OUTPUT_FOLDER,
outputFolder.getPath(), Html5.PARAM_USE_EXTERNAL_ID, true, Html5.PARAM_CONTENT_HASH_AS_ID, false);
jCas.setDocumentText("Hello World!");
DocumentAnnotation da = (DocumentAnnotation) jCas.getDocumentAnnotationFs();
da.setSourceUri("hello.txt");
consumer.process(jCas);
File f = new File(outputFolder, "734cad14909bedfafb5b273b6b0eb01fbfa639587d217f78ce9639bba41f4415.html");
assertTrue(f.exists());
}
@Test
public void testCreateOutputDir() throws UIMAException {
File newFolder = new File(outputFolder, "test");
AnalysisEngine consumer = AnalysisEngineFactory.createEngine(Html5.class, TypeSystemSingleton.getTypeSystemDescriptionInstance(), Html5.PARAM_OUTPUT_FOLDER,
newFolder.getPath());
jCas.setDocumentText("Hello World!");
DocumentAnnotation da = (DocumentAnnotation) jCas.getDocumentAnnotationFs();
da.setSourceUri("hello.txt");
consumer.process(jCas);
File f = new File(newFolder, "hello.txt.html");
assertTrue(f.exists());
}
@Test
public void testCSS() throws UIMAException, IOException {
AnalysisEngine consumer = AnalysisEngineFactory.createEngine(Html5.class, TypeSystemSingleton.getTypeSystemDescriptionInstance(), Html5.PARAM_OUTPUT_FOLDER,
outputFolder.getPath(), Html5.PARAM_CSS, "test.css");
jCas.setDocumentText("This is a test document.");
consumer.process(jCas);
DocumentAnnotation da = (DocumentAnnotation) jCas.getDocumentAnnotationFs();
File f = new File(outputFolder, da.getHash() + ".html");
assertTrue(f.exists());
Document doc = Jsoup.parse(f, "UTF-8");
Elements links = doc.select("link");
assertEquals(1, links.size());
Element link = links.get(0);
assertEquals("stylesheet", link.attr("rel"));
assertEquals("test.css", link.attr("href"));
}
@Test
public void testLineBreak() throws UIMAException, IOException {
AnalysisEngine consumer = AnalysisEngineFactory.createEngine(Html5.class, TypeSystemSingleton.getTypeSystemDescriptionInstance(), Html5.PARAM_OUTPUT_FOLDER,
outputFolder.getPath());
DocumentAnnotation da = (DocumentAnnotation) jCas.getDocumentAnnotationFs();
da.setSourceUri("multiline.txt");
jCas.setDocumentText("His name was James\n\nBond.");
Person p = new Person(jCas, 13, 24);
p.addToIndexes();
consumer.process(jCas);
File f = new File(outputFolder, "multiline.txt.html");
assertTrue(f.exists());
assertTrue(Files.toString(f, StandardCharsets.UTF_8).contains("data-referent=\"\">James\n\nBond</span>"));
}
@Test
public void testDocument() throws UIMAException, IOException {
AnalysisEngine consumer = AnalysisEngineFactory.createEngine(Html5.class, TypeSystemSingleton.getTypeSystemDescriptionInstance(), Html5.PARAM_OUTPUT_FOLDER,
outputFolder.getPath());
jCas.setDocumentText(
"This is a test document, that contains a number of test entities.\n\nOn 30th June, this test was written by James.\nJames wrote this test on 30 June 2015.\n\n3kg of Sugar, 2kg of Spice");
DocumentAnnotation da = (DocumentAnnotation) jCas.getDocumentAnnotationFs();
da.setSourceUri("test.txt");
da.setDocumentClassification("UK OFFICIAL");
da.setDocumentCaveats(UimaTypesUtils.toArray(jCas, Arrays.asList("Test", "Caveats")));
Metadata m1 = new Metadata(jCas);
m1.setKey("author");
m1.setValue("bakerj");
m1.addToIndexes();
Metadata m2 = new Metadata(jCas);
m2.setKey("test-key");
m2.setValue("test value");
m2.addToIndexes();
Metadata m3 = new Metadata(jCas);
m3.setKey("documentTitle");
m3.setValue("Test Document");
m3.addToIndexes();
Person p1 = new Person(jCas, 106, 111);
p1.addToIndexes();
Person p2 = new Person(jCas, 113, 118);
p2.addToIndexes();
Temporal d1 = new Temporal(jCas, 70, 79);
d1.addToIndexes();
Temporal d2 = new Temporal(jCas, 138, 145);
d2.addToIndexes();
Temporal d3 = new Temporal(jCas, 138, 150);
d3.addToIndexes();
Quantity q1 = new Quantity(jCas, 153, 156);
q1.addToIndexes();
Quantity q2 = new Quantity(jCas, 167, 170);
q2.addToIndexes();
Buzzword b1 = new Buzzword(jCas, 160, 165);
b1.addToIndexes();
Buzzword b2 = new Buzzword(jCas, 174, 179);
b2.addToIndexes();
Entity e1 = new Entity(jCas, 153, 165);
e1.addToIndexes();
Entity e2 = new Entity(jCas, 167, 179);
e2.addToIndexes();
Organisation o = new Organisation(jCas, 153, 179); // Abusing the entity type just so we can
// differentiate in the tests
o.addToIndexes();
consumer.process(jCas);
File f = new File(outputFolder, "test.txt.html");
assertTrue(f.exists());
Document doc = Jsoup.parse(f, "UTF-8");
assertEquals("Test Document", doc.title());
Elements metas = doc.select("meta");
assertEquals(8, metas.size()); // 3 defined elements, charset, external ID, and source URI,
// classification, caveats
Element charset = metas.get(0);
assertEquals("utf-8", charset.attr("charset"));
Element sourceUri = metas.get(1);
assertEquals("document.sourceUri", sourceUri.attr("name"));
assertEquals("test.txt", sourceUri.attr("content"));
Element externalId = metas.get(2);
assertEquals("externalId", externalId.attr("name"));
assertNotNull(sourceUri.attr("content"));
Element classification = metas.get(3);
assertEquals("document.classification", classification.attr("name"));
assertEquals("UK OFFICIAL", classification.attr("content"));
Element caveats = metas.get(4);
assertEquals("document.caveats", caveats.attr("name"));
assertEquals("Test,Caveats", caveats.attr("content"));
Element meta1 = metas.get(5);
assertEquals("author", meta1.attr("name"));
assertEquals("bakerj", meta1.attr("content"));
Element meta2 = metas.get(7);
assertEquals("test-key", meta2.attr("name"));
assertEquals("test value", meta2.attr("content"));
Element meta3 = metas.get(6);
assertEquals("documentTitle", meta3.attr("name"));
assertEquals("Test Document", meta3.attr("content"));
Elements spans = doc.select("span");
assertEquals(12, spans.size());
Elements people = doc.select(".Person");
assertEquals(2, people.size());
Element person1 = people.get(0);
assertEquals("James", person1.text());
assertNotNull(person1.attr("id"));
Element person2 = people.get(1);
assertEquals("James", person2.text());
assertNotNull(person2.attr("id"));
Elements temporals = doc.select(".Temporal");
assertEquals(3, temporals.size());
Element temporal1 = temporals.get(0);
assertEquals("30th June", temporal1.text());
assertNotNull(temporal1.attr("id"));
Element temporal2 = temporals.get(2);
assertEquals("30 June", temporal2.text());
assertNotNull(temporal2.attr("id"));
Element temporal3 = temporals.get(1);
assertEquals("30 June 2015", temporal3.text());
assertNotNull(temporal3.attr("id"));
Elements quantities = doc.select(".Quantity");
assertEquals(2, quantities.size());
Element quantity1 = quantities.get(0);
assertEquals("3kg", quantity1.text());
assertNotNull(quantity1.attr("id"));
Element quantity2 = quantities.get(1);
assertEquals("2kg", quantity2.text());
assertNotNull(quantity2.attr("id"));
Elements buzzwords = doc.select(".Buzzword");
assertEquals(2, buzzwords.size());
Element buzzword1 = buzzwords.get(0);
assertEquals("Sugar", buzzword1.text());
assertNotNull(buzzword1.attr("id"));
Element buzzword2 = buzzwords.get(1);
assertEquals("Spice", buzzword2.text());
assertNotNull(buzzword2.attr("id"));
Elements entities = doc.select(".Entity");
assertEquals(2, entities.size());
Element entity1 = entities.get(0);
assertEquals("3kg of Sugar", entity1.text());
assertNotNull(entity1.attr("id"));
Element entity2 = entities.get(1);
assertEquals("2kg of Spice", entity2.text());
assertNotNull(entity2.attr("id"));
Elements organisations = doc.select(".Organisation");
assertEquals(1, organisations.size());
Element org1 = organisations.get(0);
assertEquals("3kg of Sugar, 2kg of Spice", org1.text());
assertNotNull(org1.attr("id"));
}
}