//Dstl (c) Crown Copyright 2017
package uk.gov.dstl.baleen.annotators.templates;
import static org.junit.Assert.assertEquals;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.resource.ResourceInitializationException;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;
import com.fasterxml.jackson.core.JsonGenerationException;
import com.fasterxml.jackson.databind.JsonMappingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.dataformat.yaml.YAMLFactory;
import com.google.common.collect.ImmutableList;
import uk.gov.dstl.baleen.annotators.templates.TemplateRecordConfiguration.Kind;
import uk.gov.dstl.baleen.annotators.testing.AbstractAnnotatorTest;
import uk.gov.dstl.baleen.types.structure.Document;
import uk.gov.dstl.baleen.types.structure.Heading;
import uk.gov.dstl.baleen.types.structure.Link;
import uk.gov.dstl.baleen.types.structure.Paragraph;
import uk.gov.dstl.baleen.types.structure.Quotation;
import uk.gov.dstl.baleen.types.structure.Section;
import uk.gov.dstl.baleen.types.structure.Table;
import uk.gov.dstl.baleen.types.structure.TableBody;
import uk.gov.dstl.baleen.types.structure.TableCell;
import uk.gov.dstl.baleen.types.structure.TableRow;
import uk.gov.dstl.baleen.types.templates.TemplateRecord;
import uk.gov.dstl.baleen.types.templates.TemplateField;
public class RepeatingRecordsAnnotatorTest extends AbstractAnnotatorTest {
protected static final ObjectMapper YAMLMAPPER = new ObjectMapper(new YAMLFactory());
private static final String QUOTE1 = "quote";
private static final String QUOTE2 = "better quote";
private static final String LINK = "link";
private static final String QUOTE3 = "best quote";
private static final String R1C1 = "r1c1";
private static final String R1C2 = "r1c2";
private static final String R2C1 = "r2c1";
private static final String R2C2 = "r2c2";
private static final String ROW1 = R1C1 + " " + R1C2;
private static final String ROW2 = R2C1 + " " + R2C2;
private static final String HEADING1 = "h1";
private static final String PARA1 = "This is para 1";
private static final String HEADING2 = "h2";
private static final String PARA2 = "This is para 2";
private static final String HEADING3 = "h3";
private static final String PARA3 = "This is para 3";
private static final String TEXT = String.join("\n", QUOTE1, QUOTE2, LINK, QUOTE1, QUOTE2, LINK, LINK, QUOTE3, ROW1,
ROW2, HEADING1, PARA1, HEADING2, PARA2, HEADING3, PARA3);
protected Path tempDirectory;
private Quotation quotation1;
private Quotation quotation2;
private Link link1;
private Quotation quotation3;
private Quotation quotation4;
private Link link2;
private Link link3;
private Quotation quotation5;
private Table table;
private TableBody tableBody;
private TableRow tableRow1;
private TableCell tableCell11;
private TableCell tableCell12;
private TableRow tableRow2;
private TableCell tableCell21;
private TableCell tableCell22;
private Section section;
private Heading heading1;
private Paragraph paragraph1;
private Heading heading2;
private Paragraph paragraph2;
private Heading heading3;
private Paragraph paragraph3;
public RepeatingRecordsAnnotatorTest() {
super(TemplateAnnotator.class);
}
@Before
public void setup() throws IOException {
tempDirectory = Files.createTempDirectory(getClass().getSimpleName());
jCas.setDocumentText(TEXT);
addAnnotations();
}
protected void addAnnotations() {
int cursor = 0;
int depth = 0;
Document document = new Document(jCas);
document.setBegin(cursor);
document.setDepth(depth);
document.setEnd(TEXT.length());
document.addToIndexes();
quotation1 = new Quotation(jCas);
quotation1.setBegin(cursor);
quotation1.setDepth(++depth);
cursor += QUOTE1.length();
quotation1.setEnd(cursor);
quotation1.addToIndexes();
quotation2 = new Quotation(jCas);
quotation2.setBegin(++cursor);
quotation2.setDepth(depth);
cursor += QUOTE2.length();
quotation2.setEnd(cursor);
quotation2.addToIndexes();
link1 = new Link(jCas);
link1.setBegin(++cursor);
link1.setDepth(depth);
cursor += LINK.length();
link1.setEnd(cursor);
link1.addToIndexes();
quotation3 = new Quotation(jCas);
quotation3.setBegin(++cursor);
quotation3.setDepth(++depth);
cursor += QUOTE1.length();
quotation3.setEnd(cursor);
quotation3.addToIndexes();
quotation4 = new Quotation(jCas);
quotation4.setBegin(++cursor);
quotation4.setDepth(depth);
cursor += QUOTE2.length();
quotation4.setEnd(cursor);
quotation4.addToIndexes();
link2 = new Link(jCas);
link2.setBegin(++cursor);
link2.setDepth(depth);
cursor += LINK.length();
link2.setEnd(cursor);
link2.addToIndexes();
link3 = new Link(jCas);
link3.setBegin(++cursor);
link3.setDepth(depth);
cursor += LINK.length();
link3.setEnd(cursor);
link3.addToIndexes();
quotation5 = new Quotation(jCas);
quotation5.setBegin(++cursor);
quotation5.setDepth(++depth);
cursor += QUOTE3.length();
quotation5.setEnd(cursor);
quotation5.addToIndexes();
table = new Table(jCas);
table.setBegin(++cursor);
table.setDepth(depth);
tableBody = new TableBody(jCas);
tableBody.setBegin(cursor);
tableBody.setDepth(++depth);
tableRow1 = new TableRow(jCas);
tableRow1.setBegin(cursor);
tableRow1.setDepth(++depth);
tableCell11 = new TableCell(jCas);
tableCell11.setBegin(cursor);
tableCell11.setDepth(++depth);
cursor += R1C1.length();
tableCell11.setEnd(cursor);
tableCell11.addToIndexes();
tableCell12 = new TableCell(jCas);
tableCell12.setBegin(++cursor);
tableCell12.setDepth(depth);
cursor += R1C2.length();
tableCell12.setEnd(cursor);
tableCell12.addToIndexes();
tableRow1.setEnd(cursor);
tableRow1.addToIndexes();
tableRow2 = new TableRow(jCas);
tableRow2.setBegin(++cursor);
tableRow2.setDepth(--depth);
tableCell21 = new TableCell(jCas);
tableCell21.setBegin(cursor);
tableCell21.setDepth(++depth);
cursor += R2C1.length();
tableCell21.setEnd(cursor);
tableCell21.addToIndexes();
tableCell22 = new TableCell(jCas);
tableCell22.setBegin(++cursor);
tableCell22.setDepth(depth);
cursor += R2C2.length();
tableCell22.setEnd(cursor);
tableCell22.addToIndexes();
tableRow2.setEnd(cursor);
tableRow2.addToIndexes();
tableBody.setEnd(cursor);
tableBody.addToIndexes();
--depth;
table.setEnd(cursor);
table.addToIndexes();
--depth;
section = new Section(jCas);
section.setBegin(++cursor);
section.setDepth(depth);
heading1 = new Heading(jCas);
heading1.setBegin(cursor);
heading1.setDepth(++depth);
cursor += HEADING1.length();
heading1.setEnd(cursor);
heading1.addToIndexes();
paragraph1 = new Paragraph(jCas);
paragraph1.setBegin(++cursor);
paragraph1.setDepth(depth);
cursor += PARA1.length();
paragraph1.setEnd(cursor);
paragraph1.addToIndexes();
heading2 = new Heading(jCas);
heading2.setBegin(++cursor);
heading2.setDepth(depth);
cursor += HEADING1.length();
heading2.setEnd(cursor);
heading2.addToIndexes();
paragraph2 = new Paragraph(jCas);
paragraph2.setBegin(++cursor);
paragraph2.setDepth(depth);
cursor += PARA1.length();
paragraph2.setEnd(cursor);
paragraph2.addToIndexes();
heading3 = new Heading(jCas);
heading3.setBegin(++cursor);
heading3.setDepth(depth);
cursor += HEADING1.length();
heading3.setEnd(cursor);
heading3.addToIndexes();
paragraph3 = new Paragraph(jCas);
paragraph3.setBegin(++cursor);
paragraph3.setDepth(depth);
cursor += PARA1.length();
paragraph3.setEnd(cursor);
paragraph3.addToIndexes();
section.setEnd(cursor);
section.addToIndexes();
}
@After
public void tearDown() throws IOException {
Files.delete(tempDirectory);
}
protected Path writeRecordDefinitions() throws IOException, JsonGenerationException, JsonMappingException {
Path definitionFile = Files.createTempFile(tempDirectory, AbstractRecordAnnotatorTest.class.getSimpleName(),
".yml");
YAMLMAPPER.writeValue(definitionFile.toFile(), createRecordDefinitions());
return definitionFile;
}
private List<TemplateRecordConfiguration> createRecordDefinitions() {
List<TemplateRecordConfiguration> recordDefinitionConfigurations = new ArrayList<>();
recordDefinitionConfigurations.add(createRepeatQuoteRecord1());
recordDefinitionConfigurations.add(createRepeatQuoteRecord2());
recordDefinitionConfigurations.add(createMissingRepeatQuoteRecord());
recordDefinitionConfigurations.add(createSingleQuoteRecord());
recordDefinitionConfigurations.add(createRowRecord());
recordDefinitionConfigurations.add(createSectionRecord());
return recordDefinitionConfigurations;
}
private TemplateRecordConfiguration createRepeatQuoteRecord1() {
TemplateRecordConfiguration record = new TemplateRecordConfiguration();
record.setName("quote1");
record.setOrder(1);
record.setPrecedingPath("");
record.setFollowingPath("Document > Link");
record.setCoveredPaths(ImmutableList.of("Document > Quotation"));
record.setMinimalRepeat("Document > Quotation");
record.setRepeat(true);
record.setKind(Kind.NAMED);
List<TemplateFieldConfiguration> fields = ImmutableList
.of(new TemplateFieldConfiguration("quote", "Document > Quotation:nth-of-type(1)"));
record.setFieldPaths(fields);
return record;
}
private TemplateRecordConfiguration createRepeatQuoteRecord2() {
TemplateRecordConfiguration record = new TemplateRecordConfiguration();
record.setName("quote2");
record.setOrder(2);
record.setPrecedingPath("Document > Link");
record.setFollowingPath("Document > Link:nth-of-type(2)");
record.setCoveredPaths(ImmutableList.of("Document > Quotation"));
record.setMinimalRepeat("Document > Quotation");
record.setRepeat(true);
record.setKind(Kind.NAMED);
List<TemplateFieldConfiguration> fields = ImmutableList
.of(new TemplateFieldConfiguration("quote", "Document > Quotation:nth-of-type(2)"));
record.setFieldPaths(fields);
return record;
}
private TemplateRecordConfiguration createMissingRepeatQuoteRecord() {
TemplateRecordConfiguration record = new TemplateRecordConfiguration();
record.setName("missing");
record.setOrder(3);
record.setPrecedingPath("Document > Link:nth-of-type(2)");
record.setFollowingPath("Document > Link:nth-of-type(3)");
record.setCoveredPaths(ImmutableList.of("Document > Quotation"));
record.setMinimalRepeat("Document > Quotation");
record.setRepeat(true);
record.setKind(Kind.NAMED);
List<TemplateFieldConfiguration> fields = ImmutableList
.of(new TemplateFieldConfiguration("quote", "Document > Quotation:nth-of-type(3)"));
record.setFieldPaths(fields);
return record;
}
private TemplateRecordConfiguration createSingleQuoteRecord() {
TemplateRecordConfiguration record = new TemplateRecordConfiguration();
record.setName("single");
record.setOrder(4);
record.setPrecedingPath("Document > Link:nth-of-type(3)");
record.setFollowingPath("Document > Table");
record.setRepeat(false);
record.setKind(Kind.NAMED);
List<TemplateFieldConfiguration> fields = ImmutableList
.of(new TemplateFieldConfiguration("quote", "Document > Quotation:nth-of-type(4)"));
record.setFieldPaths(fields);
return record;
}
private TemplateRecordConfiguration createRowRecord() {
TemplateRecordConfiguration record = new TemplateRecordConfiguration();
record.setName("row");
record.setOrder(5);
record.setPrecedingPath("Document > Quotation:nth-of-type(4)");
record.setFollowingPath("Document > Section");
record.setCoveredPaths(ImmutableList.of("Document > Table"));
record.setMinimalRepeat("Document > Table > TableBody > TableRow");
record.setRepeat(true);
record.setKind(Kind.NAMED);
List<TemplateFieldConfiguration> fields = ImmutableList.of(
new TemplateFieldConfiguration("cell1",
"Document > Table > TableBody > TableRow > TableCell:nth-of-type(1)"),
new TemplateFieldConfiguration("cell2",
"Document > Table > TableBody > TableRow > TableCell:nth-of-type(2)"));
record.setFieldPaths(fields);
return record;
}
private TemplateRecordConfiguration createSectionRecord() {
TemplateRecordConfiguration record = new TemplateRecordConfiguration();
record.setName("section");
record.setOrder(6);
record.setPrecedingPath("Document > Table > TableBody > TableRow > TableCell:nth-of-type(2)");
record.setFollowingPath("");
record.setCoveredPaths(ImmutableList.of("Document > Section > Heading", "Document > Section > Paragraph"));
record.setMinimalRepeat("");
record.setRepeat(true);
record.setKind(Kind.NAMED);
List<TemplateFieldConfiguration> fields = ImmutableList.of(
new TemplateFieldConfiguration("heading", "Document > Section > Heading"),
new TemplateFieldConfiguration("para", "Document > Section > Paragraph"));
record.setFieldPaths(fields);
return record;
}
@Test
public void testRepeatingRecords()
throws AnalysisEngineProcessException, ResourceInitializationException, IOException {
Path definitionFile = writeRecordDefinitions();
try {
processJCas(TemplateAnnotator.PARAM_RECORD_DEFINITIONS_DIRECTORY, tempDirectory.toString());
List<TemplateRecord> records = new ArrayList<>(JCasUtil.select(jCas, TemplateRecord.class));
assertEquals(10, records.size());
TemplateRecord r1 = records.get(0);
assertEquals("quote1", r1.getName());
assertEquals(0, r1.getBegin());
assertEquals(quotation1.getEnd(), r1.getEnd());
List<TemplateField> fields = JCasUtil.selectCovered(TemplateField.class, r1);
assertEquals(1, fields.size());
TemplateField field = fields.get(0);
assertEquals(0, field.getBegin());
assertEquals("quote", field.getName());
assertEquals(QUOTE1.length(), field.getEnd());
assertEquals(QUOTE1, field.getCoveredText());
assertEquals(QUOTE1, field.getValue());
TemplateRecord r2 = records.get(1);
assertEquals("quote1", r2.getName());
assertEquals(quotation1.getEnd(), r2.getBegin());
assertEquals(quotation2.getEnd(), r2.getEnd());
fields = JCasUtil.selectCovered(TemplateField.class, r2);
assertEquals(1, fields.size());
field = fields.get(0);
assertEquals("quote", field.getName());
assertEquals(quotation2.getBegin(), field.getBegin());
assertEquals(field.getBegin() + QUOTE2.length(), field.getEnd());
assertEquals(QUOTE2, field.getCoveredText());
assertEquals(QUOTE2, field.getValue());
TemplateRecord r3 = records.get(2);
assertEquals("quote2", r3.getName());
assertEquals(link1.getEnd(), r3.getBegin());
assertEquals(quotation3.getEnd(), r3.getEnd());
fields = JCasUtil.selectCovered(TemplateField.class, r3);
assertEquals(1, fields.size());
field = fields.get(0);
assertEquals(quotation3.getBegin(), field.getBegin());
assertEquals("quote", field.getName());
assertEquals(field.getBegin() + QUOTE1.length(), field.getEnd());
assertEquals(QUOTE1, field.getCoveredText());
assertEquals(QUOTE1, field.getValue());
TemplateRecord r4 = records.get(3);
assertEquals("quote2", r4.getName());
assertEquals(quotation3.getEnd(), r4.getBegin());
assertEquals(quotation4.getEnd(), r4.getEnd());
fields = JCasUtil.selectCovered(TemplateField.class, r4);
assertEquals(1, fields.size());
field = fields.get(0);
assertEquals("quote", field.getName());
assertEquals(quotation4.getBegin(), field.getBegin());
assertEquals(field.getBegin() + QUOTE2.length(), field.getEnd());
assertEquals(QUOTE2, field.getCoveredText());
assertEquals(QUOTE2, field.getValue());
TemplateRecord r5 = records.get(4);
assertEquals("single", r5.getName());
assertEquals(link3.getEnd(), r5.getBegin());
assertEquals(table.getBegin(), r5.getEnd());
fields = JCasUtil.selectCovered(TemplateField.class, r5);
assertEquals(1, fields.size());
field = fields.get(0);
assertEquals("quote", field.getName());
assertEquals(quotation5.getBegin(), field.getBegin());
assertEquals(field.getBegin() + QUOTE3.length(), field.getEnd());
assertEquals(QUOTE3, field.getCoveredText());
assertEquals(QUOTE3, field.getValue());
TemplateRecord r6 = records.get(5);
assertEquals(1, fields.size());
field = fields.get(0);
assertEquals("row", r6.getName());
assertEquals(quotation5.getEnd(), r6.getBegin());
assertEquals(tableRow1.getEnd(), r6.getEnd());
fields = JCasUtil.selectCovered(TemplateField.class, r6);
assertEquals(2, fields.size());
TemplateField cell11 = fields.get(0);
assertEquals("cell1", cell11.getName());
assertEquals(tableCell11.getBegin(), cell11.getBegin());
assertEquals(tableCell11.getEnd(), cell11.getEnd());
assertEquals(R1C1, cell11.getCoveredText());
assertEquals(R1C1, cell11.getValue());
TemplateField cell12 = fields.get(1);
assertEquals("cell2", cell12.getName());
assertEquals(tableCell12.getBegin(), cell12.getBegin());
assertEquals(tableCell12.getEnd(), cell12.getEnd());
assertEquals(R1C2, cell12.getCoveredText());
assertEquals(R1C2, cell12.getValue());
TemplateRecord r7 = records.get(6);
assertEquals("row", r7.getName());
assertEquals(tableRow1.getEnd(), r7.getBegin());
assertEquals(tableRow2.getEnd(), r7.getEnd());
fields = JCasUtil.selectCovered(TemplateField.class, r7);
assertEquals(2, fields.size());
TemplateField cell21 = fields.get(0);
assertEquals("cell1", cell21.getName());
assertEquals(tableCell21.getBegin(), cell21.getBegin());
assertEquals(tableCell21.getEnd(), cell21.getEnd());
assertEquals(R2C1, cell21.getCoveredText());
assertEquals(R2C1, cell21.getValue());
TemplateField cell22 = fields.get(1);
assertEquals("cell2", cell22.getName());
assertEquals(tableCell22.getBegin(), cell22.getBegin());
assertEquals(tableCell22.getEnd(), cell22.getEnd());
assertEquals(R2C2, cell22.getCoveredText());
assertEquals(R2C2, cell22.getValue());
TemplateRecord r8 = records.get(7);
assertEquals("section", r8.getName());
assertEquals(tableRow2.getEnd(), r8.getBegin());
assertEquals(paragraph1.getEnd(), r8.getEnd());
fields = JCasUtil.selectCovered(TemplateField.class, r8);
assertEquals(2, fields.size());
TemplateField heading = fields.get(0);
assertEquals("heading", heading.getName());
assertEquals(heading1.getBegin(), heading.getBegin());
assertEquals(heading.getBegin() + HEADING1.length(), heading.getEnd());
assertEquals(HEADING1, heading.getCoveredText());
assertEquals(HEADING1, heading.getValue());
TemplateField para = fields.get(1);
assertEquals("para", para.getName());
assertEquals(paragraph1.getBegin(), para.getBegin());
assertEquals(para.getBegin() + PARA1.length(), para.getEnd());
assertEquals(PARA1, para.getCoveredText());
assertEquals(PARA1, para.getValue());
TemplateRecord r9 = records.get(8);
assertEquals("section", r9.getName());
assertEquals(paragraph1.getEnd(), r9.getBegin());
assertEquals(paragraph2.getEnd(), r9.getEnd());
fields = JCasUtil.selectCovered(TemplateField.class, r9);
assertEquals(2, fields.size());
heading = fields.get(0);
assertEquals("heading", heading.getName());
assertEquals(heading2.getBegin(), heading.getBegin());
assertEquals(heading.getBegin() + HEADING2.length(), heading.getEnd());
assertEquals(HEADING2, heading.getCoveredText());
assertEquals(HEADING2, heading.getValue());
para = fields.get(1);
assertEquals("para", para.getName());
assertEquals(paragraph2.getBegin(), para.getBegin());
assertEquals(para.getBegin() + PARA2.length(), para.getEnd());
assertEquals(PARA2, para.getCoveredText());
assertEquals(PARA2, para.getValue());
TemplateRecord r10 = records.get(9);
assertEquals("section", r10.getName());
assertEquals(paragraph2.getEnd(), r10.getBegin());
assertEquals(paragraph3.getEnd(), r10.getEnd());
fields = JCasUtil.selectCovered(TemplateField.class, r10);
assertEquals(2, fields.size());
heading = fields.get(0);
assertEquals("heading", heading.getName());
assertEquals(heading3.getBegin(), heading.getBegin());
assertEquals(heading.getBegin() + HEADING3.length(), heading.getEnd());
assertEquals(HEADING3, heading.getCoveredText());
assertEquals(HEADING3, heading.getValue());
para = fields.get(1);
assertEquals("para", para.getName());
assertEquals(paragraph3.getBegin(), para.getBegin());
assertEquals(para.getBegin() + PARA3.length(), para.getEnd());
assertEquals(PARA3, para.getCoveredText());
assertEquals(PARA3, para.getValue());
} finally {
Files.delete(definitionFile);
}
}
}