//Dstl (c) Crown Copyright 2017 package uk.gov.dstl.baleen.annotators.templates; import static org.junit.Assert.assertEquals; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; import java.util.ArrayList; import java.util.List; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.resource.ResourceInitializationException; import org.junit.After; import org.junit.Before; import org.junit.Test; import com.fasterxml.jackson.core.JsonGenerationException; import com.fasterxml.jackson.databind.JsonMappingException; import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.dataformat.yaml.YAMLFactory; import com.google.common.collect.ImmutableList; import uk.gov.dstl.baleen.annotators.templates.TemplateRecordConfiguration.Kind; import uk.gov.dstl.baleen.annotators.testing.AbstractAnnotatorTest; import uk.gov.dstl.baleen.types.structure.Document; import uk.gov.dstl.baleen.types.structure.Link; import uk.gov.dstl.baleen.types.structure.Paragraph; import uk.gov.dstl.baleen.types.structure.Quotation; import uk.gov.dstl.baleen.types.structure.Table; import uk.gov.dstl.baleen.types.structure.TableBody; import uk.gov.dstl.baleen.types.structure.TableCell; import uk.gov.dstl.baleen.types.structure.TableRow; import uk.gov.dstl.baleen.types.templates.TemplateRecord; import uk.gov.dstl.baleen.types.templates.TemplateField; public class RepeatingFieldAnnotatorTest extends AbstractAnnotatorTest { protected static final ObjectMapper YAMLMAPPER = new ObjectMapper(new YAMLFactory()); private static final String QUOTE1 = "quote"; private static final String QUOTE2 = "better quote"; private static final String LINK = "link"; private static final String QUOTE3 = "best quote"; private static final String R1C1P1 = "r1c1p1"; private static final String R1C1P2 = "r1c1p2"; private static final String R1C2 = "r1c2"; private static final String R2C1 = "r2c1"; private static final String R2C2P1 = "r2c2p1"; private static final String R2C2P2 = "r2c2p2"; private static final String ROW1 = R1C1P1 + " " + R1C1P2 + " " + R1C2; private static final String ROW2 = R2C1 + " " + R2C2P1 + " " + R2C2P2; private static final String TEXT = String.join("\n", QUOTE1, QUOTE2, LINK, QUOTE1, QUOTE2, LINK, LINK, QUOTE3, ROW1, ROW2); protected Path tempDirectory; private Quotation quotation1; private Quotation quotation2; private Link link1; private Quotation quotation3; private Quotation quotation4; private Link link2; private Link link3; private Quotation quotation5; private Table table; private TableBody tableBody; private TableRow tableRow1; private TableCell tableCell11; private TableCell tableCell12; private TableRow tableRow2; private TableCell tableCell21; private TableCell tableCell22; private Paragraph paragraph1; private Paragraph paragraph2; private Paragraph paragraph3; private Paragraph paragraph4; private Paragraph paragraph5; private Paragraph paragraph6; public RepeatingFieldAnnotatorTest() { super(TemplateAnnotator.class); } @Before public void setup() throws IOException { tempDirectory = Files.createTempDirectory(getClass().getSimpleName()); jCas.setDocumentText(TEXT); addAnnotations(); } protected void addAnnotations() { int cursor = 0; int depth = 0; Document document = new Document(jCas); document.setBegin(cursor); document.setDepth(depth); document.setEnd(TEXT.length()); document.addToIndexes(); quotation1 = new Quotation(jCas); quotation1.setBegin(cursor); quotation1.setDepth(++depth); cursor += QUOTE1.length(); quotation1.setEnd(cursor); quotation1.addToIndexes(); quotation2 = new Quotation(jCas); quotation2.setBegin(++cursor); quotation2.setDepth(depth); cursor += QUOTE2.length(); quotation2.setEnd(cursor); quotation2.addToIndexes(); link1 = new Link(jCas); link1.setBegin(++cursor); link1.setDepth(depth); cursor += LINK.length(); link1.setEnd(cursor); link1.addToIndexes(); quotation3 = new Quotation(jCas); quotation3.setBegin(++cursor); quotation3.setDepth(++depth); cursor += QUOTE1.length(); quotation3.setEnd(cursor); quotation3.addToIndexes(); quotation4 = new Quotation(jCas); quotation4.setBegin(++cursor); quotation4.setDepth(depth); cursor += QUOTE2.length(); quotation4.setEnd(cursor); quotation4.addToIndexes(); link2 = new Link(jCas); link2.setBegin(++cursor); link2.setDepth(depth); cursor += LINK.length(); link2.setEnd(cursor); link2.addToIndexes(); link3 = new Link(jCas); link3.setBegin(++cursor); link3.setDepth(depth); cursor += LINK.length(); link3.setEnd(cursor); link3.addToIndexes(); quotation5 = new Quotation(jCas); quotation5.setBegin(++cursor); quotation5.setDepth(++depth); cursor += QUOTE3.length(); quotation5.setEnd(cursor); quotation5.addToIndexes(); table = new Table(jCas); table.setBegin(++cursor); table.setDepth(depth); tableBody = new TableBody(jCas); tableBody.setBegin(cursor); tableBody.setDepth(++depth); tableRow1 = new TableRow(jCas); tableRow1.setBegin(cursor); tableRow1.setDepth(++depth); tableCell11 = new TableCell(jCas); tableCell11.setBegin(cursor); tableCell11.setDepth(++depth); paragraph1 = new Paragraph(jCas); paragraph1.setBegin(cursor); paragraph1.setDepth(++depth); cursor += R1C1P1.length(); paragraph1.setEnd(cursor); paragraph1.addToIndexes(); paragraph2 = new Paragraph(jCas); paragraph2.setBegin(++cursor); paragraph2.setDepth(depth); cursor += R1C1P2.length(); paragraph2.setEnd(cursor); paragraph2.addToIndexes(); --depth; tableCell11.setEnd(cursor); tableCell11.addToIndexes(); tableCell12 = new TableCell(jCas); tableCell12.setBegin(++cursor); tableCell12.setDepth(depth); paragraph3 = new Paragraph(jCas); paragraph3.setBegin(cursor); paragraph3.setDepth(++depth); cursor += R1C2.length(); paragraph3.setEnd(cursor); paragraph3.addToIndexes(); --depth; tableCell12.setEnd(cursor); tableCell12.addToIndexes(); tableRow1.setEnd(cursor); tableRow1.addToIndexes(); tableRow2 = new TableRow(jCas); tableRow2.setBegin(++cursor); tableRow2.setDepth(--depth); tableCell21 = new TableCell(jCas); tableCell21.setBegin(cursor); tableCell21.setDepth(++depth); paragraph4 = new Paragraph(jCas); paragraph4.setBegin(cursor); paragraph4.setDepth(++depth); cursor += R2C1.length(); paragraph4.setEnd(cursor); paragraph4.addToIndexes(); --depth; tableCell21.setEnd(cursor); tableCell21.addToIndexes(); tableCell22 = new TableCell(jCas); tableCell22.setBegin(++cursor); tableCell22.setDepth(depth); paragraph5 = new Paragraph(jCas); paragraph5.setBegin(cursor); paragraph5.setDepth(++depth); cursor += R2C2P1.length(); paragraph5.setEnd(cursor); paragraph5.addToIndexes(); paragraph6 = new Paragraph(jCas); paragraph6.setBegin(++cursor); paragraph6.setDepth(depth); cursor += R2C2P2.length(); paragraph6.setEnd(cursor); paragraph6.addToIndexes(); --depth; tableCell22.setEnd(cursor); tableCell22.addToIndexes(); tableRow2.setEnd(cursor); tableRow2.addToIndexes(); tableBody.setEnd(cursor); tableBody.addToIndexes(); --depth; table.setEnd(cursor); table.addToIndexes(); --depth; } @After public void tearDown() throws IOException { Files.delete(tempDirectory); } protected Path writeRecordDefinitions() throws IOException, JsonGenerationException, JsonMappingException { Path definitionFile = Files.createTempFile(tempDirectory, AbstractRecordAnnotatorTest.class.getSimpleName(), ".yml"); YAMLMAPPER.writeValue(definitionFile.toFile(), createRecordDefinitions()); return definitionFile; } private List<TemplateRecordConfiguration> createRecordDefinitions() { List<TemplateRecordConfiguration> recordDefinitionConfigurations = new ArrayList<>(); recordDefinitionConfigurations.add(createRepeatQuoteRecord1()); recordDefinitionConfigurations.add(createRepeatQuoteRecord2()); recordDefinitionConfigurations.add(createMissingRepeatQuoteRecord()); recordDefinitionConfigurations.add(createSingleQuoteRecord()); recordDefinitionConfigurations.add(createRowRecord()); return recordDefinitionConfigurations; } private TemplateRecordConfiguration createRepeatQuoteRecord1() { TemplateRecordConfiguration record = new TemplateRecordConfiguration(); record.setName("quote1"); record.setOrder(1); record.setPrecedingPath(""); record.setFollowingPath("Document > Link"); record.setKind(Kind.NAMED); TemplateFieldConfiguration field = new TemplateFieldConfiguration("quote", "Document > Quotation:nth-of-type(1)"); field.setRepeat(true); record.setFieldPaths(ImmutableList.of(field)); return record; } private TemplateRecordConfiguration createRepeatQuoteRecord2() { TemplateRecordConfiguration record = new TemplateRecordConfiguration(); record.setName("quote2"); record.setOrder(2); record.setPrecedingPath("Document > Link"); record.setFollowingPath("Document > Link:nth-of-type(2)"); record.setKind(Kind.NAMED); TemplateFieldConfiguration field = new TemplateFieldConfiguration("quote", "Document > Quotation:nth-of-type(2)"); field.setRepeat(true); record.setFieldPaths(ImmutableList.of(field)); return record; } private TemplateRecordConfiguration createMissingRepeatQuoteRecord() { TemplateRecordConfiguration record = new TemplateRecordConfiguration(); record.setName("missing"); record.setOrder(3); record.setPrecedingPath("Document > Link:nth-of-type(2)"); record.setFollowingPath("Document > Link:nth-of-type(3)"); record.setKind(Kind.NAMED); TemplateFieldConfiguration field = new TemplateFieldConfiguration("quote", "Document > Quotation:nth-of-type(3)"); field.setRepeat(true); record.setFieldPaths(ImmutableList.of(field)); return record; } private TemplateRecordConfiguration createSingleQuoteRecord() { TemplateRecordConfiguration record = new TemplateRecordConfiguration(); record.setName("single"); record.setOrder(4); record.setPrecedingPath("Document > Link:nth-of-type(3)"); record.setFollowingPath("Document > Table"); record.setRepeat(false); record.setKind(Kind.NAMED); List<TemplateFieldConfiguration> fields = ImmutableList .of(new TemplateFieldConfiguration("quote", "Document > Quotation:nth-of-type(4)")); record.setFieldPaths(fields); return record; } private TemplateRecordConfiguration createRowRecord() { TemplateRecordConfiguration record = new TemplateRecordConfiguration(); record.setName("row"); record.setOrder(5); record.setPrecedingPath("Document > Quotation:nth-of-type(4)"); record.setFollowingPath("Document > Section"); record.setCoveredPaths(ImmutableList.of("Document > Table")); record.setMinimalRepeat("Document > Table > TableBody > TableRow"); record.setRepeat(true); record.setKind(Kind.NAMED); TemplateFieldConfiguration cell1 = new TemplateFieldConfiguration("cell1", "Document > Table > TableBody > TableRow > TableCell:nth-of-type(1) > Paragraph"); cell1.setRepeat(true); TemplateFieldConfiguration cell2 = new TemplateFieldConfiguration("cell2", "Document > Table > TableBody > TableRow > TableCell:nth-of-type(2)> Paragraph"); cell2.setRepeat(true); record.setFieldPaths(ImmutableList.of(cell1, cell2)); return record; } @Test public void testCreateRepeatingFields() throws AnalysisEngineProcessException, ResourceInitializationException, IOException { Path definitionFile = writeRecordDefinitions(); try { processJCas(TemplateAnnotator.PARAM_RECORD_DEFINITIONS_DIRECTORY, tempDirectory.toString()); List<TemplateRecord> records = new ArrayList<>(JCasUtil.select(jCas, TemplateRecord.class)); assertEquals(6, records.size()); TemplateRecord r1 = records.get(0); assertEquals("quote1", r1.getName()); assertEquals(0, r1.getBegin()); assertEquals(link1.getBegin(), r1.getEnd()); List<TemplateField> fields = JCasUtil.selectCovered(TemplateField.class, r1); assertEquals(2, fields.size()); TemplateField field = fields.get(0); assertEquals(0, field.getBegin()); assertEquals("quote", field.getName()); assertEquals(QUOTE1.length(), field.getEnd()); assertEquals(QUOTE1, field.getCoveredText()); assertEquals(QUOTE1, field.getValue()); field = fields.get(1); assertEquals("quote", field.getName()); assertEquals(quotation2.getBegin(), field.getBegin()); assertEquals(field.getBegin() + QUOTE2.length(), field.getEnd()); assertEquals(QUOTE2, field.getCoveredText()); assertEquals(QUOTE2, field.getValue()); TemplateRecord r2 = records.get(1); assertEquals("quote2", r2.getName()); assertEquals(link1.getEnd(), r2.getBegin()); assertEquals(link2.getBegin(), r2.getEnd()); fields = JCasUtil.selectCovered(TemplateField.class, r2); assertEquals(2, fields.size()); field = fields.get(0); assertEquals(quotation3.getBegin(), field.getBegin()); assertEquals("quote", field.getName()); assertEquals(field.getBegin() + QUOTE1.length(), field.getEnd()); assertEquals(QUOTE1, field.getCoveredText()); assertEquals(QUOTE1, field.getValue()); field = fields.get(1); assertEquals("quote", field.getName()); assertEquals(quotation4.getBegin(), field.getBegin()); assertEquals(field.getBegin() + QUOTE2.length(), field.getEnd()); assertEquals(QUOTE2, field.getCoveredText()); assertEquals(QUOTE2, field.getValue()); TemplateRecord r3 = records.get(2); assertEquals("missing", r3.getName()); assertEquals(link2.getEnd(), r3.getBegin()); assertEquals(link3.getBegin(), r3.getEnd()); fields = JCasUtil.selectCovered(TemplateField.class, r3); assertEquals(0, fields.size()); TemplateRecord r4 = records.get(3); assertEquals("single", r4.getName()); assertEquals(link3.getEnd(), r4.getBegin()); assertEquals(table.getBegin(), r4.getEnd()); fields = JCasUtil.selectCovered(TemplateField.class, r4); assertEquals(1, fields.size()); field = fields.get(0); assertEquals("quote", field.getName()); assertEquals(quotation5.getBegin(), field.getBegin()); assertEquals(field.getBegin() + QUOTE3.length(), field.getEnd()); assertEquals(QUOTE3, field.getCoveredText()); assertEquals(QUOTE3, field.getValue()); TemplateRecord r5 = records.get(4); assertEquals(1, fields.size()); field = fields.get(0); assertEquals("row", r5.getName()); assertEquals(quotation5.getEnd(), r5.getBegin()); assertEquals(tableRow1.getEnd(), r5.getEnd()); fields = JCasUtil.selectCovered(TemplateField.class, r5); assertEquals(3, fields.size()); TemplateField cell111 = fields.get(0); assertEquals("cell1", cell111.getName()); assertEquals(paragraph1.getBegin(), cell111.getBegin()); assertEquals(paragraph1.getEnd(), cell111.getEnd()); assertEquals(R1C1P1, cell111.getCoveredText()); assertEquals(R1C1P1, cell111.getValue()); TemplateField cell112 = fields.get(1); assertEquals("cell1", cell112.getName()); assertEquals(paragraph2.getBegin(), cell112.getBegin()); assertEquals(paragraph2.getEnd(), cell112.getEnd()); assertEquals(R1C1P2, cell112.getCoveredText()); assertEquals(R1C1P2, cell112.getValue()); TemplateField cell12 = fields.get(2); assertEquals("cell2", cell12.getName()); assertEquals(tableCell12.getBegin(), cell12.getBegin()); assertEquals(tableCell12.getEnd(), cell12.getEnd()); assertEquals(R1C2, cell12.getCoveredText()); assertEquals(R1C2, cell12.getValue()); TemplateRecord r6 = records.get(5); assertEquals("row", r6.getName()); assertEquals(tableRow1.getEnd(), r6.getBegin()); assertEquals(tableRow2.getEnd(), r6.getEnd()); fields = JCasUtil.selectCovered(TemplateField.class, r6); assertEquals(3, fields.size()); TemplateField cell21 = fields.get(0); assertEquals("cell1", cell21.getName()); assertEquals(tableCell21.getBegin(), cell21.getBegin()); assertEquals(tableCell21.getEnd(), cell21.getEnd()); assertEquals(R2C1, cell21.getCoveredText()); assertEquals(R2C1, cell21.getValue()); TemplateField cell221 = fields.get(1); assertEquals("cell2", cell221.getName()); assertEquals(paragraph5.getBegin(), cell221.getBegin()); assertEquals(paragraph5.getEnd(), cell221.getEnd()); assertEquals(R2C2P1, cell221.getCoveredText()); assertEquals(R2C2P1, cell221.getValue()); TemplateField cell222 = fields.get(2); assertEquals("cell2", cell222.getName()); assertEquals(paragraph6.getBegin(), cell222.getBegin()); assertEquals(paragraph6.getEnd(), cell222.getEnd()); assertEquals(R2C2P2, cell222.getCoveredText()); assertEquals(R2C2P2, cell222.getValue()); } finally { Files.delete(definitionFile); } } }