//Dstl (c) Crown Copyright 2017 package uk.gov.dstl.baleen.consumers; import static org.junit.Assert.assertArrayEquals; import static org.junit.Assert.assertEquals; import java.util.Arrays; import java.util.Collection; import java.util.Collections; import java.util.Date; import java.util.List; import org.apache.uima.analysis_engine.AnalysisEngine; import org.apache.uima.analysis_engine.AnalysisEngineDescription; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.fit.factory.AnalysisEngineFactory; import org.apache.uima.fit.factory.ExternalResourceFactory; import org.apache.uima.jcas.cas.StringArray; import org.apache.uima.jcas.tcas.DocumentAnnotation; import org.apache.uima.resource.ExternalResourceDescription; import org.apache.uima.resource.ResourceAccessException; import org.apache.uima.resource.ResourceInitializationException; import org.apache.uima.resource.impl.CustomResourceSpecifier_impl; import org.bson.Document; import org.junit.After; import org.junit.Before; import org.junit.Test; import org.mockito.Mockito; import com.mongodb.client.MongoCollection; import uk.gov.dstl.baleen.consumers.utils.DefaultFields; import uk.gov.dstl.baleen.consumers.utils.IEntityConverterFields; import uk.gov.dstl.baleen.core.history.BaleenHistory; import uk.gov.dstl.baleen.core.history.DocumentHistory; import uk.gov.dstl.baleen.core.history.HistoryEvent; import uk.gov.dstl.baleen.core.history.HistoryEvents; import uk.gov.dstl.baleen.core.history.memory.InMemoryBaleenHistory; import uk.gov.dstl.baleen.core.pipelines.PipelineBuilder; import uk.gov.dstl.baleen.resources.SharedFongoResource; import uk.gov.dstl.baleen.types.common.Buzzword; import uk.gov.dstl.baleen.types.common.CommsIdentifier; import uk.gov.dstl.baleen.types.common.Person; import uk.gov.dstl.baleen.types.metadata.Metadata; import uk.gov.dstl.baleen.types.metadata.PublishedId; import uk.gov.dstl.baleen.types.semantic.Location; import uk.gov.dstl.baleen.types.semantic.ReferenceTarget; import uk.gov.dstl.baleen.types.semantic.Relation; import uk.gov.dstl.baleen.types.semantic.Temporal; import uk.gov.dstl.baleen.uima.utils.UimaTypesUtils; public class MongoTest extends ConsumerTestBase { private static final String LONDON = "London"; private static final String NAME_2 = "William"; private static final String TYPE = "type"; private static final String VALUE = "value"; private static final String CONFIDENCE = "confidence"; private static final String END = "end"; private static final String BEGIN = "begin"; private static final String EMAIL = "james@example.com"; private static final String DATE = "19th February 2015"; private static final String PERSON = "James"; private static final String TEXT = "Hello World"; private static final String MONGO = "mongo"; private static final String WENT = "went"; private AnalysisEngine ae; private MongoCollection<Document> documents; private MongoCollection<Document> entities; private MongoCollection<Document> relations; private BaleenHistory history; private final IEntityConverterFields fields = new DefaultFields(); @Before public void setUp() throws ResourceInitializationException, ResourceAccessException { // Create a description of an external resource - a fongo instance, in the same way we would have created a shared mongo resource ExternalResourceDescription erd = ExternalResourceFactory.createExternalResourceDescription(MONGO, SharedFongoResource.class, "fongo.collection", "test", "fongo.data", "[]"); ExternalResourceDescription historyErd = ExternalResourceFactory.createExternalResourceDescription(PipelineBuilder.BALEEN_HISTORY, InMemoryBaleenHistory.class); history = Mockito.mock(BaleenHistory.class); // Create the analysis engine AnalysisEngineDescription aed = AnalysisEngineFactory.createEngineDescription(Mongo.class, MONGO, erd, "collection", "test", PipelineBuilder.BALEEN_HISTORY, historyErd, "outputHistory", Boolean.TRUE); ae = AnalysisEngineFactory.createEngine(aed); ae.initialize(new CustomResourceSpecifier_impl(), Collections.emptyMap()); SharedFongoResource sfr = (SharedFongoResource) ae.getUimaContext().getResourceObject(MONGO); history = (BaleenHistory) ae.getUimaContext().getResourceObject(PipelineBuilder.BALEEN_HISTORY); entities = sfr.getDB().getCollection("entities"); documents = sfr.getDB().getCollection("documents"); relations = sfr.getDB().getCollection("relations"); // Ensure we start with no data! assertEquals(0L, documents.count()); assertEquals(0L, entities.count()); assertEquals(0L, relations.count()); } @After public void tearDown() { if(ae != null) { ae.destroy(); } } @Test public void testSave() throws Exception{ // Set the document content jCas.setDocumentText("Hello world, this is a test"); // Put some other stuff in that should end up in Mongo // Process ae.process(jCas) ae.process(jCas); // Try and get separate connection to fongo instance assertEquals(1L, documents.count()); assertEquals(0L, entities.count()); } @SuppressWarnings("unchecked") @Test public void testNoEntities() throws Exception { jCas.setDocumentText(TEXT); jCas.setDocumentLanguage("en"); long timestamp = System.currentTimeMillis(); DocumentAnnotation da = getDocumentAnnotation(jCas); da.setTimestamp(timestamp); da.setSourceUri("test/no_entities"); da.setDocType("test"); da.setDocumentClassification("OFFICIAL"); da.setDocumentCaveats(UimaTypesUtils.toArray(jCas, Arrays.asList(new String[] { "TEST_A", "TEST_B" }))); da.setDocumentReleasability(UimaTypesUtils.toArray(jCas, Arrays.asList(new String[] { "ENG", "SCO", "WAL" }))); ae.process(jCas); assertEquals(1, documents.count()); Document result = documents.find().first(); assertEquals(TEXT, result.get(Mongo.FIELD_CONTENT)); assertEquals("en", ((Document)result.get(Mongo.FIELD_DOCUMENT)).get(Mongo.FIELD_DOCUMENT_LANGUAGE)); assertEquals(new Date(timestamp), ((Document)result.get(Mongo.FIELD_DOCUMENT)).get(Mongo.FIELD_DOCUMENT_TIMESTAMP)); assertEquals("test/no_entities", ((Document)result.get(Mongo.FIELD_DOCUMENT)).get(Mongo.FIELD_DOCUMENT_SOURCE)); assertEquals("test", ((Document)result.get(Mongo.FIELD_DOCUMENT)).get(Mongo.FIELD_DOCUMENT_TYPE)); assertEquals("OFFICIAL", ((Document)result.get(Mongo.FIELD_DOCUMENT)).get(Mongo.FIELD_DOCUMENT_CLASSIFICATION)); assertArrayEquals(new String[] { "TEST_A", "TEST_B" }, ((Collection<String>)((Document)result.get(Mongo.FIELD_DOCUMENT)).get(Mongo.FIELD_DOCUMENT_CAVEATS)).toArray()); assertArrayEquals(new String[] { "ENG", "SCO", "WAL" }, ((Collection<String>)((Document)result.get(Mongo.FIELD_DOCUMENT)).get(Mongo.FIELD_DOCUMENT_RELEASABILITY)).toArray()); assertEquals(getDocumentAnnotation(jCas).getHash(), result.get(fields.getExternalId())); } @Test public void testMetadata() throws Exception { jCas.setDocumentText(TEXT); PublishedId pid1 = new PublishedId(jCas); pid1.setValue("id_1"); pid1.addToIndexes(); PublishedId pid2 = new PublishedId(jCas); pid2.setValue("id_2"); pid2.addToIndexes(); Metadata mdSourceAndInformation = new Metadata(jCas); mdSourceAndInformation.setKey("sourceAndInformationGrading"); mdSourceAndInformation.setValue("D3"); mdSourceAndInformation.addToIndexes(); Metadata mdCountries = new Metadata(jCas); mdCountries.setKey("countryInfo"); mdCountries.setValue("ENG|WAL|SCO"); mdCountries.addToIndexes(); Metadata mdTitle = new Metadata(jCas); mdTitle.setKey("documentTitle"); mdTitle.setValue("Test Title"); mdTitle.addToIndexes(); Metadata mdMisc = new Metadata(jCas); mdMisc.setKey("test.key"); mdMisc.setValue("test.value"); mdMisc.addToIndexes(); ae.process(jCas); assertEquals(1, documents.count()); Document result = documents.find().first(); assertEquals("id_1", ((Document)((List<?>)result.get(Mongo.FIELD_PUBLISHEDIDS)).get(0)).get(Mongo.FIELD_PUBLISHEDIDS_ID)); Document meta = (Document) result.get(Mongo.FIELD_METADATA); assertMeta(meta, "sourceAndInformationGrading", "D3"); assertMeta(meta, "test_key", "test.value"); assertMeta(meta, "documentTitle", "Test Title"); assertMeta(meta, "countryInfo", "ENG|WAL|SCO"); } private void assertMeta(Document meta, String key, Object value) { assertEquals(value, ((List<?>)meta.get(key)).get(0)); } @SuppressWarnings("unchecked") @Test public void testEntities() throws Exception { jCas.setDocumentText("James went to London on 19th February 2015. His e-mail address is james@example.com"); Person p = new Person(jCas); p.setBegin(0); p.setEnd(5); p.setValue(PERSON); p.addToIndexes(); Location l = new Location(jCas); l.setBegin(14); l.setEnd(20); l.setValue(LONDON); l.setGeoJson("{\"type\": \"Point\", \"coordinates\": [-0.1, 51.5]}"); l.addToIndexes(); Temporal dt = new Temporal(jCas); dt.setBegin(24); dt.setEnd(42); dt.setConfidence(1.0); dt.setValue(DATE); dt.addToIndexes(); CommsIdentifier ci = new CommsIdentifier(jCas); ci.setBegin(66); ci.setEnd(83); ci.setSubType("email"); ci.setValue(EMAIL); ci.addToIndexes(); Buzzword bw = new Buzzword(jCas); bw.setBegin(6); bw.setEnd(10); bw.setValue(WENT); StringArray tags = new StringArray(jCas, 2); tags.set(0, "verb"); tags.set(1, "past"); bw.setTags(tags); bw.addToIndexes(); ae.process(jCas); assertEquals(1, documents.count()); assertEquals(5, entities.count()); Document a = entities.find(new Document(Mongo.FIELD_ENTITIES + "." + VALUE, PERSON)).first(); Document person = ((List<Document>)a.get(Mongo.FIELD_ENTITIES)).get(0); assertEquals(11, person.size()); assertEquals(0, person.get(BEGIN)); assertEquals(5, person.get(END)); assertEquals(0.0, person.get(CONFIDENCE)); assertEquals("Person", person.get(TYPE)); assertEquals(PERSON, person.get(VALUE)); Document b = entities.find(new Document(Mongo.FIELD_ENTITIES + "." + VALUE, LONDON)).first(); Document location = ((List<Document>)b.get(Mongo.FIELD_ENTITIES)).get(0); assertEquals(10, location.size()); assertEquals(14, location.get(BEGIN)); assertEquals(20, location.get(END)); assertEquals(0.0, location.get(CONFIDENCE)); assertEquals("Location", location.get(TYPE)); assertEquals(LONDON, location.get(VALUE)); assertEquals("Point", ((Document)location.get("geoJson")).get(TYPE)); assertEquals(Arrays.asList(new Double(-0.1), new Double(51.5)), ((Document)location.get("geoJson")).get("coordinates")); Document c = entities.find(new Document(Mongo.FIELD_ENTITIES + "." + VALUE, DATE)).first(); Document date = ((List<Document>)c.get(Mongo.FIELD_ENTITIES)).get(0); assertEquals(14, date.size()); assertEquals(24, date.get(BEGIN)); assertEquals(42, date.get(END)); assertEquals(1.0, date.get(CONFIDENCE)); assertEquals("Temporal", date.get(TYPE)); assertEquals(DATE, date.get(VALUE)); Document d = entities.find(new Document(Mongo.FIELD_ENTITIES + "." + VALUE, EMAIL)).first(); Document email = ((List<Document>)d.get(Mongo.FIELD_ENTITIES)).get(0); assertEquals(9, email.size()); assertEquals(66, email.get(BEGIN)); assertEquals(83, email.get(END)); assertEquals(0.0, email.get(CONFIDENCE)); assertEquals("CommsIdentifier", email.get(TYPE)); assertEquals("email", email.get("subType")); assertEquals(EMAIL, email.get(VALUE)); Document e = entities.find(new Document(Mongo.FIELD_ENTITIES + "." + VALUE, WENT)).first(); Document went = ((List<Document>)e.get(Mongo.FIELD_ENTITIES)).get(0); assertEquals(10, went.size()); assertEquals(6, went.get(BEGIN)); assertEquals(10, went.get(END)); assertEquals(0.0, went.get(CONFIDENCE)); List<String> wentTags = (List<String>) went.get("tags"); assertEquals("verb", wentTags.get(0)); assertEquals("past", wentTags.get(1)); assertEquals(WENT, went.get(VALUE)); } @SuppressWarnings("unchecked") @Test public void testReferenceTargets() throws AnalysisEngineProcessException { jCas.setDocumentText("Bill went to London. William came back."); ReferenceTarget rt = new ReferenceTarget(jCas); rt.addToIndexes(); Person p = new Person(jCas); p.setBegin(0); p.setEnd(4); p.setValue("Bill"); p.addToIndexes(); p.setReferent(rt); Person q = new Person(jCas); q.setBegin(21); q.setEnd(28); q.setValue(NAME_2); q.addToIndexes(); q.setReferent(rt); ae.process(jCas); assertEquals(1, documents.count()); assertEquals(1, entities.count()); Document a = (Document)entities.find().first(); assertEquals(2, ((List<Object>)a.get(Mongo.FIELD_ENTITIES)).size()); } @SuppressWarnings("unchecked") @Test public void testHistory() throws AnalysisEngineProcessException { jCas.setDocumentText("Bill went to London. William came back."); Person p = new Person(jCas); p.setBegin(0); p.setEnd(4); p.setValue("Bill"); p.addToIndexes(); Person q = new Person(jCas); q.setBegin(21); q.setEnd(28); q.setValue(NAME_2); q.addToIndexes(); DocumentHistory documentHistory = history.getHistory("unknown:" + getDocumentAnnotation(jCas).getHash()); documentHistory.add(HistoryEvents.createAdded(p, "test")); documentHistory.add(HistoryEvents.createAdded(q, "test")); documentHistory.add(HistoryEvents.createMerged(p, "test", q.getInternalId())); documentHistory.add(HistoryEvents.createMerged(p, "fakeId merge", 500)); documentHistory.add(HistoryEvents.createRemoved(q, "test")); ae.process(jCas); Collection<HistoryEvent> pHistory = documentHistory.getHistory(p.getInternalId()); Collection<HistoryEvent> qHistory = documentHistory.getHistory(q.getInternalId()); assertEquals(1, documents.count()); assertEquals(2, entities.count()); Document a = entities.find(new Document(Mongo.FIELD_ENTITIES+"." + VALUE, "Bill")).first(); List<Document> pH = (List<Document>) ((List<Document>)a.get(Mongo.FIELD_ENTITIES)).get(0).get(fields.getHistory()); assertEquals(pHistory.size() + qHistory.size(), pH.size()); Document b = entities.find(new Document(Mongo.FIELD_ENTITIES+"." + VALUE, NAME_2)).first(); List<Document> qH = (List<Document>) ((List<Document>)b.get(Mongo.FIELD_ENTITIES)).get(0).get(fields.getHistory()); assertEquals(qHistory.size(), qH.size()); } @SuppressWarnings("unchecked") @Test public void testRelations() throws Exception { jCas.setDocumentText("James went to London on 19th February 2015."); Person p = new Person(jCas); p.setBegin(0); p.setEnd(5); p.setValue(PERSON); p.addToIndexes(); Location l = new Location(jCas); l.setBegin(14); l.setEnd(20); l.setValue(LONDON); l.setGeoJson("{\"type\": \"Point\", \"coordinates\": [-0.1, 51.5]}"); l.addToIndexes(); Temporal dt = new Temporal(jCas); dt.setBegin(24); dt.setEnd(42); dt.setConfidence(1.0); dt.setValue(DATE); dt.addToIndexes(); Relation r = new Relation(jCas); r.setBegin(0); r.setEnd(20); r.setValue("James went to London"); r.setSource(p); r.setTarget(l); r.setRelationshipType("AT"); r.setConfidence(0.7); r.addToIndexes(); ae.process(jCas); assertEquals(1, documents.count()); assertEquals(3, entities.count()); assertEquals(1, relations.count()); Document a = entities.find(new Document(Mongo.FIELD_ENTITIES + "." + VALUE, PERSON)).first(); Document person = ((List<Document>)a.get(Mongo.FIELD_ENTITIES)).get(0); assertEquals(11, person.size()); assertEquals(0, person.get(BEGIN)); assertEquals(5, person.get(END)); assertEquals(0.0, person.get(CONFIDENCE)); assertEquals("Person", person.get(TYPE)); assertEquals(PERSON, person.get(VALUE)); Document b = entities.find(new Document(Mongo.FIELD_ENTITIES + "." + VALUE, LONDON)).first(); Document location = ((List<Document>)b.get(Mongo.FIELD_ENTITIES)).get(0); assertEquals(10, location.size()); assertEquals(14, location.get(BEGIN)); assertEquals(20, location.get(END)); assertEquals(0.0, location.get(CONFIDENCE)); assertEquals("Location", location.get(TYPE)); assertEquals(LONDON, location.get(VALUE)); assertEquals("Point", ((Document)location.get("geoJson")).get(TYPE)); assertEquals(Arrays.asList(new Double(-0.1), new Double(51.5)), ((Document)location.get("geoJson")).get("coordinates")); Document c = entities.find(new Document(Mongo.FIELD_ENTITIES + "." + VALUE, DATE)).first(); Document date = ((List<Document>)c.get(Mongo.FIELD_ENTITIES)).get(0); assertEquals(14, date.size()); assertEquals(24, date.get(BEGIN)); assertEquals(42, date.get(END)); assertEquals(1.0, date.get(CONFIDENCE)); assertEquals("Temporal", date.get(TYPE)); assertEquals(DATE, date.get(VALUE)); Document relation = relations.find().first(); assertEquals(13, relation.size()); assertEquals(0, relation.get(BEGIN)); assertEquals(20, relation.get(END)); assertEquals(0.7, relation.get(CONFIDENCE)); assertEquals(person.get("externalId"), relation.get("source")); assertEquals(location.get("externalId"), relation.get("target")); assertEquals("AT", relation.get("relationshipType")); assertEquals("James went to London", relation.get(VALUE)); } }