//Dstl (c) Crown Copyright 2017
package uk.gov.dstl.baleen.consumers;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertTrue;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.uima.analysis_engine.AnalysisEngine;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.tcas.DocumentAnnotation;
import org.elasticsearch.action.admin.indices.refresh.RefreshRequest;
import org.elasticsearch.action.bulk.BulkRequestBuilder;
import org.elasticsearch.action.search.SearchRequest;
import org.elasticsearch.action.search.SearchRequestBuilder;
import org.elasticsearch.action.search.SearchResponse;
import org.elasticsearch.client.Client;
import org.elasticsearch.index.IndexNotFoundException;
import org.elasticsearch.index.query.QueryBuilders;
import org.elasticsearch.search.SearchHit;
import org.elasticsearch.search.SearchHits;
import org.junit.AfterClass;
import org.junit.Before;
import org.junit.Test;
import uk.gov.dstl.baleen.types.common.CommsIdentifier;
import uk.gov.dstl.baleen.types.common.Person;
import uk.gov.dstl.baleen.types.metadata.Metadata;
import uk.gov.dstl.baleen.types.metadata.PublishedId;
import uk.gov.dstl.baleen.types.semantic.Entity;
import uk.gov.dstl.baleen.types.semantic.Location;
import uk.gov.dstl.baleen.types.semantic.Temporal;
import uk.gov.dstl.baleen.uima.testing.JCasSingleton;
import uk.gov.dstl.baleen.uima.utils.UimaTypesUtils;
public abstract class ElasticsearchTestBase {
private static final String EXTERNAL_ID = "externalId";
private static final String VALUE = "value";
private static final String TYPE = "type";
private static final String CONFIDENCE = "confidence";
private static final String END = "end";
private static final String BEGIN = "begin";
private static final String DOC_TYPE = "docType";
private static final String BALEEN_INDEX = "baleen_index";
protected static JCas jCas;
protected static Client client;
protected static AnalysisEngine ae;
@AfterClass
public static void destroyClass(){
if (client != null) {
client.close();
}
if (ae != null) {
ae.destroy();
}
}
@Before
public void beforeTest() throws Exception{
jCas = JCasSingleton.getJCasInstance();
try{
//Find all documents in index and delete the documents
BulkRequestBuilder brb = client.prepareBulk();
SearchHits results = client.search(new SearchRequest()).actionGet().getHits();
for(SearchHit sh : results){
brb.add(client.prepareDelete("baleen_index", "baleen_output", sh.getId()).request());
}
if(brb.numberOfActions() > 0)
brb.get();
}catch(IndexNotFoundException infe){
//Index doesn't exist - ignore
}
}
@SuppressWarnings("unchecked")
@Test
public void testNoEntities() throws Exception{
long timestamp = createNoEntitiesDocument();
ae.process(jCas);
//Call refresh to force ES to write buffer
client.admin().indices().refresh(new RefreshRequest("baleen_index")).actionGet();
assertEquals(new Long(1), getCount());
SearchHit result = client.search(new SearchRequest()).actionGet().getHits().hits()[0];
assertEquals("Hello World", result.getSource().get("content"));
assertEquals("en", result.getSource().get("language"));
assertEquals(timestamp, result.getSource().get("dateAccessed"));
assertEquals("test/no_entities", result.getSource().get("sourceUri"));
assertEquals("test", result.getSource().get(DOC_TYPE));
assertEquals("OFFICIAL", result.getSource().get("classification"));
List<String> rels = (List<String>) result.getSource().get("releasability");
assertEquals(3, rels.size());
assertTrue(rels.contains("ENG"));
List<String> cavs = (List<String>) result.getSource().get("caveats");
assertEquals(2, cavs.size());
assertTrue(cavs.contains("TEST_A"));
}
@SuppressWarnings("unchecked")
@Test
public void testMetadata() throws Exception{
createMetadataDocument();
ae.process(jCas);
//Call refresh to force ES to write buffer
client.admin().indices().refresh(new RefreshRequest("baleen_index")).actionGet();
assertEquals(new Long(1), getCount());
SearchHit result = client.search(new SearchRequest()).actionGet().getHits().hits()[0];
List<String> pids = (List<String>) result.getSource().get("publishedId");
assertEquals("id_1", pids.get(0));
assertEquals("id_2", pids.get(1));
Map<String, Object> metadataMap = (Map<String, Object>) result.getSource().get("metadata");
assertEquals("D3", metadataMap.get("sourceAndInformationGrading"));
assertEquals("test_value", metadataMap.get("test_key"));
assertEquals("Test Title", metadataMap.get("documentTitle"));
assertEquals("ENG|WAL|SCO", metadataMap.get("countryInfo"));
}
@SuppressWarnings("unchecked")
@Test
public void testEntities() throws Exception{
createEntitiesDocument();
ae.process(jCas);
//Call refresh to force ES to write buffer
client.admin().indices().refresh(new RefreshRequest("baleen_index")).actionGet();
assertEquals(new Long(1), getCount());
SearchHit result = client.search(new SearchRequest()).actionGet().getHits().hits()[0];
List<Map<String, Object>> entities = (List<Map<String, Object>>) result.getSource().get("entities");
assertEquals(4, entities.size());
Map<String, Object> person = entities.get(0);
assertTrue(person.size() >= 6); //The REST API only adds non-null fields, the Transport API add null fields too
assertEquals(0, person.get(BEGIN));
assertEquals(5, person.get(END));
assertEquals(0.0, person.get(CONFIDENCE));
assertEquals("Person", person.get(TYPE));
assertEquals("James", person.get(VALUE));
assertNotNull(person.get(EXTERNAL_ID));
Map<String, Object> location = entities.get(1);
assertTrue(location.size() >= 7); //The REST API only adds non-null fields, the Transport API add null fields too
assertEquals(14, location.get(BEGIN));
assertEquals(20, location.get(END));
assertEquals(0.0, location.get(CONFIDENCE));
assertEquals("Location", location.get(TYPE));
assertEquals("London", location.get(VALUE));
assertNotNull(location.get(EXTERNAL_ID));
Map<String, Object> geometryMap = new HashMap<>();
geometryMap.put(TYPE, "Point");
geometryMap.put("coordinates", new ArrayList<Double>(Arrays.asList(-0.1, 51.5)));
assertEquals(geometryMap, location.get("geoJson"));
Map<String, Object> date = entities.get(2);
assertTrue(date.size() >= 6); //The REST API only adds non-null fields, the Transport API add null fields too
assertEquals(24, date.get(BEGIN));
assertEquals(42, date.get(END));
assertEquals(1.0, date.get(CONFIDENCE));
assertEquals("Temporal", date.get(TYPE));
assertEquals("19th February 2015", date.get(VALUE));
assertNotNull(date.get(EXTERNAL_ID));
Map<String, Object> email = entities.get(3);
assertEquals(8, email.size());
assertEquals(66, email.get(BEGIN));
assertEquals(83, email.get(END));
assertEquals(0.0, email.get(CONFIDENCE));
assertEquals("CommsIdentifier", email.get(TYPE));
assertEquals("email", email.get("subType"));
assertEquals("james@example.com", email.get(VALUE));
assertNotNull(email.get(EXTERNAL_ID));
}
@Test
public void testReindexEntities() throws Exception{
createEntitiesDocument();
ae.process(jCas);
ae.process(jCas);
// Change the last document so we can check its been updated
getDocumentAnnotation(jCas).setDocumentClassification("TEST");
ae.process(jCas);
//Call refresh to force ES to write buffer
client.admin().indices().refresh(new RefreshRequest("baleen_index")).actionGet();
assertEquals(new Long(1), getCount());
SearchHit result = client.search(new SearchRequest()).actionGet().getHits().hits()[0];
// This checks the last document is tone we are getting
assertEquals("TEST", result.getSource().get("classification"));
}
@Test
public void testNestedEntities() throws Exception{
createEntitiesDocument();
ae.process(jCas);
createEntitiesDocument2();
ae.process(jCas);
//Call refresh to force ES to write buffer
client.admin().indices().refresh(new RefreshRequest("baleen_index")).actionGet();
assertEquals(new Long(2), getCount());
SearchRequestBuilder srb = client.prepareSearch("baleen_index").setQuery(
QueryBuilders.nestedQuery("entities", QueryBuilders.boolQuery()
.must(QueryBuilders.matchQuery("entities.type", "Location"))
.must(QueryBuilders.matchQuery("entities.value", "London")))
);
SearchHits results = client.search(srb.request()).actionGet().getHits();
assertEquals(1, results.getTotalHits());
}
protected DocumentAnnotation getDocumentAnnotation(JCas jCas){
return (DocumentAnnotation) jCas.getDocumentAnnotationFs();
}
protected long createNoEntitiesDocument(){
jCas.reset();
jCas.setDocumentText("Hello World");
jCas.setDocumentLanguage("en");
long timestamp = System.currentTimeMillis();
DocumentAnnotation da = getDocumentAnnotation(jCas);
da.setTimestamp(timestamp);
da.setSourceUri("test/no_entities");
da.setDocType("test");
da.setDocumentClassification("OFFICIAL");
da.setDocumentCaveats(UimaTypesUtils.toArray(jCas, Arrays.asList(new String[] { "TEST_A", "TEST_B" })));
da.setDocumentReleasability(UimaTypesUtils.toArray(jCas, Arrays.asList(new String[] { "ENG", "SCO", "WAL" })));
return timestamp;
}
protected void createMetadataDocument(){
jCas.reset();
jCas.setDocumentText("Hello World");
PublishedId pid1 = new PublishedId(jCas);
pid1.setValue("id_1");
pid1.addToIndexes();
PublishedId pid2 = new PublishedId(jCas);
pid2.setValue("id_2");
pid2.addToIndexes();
Metadata mdSourceAndInformation = new Metadata(jCas);
mdSourceAndInformation.setKey("sourceAndInformationGrading");
mdSourceAndInformation.setValue("D3");
mdSourceAndInformation.addToIndexes();
Metadata mdCountries = new Metadata(jCas);
mdCountries.setKey("countryInfo");
mdCountries.setValue("ENG|WAL|SCO");
mdCountries.addToIndexes();
Metadata mdTitle = new Metadata(jCas);
mdTitle.setKey("documentTitle");
mdTitle.setValue("Test Title");
mdTitle.addToIndexes();
Metadata mdMisc = new Metadata(jCas);
mdMisc.setKey("test_key");
mdMisc.setValue("test_value");
mdMisc.addToIndexes();
}
protected void createEntitiesDocument(){
jCas.reset();
jCas.setDocumentText("James went to London on 19th February 2015. His e-mail address is james@example.com");
Person p = new Person(jCas);
p.setBegin(0);
p.setEnd(5);
p.setValue("James");
p.addToIndexes();
Location l = new Location(jCas);
l.setBegin(14);
l.setEnd(20);
l.setValue("London");
l.setGeoJson("{\"type\": \"Point\", \"coordinates\": [-0.1, 51.5]}");
l.addToIndexes();
Temporal d = new Temporal(jCas);
d.setBegin(24);
d.setEnd(42);
d.setConfidence(1.0);
d.addToIndexes();
CommsIdentifier ci = new CommsIdentifier(jCas);
ci.setBegin(66);
ci.setEnd(83);
ci.setSubType("email");
ci.addToIndexes();
}
protected void createEntitiesDocument2(){
jCas.reset();
jCas.setDocumentText("Paula went to London on 12th February 2017. In Paris, she met a UID male.");
Person p = new Person(jCas);
p.setBegin(0);
p.setEnd(5);
p.setValue("Paula");
p.addToIndexes();
Entity e = new Entity(jCas);
e.setBegin(14);
e.setEnd(20);
e.setValue("London");
e.addToIndexes();
Temporal d = new Temporal(jCas);
d.setBegin(24);
d.setEnd(42);
d.setConfidence(1.0);
d.addToIndexes();
Location l = new Location(jCas);
l.setBegin(47);
l.setEnd(53);
l.setValue("Paris");
l.addToIndexes();
}
protected Long getCount(){
SearchResponse sr = client.prepareSearch(BALEEN_INDEX).setSize(0).execute().actionGet();
return sr.getHits().getTotalHits();
}
}