//Dstl (c) Crown Copyright 2017 package uk.gov.dstl.baleen.consumers.utils; import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.tcas.DocumentAnnotation; import com.google.common.base.Strings; import uk.gov.dstl.baleen.types.metadata.Metadata; import uk.gov.dstl.baleen.types.metadata.PublishedId; import uk.gov.dstl.baleen.types.semantic.Entity; import uk.gov.dstl.baleen.types.semantic.Relation; import uk.gov.dstl.baleen.uima.UimaMonitor; import uk.gov.dstl.baleen.uima.UimaSupport; /** * Helper class for converting a CAS object into a single document * (i.e. with entities and relations embedded) for persistence using * the following schema: * <pre> { content, language, externalId, dateAccessed, sourceUri, docType, classification, caveats: [], releasability: [], publishedId: [], metadata: { key1: value1, key2: value2, ... }, entities: [ { externalId, value, confidence, type, begin, end, ... } ], relations: [ { ... } ] }</pre> * The protective marking set on the DocumentAnnotation is used as the classification of the document, and ProtectiveMarking annotations are ignored. * Events are not currently supported. * * Use of this class ensures consistency of formats across databases, e.g. Elasticsearch and ActiveMQ */ public class SingleDocumentConsumerFormat { /** * Private constructor for utility class */ private SingleDocumentConsumerFormat(){ //Do nothing } /** * Convert the provided jCas object into a standardised representation * * @param jCas * @param contentHashAsId * Should a hash of the content be used to generate the ID? If false, then a hash of the Source URI is used instead. * @param fields * An instance of IEntityConverterFields to be used * @param monitor * @param support * * @return * Standardised representation of jCas */ public static Map<String, Object> formatCas(JCas jCas, IEntityConverterFields fields, boolean contentHashAsId, UimaMonitor monitor, UimaSupport support){ Set<String> stopFeatures = new HashSet<>(); stopFeatures.add("uima.cas.AnnotationBase:sofa"); stopFeatures.add("uk.gov.dstl.baleen.types.BaleenAnnotation:internalId"); Map<String, Object> output = new HashMap<>(); EntityRelationConverter entityRelationConverter = new EntityRelationConverter(monitor, false, support.getDocumentHistory(jCas), stopFeatures, fields); //Content and language output.put("content", jCas.getDocumentText()); if(!Strings.isNullOrEmpty(jCas.getDocumentLanguage())){ output.put("language", jCas.getDocumentLanguage()); } //Document Annotations DocumentAnnotation da = support.getDocumentAnnotation(jCas); output.putAll(createDocumentAnnotationMap(da)); String id = ConsumerUtils.getExternalId(da, contentHashAsId); output.put("externalId", id); //Metadata Annotations Collection<PublishedId> publishedIds = JCasUtil.select(jCas, PublishedId.class); if(!publishedIds.isEmpty()){ output.put("publishedId", createPublishedIdList(publishedIds)); } Collection<Metadata> metadata = JCasUtil.select(jCas, Metadata.class); if(!metadata.isEmpty()){ output.put("metadata", createMetadataMap(metadata)); } //Entities List<Map<String, Object>> entitiesList = new ArrayList<>(); Collection<Entity> entities = JCasUtil.select(jCas, Entity.class); for(Entity ent : entities){ entitiesList.add(entityRelationConverter.convertEntity(ent)); } output.put("entities", entitiesList); //Relations List<Map<String, Object>> relationsList = new ArrayList<>(); Collection<Relation> relations = JCasUtil.select(jCas, Relation.class); for(Relation rel : relations){ relationsList.add(entityRelationConverter.convertRelation(rel)); } output.put("relations", relationsList); return output; } /** * Create a map containing information from the DocumentAnnotation object */ public static Map<String, Object> createDocumentAnnotationMap(DocumentAnnotation da){ Map<String, Object> map = new HashMap<>(); if(!Strings.isNullOrEmpty(da.getSourceUri())){ map.put("sourceUri", da.getSourceUri()); } map.put("dateAccessed", da.getTimestamp()); if(!Strings.isNullOrEmpty(da.getDocType())){ map.put("docType", da.getDocType()); } if(!Strings.isNullOrEmpty(da.getDocumentClassification())){ map.put("classification", da.getDocumentClassification().toUpperCase()); } if(da.getDocumentCaveats() != null){ String[] caveats = da.getDocumentCaveats().toArray(); if(caveats.length > 0){ map.put("caveats", caveats); } } if(da.getDocumentReleasability() != null){ String[] rels = da.getDocumentReleasability().toArray(); if(rels.length > 0){ map.put("releasability", rels); } } return map; } /** * Create a map of all metadata objects in a collection. * Duplicate key values will be converted into a list of objects. */ public static Map<String, Object> createMetadataMap(Collection<Metadata> md){ Map<String, Object> metadata = new HashMap<>(); for(Metadata m : md){ String key = m.getKey().replaceAll("\\.", "_"); if(metadata.containsKey(key)){ List<Object> list = new ArrayList<>(); Object o = metadata.get(key); if(o instanceof List){ list.addAll((List<?>)o); } list.add(m.getValue()); metadata.put(key, m.getValue()); }else{ metadata.put(key, m.getValue()); } } return metadata; } /** * Create a list of PublishedId values from a collection of PublishedIds */ public static List<String> createPublishedIdList(Collection<PublishedId> publishedIds){ List<String> pids = new ArrayList<>(); publishedIds.forEach(x -> pids.add(x.getValue())); return pids; } }