//Dstl (c) Crown Copyright 2017 package uk.gov.dstl.baleen.consumers; import java.sql.Array; import java.sql.Connection; import java.sql.PreparedStatement; import java.sql.ResultSet; import java.sql.SQLException; import java.sql.Statement; import java.sql.Timestamp; import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import org.apache.uima.UimaContext; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.fit.descriptor.ExternalResource; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.cas.StringArray; import org.apache.uima.jcas.tcas.DocumentAnnotation; import org.apache.uima.resource.ResourceInitializationException; import uk.gov.dstl.baleen.consumers.utils.ConsumerUtils; import uk.gov.dstl.baleen.exceptions.BaleenException; import uk.gov.dstl.baleen.exceptions.InvalidParameterException; import uk.gov.dstl.baleen.resources.SharedPostgresResource; import uk.gov.dstl.baleen.types.metadata.Metadata; import uk.gov.dstl.baleen.types.semantic.Entity; import uk.gov.dstl.baleen.types.semantic.Location; import uk.gov.dstl.baleen.types.semantic.ReferenceTarget; import uk.gov.dstl.baleen.uima.BaleenConsumer; import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.type.MapLikeType; import com.fasterxml.jackson.databind.type.TypeFactory; import com.google.common.base.Strings; /** * Consumer to persist Baleen outputs into Postgres. * * Assumes that the provided schema has the following tables. * If these do not exist, then they are created when the consumer is first run. * If they exist, but have the wrong definition, then the consumer will be unable to insert into them. * A prefix, defaulting to 'baleen_', can be specified to avoid name conflicts. * * <ul> * <li>docs</li> * <li>doc_metadata</li> * <li>entities</li> * <li>entity_geos</li> * </ul> * * Requires PostGIS 2 or later. * * * @baleen.javadoc */ public class Postgres extends BaleenConsumer { /** * Connection to PostgreSQL * * @baleen.resource uk.gov.dstl.baleen.resources.SharedPostgresResource */ public static final String KEY_POSTGRES = "postgres"; @ExternalResource(key = KEY_POSTGRES) private SharedPostgresResource postgresResource; /** * The Postgres schema containing the tables * * @baleen.config */ public static final String PARAM_SCHEMA = "schema"; @ConfigurationParameter(name = PARAM_SCHEMA, defaultValue = "") private String schema; /** * The prefix to add to table names * * @baleen.config baleen_ */ public static final String PARAM_PREFIX = "prefix"; @ConfigurationParameter(name = PARAM_PREFIX, defaultValue = "baleen_") private String prefix; /** * Should a hash of the content be used to generate the ID? * If false, then a hash of the Source URI is used instead. * * @baleen.config true */ public static final String PARAM_CONTENT_HASH_AS_ID = "contentHashAsId"; @ConfigurationParameter(name = PARAM_CONTENT_HASH_AS_ID, defaultValue = "true") private boolean contentHashAsId = true; private PreparedStatement insertDocStatement; private PreparedStatement insertDocMetadataStatement; private PreparedStatement insertEntityStatement; private PreparedStatement insertEntityGeoStatement; private static final ObjectMapper MAPPER = new ObjectMapper(); private static final MapLikeType MAP_LIKE_TYPE = TypeFactory .defaultInstance() .constructMapLikeType(Map.class, String.class, Object.class); private static final String DOC_ROOT = "docs"; private static final String DOC_METADATA_ROOT = "doc_metadata"; private static final String ENTITY_ROOT = "entities"; private static final String ENTITY_GEO_ROOT = "entity_geos"; private static final String VARCHAR = "varchar"; private static final String CREATE_TABLE_PREFIX = "CREATE TABLE IF NOT EXISTS "; private static final String INSERT_INTO_PREFIX = "INSERT INTO "; @Override public void doInitialize(UimaContext aContext) throws ResourceInitializationException { checkVersion(); createTables(); try{ insertDocStatement = postgresResource.getConnection().prepareStatement(INSERT_INTO_PREFIX + getTableName(DOC_ROOT) + " (externalId, type, source, content, language, processed, classification, caveats, releasability) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)", Statement.RETURN_GENERATED_KEYS); insertDocMetadataStatement = postgresResource.getConnection().prepareStatement(INSERT_INTO_PREFIX + getTableName(DOC_METADATA_ROOT) + " (doc_key, name, value) VALUES (?, ?, ?)"); insertEntityStatement = postgresResource.getConnection().prepareStatement(INSERT_INTO_PREFIX + getTableName(ENTITY_ROOT) + " (doc_key, externalId, type, value) VALUES (?, ?, ?, ?)", Statement.RETURN_GENERATED_KEYS); insertEntityGeoStatement = postgresResource.getConnection().prepareStatement(INSERT_INTO_PREFIX + getTableName(ENTITY_GEO_ROOT) + " (entity_key, geo) VALUES (?, ST_GeomFromGeoJSON(?))"); }catch(SQLException e){ throw new ResourceInitializationException(e); } } /** * Check that Postgres has at least version 2 of PostGIS installed */ private void checkVersion() throws ResourceInitializationException{ try( Statement s = postgresResource.getConnection().createStatement(); ){ ResultSet rs = s.executeQuery("SELECT PostGIS_Lib_Version() AS version"); rs.next(); String version = rs.getString("version"); String[] versionParts = version.split("\\."); Integer majorVersion = Integer.parseInt(versionParts[0]); if(majorVersion < 2){ throw new BaleenException("Unsupported PostGIS Version"); } }catch(SQLException | NumberFormatException | NullPointerException e){ getMonitor().error("Unable to retrieve PostGIS version"); throw new ResourceInitializationException(e); }catch(BaleenException e){ throw new ResourceInitializationException(e); } } /** * If the tables don't already exist, then create them. */ private void createTables() throws ResourceInitializationException{ try( Statement s = postgresResource.getConnection().createStatement(); ){ s.execute(CREATE_TABLE_PREFIX + getTableName(DOC_ROOT) + " (key serial primary key, externalId character varying, type character varying, source character varying, content character varying, language character varying, processed timestamp, classification character varying, caveats character varying[], releasability character varying[])"); s.execute(CREATE_TABLE_PREFIX + getTableName(DOC_METADATA_ROOT) + " (key serial primary key, doc_key integer references " + getTableName(DOC_ROOT) + "(key), name character varying, value character varying)"); s.execute(CREATE_TABLE_PREFIX + getTableName(ENTITY_ROOT) + " (key serial primary key, doc_key integer references " + getTableName(DOC_ROOT) + "(key), externalId character varying[], type character varying, value character varying[])"); s.execute(CREATE_TABLE_PREFIX + getTableName(ENTITY_GEO_ROOT) + " (key serial primary key, entity_key integer references " + getTableName(ENTITY_ROOT) + "(key), geo geometry(Geometry, 4326))"); postgresResource.getConnection().setAutoCommit(false); }catch(SQLException e){ throw new ResourceInitializationException(e); } } @Override protected void doProcess(JCas jCas) throws AnalysisEngineProcessException { Connection conn = postgresResource.getConnection(); try{ //Insert document and metadata into database Integer docKey = executeDocInsert(jCas); for(Metadata md : JCasUtil.select(jCas, Metadata.class)){ executeDocMetadataInsert(docKey, md); } processEntities(jCas, docKey); conn.commit(); }catch(SQLException | BaleenException e){ getMonitor().error("Unable to insert document into Postgres database", e); if(conn != null){ try{ conn.rollback(); }catch(SQLException e2){ getMonitor().error("Unable to rollback insertion - state of the database may have been left inconsistent", e2); } } } } private Integer executeDocInsert(JCas jCas) throws SQLException, BaleenException{ DocumentAnnotation da = getDocumentAnnotation(jCas); String documentId = ConsumerUtils.getExternalId(da, contentHashAsId); insertDocStatement.clearParameters(); insertDocStatement.setString(1, documentId); insertDocStatement.setString(2, da.getDocType()); insertDocStatement.setString(3, da.getSourceUri()); insertDocStatement.setString(4, jCas.getDocumentText()); insertDocStatement.setString(5, jCas.getDocumentLanguage()); insertDocStatement.setTimestamp(6, new Timestamp(da.getTimestamp())); insertDocStatement.setString(7, da.getDocumentClassification()); insertDocStatement.setArray(8, createVarcharArray(postgresResource.getConnection(), da.getDocumentCaveats())); insertDocStatement.setArray(9, createVarcharArray(postgresResource.getConnection(), da.getDocumentReleasability())); insertDocStatement.executeUpdate(); Integer docKey = getKey(insertDocStatement); if(docKey == null){ throw new BaleenException("No document key returned"); } return docKey; } private void executeDocMetadataInsert(Integer docKey, Metadata md) throws SQLException{ insertDocMetadataStatement.clearParameters(); insertDocMetadataStatement.setInt(1, docKey); insertDocMetadataStatement.setString(2, md.getKey()); insertDocMetadataStatement.setString(3, md.getValue()); insertDocMetadataStatement.executeUpdate(); } private void processEntities(JCas jCas, Integer docKey) throws SQLException{ //Insert entities Map<ReferenceTarget, List<Entity>> coreferenceEntities = new HashMap<>(); for(Entity ent : JCasUtil.select(jCas, Entity.class)){ ReferenceTarget rt = ent.getReferent(); if(rt == null){ rt = new ReferenceTarget(jCas); } List<Entity> entities = coreferenceEntities.getOrDefault(rt, new ArrayList<>()); entities.add(ent); coreferenceEntities.put(rt, entities); } for(List<Entity> entities : coreferenceEntities.values()){ processCoreferencedEntities(docKey, entities); } } private void processCoreferencedEntities(Integer docKey, List<Entity> entities) throws SQLException{ Set<String> values = new HashSet<>(); Set<String> externalIds = new HashSet<>(); Set<String> geoJsons = new HashSet<>(); Class<? extends Entity> type = null; for(Entity e : entities){ values.add(e.getValue()); externalIds.add(e.getExternalId()); if(e instanceof Location){ Location l = (Location) e; try{ geoJsons.add(addCrsToGeoJSON(l.getGeoJson())); }catch(BaleenException ex){ getMonitor().warn("Unable to add CRS to GeoJSON", ex); } } type = getSuperclass(type, e.getClass()); } if(type == null){ //No entities processed return; } Integer entityKey = executeEntityInsert(docKey, values, externalIds, type.getName()); if(entityKey != null){ for(String geoJson : geoJsons){ executeEntityGeoInsert(entityKey, geoJson); } } } private Integer executeEntityInsert(Integer docKey, Collection<String> values, Collection<String> externalIds, String type) throws SQLException{ insertEntityStatement.clearParameters(); insertEntityStatement.setInt(1, docKey); insertEntityStatement.setArray(2, postgresResource.getConnection().createArrayOf(VARCHAR, externalIds.toArray(new String[0]))); insertEntityStatement.setString(3, type); insertEntityStatement.setArray(4, postgresResource.getConnection().createArrayOf(VARCHAR, values.toArray(new String[0]))); insertEntityStatement.executeUpdate(); Integer entityKey = getKey(insertEntityStatement); if(entityKey == null){ getMonitor().error("No entity key returned - Geo insertion, if applicable, will be skipped"); } return entityKey; } private void executeEntityGeoInsert(Integer entityKey, String geoJson) throws SQLException{ insertEntityGeoStatement.clearParameters(); insertEntityGeoStatement.setInt(1, entityKey); insertEntityGeoStatement.setString(2, geoJson); insertEntityGeoStatement.executeUpdate(); } /** * Add CRS (assumed to be EPSG:4326) to a GeoJSON string if it doesn't already exist */ public static String addCrsToGeoJSON(String geoJson) throws BaleenException{ if(Strings.isNullOrEmpty(geoJson)){ return geoJson; } try{ Map<String, Object> geoJsonObj = MAPPER.readValue(geoJson, MAP_LIKE_TYPE); if(geoJsonObj == null){ throw new InvalidParameterException("Mapper returned null"); } if(geoJsonObj.get("crs") == null){ Map<String, Object> crs = new HashMap<>(); crs.put("type", "name"); Map<String, Object> srid = new HashMap<>(); srid.put("name", "EPSG:4326"); crs.put("properties", srid); geoJsonObj.put("crs", crs); return MAPPER.writeValueAsString(geoJsonObj); }else{ return geoJson; } }catch(Exception e){ throw new BaleenException("Unable to parse GeoJSON", e); } } /** * From the two classes, return the one that is the superclass. * If the two classes aren't in the same hierarchy, then c1 will be returned. */ public static Class<? extends Entity> getSuperclass(Class<? extends Entity> c1, Class<? extends Entity> c2){ if(c1 == null) return c2; if(c2 == null) return c1; if(c2.isAssignableFrom(c1)){ return c2; } return c1; } /** * Get the table name with the schema and prefix if one is set */ public String getTableName(String table){ String ret = table; if(!Strings.isNullOrEmpty(prefix)){ ret = prefix + ret; } if(!Strings.isNullOrEmpty(schema)){ ret = schema + "." + ret; } return ret; } private Array createVarcharArray(Connection conn, StringArray s) throws SQLException{ if(s == null){ return conn.createArrayOf(VARCHAR, new String[]{}); }else{ return conn.createArrayOf(VARCHAR, s.toArray()); } } private Integer getKey(Statement s) throws SQLException{ ResultSet generatedKeys = s.getGeneratedKeys(); if(generatedKeys.next()){ return generatedKeys.getInt(1); }else{ return null; } } }