/* * DBSemanticAnnotationHelper.java * * Copyright (c) 2007-2011, The University of Sheffield. * * This file is part of GATE Mímir (see http://gate.ac.uk/family/mimir.html), * and is free software, licenced under the GNU Lesser General Public License, * Version 3, June 2007 (also included with this distribution as file * LICENCE-LGPL3.html). * * Valentin Tablan, 08 Feb 2011 * * $Id$ */ package gate.mimir.db; import gate.Annotation; import gate.Document; import gate.FeatureMap; import gate.mimir.AbstractSemanticAnnotationHelper; import gate.mimir.Constraint; import gate.mimir.ConstraintType; import gate.mimir.IndexConfig; import gate.mimir.SemanticAnnotationHelper; import gate.mimir.index.AtomicAnnotationIndex; import gate.mimir.index.Mention; import gate.mimir.search.QueryEngine; import java.io.File; import java.sql.Connection; import java.sql.DriverManager; import java.sql.PreparedStatement; import java.sql.ResultSet; import java.sql.SQLException; import java.sql.Statement; import java.sql.Types; import java.text.NumberFormat; import java.util.ArrayList; import java.util.Arrays; import java.util.HashSet; import java.util.LinkedList; import java.util.List; import java.util.Set; import java.util.concurrent.Callable; import org.apache.log4j.Logger; /** * A Semantic annotation helper that uses an embedded RDBMS for storing * annotation data. */ public class DBSemanticAnnotationHelper extends AbstractSemanticAnnotationHelper{ /** * A callable that generates Level 1 IDs given a set of features. */ protected class Level1IdGenerator implements Callable<Long> { public Level1IdGenerator(FeatureMap features) { this.features = features; } protected FeatureMap features; /** * Retrieves the level1 ID for the given set of features. If no ID can be * found (i.e. this combination of features has not been seen before), it * inserts a new row in the level 1 table, and returns the ID for it. * @see java.util.concurrent.Callable#call() */ @Override public Long call() throws Exception { setStatementParameters(level1SelectStmt, features); ResultSet res = level1SelectStmt.executeQuery(); if(!res.next()) { // no results found -> insert the new row setStatementParameters(level1InsertStmt, features); if(level1InsertStmt.executeUpdate() != 1) { // the update failed throw new RuntimeException("Error while inserting into database. Annotation was lost!"); } res = level1InsertStmt.getGeneratedKeys(); if(!res.next()) throw new RuntimeException( "Could not insert new Level 1 row for features: " + features); } // we have found the level 1 ID Long level1id = res.getLong(1); // sanity check if(res.next()) throw new RuntimeException( "Multiple Unique IDs foud in Level 1 table for features: " + features.toString()); return level1id; } } /** * A callable that generates Level 2 IDs given a set of features. */ protected class Level2IdGenerator implements Callable<Long> { private Long level1Id; public Level2IdGenerator(Long level1Id, FeatureMap features) { this.level1Id = level1Id; this.features = features; } protected FeatureMap features; /** * Retrieves the level2 ID for the given set of features. If no ID can be * found (i.e. this combination of features has not been seen before), it * inserts a new row in the level 2 table, and returns the ID for it. * @see java.util.concurrent.Callable#call() */ @Override public Long call() throws Exception { level2SelectStmt.setLong(1, level1Id); setStatementParameters(level2SelectStmt, features); ResultSet res = level2SelectStmt.executeQuery(); if(!res.next()) { // no results -> insert new row level2InsertStmt.setLong(1, level1Id); setStatementParameters(level2InsertStmt, features); if(level2InsertStmt.executeUpdate() != 1) { // the update failed throw new RuntimeException( "Could not insert new Level 2 row for Level 1 ID: \"" + level1Id + "\" and features: " + features); } res = level2InsertStmt.getGeneratedKeys(); if(!res.next()) throw new RuntimeException( "Could not insert new Level 2 row for Level 1 ID: \"" + level1Id + "\" and features: " + features); } // we have found the level 2 ID Long level2Id = res.getLong(1); // sanity check if(res.next()) { throw new RuntimeException( "Multiple Unique IDs found in Level 2 table for Level 1 ID: \"" + level1Id + "\" and features: " + features); } return level2Id; } } /** * A callable that generates Level 3 IDs (i.e. mention IDs) given a Level 1 * ID and/or a Level 2 ID, and a mention length. */ protected class Level3IdGenerator implements Callable<Long> { private Long level1Id; private Long level2Id; private int mentionLength; public Level3IdGenerator(Long level1Id, Long level2Id, int mentionLength) { super(); this.level1Id = level1Id; this.level2Id = level2Id; this.mentionLength = mentionLength; } /* (non-Javadoc) * @see java.util.concurrent.Callable#call() */ @Override public Long call() throws Exception { mentionsSelectStmt.setLong(1, level1Id); if(level2Used) { if(level2Id != null) { mentionsSelectStmt.setLong(2, level2Id); } else { mentionsSelectStmt.setNull(2, Types.BIGINT); } mentionsSelectStmt.setInt(3, mentionLength); } else { mentionsSelectStmt.setInt(2, mentionLength); } ResultSet res = mentionsSelectStmt.executeQuery(); if(!res.next()) { // no results -> insert new row mentionsInsertStmt.setLong(1, level1Id); if(level2Used) { if(level2Id != null) { mentionsInsertStmt.setLong(2, level2Id); } else { mentionsInsertStmt.setNull(2, Types.BIGINT); } mentionsInsertStmt.setInt(3, mentionLength); } else { mentionsInsertStmt.setInt(2, mentionLength); } if(mentionsInsertStmt.executeUpdate() != 1) { // the update failed throw new RuntimeException( "Could not insert new mention ID for Level 1 ID: " + level1Id + ", Level 2 ID: " + level2Id + ", and mention length: " + mentionLength); } res = mentionsInsertStmt.getGeneratedKeys(); if(!res.next()) { throw new RuntimeException( "Could not insert new mention ID for Level 1 ID: " + level1Id + ", Level 2 ID: " + level2Id + ", and mention length: " + mentionLength); } } // we have found the level 3 (mention) ID Long mentionId = res.getLong(1); // sanity check if(res.next()){ throw new RuntimeException( "Multiple Unique IDs foud in mentions table for Level 1 ID: " + level1Id + ", Level 2 ID: " + level2Id + ", and mention length: " + mentionLength); } return mentionId; } } private static final long serialVersionUID = 2734946594117068194L; /** * Empty array to return when there are no mention URIs. */ private static final String[] EMPTY_STRING_ARRAY = new String[0]; /** * The directory name for the database data (relative to the top level index * directory). */ public static final String DB_DIR_NAME = "db"; /** * Key used to retrieve the {@link List} of table base names (see * {@link #tableBaseName}) from the {@link IndexConfig#getContext()} context. */ public static final String DB_NAMES_CONTEXT_KEY = DBSemanticAnnotationHelper.class.getName() + ":dbNames"; protected static final String L1_TABLE_SUFFIX = "L1"; protected static final String L2_TABLE_SUFFIX = "L2"; protected static final String MENTIONS_TABLE_SUFFIX = "Mentions"; /** * The key in the {@link IndexConfig#getOptions()} Map for the size of the * memory cache to be used by the database. The cache size defaults to 1 GB. * Too small a cache size can lead to out of memory errors during indexing! */ public static final String DB_CACHE_SIZE_OPTIONS_KEY = "databaseCacheSize"; /** * The base name (prefix) used for all tables created by this helper. * The name is derived from the annotation name. */ protected String tableBaseName; /** * Flag showing if the second level model is need (i.e. if the annotation * has any non-nominal features) */ protected boolean level2Used = true; protected int level1CacheSize = -1; protected int level2CacheSize = -1; protected int level3CacheSize = -1; /** * Should we index "null" instances where all the configured features are * null or missing? Normally this would be true for a normal annotation-mode * helper but false for a document-mode helper. */ protected boolean indexNulls = true; /** * Should this helper index "null" instances, where none of the configured * features has a value set in the target (annotation or document) feature * map? Default is true, both for backwards compatibility and because this * is the only value that makes sense for normal annotation-mode helpers, but * it may be useful to set it to false for document-mode helpers where not * every document has the target feature(s). */ public void setIndexNulls(boolean indexNulls) { this.indexNulls = indexNulls; } /** * Should this helper index "null" instances, where none of the configured * features has a value set in the target (annotation or document) feature * map? */ public boolean isIndexNulls() { return indexNulls; } /** * Prepared statement used to obtain the Level-1 ID based on the values of * nominal features. Only used at indexing time. */ protected transient PreparedStatement level1SelectStmt; /** * Prepared statement used to obtain the Level-1 feature values based on a * mention ID. Only used at search time. */ protected transient PreparedStatement level1DescribeStmt; /** * Prepared statement used to obtain the Level-1 and Level-2 feature values * based on a mention ID. Only used at search time. */ protected transient PreparedStatement level1And2DescribeStmt; /** * Prepared statement used to insert anew row into the Level-1 table. * Only used at indexing time. */ protected transient PreparedStatement level1InsertStmt; /** * Prepared statement used to obtain the Level-2 ID based on the values of * non-nominal features. Only used at indexing time. */ protected transient PreparedStatement level2SelectStmt; /** * Prepared statement used to insert anew row into the Level-2 table. * Only used at indexing time. */ protected transient PreparedStatement level2InsertStmt; /** * Prepared statement used to obtain the Mention ID based on the Level-1 ID, * the Level-2 ID and the annotation length. Only used at indexing time. */ protected transient PreparedStatement mentionsSelectStmt; /** * Prepared statement used to insert anew row into the mentions table. * Only used at indexing time. */ protected transient PreparedStatement mentionsInsertStmt; /** * The set of feature names for all the nominal features. */ protected transient Set<String> nominalFeatureNameSet; /** * The set of feature names for all the non-nominal features. */ protected transient Set<String> nonNominalFeatureNameSet; /** * A cached connection used throughout the life of this helper. */ protected transient Connection dbConnection; protected transient AnnotationTemplateCache cache; private transient int docsSoFar = 0; private static transient NumberFormat percentFormat = NumberFormat.getPercentInstance(); private static transient Logger logger = Logger.getLogger(DBSemanticAnnotationHelper.class); /** * When in document mode (see * {@link SemanticAnnotationHelper#isInDocumentMode()}), stores the features * for the current document. */ private transient FeatureMap documentFeatures; @Override public void init(AtomicAnnotationIndex index) { super.init(index); if(getUriFeatures() != null && getUriFeatures().length > 0) { logger.warn( "This helper type does not fully support URI features, " + "they will be indexed but only as text literals!"); } setTextFeatures(concatenateArrays(getTextFeatures(), getUriFeatures())); setUriFeatures(new String[0]); cache = new AnnotationTemplateCache(this); cache.setL1CacheSize(level1CacheSize); cache.setL2CacheSize(level2CacheSize); cache.setL3CacheSize(level3CacheSize); // calculate the basename // to avoid inter-locking between the multiple SB-based indexers, they each // create their own database. tableBaseName = annotationType.replaceAll("[^\\p{Alnum}_]", "_"); List<String> baseNames = (List<String>)index.getParent().getIndexConfig() .getContext().get(DB_NAMES_CONTEXT_KEY); if(baseNames == null) { baseNames = new LinkedList<String>(); index.getParent().getIndexConfig().getContext().put(DB_NAMES_CONTEXT_KEY, baseNames); } while(baseNames.contains(tableBaseName)) { tableBaseName += "_"; } baseNames.add(tableBaseName); File dbDir = new File(index.getIndexDirectory(), DB_DIR_NAME); try { Class.forName("org.h2.Driver"); String cacheSizeStr = index.getParent().getIndexConfig().getOptions().get( DB_CACHE_SIZE_OPTIONS_KEY); // default to 100 MiB, if not provided if(cacheSizeStr == null) cacheSizeStr = Integer.toString(100 *1024); dbConnection = DriverManager.getConnection( "jdbc:h2:file:" + dbDir.getAbsolutePath() + "/" + tableBaseName + ";CACHE_SIZE=" + cacheSizeStr, "sa", ""); dbConnection.setAutoCommit(true); dbConnection.setTransactionIsolation(Connection.TRANSACTION_READ_COMMITTED); createDb(index); } catch(SQLException e) { throw new RuntimeException("Error while initialising the database", e); } catch(ClassNotFoundException e) { throw new RuntimeException("Database driver not loaded.", e); } nominalFeatureNameSet = new HashSet<String>(); if(nominalFeatureNames != null){ for(String name : nominalFeatureNames) nominalFeatureNameSet.add(name); } nonNominalFeatureNameSet = new HashSet<String>(); if(integerFeatureNames != null){ for(String name : integerFeatureNames) nonNominalFeatureNameSet.add(name); } if(floatFeatureNames != null){ for(String name : floatFeatureNames) nonNominalFeatureNameSet.add(name); } if(textFeatureNames != null){ for(String name : textFeatureNames) nonNominalFeatureNameSet.add(name); } try { constructDescriptionStatements(); } catch(SQLException e) { throw new RuntimeException("Error while opening database", e); } } protected void constructDescriptionStatements() throws SQLException { // level 1 query List<String> nomFeatNames = new ArrayList<String>( Arrays.asList(descriptiveFeatures)); nomFeatNames.retainAll(nominalFeatureNameSet); StringBuilder stmt = new StringBuilder("SELECT DISTINCT "); stmt.append(tableName(null, MENTIONS_TABLE_SUFFIX)).append(".\"ID\""); for(int i = 0; i < nomFeatNames.size(); i++) { String featName = nomFeatNames.get(i); stmt.append(", ").append(tableName(null, L1_TABLE_SUFFIX)) .append(".\"").append(featName).append("\" AS \"").append(featName).append("\""); } stmt.append(" FROM ") .append(tableName(null, MENTIONS_TABLE_SUFFIX)) .append(", ").append(tableName(null, L1_TABLE_SUFFIX)) .append(" WHERE ").append(tableName(null, MENTIONS_TABLE_SUFFIX)) .append(".\"ID\" IS ?") .append(" AND ").append(tableName(null, MENTIONS_TABLE_SUFFIX)) .append(".\"L1_ID\" = ").append(tableName(null, L1_TABLE_SUFFIX)) .append(".\"ID\""); if(level2Used){ stmt.append(" AND ").append(tableName(null, MENTIONS_TABLE_SUFFIX)) .append(".\"L2_ID\" IS NULL;"); }else { stmt.append(";"); } // logger.debug("L1 description statement: " + stmt.toString()); level1DescribeStmt = dbConnection.prepareStatement(stmt.toString()); if(level2Used) { // levels 1 and 2 query List<String> nonNomFeatNames = new ArrayList<String>( Arrays.asList(descriptiveFeatures)); nonNomFeatNames.retainAll(nonNominalFeatureNameSet); stmt = new StringBuilder("SELECT DISTINCT "); stmt.append(tableName(null, MENTIONS_TABLE_SUFFIX)).append(".\"ID\""); for(int i = 0; i < nomFeatNames.size(); i++) { String featName = nomFeatNames.get(i); stmt.append(", ").append(tableName(null, L1_TABLE_SUFFIX)) .append(".\"").append(featName).append("\" AS \"").append(featName).append('"'); } for(int i = 0; i < nonNomFeatNames.size(); i++) { String featName = nonNomFeatNames.get(i); stmt.append(", ").append(tableName(null, L2_TABLE_SUFFIX)) .append(".\"").append(featName).append("\" AS \"").append(featName).append('"'); } stmt.append(" FROM ") .append(tableName(null, MENTIONS_TABLE_SUFFIX)) .append(", ").append(tableName(null, L1_TABLE_SUFFIX)) .append(", ").append(tableName(null, L2_TABLE_SUFFIX)) .append(" WHERE ").append(tableName(null, MENTIONS_TABLE_SUFFIX)) .append(".\"ID\" IS ?") .append(" AND ").append(tableName(null, MENTIONS_TABLE_SUFFIX)) .append(".\"L1_ID\" = ").append(tableName(null, L1_TABLE_SUFFIX)) .append(".\"ID\" AND ").append(tableName(null, MENTIONS_TABLE_SUFFIX)) .append(".\"L2_ID\" = ").append(tableName(null, L2_TABLE_SUFFIX)) .append(".\"ID\";"); // logger.debug("L1+2 description statement: " + stmt.toString()); level1And2DescribeStmt = dbConnection.prepareStatement(stmt.toString()); } else { level1And2DescribeStmt = null; } } /** * Creates in the database the tables required by this helper for indexing. * Called at index creation, during the initialisation process. * * During indexing, the only tests are equality tests (to check we're not * inserting duplicate rows). To support those in the most efficient way * possible, we're creating MEMORY tables (with the indexes stored in RAM) * and HASH indexes for all data columns. * * @throws SQLException */ protected void createDb(AtomicAnnotationIndex indexer) throws SQLException { Statement stmt = dbConnection.createStatement(); // //////////////////////////////// // create the Level 1 table // //////////////////////////////// StringBuilder createStr = new StringBuilder(); StringBuilder selectStr = new StringBuilder(); StringBuilder insertStr = new StringBuilder(); createStr.append("CREATE TABLE IF NOT EXISTS " + tableName(null, L1_TABLE_SUFFIX) + " (ID IDENTITY NOT NULL PRIMARY KEY"); selectStr.append("SELECT ID FROM " + tableName(null, L1_TABLE_SUFFIX)); insertStr.append("INSERT INTO " + tableName(null, L1_TABLE_SUFFIX) + " VALUES(DEFAULT"); if(nominalFeatureNames != null && nominalFeatureNames.length > 0) { selectStr.append(" WHERE"); boolean firstWhere = true; for(String aFeatureName : nominalFeatureNames) { createStr.append(", \"" + aFeatureName + "\" VARCHAR(255)"); if(firstWhere) firstWhere = false; else selectStr.append(" AND"); selectStr.append(" \"" + aFeatureName + "\" IS ?"); insertStr.append(", ?"); } } createStr.append(")"); insertStr.append(")"); logger.debug("Create statement:\n" + createStr.toString()); stmt.execute(createStr.toString()); logger.debug("Select Level 1:\n" + selectStr.toString()); level1SelectStmt = dbConnection.prepareStatement(selectStr.toString()); level1InsertStmt = dbConnection.prepareStatement(insertStr.toString()); // //////////////////////////////// // create the Level 2 table // //////////////////////////////// int nonNominalFeats = 0; if(integerFeatureNames != null) nonNominalFeats += integerFeatureNames.length; if(floatFeatureNames != null) nonNominalFeats += floatFeatureNames.length; if(textFeatureNames != null) nonNominalFeats += textFeatureNames.length; level2Used = (nonNominalFeats > 0); if(level2Used) { createStr = new StringBuilder( "CREATE TABLE IF NOT EXISTS " + tableName(null, L2_TABLE_SUFFIX) + " (ID IDENTITY NOT NULL PRIMARY KEY, L1_ID BIGINT," + " FOREIGN KEY(L1_ID) REFERENCES " + tableName(null, L1_TABLE_SUFFIX) + "(ID)" ); selectStr = new StringBuilder( "SELECT ID FROM " + tableName(null, L2_TABLE_SUFFIX) + " WHERE L1_ID IS ?"); insertStr = new StringBuilder( "INSERT INTO " + tableName(null, L2_TABLE_SUFFIX) + " VALUES(DEFAULT, ?"); if(integerFeatureNames != null && integerFeatureNames.length > 0) { for(String aFeatureName : integerFeatureNames) { createStr.append(", \"" + aFeatureName + "\" BIGINT"); selectStr.append(" AND \"" + aFeatureName + "\" IS ?"); insertStr.append(", ?"); } } if(floatFeatureNames != null && floatFeatureNames.length > 0) { for(String aFeatureName : floatFeatureNames) { createStr.append(", \"" + aFeatureName + "\" DOUBLE"); selectStr.append(" AND \"" + aFeatureName + "\" IS ?"); insertStr.append(", ?"); } } if(textFeatureNames != null && textFeatureNames.length > 0) { for(String aFeatureName : textFeatureNames) { createStr.append(", \"" + aFeatureName + "\" VARCHAR(255)"); selectStr.append(" AND \"" + aFeatureName + "\" IS ?"); insertStr.append(", ?"); } } createStr.append(")"); insertStr.append(")"); logger.debug("Create statement:\n" + createStr.toString()); stmt.execute(createStr.toString()); logger.debug("Select Level 2:\n" + selectStr.toString()); level2SelectStmt = dbConnection.prepareStatement(selectStr.toString()); level2InsertStmt = dbConnection.prepareStatement(insertStr.toString()); } // ///////////////////////////// // create the Mentions table // ///////////////////////////// createStr = new StringBuilder( "CREATE TABLE IF NOT EXISTS " + tableName(null, MENTIONS_TABLE_SUFFIX) + " (ID IDENTITY NOT NULL PRIMARY KEY, L1_ID BIGINT," + " FOREIGN KEY (L1_ID) REFERENCES " + tableName(null, L1_TABLE_SUFFIX) + "(ID)"); selectStr = new StringBuilder( "SELECT ID FROM " + tableName(null, MENTIONS_TABLE_SUFFIX) + " WHERE L1_ID IS ?"); insertStr = new StringBuilder( "INSERT INTO " + tableName(null, MENTIONS_TABLE_SUFFIX) + " VALUES(DEFAULT, ?"); if(level2Used) { createStr.append(", L2_ID BIGINT, FOREIGN KEY (L2_ID) REFERENCES " + tableName(null, L2_TABLE_SUFFIX) + "(ID)"); selectStr.append(" AND L2_ID IS ?"); insertStr.append(", ?"); } createStr.append(", Length INT)"); selectStr.append(" AND Length IS ?"); insertStr.append(", ?)"); logger.debug("Create statement:\n" + createStr.toString()); stmt.execute(createStr.toString()); logger.debug("Select Mentions:\n" + selectStr.toString()); mentionsSelectStmt = dbConnection.prepareStatement(selectStr.toString()); mentionsInsertStmt = dbConnection.prepareStatement(insertStr.toString()); // create all the indexes createIndexes(stmt); dbConnection.commit(); } protected void createIndexes(Statement stmt) throws SQLException { // //////////////////////////////// // Level 1 table // //////////////////////////////// List<String> indexStatements = new LinkedList<String>(); if(nominalFeatureNames != null && nominalFeatureNames.length > 0) { for(String aFeatureName : nominalFeatureNames) { // create the index statement indexStatements.add( "CREATE INDEX IF NOT EXISTS "+ tableName("IDX-", L1_TABLE_SUFFIX + aFeatureName) + " ON " + tableName(null, L1_TABLE_SUFFIX) + "(\"" + aFeatureName + "\")"); } } for(String aStmt : indexStatements) { logger.debug("Index statement:\n" + aStmt); stmt.execute(aStmt); } // //////////////////////////////// // Level 2 table // //////////////////////////////// if(level2Used) { indexStatements.clear(); if(integerFeatureNames != null && integerFeatureNames.length > 0) { for(String aFeatureName : integerFeatureNames) { indexStatements.add( "CREATE INDEX IF NOT EXISTS " + tableName("IDX", L2_TABLE_SUFFIX + aFeatureName) + " ON " + tableName(null, L2_TABLE_SUFFIX) + "(\"" + aFeatureName + "\")"); } } if(floatFeatureNames != null && floatFeatureNames.length > 0) { for(String aFeatureName : floatFeatureNames) { indexStatements.add( "CREATE INDEX IF NOT EXISTS " + tableName("IDX", L2_TABLE_SUFFIX + aFeatureName) + " ON " + tableName(null, L2_TABLE_SUFFIX) + "(\"" + aFeatureName + "\")"); } } if(textFeatureNames != null && textFeatureNames.length > 0) { for(String aFeatureName : textFeatureNames) { indexStatements.add( "CREATE INDEX IF NOT EXISTS " + tableName("IDX", L2_TABLE_SUFFIX + aFeatureName) + " ON " + tableName(null, L2_TABLE_SUFFIX) + "(\"" + aFeatureName + "\")"); } } // create the indexes for(String aStmt : indexStatements) { logger.debug("Index statement:\n" + aStmt); stmt.execute(aStmt); } } // ///////////////////////////// // Mentions table // ///////////////////////////// // all other fields are either primary or foreign keys, so they get indexes String idxStmt = "CREATE INDEX IF NOT EXISTS " + tableName("IDX", MENTIONS_TABLE_SUFFIX + "Length") + " ON " + tableName(null, MENTIONS_TABLE_SUFFIX) + " (Length)"; logger.debug("Index statement:\n" + idxStmt); stmt.execute(idxStmt); } /** * Creates a table (index, etc.) name. Uses the value in * {@link #tableBaseName} as a base name, to which it prepends the supplied * prefix (if any), it appends the supplied suffix(if any). The constructed * string is then surrounded with double quotes. * @param suffix * @return */ protected String tableName(String prefix, String suffix) { StringBuilder str = new StringBuilder("\""); if(prefix != null) str.append(prefix); str.append(tableBaseName); if(suffix != null) str.append(suffix); str.append("\""); return str.toString(); } @Override public String[] getMentionUris(Annotation ann, int length, AtomicAnnotationIndex index) { FeatureMap featuresToIndex; if(getMode() == Mode.DOCUMENT) { length = -1; featuresToIndex = documentFeatures; } else { featuresToIndex = ann.getFeatures(); } if(!indexNulls) { // we don't want to index instances where all the features are null, so // check to see whether this is the case boolean allFeaturesNull = true; shortCircuit:do { for(String featureName : nominalFeatureNameSet) { if(featuresToIndex.get(featureName) != null) { allFeaturesNull = false; break shortCircuit; } } for(String featureName : nonNominalFeatureNameSet) { if(featuresToIndex.get(featureName) != null) { allFeaturesNull = false; break shortCircuit; } } } while(false); // no value found for any of the features, so drop this instance if(allFeaturesNull) { return EMPTY_STRING_ARRAY; } } try { // find the level 1 ID Long level1Tag = cache.getLevel1Id(featuresToIndex, new Level1IdGenerator(featuresToIndex)); // find the Level-1 Mention ID (ignoring the L2 values) Long mentionL1Tag = cache.getLevel3Id(level1Tag, null, length, new Level3IdGenerator(level1Tag, null, length)); Long mentionL2Tag = null; if(level2Used){ // find the level 2 ID Long level2Tag = cache.getLevel2Id(level1Tag, featuresToIndex, new Level2IdGenerator(level1Tag, featuresToIndex)); // find the Level-2 Mention ID mentionL2Tag = cache.getLevel3Id(level1Tag, level2Tag, length, new Level3IdGenerator(level1Tag, level2Tag, length)); } // now we finally have the mention ID if(level2Used) { return new String[] { annotationType + ":" + mentionL1Tag, annotationType + ":" + mentionL2Tag}; } else { return new String[] { annotationType + ":" + mentionL1Tag}; } } catch(Exception e) { // something went bad: we can't fix it :( logger.error("Error while interogating database. Annotation was lost!", e); return EMPTY_STRING_ARRAY; } } /* (non-Javadoc) * @see gate.mimir.SemanticAnnotationHelper#isMentionUri(java.lang.String) */ @Override public boolean isMentionUri(String mentionUri) { final String prefix = annotationType + ":"; if(mentionUri.startsWith(prefix)) { try{ return Long.parseLong(mentionUri.substring(prefix.length())) >= 0; } catch (Exception e) {} } return false; } /* (non-Javadoc) * @see gate.mimir.AbstractSemanticAnnotationHelper#getDescriptiveFeatureValues(java.lang.String) */ @Override protected String[] getDescriptiveFeatureValues(String mentionUri) { long mentionId = -1; try { mentionId = Long.parseLong( mentionUri.substring(annotationType.length() + 1)); } catch(Exception e) { logger.error("Could not describe mention with invalid URI: \"" + mentionUri + "\" (" + e.getMessage() + ")." ); return null; } if(level1DescribeStmt == null) return null; ResultSet res = null; try { level1DescribeStmt.setLong(1, mentionId); res = level1DescribeStmt.executeQuery(); if(!res.next()) { // no level 1 results: try levels 1+2 res.close(); if(level2Used && level1And2DescribeStmt != null) { level1And2DescribeStmt.setLong(1, mentionId); res = level1And2DescribeStmt.executeQuery(); if(!res.next()){ logger.error("Was asked to describe mention with ID " + mentionId + " but was unable to find it."); return null; } } else { // no results from level 1, and level2 not used return null; } } // by this point the result set was advanced to the one and only row String[] result = new String[descriptiveFeatures.length]; for(int i = 0; i < descriptiveFeatures.length; i++) { try { Object sqlValue = res.getObject(descriptiveFeatures[i]); if(sqlValue != null) result[i] = sqlValue.toString(); } catch(SQLException e) { // non-nominal features are not available for level 1 mentions result[i] = null; } catch (Exception e) { logger.error("Error while obtaining description feature value.", e); } } return result; } catch(SQLException e) { logger.error("Database error while describing mention with ID: " + mentionId, e); return null; } finally { if(res != null){ try { res.close(); } catch(SQLException e) { logger.error("Error while closing SQL result set", e); } } } } /** * Sets all the values for a prepared statement (which must be one of the * cached transient statements!) * For level-2 statements, it does not set the L1_ID parameter (i.e. it starts * with the parameter at position 2). * @param stmt * @param annotation * @throws SQLException */ protected void setStatementParameters(PreparedStatement stmt, FeatureMap annFeats) throws SQLException { if(stmt == level1InsertStmt || stmt == level1SelectStmt) { if(nominalFeatureNames != null){ int paramIdx = 1; for(String aFeatureName : nominalFeatureNames) { Object value = annFeats.get(aFeatureName); if(value != null) { stmt.setString(paramIdx++, value.toString()); } else { stmt.setNull(paramIdx++, Types.VARCHAR); } } } } else if(stmt == level2InsertStmt || stmt == level2SelectStmt) { if(!level2Used) throw new RuntimeException( "Was asked to populate a Level-2 statement, but Level-2 is not in use!"); int paramIdx = 2; if(integerFeatureNames != null){ for(String aFeatureName : integerFeatureNames) { Object valueObj = annFeats.get(aFeatureName); Long value = null; if(valueObj != null){ if(valueObj instanceof Number) { value = ((Number)valueObj).longValue(); } else if(valueObj instanceof String) { try { value = Long.valueOf((String)valueObj); } catch(NumberFormatException e) { logger.warn("Value provided for feature \"" + aFeatureName + "\" is a String that cannot be parsed to a Long. Value (" + valueObj.toString() + ") will be ignored!"); } } else { logger.warn("Value provided for feature \"" + aFeatureName + "\" is not a subclass of java.lang.Number. Value (" + valueObj.toString() + ") will be ignored!"); } } if(value != null) { stmt.setLong(paramIdx++, value); } else { stmt.setNull(paramIdx++, Types.BIGINT); } } } if(floatFeatureNames != null){ for(String aFeatureName : floatFeatureNames) { Object valueObj = annFeats.get(aFeatureName); Double value = null; if(valueObj != null){ if(valueObj instanceof Number) { value = ((Number)valueObj).doubleValue(); } else if(valueObj instanceof String) { try { value = Double.valueOf((String)valueObj); } catch(NumberFormatException e) { logger.warn("Value provided for feature \"" + aFeatureName + "\" is a String that cannot be parsed to a Double. Value (" + valueObj.toString() + ") will be ignored!"); } } else { logger.warn("Value provided for feature \"" + aFeatureName + "\" is not a subclass of java.lang.Number. Value (" + valueObj.toString() + ") will be ignored!"); } } if(value != null) { stmt.setDouble(paramIdx++, value); } else { stmt.setNull(paramIdx++, Types.DOUBLE); } } } if(textFeatureNames != null) { for(String aFeatureName : textFeatureNames) { Object valueObj = annFeats.get(aFeatureName); if(valueObj != null) { stmt.setString(paramIdx++, valueObj.toString()); } else { stmt.setNull(paramIdx++, Types.VARCHAR); } } } } else { throw new RuntimeException("Cannot recognise the the provided prepared statement!"); } } @Override public List<Mention> getMentions(String annotationType, List<Constraint> constraints, QueryEngine engine) { if(!annotationType.equals(this.annotationType)) { throw new IllegalArgumentException("Wrong annotation type \"" + annotationType + "\", this helper can only handle " + this.annotationType + "!"); } List<Mention> mentions = new LinkedList<Mention>(); boolean hasLevel1Constraints = false; for(Constraint aConstraint : constraints) { if(nominalFeatureNameSet.contains(aConstraint.getFeatureName())) { hasLevel1Constraints = true; break; } } boolean hasLevel2Constraints = false; for(Constraint aConstraint : constraints) { if(nonNominalFeatureNameSet.contains(aConstraint.getFeatureName())) { hasLevel2Constraints = true; break; } } List<Object> params = new ArrayList<Object>(); StringBuilder selectStr = new StringBuilder( "SELECT DISTINCT " + tableName(null, MENTIONS_TABLE_SUFFIX) + ".ID, " + tableName(null, MENTIONS_TABLE_SUFFIX) + ".Length FROM " + tableName(null, MENTIONS_TABLE_SUFFIX)); if(hasLevel1Constraints) { selectStr.append(", " + tableName(null, L1_TABLE_SUFFIX)); } if(hasLevel2Constraints) { selectStr.append(", " + tableName(null, L2_TABLE_SUFFIX)); } boolean firstWhere = true; // add constraints List<Constraint> unusedConstraints = new ArrayList<Constraint>(constraints); if(hasLevel1Constraints) { if(nominalFeatureNames != null) { for(String aFeatureName : nominalFeatureNames) { for(Constraint aConstraint : constraints) { if(aFeatureName.equals(aConstraint.getFeatureName())){ if(firstWhere){ firstWhere = false; selectStr.append(" WHERE"); } else { selectStr.append(" AND"); } selectStr.append(" " + tableName(null, L1_TABLE_SUFFIX) + ".\"" + aFeatureName + "\""); switch( aConstraint.getPredicate() ) { case EQ: selectStr.append(" ="); break; case GT: selectStr.append(" >"); break; case GE: selectStr.append(" >="); break; case LT: selectStr.append(" <"); break; case LE: selectStr.append(" <="); break; case REGEX: selectStr.append(" REGEXP"); } if(aConstraint.getValue() instanceof String) { selectStr.append(" ?"); params.add(aConstraint.getValue()); } else if(aConstraint.getValue() instanceof String[]) { // this only makes sense for REGEX if(aConstraint.getPredicate() != ConstraintType.REGEX) { throw new IllegalArgumentException("Got a two-valued constraint that is not a REGEXP!"); } selectStr.append(" ?"); params.add("(?" + ((String[])aConstraint.getValue())[1] + ")" + ((String[])aConstraint.getValue())[0]); } unusedConstraints.remove(aConstraint); } } } } // join L1 with Mentions selectStr.append(" AND " + tableName(null, L1_TABLE_SUFFIX) + ".ID = " + tableName(null, MENTIONS_TABLE_SUFFIX) + ".L1_ID"); if(hasLevel2Constraints) { // join L1 with L2 selectStr.append(" AND " + tableName(null, L1_TABLE_SUFFIX) + ".ID = " + tableName(null, L2_TABLE_SUFFIX) + ".L1_ID"); } } if(hasLevel2Constraints) { if(integerFeatureNames != null) { for(String aFeatureName : integerFeatureNames) { for(Constraint aConstraint : constraints) { if(aFeatureName.equals(aConstraint.getFeatureName())){ if(firstWhere){ firstWhere = false; selectStr.append(" WHERE"); } else { selectStr.append(" AND"); } selectStr.append(" " + tableName(null, L2_TABLE_SUFFIX) + ".\"" + aFeatureName + "\""); switch( aConstraint.getPredicate() ) { case EQ: selectStr.append(" ="); break; case GT: selectStr.append(" >"); break; case GE: selectStr.append(" >="); break; case LT: selectStr.append(" <"); break; case LE: selectStr.append(" <="); break; case REGEX: throw new IllegalArgumentException("Cannot use a REGEX predicate for numeric features!"); } selectStr.append(" ?"); if(aConstraint.getValue() instanceof Number) { params.add(Long.valueOf(((Number)aConstraint.getValue()).longValue())); } else { params.add(Long.valueOf(aConstraint.getValue().toString())); } unusedConstraints.remove(aConstraint); } } } } if(floatFeatureNames != null) { for(String aFeatureName : floatFeatureNames) { for(Constraint aConstraint : constraints) { if(aFeatureName.equals(aConstraint.getFeatureName())){ if(firstWhere){ firstWhere = false; selectStr.append(" WHERE"); } else { selectStr.append(" AND"); } selectStr.append(" " + tableName(null, L2_TABLE_SUFFIX) + ".\"" + aFeatureName + "\""); switch( aConstraint.getPredicate() ) { case EQ: selectStr.append(" ="); break; case GT: selectStr.append(" >"); break; case GE: selectStr.append(" >="); break; case LT: selectStr.append(" <"); break; case LE: selectStr.append(" <="); break; case REGEX: throw new IllegalArgumentException("Cannot use a REGEX predicate for numeric features!"); } selectStr.append(" ?"); if(aConstraint.getValue() instanceof Number) { params.add(Double.valueOf(((Number)aConstraint.getValue()).doubleValue())); } else { params.add(Double.valueOf(aConstraint.getValue().toString())); } unusedConstraints.remove(aConstraint); } } } } if(textFeatureNames != null) { for(String aFeatureName : textFeatureNames) { for(Constraint aConstraint : constraints) { if(aFeatureName.equals(aConstraint.getFeatureName())){ if(firstWhere){ firstWhere = false; selectStr.append(" WHERE"); } else { selectStr.append(" AND"); } selectStr.append(" " + tableName(null, L2_TABLE_SUFFIX) + ".\"" + aFeatureName + "\""); switch( aConstraint.getPredicate() ) { case EQ: selectStr.append(" ="); break; case GT: selectStr.append(" >"); break; case GE: selectStr.append(" >="); break; case LT: selectStr.append(" <"); break; case LE: selectStr.append(" <="); break; case REGEX: selectStr.append(" REGEXP"); } selectStr.append(" ?"); if(aConstraint.getValue() instanceof String) { params.add(aConstraint.getValue()); } else if(aConstraint.getValue() instanceof String[]) { // this only makes sense for REGEX if(aConstraint.getPredicate() != ConstraintType.REGEX) { throw new IllegalArgumentException("Got a two-valued constraint that is not a REGEXP!"); } params.add("(?" + ((String[])aConstraint.getValue())[1] + ")" + ((String[])aConstraint.getValue())[0]); } unusedConstraints.remove(aConstraint); } } } } // join L2 with Mentions selectStr.append(" AND "+ tableName(null, L2_TABLE_SUFFIX) + ".ID = " + tableName(null, MENTIONS_TABLE_SUFFIX) + ".L2_ID"); } if(unusedConstraints.size() > 0) { StringBuilder msg = new StringBuilder(); if(unusedConstraints.size() == 1) { msg.append("The following constraint name was not recognised: \""); msg.append(unusedConstraints.get(0).getFeatureName()); msg.append("\"."); } else { msg.append("The following constraint names were not recognised: "); boolean first = true; for(Constraint aConstraint : unusedConstraints) { if(first) first = false; else msg.append(", "); msg.append('"'); msg.append(aConstraint.getFeatureName()); msg.append('"'); } msg.append("."); } throw new RuntimeException(msg.toString()); } if(!hasLevel2Constraints && level2Used) { // no level 2 constraints if(firstWhere){ firstWhere = false; selectStr.append(" WHERE "); } else { selectStr.append(" AND "); } selectStr.append(tableName(null, MENTIONS_TABLE_SUFFIX) + ".L2_ID IS NULL"); } logger.debug("Select query:\n" + selectStr.toString()); try { PreparedStatement stmt = dbConnection.prepareStatement(selectStr.toString()); int pos = 1; for(Object val : params) { stmt.setObject(pos++, val); } ResultSet res = stmt.executeQuery(); while(res.next()) { long id = res.getLong(1); int length = getMode() == Mode.DOCUMENT? Mention.NO_LENGTH : res.getInt(2); mentions.add(new Mention(annotationType + ":" + id, length)); } stmt.close(); } catch(SQLException e) { logger.error("DB error", e); throw new RuntimeException("DB error", e); } return mentions; } @Override public void documentStart(Document document) { if(getMode() == Mode.DOCUMENT) { documentFeatures = document.getFeatures(); } } @Override public void documentEnd() { documentFeatures = null; if(cache != null) { double l1ratio = cache.getL1CacheHitRatio(); double l2ratio = cache.getL2CacheHitRatio(); double l3ratio = cache.getL3CacheHitRatio(); logger.debug("Cache size(" + annotationType + "):" + cache.size() + ". Hit ratios L1, L2, L3: " + (Double.isNaN(l1ratio) ? "N/A" : percentFormat.format(l1ratio)) + ", " + (Double.isNaN(l2ratio) ? "N/A" : percentFormat.format(l2ratio)) + ", " + (Double.isNaN(l3ratio) ? "N/A" : percentFormat.format(l3ratio))); docsSoFar++; } else { logger.debug("Cache size(" + annotationType + "): null"); } } @Override public void close(AtomicAnnotationIndex indexer) { closeDB(); } @Override public void close(QueryEngine qEngine) { closeDB(); } private void closeDB() { //Explicitly close and nullify all the prepared statements. level1InsertStmt = closeAndNullify(level1InsertStmt); level1SelectStmt = closeAndNullify(level1SelectStmt); level2InsertStmt = closeAndNullify(level2InsertStmt); level2SelectStmt = closeAndNullify(level2SelectStmt); mentionsInsertStmt = closeAndNullify(mentionsInsertStmt); mentionsSelectStmt = closeAndNullify(mentionsSelectStmt); //now close the connection try { if(dbConnection != null) { dbConnection.close(); dbConnection = null; } } catch(SQLException e) { logger.warn("Error while closing DB COnnection", e); } } /** * Close a prepared statement to help free resources * @param stmt * @return null, as a utility for easily nullifying the original object */ private PreparedStatement closeAndNullify(PreparedStatement stmt) { try { if (stmt != null) stmt.close(); } catch (SQLException e) { logger.warn("Error closing DB statement"); } return null; } /** * Sets the size for the three level caches used by this helper. * * A negative value for each cache size sets the cache size to its default * value. * * @param level1 the size for the Level 1 cache. The Level 1 cache stores * previously seen combinations of nominal feature values. * * @param level2 the size for the Level 2 cache. The Level 2 cache stores * previously seen combinations of non-nominal feature values. * * @param level3 the size for the Level 3 cache. The Level 1 cache stores * previously seen mention IDs. */ public void setCacheSizes(int level1, int level2, int level3) { this.level1CacheSize = level1; this.level2CacheSize = level2; this.level3CacheSize = level3; if(cache != null) { cache.setL1CacheSize(level1CacheSize); cache.setL2CacheSize(level2CacheSize); cache.setL3CacheSize(level3CacheSize); } } }