/*
* (C) Copyright IBM Corp. 2014
*
* LICENSE: Eclipse Public License v1.0
* http://www.eclipse.org/legal/epl-v10.html
*/
package com.ibm.db2j;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Hashtable;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.accumulo.core.client.BatchScanner;
import org.apache.accumulo.core.client.Connector;
import org.apache.accumulo.core.client.IteratorSetting;
import org.apache.accumulo.core.client.Scanner;
import org.apache.accumulo.core.client.ScannerBase;
import org.apache.accumulo.core.client.TableNotFoundException;
import org.apache.accumulo.core.client.ZooKeeperInstance;
import org.apache.accumulo.core.data.Key;
import org.apache.accumulo.core.data.Range;
import org.apache.accumulo.core.data.Value;
import org.apache.accumulo.core.iterators.user.RegExFilter;
import org.apache.accumulo.core.security.Authorizations;
import org.apache.derby.iapi.error.StandardException;
import org.apache.derby.iapi.store.access.Qualifier;
import org.apache.derby.iapi.types.DataValueDescriptor;
import org.apache.derby.iapi.types.Orderable;
import org.apache.derby.iapi.types.SQLChar;
import org.apache.derby.vti.IFastPath;
import org.apache.derby.vti.VTIEnvironment;
import org.apache.hadoop.io.Text;
import com.ibm.db2j.AbstractVTI;
import com.ibm.gaiandb.GaianDBConfigProcedures;
import com.ibm.gaiandb.GaianResultSetMetaData;
import com.ibm.gaiandb.Logger;
import com.ibm.gaiandb.RowsFilter;
import com.ibm.gaiandb.Util;
import com.ibm.gaiandb.diags.GDBMessages;
/**
* This VTI provides access to an Accumulo Database Table.
* It reads configuration parameters from gaiandb_config.properties (e.g. instance name, zookeeper ips, user, password, table, visibility) - see configuration example below.
* It supports use of all fields in the Accumulo table structure:
*
* Row ID, Column Family, Column Qualifier, Column Visibility, Timestamp, Value
*
* The first 5 fields represent a Key. All fields are byte arrays except for the Timestamp which is a long.
* Row ID is the equivalent of a "primary key". All Accumulo table rows having a same Row ID represent a "record" returned by this VTI - i.e. a set of related column family values.
* Column Family effectively holds the column name, and column values returned by this VTI can be configured to contain the Column Qualifier or Value field.
* Column Visibility holds a logical expression allowing restrictions on cell-level access by a table scan.
* Timestamp allows ordering by time of the records. If the field is not provided at ingestion-time, it is automatically generated instead.
*
* Configuration example:
* ======================
*
* # Logical table 'LTBIKES' definition:
* LTBIKES_DEF=ROWID INT, AREA VARCHAR(50), INSTALLED BOOLEAN, LAT DOUBLE, LOCKED BOOLEAN, LON DOUBLE, NUMBIKES INT, NUMEMPTYDOCKS INT, STATION VARCHAR(50), NOT_FIXED BOOLEAN
*
* # Data-source wrapper 'LTBIKES_DS0' properties - federated under logical table LTBIKES:
* LTBIKES_DS0_VTI=com.ibm.db2j.AccumuloVTI
* LTBIKES_DS0_ARGS=AccumuloUserInstance1, mytable qualifiers [vis1&vis2], DERIVE_SCHEMA_FROM_FIRST_ROW
* LTBIKES_DS0_OPTIONS=MAP_COLUMNS_BY_POSITION
*
* # NOTE: The property ending with "_ARGS" specifies the arguments that are passed to the VTI constructor:
* # 1) A reference to the static Accumulo connection properties
* # 2) The physical accumulo table name, an identifier specifying the field to extract for table values (either "values" or "qualifiers"), and a visibility expression
* # 3) An optional 'DERIVE_SCHEMA_FROM_FIRST_ROW' argument to tell the VTI to derive it's schema based on the first RowID row.
* # => Default would be to use property: LTBIKES_DS0_SCHEMA, or failing that: LT_BIKES_DEF
*
* # Static VTI properties:
* AccumuloVTI.AccumuloUserInstance1.INSTANCE=instance1
* AccumuloVTI.AccumuloUserInstance1.ZOOKEEPERS=9.71.39.154
* AccumuloVTI.AccumuloUserInstance1.USR=root
* AccumuloVTI.AccumuloUserInstance1.PWD=<pwd>
*
* @author DavidVyvyan
*/
public class AccumuloVTI extends AbstractVTI { //implements PluralizableVTI {
// Use PROPRIETARY notice if class contains a main() method, otherwise use COPYRIGHT notice.
public static final String COPYRIGHT_NOTICE = "(c) Copyright IBM Corp. 2014";
private static final Logger logger = new Logger( "AccumuloVTI", 30 );
private final static Map<String, ZooKeeperInstance> zooKeeperInstances = new Hashtable<String, ZooKeeperInstance>();
private final Connector accumuloConnector;
private final String accumuloTable;
private final boolean isDeriveSchemaFromFirstRow;
private static final String DERIVE_SCHEMA_FROM_FIRST_ROW = "DERIVE_SCHEMA_FROM_FIRST_ROW";
// Table meta-data, i.e. schema/shape including column indexes, names, types and sizes
private GaianResultSetMetaData accumuloTableRSMD = null;
private final Scanner standardScanner;
private final BatchScanner batchScanner;
private ScannerBase tableScanner = null; // Can switch between the 2 scanner types above
private List<Range> rowidRangesPredicates = null;
private List<IteratorSetting> preScanIterators = new ArrayList<IteratorSetting>(); // iterators based on regex filters built from predicates
private Iterator<Map.Entry<Key,Value>> rowScanIterator = null;
private Map.Entry<Key,Value> currentAccumuloRow = null;
private int[] projectedColumns = null;
private Qualifier[][] qualifiers = null;
Map<String, Integer> projectedColumnsNameToIndexMap = new HashMap<String, Integer>();
private final boolean isExtractAccumuloColumnQualifiersInPlaceOfValues;
private final Authorizations tableScannerAuthorizations;
// Fields used for building meta-data
private static final String ROWID = "ROWID";
private static final String VC256 = "VARCHAR(256)";
private static final boolean isRowidInSchema = true; // Include Accumulo RowID as a column in this VTI's table shape?
private static int rowidColShift = 0;
private static final String PROPERTY_INSTANCE = "INSTANCE";
private static final String PROPERTY_ZOOKEEPERS = "ZOOKEEPERS";
private static final String PROPERTY_USR = "USR";
private static final String PROPERTY_PWD = "PWD";
private int rowCount = 0;
private int numRowsReceivedFromAccumulo;
private static final String VTI_ARGS_SYNTAX_HELP =
"AccumuloVTI args syntax (LTX_DSY_ARGS): <vtiPropertiesID>, <accumuloTable [fieldToExtractExpression] [visibilityExpressionInSquareBrackets]>[, "
+DERIVE_SCHEMA_FROM_FIRST_ROW+"]";
/**
* Build an AccumuloVTI based on passed in CSV String holding:
*
* 1) A reference to the static Accumulo connection properties
* 2) The physical accumulo table name, an identifier specifying the field to extract for table values (either "values" or "qualifiers"), and a visibility expression
* 3) An optional 'DERIVE_SCHEMA_FROM_FIRST_ROW' argument to tell the VTI to derive it's schema based on the first RowID row.
* => Default would be to use property: LTBIKES_DS0_SCHEMA, or failing that: LT_BIKES_DEF
*
* @param vtiArgs
* @throws Exception
*/
public AccumuloVTI(String vtiArgs) throws Exception {
super(vtiArgs, "AccumuloVTI");
String[] locatorArgs = replacements.toArray( new String[0] );
if ( 1 > locatorArgs.length ) throw new Exception("Missing AccumuloVTI table argument: " + vtiArgs + ". " + VTI_ARGS_SYNTAX_HELP);
final String accumuloTableAndConstraints = locatorArgs[0];
isDeriveSchemaFromFirstRow = 1 < locatorArgs.length && DERIVE_SCHEMA_FROM_FIRST_ROW.equals(locatorArgs[1]);
String[] toks = Util.splitByTrimmedDelimiterNonNestedInSquareBracketsOrDoubleQuotes(accumuloTableAndConstraints, ' ');
accumuloTable = toks[0];
final boolean isFirstConstraintFieldToExtract = 1 < toks.length && '[' != toks[1].charAt(0);
final int visFieldIdx = 1 < toks.length && '[' == toks[1].charAt(0) ? 1 : ( 2 < toks.length ? 2 : -1 );
isExtractAccumuloColumnQualifiersInPlaceOfValues = isFirstConstraintFieldToExtract && toks[1].equalsIgnoreCase("qualifiers");
String visibilityExpression = null; // default null = not specified
if ( 0 < visFieldIdx ) {
final String vis = toks[visFieldIdx];
final int vlen = vis.length();
if ( '[' != vis.charAt(0) || ']' != vis.charAt(vlen-1) )
throw new Exception("Missing AccumuloVTI table argument: " + vtiArgs + ". " + VTI_ARGS_SYNTAX_HELP);
visibilityExpression = vis.substring(1, vlen-1);
}
tableScannerAuthorizations = null == visibilityExpression ? new Authorizations() : new Authorizations( visibilityExpression );
String instanceName = getVTIProperty(PROPERTY_INSTANCE);
String zooKeeperHostPortLocationsCSV = getVTIProperty(PROPERTY_ZOOKEEPERS);
String usr = getVTIProperty(PROPERTY_USR);
String pwd = getVTIProperty(PROPERTY_PWD);
ZooKeeperInstance instance;
if ( zooKeeperInstances.containsKey( instanceName ) ) instance = zooKeeperInstances.get( instanceName );
else zooKeeperInstances.put( instanceName, instance = new ZooKeeperInstance( instanceName, zooKeeperHostPortLocationsCSV ) );
// All properties are referenced relative to: AccumuloVTI.<vtiArgs prefix argument>, e.g. AccumuloVTI.MyUserInstanceTable1
accumuloConnector = instance.getConnector( usr, pwd );
try { standardScanner = accumuloConnector.createScanner( accumuloTable, tableScannerAuthorizations ); }
catch ( TableNotFoundException e ) { throw new SQLException("Unable to construct Accumulo Scanner. table = " + accumuloTable + ", cause: " + e); }
try { batchScanner = accumuloConnector.createBatchScanner(accumuloTable, tableScannerAuthorizations, 10); }
catch ( TableNotFoundException e ) { throw new SQLException("Unable to construct Accumulo BatchScanner. table = " + accumuloTable + ", cause: " + e); }
tableScanner = standardScanner; // default
}
/**
* Gives VTI's table schema, i.e. number of columns, their types, names, sizes etc.
* Deduces this from the first row of data in the targeted table (whose name should be specified in gaiandb_config.properties).
* This method is always called by the querying engine (Gaian or Derby) *before* query execution.
*/
@Override public GaianResultSetMetaData getMetaData() throws SQLException {
if ( false == isDeriveSchemaFromFirstRow ) accumuloTableRSMD = super.getMetaData();
else if ( null == accumuloTableRSMD ) {
// Get table shape from first accumulo record
rowScanIterator = standardScanner.iterator();
if ( false == rowScanIterator.hasNext() )
throw new SQLException("Table has no data to derive it's schema. Table name = " + accumuloTable);
Key key = rowScanIterator.next().getKey();
Text rowID = key.getRow(), previousRowID = null;
StringBuilder tableDefSB = new StringBuilder(
( isRowidInSchema ? ROWID + ' ' + VC256 + ',' : "" ) + key.getColumnFamily() + ' ' + VC256 );
while ( rowScanIterator.hasNext() ) {
key = rowScanIterator.next().getKey();
previousRowID = rowID;
rowID = key.getRow();
if ( false == rowID.equals(previousRowID) ) break; // stop when a full record has been read.
tableDefSB.append( ',' + key.getColumnFamily().toString() + ' ' + VC256 );
}
reinitialise(); // clear scanner for re-use
try { accumuloTableRSMD = new GaianResultSetMetaData( tableDefSB.toString() ); }
catch (Exception e) {
throw new SQLException("Unable to build AccumuloVTI RSMD table schema from definition: "
+ tableDefSB + " (returning null), cause: " + e);
}
}
// ROWID must always be included... if missing then hardly no qualifiers can be pushed down at all.
// The logical table could still be configured to cut out the ROWID if this was really necessary (but performance could not longer be optimised).
// isIncludeRowID = null != accumuloTableRSMD && "ROWID".equalsIgnoreCase( accumuloTableRSMD.getColumnName(1) );
return accumuloTableRSMD;
}
/**
* The args passed here include:
* 1) End-point instance ID (if VTI implements Pluralizable),
* 2) All arguments passed to or created in the invoking parent GaianTable().
* The end-point instance ID is passed as first argument of the String[] in this method
*/
@Override public void setArgs(String[] args) throws Exception {
super.setArgs(args); // allow super class to extract arguments it may need, e.g. QRY_CONTEXT_ARG_LT_DEF, to deduce default table schema.
// Note we don't use default table schema based on logical table definition because we need to give exact column family names
// to accumulo when specifying projected columns - there is no room for using differently named columns or different character case
// for them in the logical table.
logger.logInfo("Entered setArgs(), args are: " + Arrays.asList(args) );
// if ( null != args && 0 < args.length ) { if ( null != args[0] ) pluralizationInstance = args[0]; } // Not interested in pluralization yet
};
@Override public boolean pushProjection(VTIEnvironment arg0, int[] arg1) throws SQLException {
logger.logThreadDetail("Entered AccumuloVTI.pushProjection(), projection: " + Util.intArrayAsString(arg1));
if ( null != arg1) projectedColumns = arg1;
return true;
}
/**
* Qualifyable interface - used to process predicates against our columns -
* Note that predicates on our *constant* columns (e.g. for each pluralizable instance) should ideally be processed above by GaianDB (if we passed
* the constant instances up with getEndpointConstantColumns()) - however it does little harm to test/filter them again here just in case.
*/
@Override public void setQualifiers(VTIEnvironment vtie, Qualifier[][] qual) throws SQLException {
logger.logThreadDetail("Entered AccumuloVTI.setQualifiers(), qualifiers: " + RowsFilter.reconstructSQLWhereClause(qual));
qualifiers = qual;
try {
preScanIterators.clear();
// Prepare range predicates based on given column qualifiers.
QualifiersToAccumuloRangesConverter predicatesConverter = new QualifiersToAccumuloRangesConverter(qualifiers);
if ( predicatesConverter.isQualifiersResolveToFalse() ) {
logger.logInfo("Resolved column ranges - one of which is empty - ABORTING QUERY");
projectedColumns = new int[0];
return;
}
rowidRangesPredicates = predicatesConverter.getAccumuloRanges();
logger.logInfo("Qualifiers resolved to "+ (null==rowidRangesPredicates?"0 Accumulo Ranges on ROWID":
rowidRangesPredicates.size() + " Accumulo Ranges: " + rowidRangesPredicates) );
qualifiers = predicatesConverter.getPrunedQualifiers();
logger.logInfo("Remaining Qualifiers: " + RowsFilter.reconstructSQLWhereClause(qualifiers, accumuloTableRSMD));
for ( int i=0; i<projectedColumns.length; i++ ) {
final int pColID = projectedColumns[i];
if ( 1 == pColID ) continue;
final String colName = accumuloTableRSMD.getColumnName( pColID );
String regex = predicatesConverter.getPredicatesRegexForColID( pColID-1 ); // passed in colID must be 0-based
if ( null == regex || ".*".equals(regex) ) continue; // null case should not happen
logger.logInfo("Built REGEX filter for column '" + colName + "' (colID " + pColID + "): " + regex);
IteratorSetting regexIteratorSetting = new IteratorSetting(1, "FilterForQualsOrValues_" + pColID, RegExFilter.class);
// RegExFilter.setRegexs( regexIteratorSetting, rowTerm, cfTerm, cqTerm, valueTerm, orFields? );
if ( isExtractAccumuloColumnQualifiersInPlaceOfValues )
RegExFilter.setRegexs( regexIteratorSetting, null, "^(?:"+colName+")$", regex, null, false );
else RegExFilter.setRegexs( regexIteratorSetting, null, "^(?:"+colName+")$", null, regex, false );
// Add 1 regex scan iterator per column having predicates on it - doesn't work the way we want... (only the column with the predicate is retrieved)
// tableScanner.addScanIterator( regexIteratorSetting );
preScanIterators.add( regexIteratorSetting );
}
} catch ( Exception e ) {
logger.logImportant("Exception whilst resolving Accumulo Ranges from qualifiers (using empty Ranges list), cause " + Util.getStackTraceDigest(e));
}
}
/**
* Prepare table scanner and its iterators.
* Use scanner.fetchColumnFamily() to specify requested columns.
* Use scanner.setRanges(), scanner.fetchColumn() and custom scanner iterators to specify column filters.
*/
@Override public boolean executeAsFastPath() throws SQLException {
try {
final int columnCount = accumuloTableRSMD.getColumnCount();
// Initialise array of queried columns if not set.
if ( null == projectedColumns ) {
projectedColumns = new int[ columnCount ];
for ( int i=0; i<columnCount; i++ ) projectedColumns[i] = i+1; // 1-based
}
if ( null != rowidRangesPredicates ) {
int numRanges = rowidRangesPredicates.size();
if ( 1 < numRanges ) ((BatchScanner) (tableScanner = batchScanner)).setRanges( rowidRangesPredicates );
else if ( 1 == numRanges ) ((Scanner) (tableScanner = standardScanner)).setRange( rowidRangesPredicates.get(0) );
}
if ( false == preScanIterators.isEmpty() ) {
if ( 1 == preScanIterators.size() && ( 1 == projectedColumns.length || ( 2 == projectedColumns.length && 1 == projectedColumns[0] ) ) )
// No need for pre-scans as there is only 1 column with predicates and no other columns are being extracted.
tableScanner.addScanIterator( preScanIterators.get(0) );
else {
for ( IteratorSetting it : preScanIterators ) {
tableScanner.addScanIterator( it );
long millis = System.currentTimeMillis();
rowidRangesPredicates = iterateAndExtractRowRanges( tableScanner.iterator() );
logger.logInfo("Scanned rowID ranges matching a column's predicates in " + (System.currentTimeMillis() - millis)
+ "ms. Remaining RowIDs ranges count = " + rowidRangesPredicates.size());
if ( rowidRangesPredicates.isEmpty() ) { projectedColumns = new int[0]; break; }
tableScanner.clearScanIterators();
int numRanges = rowidRangesPredicates.size();
if ( 1 < numRanges ) ((BatchScanner) (tableScanner = batchScanner)).setRanges( rowidRangesPredicates );
else if ( 1 == numRanges ) ((Scanner) (tableScanner = standardScanner)).setRange( rowidRangesPredicates.get(0) );
}
}
}
// Initialise a shift value to 1 if ROWID needs setting in nextRow()
if ( 0 == projectedColumns.length ) return true;
rowidColShift = 1 == projectedColumns[0] ? 1 : 0;
// Initialise mapping of column names to column indexes
for ( int i=0; i<projectedColumns.length; i++ ) {
int pColID = projectedColumns[i];
projectedColumnsNameToIndexMap.put( accumuloTableRSMD.getColumnName(pColID), pColID );
}
// Specify all columns required to be extracted for this query
for ( int i=0; i<projectedColumns.length; i++ ) {
final int pColID = projectedColumns[i];
if ( 1 == pColID ) continue;
final String colName = accumuloTableRSMD.getColumnName( pColID );
logger.logInfo("Adding projected column (column family): " + colName);
tableScanner.fetchColumnFamily( new Text(colName) );
// NOTE - Could make use of the following if the column name were to be built from family+qualifier fields:
// tableScanner.fetchColumn( new Text("<colFamily>"), new Text("<colQualifier>") );
}
rowScanIterator = tableScanner.iterator();
} catch ( Exception e ) {
throw new SQLException("Exception in AccumuloVTI.executeAsFastPath(), cause: " + Util.getStackTraceDigest(e));
}
return true;
}
/**
* The purpose of this method is to retrieve a list of rowIDs that satisfy predicates set on an IteratorSetting.
* This method must only be called with an iterator that will retrieve 1 Accumulo record per rowID.
*
* @param scanIterator
* @return
*/
private List<Range> iterateAndExtractRowRanges( Iterator<Map.Entry<Key, Value>> scanIterator ) {
List<Range> rowRanges = new ArrayList<Range>();
while( scanIterator.hasNext() )
rowRanges.add( new Range( scanIterator.next().getKey().getRow() ) );
return Range.mergeOverlapping( rowRanges );
}
/**
* GaianDB extract rows by calling this method repeatedly.
* 'dvdRecord' contains the number of columns resolved in tableShapeRSMD.
* However we only need to populate the projected columns indexes.
*/
@Override public int nextRow( final DataValueDescriptor[] dvdRecord ) throws StandardException, SQLException {
// logger.logDetail("Getting new relational record based on set of Accumulo rows. rowCount = " + rowCount +
// ", currenAccumuloRow: " + currentAccumuloRow );
if ( 0 == rowCount ) {
numRowsReceivedFromAccumulo = 0;
if ( 0 == projectedColumns.length || false == rowScanIterator.hasNext() ) return IFastPath.SCAN_COMPLETED; // empty table
else currentAccumuloRow = rowScanIterator.next(); // kick-start row extraction
}
// Check if there are any Accumulo records left...
if ( null == currentAccumuloRow ) return IFastPath.SCAN_COMPLETED;
Key key = currentAccumuloRow.getKey(); // lots of info available off the Key: rowID, col name/family, col qualifier, visibility, timestamp
Text rowID = key.getRow();
// Look for a new record... until one is found that meets qualifiers, or until none are left
do {
// Check if there are any Accumulo records left...
if ( null == currentAccumuloRow ) return IFastPath.SCAN_COMPLETED;
numRowsReceivedFromAccumulo++;
// Set rowID column before extracting others associated with it in the while loop
if ( 1 == rowidColShift ) dvdRecord[0].setValue( rowID.toString() );
// Initialise column cells to NULL value.
for ( int i=rowidColShift; i<projectedColumns.length; i++ )
dvdRecord[ projectedColumns[i]-1 ].setToNull();
// Extract columns from Accumulo records for this rowID - note: Accumulo rows don't have to be complete
Text previousRowID = rowID;
while ( rowID.equals( previousRowID ) ) {
final String colName = key.getColumnFamily().toString();
final Integer pColID = projectedColumnsNameToIndexMap.get(colName);
if ( null == pColID ) {
logger.logImportant("Encountered Accumulo column which was not requested as column family (skipped): " + colName);
continue; // this column was not requested - should not happen
}
// Log info about the newly found column
final String cellStringValue = isExtractAccumuloColumnQualifiersInPlaceOfValues ?
currentAccumuloRow.getKey().getColumnQualifier().toString() : currentAccumuloRow.getValue().toString();
// logger.logDetail("Setting ProjectedColID: " + pColID +
// ", from record with Key: " + key + " ==> ColFamily: " + key.getColumnFamily()
// + ( isExtractAccumuloColumnQualifiersInPlaceOfValues ? ", ColQualifier: " : ", Value: " ) + cellStringValue );
// Set column value for the row - this also does type conversion.
dvdRecord[ pColID-1 ].setValue( cellStringValue ); // normalise to 0-based
// Scroll to the next column - break if we run out of records (rows don't have to be complete)
if ( false == rowScanIterator.hasNext() ) {
currentAccumuloRow = null;
break;
}
currentAccumuloRow = rowScanIterator.next();
key = currentAccumuloRow.getKey();
previousRowID = rowID;
rowID = key.getRow();
}
} while ( null != qualifiers && false == RowsFilter.testQualifiers( dvdRecord, qualifiers ) );
rowCount++;
return IFastPath.GOT_ROW;
}
// PluralizableVTI methods - used to re-factor/generalise/simplify data source wrapper configurations
// Note each endpointID will appear as a suffix in the GDB_LEAF column when querying the logical table augmented with the provenance columns.
// @Override public Set<String> getEndpointIDs() { return endpointConstants.keySet(); }
// @Override public DataValueDescriptor[] getEndpointConstants(String endpointID) { return endpointConstants.get( endpointID ); }
@Override public int getRowCount() throws Exception { return rowCount; }
// Empty local heap resources for this instance and set them to null if possible
@Override public void close() throws SQLException { super.close(); reinitialise(); } //endpointConstants.clear(); }
private static final Range RANGE_INFINITY = new Range();
private static final List<Range> RANGES_INFINITY = Arrays.asList( RANGE_INFINITY );
/**
* Clears the query objects and returns true
*/
@Override public boolean reinitialise() {
if ( 0 < rowCount )
logger.logImportant("Re-initialising AccumuloVTI. Row Counts for last query - filtered locally/remotely: "
+ rowCount + '/' + numRowsReceivedFromAccumulo);
tableScanner.clearColumns();
tableScanner.clearScanIterators();
if ( tableScanner == standardScanner ) standardScanner.setRange( RANGE_INFINITY );
else batchScanner.setRanges( RANGES_INFINITY );
tableScanner = standardScanner; // default
rowCount = 0;
return true;
}
@Override public boolean isBeforeFirst() throws SQLException { return 0 == rowCount; }
// VTICosting methods - used for JOIN optimisations
@Override public double getEstimatedCostPerInstantiation(VTIEnvironment arg0) throws SQLException { return 0; }
@Override public double getEstimatedRowCount(VTIEnvironment arg0) throws SQLException { return 0; }
@Override public boolean supportsMultipleInstantiations(VTIEnvironment arg0) throws SQLException { return false; }
private class QualifiersToAccumuloRangesConverter {
private final static int ROWID_COLID = 0; // This VTI always exposes the rowID as the first column of the result.
private final Qualifier[][] originalQualifiers;
private Qualifier[][] prunedQualifiers;
// Mapping from column ID to Lists of Range objects converted from qualifiers on the columnID
private final Map<Integer, List<Range>> accumuloColIDs2RangesMap; // Integer colID is 0-based
private QualifiersToAccumuloRangesConverter( Qualifier[][] queryColumnPredicates ) throws StandardException, SQLException {
originalQualifiers = queryColumnPredicates;
accumuloColIDs2RangesMap = deriveAccumuloRangesAndPruneQualifiers(); // might be null if a range resolves to empty
}
private boolean isQualifiersResolveToFalse() { return null==accumuloColIDs2RangesMap; }
private List<Range> getAccumuloRanges() { return null==accumuloColIDs2RangesMap ? null : accumuloColIDs2RangesMap.get( ROWID_COLID ); }
public Qualifier[][] getPrunedQualifiers() { return prunedQualifiers; }
/**
* Converts List of ranges for a column to a regular expression - expects the type to be String (UTF8)
* Passed in colID must be 0-based.
*
* Example String regex:
* Original predicates/range: (C>="c" AND C<"i") OR C="bob"
* In conjunctive normal form: C<"i" AND (C="bob" OR C>="c")
* Equivalent REGEX: (?:[c-h].*|bob)
*
* We wanted a solution that would work with Accumulo 1.4.4
* However a better solution in future (1.5+) is to use full server-side Filter, e.g. AccumuloPredicateHandler
* https://github.com/bfemiano/accumulo-hive-storage-manager/wiki/Iterator%20Predicate%20pushdown
* http://storage-handler-docs.s3.amazonaws.com/javadocs/org/apache/accumulo/storagehandler/predicate/AccumuloPredicateHandler.html
*
* @param colID - 0-based column ID
* @return a regular expression equivalent to the list of Ranges converted from the qualifiers; or ".*" if all records pass, and null if none do (empty range).
*/
public String getPredicatesRegexForColID( int colID ) { // Passed in colID must be 0-based
if ( null == accumuloColIDs2RangesMap ) return null; // it has been determined that no records can match the query
List<Range> columnRanges = accumuloColIDs2RangesMap.get(colID);
if ( null == columnRanges ) return ".*"; // null means no range restrictions (i.e. no filter) so all records match..
if ( 1 > columnRanges.size() ) return null; // an empty range means that no records can match the query
// Ranges list is already ordered...
StringBuilder sb = new StringBuilder( "^(?:");
for ( Range colRange : columnRanges )
sb.append( constructRegexForRange( colRange ) + "|" );
int len = sb.length();
sb.replace( len-1, len, ")$" ); // replace the last '|' with the closing bracket for the regex
return sb.toString();
}
/**
* Example ranges to consider for conversion to regex (Note Accumulo Ranges methods will have already eliminated impossible ranges like >="aaa" and <"aa"):
*
* v>"a" and v<max => a..* | [b-max].*
*
* v>"aaa" and v<max => aaa..*| aa[b-max].* | a[b-max].* | [b-max].*
* v>="aaa" => aaa.* | aa[b-max].* | a[b-max].* | [b-max].*
* v>="aaa" and v<="ad" => aaa.* | aa[b-max].* | a[b-c].* | ad
* v>="aaa" and v<"ad" => aaa.* | aa[b-max].* | a[b-c].*
* v>="aaa" and v<"ac" => aaa.* | aa[b-max].* | ab.*
* v>="aaa" and v<"ab" => aaa.* | aa[b-max].*
*
* v>="ab" and v<"azz" => ab.* | a[c-y].* | az | az[min-y].*
*
* v>="aaa" and v<"acc" => aaa.* | aa[b-max].* | ab.* | ac | ac[min-b].*
* v>="aaa" and v<"aad" => aaa.* | aa[b-c].*
* v>="aaa" and v<"aadd" => aaa.* | aa[b-c].* | aad[e-max].*
*
* v>="aaa" and v<"aaad" => aaa | aaa[min-c].*
* v>"aaa" and v<"aaadd" => aaa[min-c].* | aaad | aaad[min-c].*
*
* v>="aaa" and v<="aaa" => aaa
*
* v>="aaa" and v<"bbb" => aaa.* | aa[b-max].* | a[b-max].* | b | b[min-a].* | bb | bb[min-a].*
* v>="aaa" and v<"cbb" => aaa.* | aa[b-max].* | a[b-max].* | b.* | c | c[min-a].* | cb | cb[min-a].*
* v>="aaa" and v<"dbb" => aaa.* | aa[b-max].* | a[b-max].* | [b-c].* | d | d[min-a].* | db | db[min-a].*
*
* v>min and v<"bbb" => [min-a].* | b | b[min-a].* | bb | bb[min-a].*
* v>min and v<"b" => [min-a].*
*
* v>="a" and v<"b" => a.*
*
* Basic algorithm:
* 1. If keys are equal and included, quote the value and return it.
* 2. If the start key is negative infinitity skip to step 7.
* 3. If the start key is a sub-string of the end key, skip to step 7
* 4. Create a starting regex matching the whole left bound string. If the start key is not included, then enforce existence of a trailing char
* 5. Create a new ORed regex for every sub-string of it, until it matches the beginning of the end key
* -> build every new substring by removing right-most chars 1 at a time, and adding 1 to the rightmost char
* 6. If/when the last sub-string matches the beginning of the end key, create an ORed regex range bounded on both sides (unless left bound+1 > right bound-1)
* 7. Add an ORed regex for every remaining super-string of the end key (adding chars left to right and substracting 1 from the rightmost char).
* -> For each of these, also pre-pend another ORed value for the exact new super-string (i.e. with no suffix)
* 8. Add a final regex matching the whole right-bound string if it is included.
*
* @param range
* @return
*/
private String constructRegexForRange( Range range ) {
final Key k1 = range.getStartKey(), k2 = range.getEndKey();
final boolean inf1 = range.isInfiniteStartKey(), inf2 = range.isInfiniteStopKey();
if ( true == inf1 && true == inf2 ) return ".*"; // infinite range
final String v1 = true == inf1 ? null : k1.getRow().toString(), v2 = true == inf2 ? null : k2.getRow().toString();
logger.logDetail( "startKey: " + (null == v1 ? "-inf" : "'" + v1 + "' (incl? " + range.isStartKeyInclusive() + ")") );
logger.logDetail( "endKey: " + (null == v2 ? "+inf" : "'" + v2 + "' (incl? " + range.isEndKeyInclusive() + ")") );
// 1. If keys are equal and included, quote the value and return it.
if ( null != v1 && v1.equals(v2) && range.isStartKeyInclusive() ) return rQuote(v1); // Quote/wrap in case v1 contains special characters
// Keys are not equal, so get ready for an actual range...
StringBuilder regex = new StringBuilder();
int prefixLen = 0; // the prefix which may be common to v1 and v2, e.g: "aa" is a common prefix for v1="aaaa" and v2="aadd", and used to form expressions like: "aa[b-c].*"
final int len1 = inf1 ? 0 : v1.length(), len2 = inf2 ? 0 : v2.length();
// 2. If the start key is infinite skip to step 7.
if ( false == inf1 ) {
// 3. If the start key is a sub-string of the end key, skip to step 7
if ( false == inf2 && v2.startsWith(v1) )
prefixLen = v1.length();
else {
// 4. Create a starting regex matching the whole left bound string. If the start key is not included, then enforce existence of a trailing char
regex.append( rQuote(v1) + (range.isStartKeyInclusive()?"":".") + ".*" + '|' );
logger.logDetail("Char values in '" + v1 + "': " + unicodeValuesAsString(v1) );
// 5. Create a new ORed regex for every sub-string of it, until it matches the beginning of the end key
for ( int i=1; i<=len1; i++ ) {
prefixLen = len1-i; // length of the next prefix
String prefix = v1.substring(0, prefixLen);
if ( '\u00FF' == v1.charAt(prefixLen) ) continue;
char c1 = (char) (v1.charAt(prefixLen)+1); // character just beyond the prefix, shifted up 1 in the utf8 range.
// logger.logDetail("len2 " + len2 + ", prefixLen " + prefixLen + ", prefix " + prefix);
if ( false == inf2 && len2 > prefixLen && v2.startsWith(prefix) ) {
// 6. If/when the last sub-string matches the beginning of the end key, create an ORed regex range bounded on both sides (unless left bound+1 > right bound-1)
char c2 = (char) (v2.charAt(prefixLen)-1); // character just beyond the prefix in the upper bound, shifted down 1 in the utf8 character set.
if ( c1 < c2 ) regex.append( rQuote(prefix) + '[' + toUnicode(c1) + '-' + toUnicode(c2) + "].*" + '|' );
else if ( c1 == c2 ) regex.append( rQuote(prefix) + toUnicode(c1) + ".*" + '|' );
prefixLen++;
break;
}
if ( '\u00FF' > c1 ) regex.append( rQuote(prefix) + '[' + toUnicode(c1) + '-' + UMAX + "].*" + '|' );
else if ( '\u00FF' == c1 ) regex.append( rQuote(prefix) + UMAX + ".*" + '|' );
}
}
}
// 7. Add an ORed regex for every remaining super-string of the end key (adding chars left to right and substracting 1 from the rightmost char).
if ( false == inf2 ) {
logger.logDetail("Char values in '" + v2 + "': " + unicodeValuesAsString(v2) );
for ( int i=prefixLen; i<len2; i++ ) {
String prefix = v2.substring(0, i);
regex.append( rQuote(prefix) + '|' );
if ( '\u0000' == v2.charAt(i) ) continue;
// if ( '\u0000' == v2.charAt(i) ) { regex.append( rQuote(prefix) + '|' ); continue; }
char c2 = (char) (v2.charAt(i)-1); // character just beyond the prefix, shifted down 1 in the utf8 character set.
if ( '\u0000' < c2 ) regex.append( rQuote(prefix) + '[' + UMIN + '-' + toUnicode(c2) + "].*" + '|' );
else if ( '\u0000' == c2 ) regex.append( rQuote(prefix) + UMIN + ".*" + '|' );
}
}
// 8. Add a final regex matching the whole right-bound string if it is included.
if ( range.isEndKeyInclusive() ) regex.append( rQuote(v2) );
else regex.deleteCharAt( regex.length()-1 ); // remove last '|' symbol
return regex.toString();
}
public String unicodeValuesAsString( String s ) {
if ( null==s ) return null; int len = s.length();
StringBuffer pcs = new StringBuffer( 0<len ? "[" + toUnicode(s.charAt(0)) : "[" );
for (int i=1; i<len; i++) pcs.append( ", " + toUnicode(s.charAt(i)) ); pcs.append(']');
return pcs.toString();
}
private String rQuote( String s ) { return 0 == s.length() ? "" : "\\Q" + s + "\\E"; }
private String toUnicode( char c ) { return c+""; }
// private String toUnicode( char c ) { return ((16>(int)c) ? "\\u000" : "\\u00" ) + Integer.toHexString(c); }
private static final String UMIN = "\\u0000";
private static final String UMAX = "\\u00FF";
/**
* Create a list of ranges for each column, based on conditional expressions in the Qualifier[][] structure, which holds predicates in conjunctive normal form.
* Each list of ranges represents conditions for 1 column.
* The ranges are only built for columns that are compared to strings.
* All comparison expressions ORed with comparisons involving other columnIDs are excluded entirely from all derived lists of ranges.
*
* Finally, this method also creates the prunedQualifiers field, which holds the original qualifiers minus those that were extracted to build the ranges lists.
*/
private Map<Integer, List<Range>> deriveAccumuloRangesAndPruneQualifiers() throws SQLException, StandardException {
// Mapping from column ID to Lists of Range objects converted from qualifiers on the columnID
Map<Integer, List<Range>> colIDs2RangesMap = new HashMap<Integer, List<Range>>(); // Integer colID is 0-based
prunedQualifiers = originalQualifiers;
if ( null == originalQualifiers || 0 == originalQualifiers.length ) return colIDs2RangesMap;
Set<Integer> excludedColIDs = new HashSet<Integer>();
// Find all column IDs that are compared to values that are not strings.
// We will not push predicates down to Accumulo for these columns.
for ( int i=0; i<originalQualifiers.length; i++ )
for ( int j=0; j<originalQualifiers[i].length; j++ )
if ( false == originalQualifiers[i][j].getOrderable() instanceof SQLChar )
excludedColIDs.add( originalQualifiers[i][j].getColumnId() );
// Process 1st row, which contains ANDed expressions.
Qualifier[] qRow = originalQualifiers[0];
for ( int j=0; j<qRow.length; j++ ) {
Qualifier qCell = qRow[j];
int colID = qCell.getColumnId();
// Create new Ranges for this qCell.
// Example: ROWID!=3 becomes ranges: [null,3[ ; ]3,null]
List<Range> newRanges = constructRangesFromSingleQualifier( qCell ); // can't be empty
List<Range> accumuloRanges = colIDs2RangesMap.get( colID );
if ( null == accumuloRanges ) { colIDs2RangesMap.put( colID, newRanges ); continue; }
// Clip new ranges (i.e. intersect) with current ones.
// Example for ROWID!=3 AND ROWID!=5 we end up with: [null,3[ ; ]3,5[ ; ]5,null]
List<Range> combinedRanges = new ArrayList<Range>();
for ( Range r1 : accumuloRanges ) for ( Range r2 : newRanges ) {
Range r = r1.clip(r2, true);
if ( null != r ) combinedRanges.add( r );
}
if ( (newRanges = Range.mergeOverlapping( combinedRanges )).isEmpty() ) return null; // empty range - no records can match
// Reduce current set before moving on to the next ANDed cell in this first row.
colIDs2RangesMap.put( colID, newRanges );
}
List<Qualifier[]> remainingQualifiers = new ArrayList<Qualifier[]>();
remainingQualifiers.add( new Qualifier[0] ); // The first row contains anded predicates - we have pruned them all...
// Process all subsequent rows, which now contain ORed expressions - also combine these with previous rows
// ORed rows that reference columns other than the ROW ID column are skipped/ignored because they may just be true.
// Rows are combined with previous ones by intersecting them
for ( int i=1; i<originalQualifiers.length; i++ ) {
qRow = originalQualifiers[i];
boolean isOnlyOneColIDReferencedInThisORedQrow = true;
int colID = qRow[0].getColumnId();
for ( int j=1; j<qRow.length; j++ )
if ( colID != qRow[j].getColumnId() ) { isOnlyOneColIDReferencedInThisORedQrow = false; break; }
// Ignore ORed expressions containing more than 1 column ID - they can't enforce constrainst on the Ranges list because the other column may resolve to true.
if ( false == isOnlyOneColIDReferencedInThisORedQrow ) {
remainingQualifiers.add( qRow );
continue;
}
// Create ranges for this ORed row
List<Range> oredRanges = new ArrayList<Range>();
for ( int j=0; j<qRow.length; j++ )
oredRanges.addAll( constructRangesFromSingleQualifier(qRow[j]) ); // can't become empty
List<Range> accumuloRanges = colIDs2RangesMap.get( colID );
if ( null == accumuloRanges ) { colIDs2RangesMap.put( colID, Range.mergeOverlapping( oredRanges ) ); continue; } // can't resolve to empty here
// Now clip (i.e. intersect) all current ranges with ranges built in this row.
List<Range> combinedRanges = new ArrayList<Range>();
for ( Range r1 : accumuloRanges ) for ( Range r2 : oredRanges ) {
Range r = r1.clip(r2, true);
if ( null != r ) combinedRanges.add( r );
}
if ( (accumuloRanges = Range.mergeOverlapping( combinedRanges )).isEmpty() ); // return null; // empty range - no records can match
// Finally, try to reduce the current set before moving on to the next ORed row
colIDs2RangesMap.put( colID, accumuloRanges );
}
logger.logDetail("Built Map <colID -> List<Range>> (size " + colIDs2RangesMap.size() + "): " + colIDs2RangesMap);
prunedQualifiers = 2 > remainingQualifiers.size() ? null : (Qualifier[][]) remainingQualifiers.toArray( new Qualifier[0][] );
return colIDs2RangesMap;
}
}
private static List<Range> constructRangesFromSingleQualifier( final Qualifier q ) throws SQLException, StandardException {
List<Range> ranges = new ArrayList<Range>();
final String val = q.getOrderable().getString();
final int operator = q.getOperator();
boolean negate = q.negateCompareResult();
switch ( operator ) {
case Orderable.ORDER_OP_EQUALS:
if (negate) { ranges.add(new Range(null, false, val, false)); ranges.add(new Range(val, false, null, false)); }
else ranges.add(new Range(val));
break;
case Orderable.ORDER_OP_GREATEROREQUALS: negate = !negate;
case Orderable.ORDER_OP_LESSTHAN:
ranges.add( negate ? new Range(val, null) : new Range(null, false, val, false) ); break;
case Orderable.ORDER_OP_LESSOREQUALS: negate = !negate;
case Orderable.ORDER_OP_GREATERTHAN:
ranges.add( negate ? new Range(null, val) : new Range(val, false, null, false) ); break;
default:
String errmsg = "Invalid operator detected (not one of the Orderable interface): " + operator;
logger.logThreadWarning(GDBMessages.ENGINE_OPERATOR_INVALID, "DERBY ERROR: " + errmsg);
throw new SQLException( errmsg );
}
return ranges;
}
// /**
// * Automatically creates a logical table based on an Accumulo one.
// * Logical table column types can later be changed manually by the user in gaiandb_config.properties if they want
// * to expose them via more useful types for searching.
// *
// * @param ltName
// * @param instanceID
// * @param zookeepersCSV
// * @param usr
// * @param pwd
// * @param accumuloTable
// */
// public static void setLogicalTableForAccumulo( final String ltName,
// final String instanceID, final String zookeepersCSV, final String usr, final String pwd, final String accumuloTable ) {
//
// // Use a hash of the connector properties to derive a unique vtiPropertiesID for looking them up.
// // Note we need to order the Zookeepers list so it doesn't affect the hash.
// final String accumuloInstancePropertiesHash = Util.getFileMD5(file)( instanceID + usr + pwd
// + new TreeSet<String>( Arrays.asList( Util.splitByCommas(zookeepersCSV) ) ) ).hashCode();
//
// final String vtiPropertiesID = AccumuloVTI.class.getSimpleName() + '.' + ;
//
// GaianDBConfigProcedures.setConfigProperties(
// "values ('" + vtiInstancePrefix + ".INSTANCE', '" + instanceID)
//
// }
/**
* Automatically creates a logical table based on an Accumulo one.
* Logical table column types can later be changed manually by the user if they want to expose them via more useful types for searching.
*
* This is the registration SQL for this procedure:
* CREATE PROCEDURE setltforaccumulo( ltname varchar(32672), connectorID varchar(32672), tablexpr varchar(32672) )
* PARAMETER STYLE JAVA LANGUAGE JAVA MODIFIES SQL DATA EXTERNAL NAME 'com.ibm.db2j.AccumuloVTI.setLogicalTableForAccumulo'
*
* Example use:
* call setltforaccumulo('LTBIKES', 'connectAcc:root:password@ets03.hursley.ibm.com/instance1', 'mytable')
*
* @param ltName
* @param connectorInfo - Either just a connectorID (to point to an existing one), or
* set a new connectorID, using format: connectorID:user:password@zookeepersCSV/instance;
* Example: "Connector1:user:password@host1,9.12.34.56,host3/MyAccumuloInstance"
* @param accumuloTableExpression - This may contain a simple accumulo table name, or a more complex expression to allow:
* - Access to different combinations of Accumulo table fields for each column family (e.g. extract column qualifier instead of the value field).
* - Specification of the "Visibility" logical expression in square brackets. This visibility will be attributed
* to the Scanner that will run against the Accumulo table - this needs to be a subset of the User's Visbility.
* Example: "myAccumuloTable qualifiers [A&B]"
*/
public static void setLogicalTableForAccumulo( String ltName, String connectorID, String accumuloTableExpression ) throws Exception {
final String HELP_SYNTAX =
"Syntax: call setLogicalTableForAccumulo('<ltName>', '<connectorID>', '<accumuloTable>'), where "
+ "connectorID may also set itself: <connectorID:user:password@zookeepersCSV/instance>";
int idx = connectorID.indexOf(':');
String connectorInfo = null;
if ( -1 < idx ) {
connectorInfo = connectorID.substring(idx+1);
connectorID = connectorID.substring(0, idx);
}
final String vtiPropertiesPrefix = AccumuloVTI.class.getSimpleName() + '.' + connectorID;
String instance, zookeepersList, usr, pwd;
if ( null != connectorInfo ) {
// Create new connector properties
// int idx1 = connectorInfo.indexOf('@');
// int idx2 = connectorInfo.indexOf(';');
// int idx3 = connectorInfo.indexOf(':');
int idx1 = connectorInfo.indexOf(':');
int idx2 = connectorInfo.indexOf('@');
int idx3 = connectorInfo.lastIndexOf('/');
if ( 0 > idx1 || 0 > idx2 || 0 > idx3 )
throw new Exception("Incorrect syntax for Accumulo connectorID argument. " + HELP_SYNTAX);
// instance = connectorInfo.substring(0, idx1);
// zookeepersList = connectorInfo.substring(idx1+1, idx2);
// usr = connectorInfo.substring(idx2+1, idx3);
// pwd = connectorInfo.substring(idx3+1);
usr = connectorInfo.substring(0, idx1);
pwd = connectorInfo.substring(idx1+1, idx2);
zookeepersList = connectorInfo.substring(idx2+1, idx3);
instance = connectorInfo.substring(idx3+1);
GaianDBConfigProcedures.setConfigProperties(
"values ('" + vtiPropertiesPrefix + '.' + PROPERTY_INSTANCE + "', '" + instance + "')"
+ ", ('" + vtiPropertiesPrefix + '.' + PROPERTY_ZOOKEEPERS + "', '" + zookeepersList + "')"
+ ", ('" + vtiPropertiesPrefix + '.' + PROPERTY_USR + "', '" + usr + "')"
+ ", ('" + vtiPropertiesPrefix + '.' + PROPERTY_PWD + "', '" + pwd + "')"
);
}
// Now build VTI instance to derive schema, then set logical table and data source properties
final String vtiArgs = connectorID + ',' + accumuloTableExpression + ',' + DERIVE_SCHEMA_FROM_FIRST_ROW;
AccumuloVTI vtiInstance = new AccumuloVTI( vtiArgs );
String tableDef = vtiInstance.getMetaData().getColumnsDefinition();
logger.logInfo("Derived schema (from 1st row) for Accumulo table '" + accumuloTableExpression + "': " + tableDef);
// Map<String, String> ltProperties = GaianDBConfigProcedures.prepareLogicalTable(ltName, tableDef, "");
// ltProperties.put( ltName + "_DS0_VTI", AccumuloVTI.class.getName() );
// ltProperties.put( ltName + "_DS0_ARGS", vtiArgs );
// ltProperties.put( ltName + "_DS0_OPTIONS", "MAP_COLUMNS_BY_POSITION" );
// ltProperties.put( ltName + "_DS0_SCHEMA", tableDef );
// // apply all properties...
GaianDBConfigProcedures.setLogicalTable(ltName, tableDef, "");
GaianDBConfigProcedures.setDataSourceVTI(ltName, "0", AccumuloVTI.class.getName(), vtiArgs, "MAP_COLUMNS_BY_POSITION", "");
// TODO: add _SCHEMA property in deriveRequiredDataSourceUpdatesFromOptionsAndColumnsLists()
}
}