AccumuloDefaultIndexScanner.java example

Explorer
hive-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.hive.accumulo;

import org.apache.accumulo.core.client.AccumuloException;
import org.apache.accumulo.core.client.AccumuloSecurityException;
import org.apache.accumulo.core.client.Connector;
import org.apache.accumulo.core.client.Scanner;
import org.apache.accumulo.core.client.TableNotFoundException;
import org.apache.accumulo.core.data.Key;
import org.apache.accumulo.core.data.Range;
import org.apache.accumulo.core.data.Value;
import org.apache.accumulo.core.security.Authorizations;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hive.accumulo.serde.AccumuloIndexParameters;
import org.apache.hadoop.hive.accumulo.serde.AccumuloSerDeParameters;
import org.apache.hadoop.hive.serde.serdeConstants;
import org.apache.hadoop.io.Text;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;

import static java.util.Collections.EMPTY_SET;

/**
 * This default index scanner expects indexes to be in the same format as presto's
 * accumulo index tables defined as:
 * [rowid=field value] [cf=cfname_cqname] [cq=rowid] [visibility] [value=""]
 * <p>
 * This handler looks for the following hive serde properties:
 * 'accumulo.indextable.name' = 'table_idx' (required - name of the corresponding index table)
 * 'accumulo.indexed.columns' = 'name,age,phone' (optional - comma separated list of indexed
 *                      hive columns if not defined or defined as '*' all columns are
 *                      assumed to be indexed )
 * 'accumulo.index.rows.max' = '20000' (optional - maximum number of match indexes to use
 *                      before converting to a full table scan default=20000'
 *                      Note: This setting controls the size of the in-memory list of rowids
 *                      each search predicate. Using large values for this setting or having
 *                      very large rowid values may require additional memory to prevent
 *                      out of memory errors
 * 'accumulo.index.scanner'  = 'org.apache.hadoop.hive.accumulo.AccumuloDefaultIndexScanner'
 *                      (optional - name of the index scanner)
 * <p>
 * To implement your own index table scheme it should be as simple as sub-classing
 * this class and overriding getIndexRowRanges() and optionally init() if you need more
 * config settings
 */
public class AccumuloDefaultIndexScanner implements AccumuloIndexScanner {
  private static final Logger LOG = LoggerFactory.getLogger(AccumuloDefaultIndexScanner.class);

  private AccumuloConnectionParameters connectParams;
  private AccumuloIndexParameters indexParams;
  private int maxRowIds;
  private Authorizations auths;
  private String indexTable;
  private Set<String> indexColumns = EMPTY_SET;
  private Connector connect;
  private Map<String, String> colMap;

  /**
   * Initialize object based on configuration.
   *
   * @param conf - Hive configuration
   */
  @Override
  public void init(Configuration conf) {
    connectParams = new AccumuloConnectionParameters(conf);
    indexParams = new AccumuloIndexParameters(conf);
    maxRowIds = indexParams.getMaxIndexRows();
    auths = indexParams.getTableAuths();
    indexTable = indexParams.getIndexTable();
    indexColumns = indexParams.getIndexColumns();
    colMap = createColumnMap(conf);

  }

  /**
   * Get a list of rowid ranges by scanning a column index.
   *
   * @param column     - the hive column name
   * @param indexRange - Key range to scan on the index table
   * @return List of matching rowid ranges or null if too many matches found
   * if index values are not found a newline range is added to list to
   * short-circuit the query
   */
  @Override
  public List<Range> getIndexRowRanges(String column, Range indexRange) {
    List<Range> rowIds = new ArrayList<Range>();
    Scanner scan = null;
    String col = this.colMap.get(column);

    if (col != null) {

      try {
        LOG.debug("Searching tab=" + indexTable + " column=" + column + " range=" + indexRange);
        Connector conn = getConnector();
        scan = conn.createScanner(indexTable, auths);
        scan.setRange(indexRange);
        Text cf = new Text(col);
        LOG.debug("Using Column Family=" + toString());
        scan.fetchColumnFamily(cf);

        for (Map.Entry<Key, Value> entry : scan) {

          rowIds.add(new Range(entry.getKey().getColumnQualifier()));

          // if we have too many results return null for a full scan
          if (rowIds.size() > maxRowIds) {
            return null;
          }
        }

        // no hits on the index so return a no match range
        if (rowIds.isEmpty()) {
          LOG.debug("Found 0 index matches");
        } else {
          LOG.debug("Found " + rowIds.size() + " index matches");
        }

        return rowIds;
      } catch (AccumuloException | AccumuloSecurityException | TableNotFoundException e) {
        LOG.error("Failed to scan index table: " + indexTable, e);
      } finally {
        if (scan != null) {
          scan.close();
        }
      }
    }

    // assume the index is bad and do a full scan
    LOG.debug("Index lookup failed for table " + indexTable);
    return null;
  }

  /**
   * Test if column is defined in the index table.
   *
   * @param column - hive column name
   * @return true if the column is defined as part of the index table
   */
  @Override
  public boolean isIndexed(String column) {
    return indexTable != null
        && (indexColumns.isEmpty() || indexColumns.contains("*")
        || this.indexColumns.contains(column.toLowerCase())
        || this.indexColumns.contains(column.toUpperCase()));

  }

  protected Map<String, String> createColumnMap(Configuration conf) {
    Map<String, String> colsMap = new HashMap<String, String>();
    String accColString = conf.get(AccumuloSerDeParameters.COLUMN_MAPPINGS);
    if (accColString != null && !accColString.trim().isEmpty()) {
      String[] accCols = accColString.split(",");
      String[] hiveCols = conf.get(serdeConstants.LIST_COLUMNS).split(",");
      for (int i = 0; i < accCols.length; i++) {
        colsMap.put(hiveCols[i], accCols[i].replace(':', '_'));
      }
    }
    return colsMap;
  }

  protected Connector getConnector() throws AccumuloSecurityException, AccumuloException {
    if (connect == null) {
      connect = connectParams.getConnector();
    }
    return connect;
  }

  public void setConnectParams(AccumuloConnectionParameters connectParams) {
    this.connectParams = connectParams;
  }

  public AccumuloConnectionParameters getConnectParams() {
    return connectParams;
  }

  public AccumuloIndexParameters getIndexParams() {
    return indexParams;
  }

  public int getMaxRowIds() {
    return maxRowIds;
  }

  public Authorizations getAuths() {
    return auths;
  }

  public String getIndexTable() {
    return indexTable;
  }

  public Set<String> getIndexColumns() {
    return indexColumns;
  }

  public Connector getConnect() {
    return connect;
  }
}