/* * Copyright © 2014-2016 Cask Data, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ package co.cask.cdap.data2.dataset2.lib.table.hbase; import co.cask.cdap.api.common.Bytes; import co.cask.cdap.api.data.schema.Schema; import co.cask.cdap.api.dataset.DataSetException; import co.cask.cdap.api.dataset.DatasetContext; import co.cask.cdap.api.dataset.DatasetSpecification; import co.cask.cdap.api.dataset.table.ConflictDetection; import co.cask.cdap.api.dataset.table.Filter; import co.cask.cdap.api.dataset.table.Scanner; import co.cask.cdap.api.dataset.table.Table; import co.cask.cdap.common.conf.CConfiguration; import co.cask.cdap.common.utils.ImmutablePair; import co.cask.cdap.data2.dataset2.lib.table.BufferingTable; import co.cask.cdap.data2.dataset2.lib.table.FuzzyRowFilter; import co.cask.cdap.data2.dataset2.lib.table.IncrementValue; import co.cask.cdap.data2.dataset2.lib.table.PutValue; import co.cask.cdap.data2.dataset2.lib.table.Update; import co.cask.cdap.data2.dataset2.lib.table.inmemory.PrefixedNamespaces; import co.cask.cdap.data2.util.TableId; import co.cask.cdap.data2.util.hbase.DeleteBuilder; import co.cask.cdap.data2.util.hbase.GetBuilder; import co.cask.cdap.data2.util.hbase.HBaseTableUtil; import co.cask.cdap.data2.util.hbase.PutBuilder; import co.cask.cdap.data2.util.hbase.ScanBuilder; import co.cask.tephra.Transaction; import co.cask.tephra.TransactionCodec; import co.cask.tephra.TxConstants; import com.google.common.base.Objects; import com.google.common.base.Preconditions; import com.google.common.base.Throwables; import com.google.common.collect.ImmutableMap; import com.google.common.collect.Lists; import com.google.common.collect.Maps; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hbase.client.Delete; import org.apache.hadoop.hbase.client.Get; import org.apache.hadoop.hbase.client.HTable; import org.apache.hadoop.hbase.client.Put; import org.apache.hadoop.hbase.client.Result; import org.apache.hadoop.hbase.client.ResultScanner; import org.apache.hadoop.hbase.util.Pair; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.Map; import java.util.NavigableMap; import javax.annotation.Nullable; /** * Dataset client for HBase tables. */ // todo: do periodic flush when certain threshold is reached // todo: extract separate "no delete inside tx" table? // todo: consider writing & reading using HTable to do in multi-threaded way public class HBaseTable extends BufferingTable { private static final Logger LOG = LoggerFactory.getLogger(HBaseTable.class); public static final String DELTA_WRITE = "d"; private final HBaseTableUtil tableUtil; private final HTable hTable; private final String hTableName; private final byte[] columnFamily; private final TransactionCodec txCodec; // name length + name of the table: handy to have one cached private final byte[] nameAsTxChangePrefix; private Transaction tx; public HBaseTable(DatasetContext datasetContext, DatasetSpecification spec, CConfiguration cConf, Configuration hConf, HBaseTableUtil tableUtil) throws IOException { super(PrefixedNamespaces.namespace(cConf, datasetContext.getNamespaceId(), spec.getName()), ConflictDetection.valueOf(spec.getProperty(PROPERTY_CONFLICT_LEVEL, ConflictDetection.ROW.name())), HBaseTableAdmin.supportsReadlessIncrements(spec), spec.getProperty(Table.PROPERTY_SCHEMA) == null ? null : Schema.parseJson(spec.getProperty(Table.PROPERTY_SCHEMA)), spec.getProperty(Table.PROPERTY_SCHEMA_ROW_FIELD)); TableId tableId = TableId.from(datasetContext.getNamespaceId(), spec.getName()); HTable hTable = tableUtil.createHTable(hConf, tableId); // todo: make configurable hTable.setWriteBufferSize(HBaseTableUtil.DEFAULT_WRITE_BUFFER_SIZE); hTable.setAutoFlush(false); this.tableUtil = tableUtil; this.hTable = hTable; this.hTableName = Bytes.toStringBinary(hTable.getTableName()); this.columnFamily = HBaseTableAdmin.getColumnFamily(spec); this.txCodec = new TransactionCodec(); // Overriding the hbase tx change prefix so it resembles the hbase table name more closely, since the HBase // table name is not the same as the dataset name anymore this.nameAsTxChangePrefix = Bytes.add(new byte[]{(byte) this.hTableName.length()}, Bytes.toBytes(this.hTableName)); } @Override public String toString() { return Objects.toStringHelper(this) .add("hTable", hTable) .add("hTableName", hTableName) .add("nameAsTxChangePrefix", nameAsTxChangePrefix) .toString(); } @Override public void startTx(Transaction tx) { super.startTx(tx); this.tx = tx; } @Override public List<Map<byte[], byte[]>> getPersisted(List<co.cask.cdap.api.dataset.table.Get> gets) { List<Get> hbaseGets = new ArrayList<>(); for (co.cask.cdap.api.dataset.table.Get get : gets) { List<byte[]> cols = get.getColumns(); // Our Get class (co.cask.cdap.api.dataset.table.Get) with empty array means get nothing, but there is no way to // specify this in an HBase Get (org.apache.hadoop.hbase.client.Get). That's why we don't call createGet for // every get. if (cols == null || !cols.isEmpty()) { hbaseGets.add(createGet(get.getRow(), cols == null ? null : cols.toArray(new byte[cols.size()][]))); } } try { Result[] hbaseResults = hTable.get(hbaseGets); List<Map<byte[], byte[]>> results = new ArrayList<>(gets.size()); int hbaseResultsIndex = 0; for (co.cask.cdap.api.dataset.table.Get get : gets) { List<byte[]> cols = get.getColumns(); if (cols == null || !cols.isEmpty()) { Result hbaseResult = hbaseResults[hbaseResultsIndex++]; Map<byte[], byte[]> familyMap = hbaseResult.getFamilyMap(columnFamily); results.add(familyMap != null ? familyMap : ImmutableMap.<byte[], byte[]>of()); } else { results.add(ImmutableMap.<byte[], byte[]>of()); } } return results; } catch (IOException ioe) { throw new DataSetException("Multi-get failed on table " + hTableName, ioe); } } @Override public byte[] getNameAsTxChangePrefix() { return nameAsTxChangePrefix; } @Override public void close() throws IOException { try { super.close(); } finally { hTable.close(); } } @Override protected void persist(NavigableMap<byte[], NavigableMap<byte[], Update>> buff) throws Exception { List<Put> puts = Lists.newArrayList(); for (Map.Entry<byte[], NavigableMap<byte[], Update>> row : buff.entrySet()) { PutBuilder put = tableUtil.buildPut(row.getKey()); Put incrementPut = null; for (Map.Entry<byte[], Update> column : row.getValue().entrySet()) { // we want support tx and non-tx modes if (tx != null) { // TODO: hijacking timestamp... bad Update val = column.getValue(); if (val instanceof IncrementValue) { incrementPut = getIncrementalPut(incrementPut, row.getKey()); incrementPut.add(columnFamily, column.getKey(), tx.getWritePointer(), Bytes.toBytes(((IncrementValue) val).getValue())); } else if (val instanceof PutValue) { put.add(columnFamily, column.getKey(), tx.getWritePointer(), wrapDeleteIfNeeded(((PutValue) val).getValue())); } } else { Update val = column.getValue(); if (val instanceof IncrementValue) { incrementPut = getIncrementalPut(incrementPut, row.getKey()); incrementPut.add(columnFamily, column.getKey(), Bytes.toBytes(((IncrementValue) val).getValue())); } else if (val instanceof PutValue) { put.add(columnFamily, column.getKey(), ((PutValue) val).getValue()); } } } if (incrementPut != null) { puts.add(incrementPut); } if (!put.isEmpty()) { puts.add(put.build()); } } if (!puts.isEmpty()) { hTable.put(puts); hTable.flushCommits(); } else { LOG.info("No writes to persist!"); } } private Put getIncrementalPut(Put existing, byte[] row) { if (existing != null) { return existing; } return tableUtil.buildPut(row) .setAttribute(DELTA_WRITE, Bytes.toBytes(true)) .build(); } @Override protected void undo(NavigableMap<byte[], NavigableMap<byte[], Update>> persisted) throws Exception { // NOTE: we use Delete with the write pointer as the specific version to delete. List<Delete> deletes = Lists.newArrayList(); for (Map.Entry<byte[], NavigableMap<byte[], Update>> row : persisted.entrySet()) { DeleteBuilder delete = tableUtil.buildDelete(row.getKey()); for (Map.Entry<byte[], Update> column : row.getValue().entrySet()) { // we want support tx and non-tx modes if (tx != null) { delete.setAttribute(TxConstants.TX_ROLLBACK_ATTRIBUTE_KEY, new byte[0]); // TODO: hijacking timestamp... bad delete.deleteColumn(columnFamily, column.getKey(), tx.getWritePointer()); } else { delete.deleteColumns(columnFamily, column.getKey()); } } deletes.add(delete.build()); } hTable.delete(deletes); hTable.flushCommits(); } @Override protected NavigableMap<byte[], byte[]> getPersisted(byte[] row, byte[] startColumn, byte[] stopColumn, int limit) throws Exception { // todo: this is very inefficient: column range + limit should be pushed down via server-side filters return getRange(getInternal(row, null), startColumn, stopColumn, limit); } @Override protected NavigableMap<byte[], byte[]> getPersisted(byte[] row, @Nullable byte[][] columns) throws Exception { return getInternal(row, columns); } @Override protected Scanner scanPersisted(co.cask.cdap.api.dataset.table.Scan scan) throws Exception { ScanBuilder hScan = tableUtil.buildScan(); hScan.addFamily(columnFamily); // todo: should be configurable // NOTE: by default we assume scanner is used in mapreduce job, hence no cache blocks hScan.setCacheBlocks(false); hScan.setCaching(1000); byte[] startRow = scan.getStartRow(); byte[] stopRow = scan.getStopRow(); if (startRow != null) { hScan.setStartRow(startRow); } if (stopRow != null) { hScan.setStopRow(stopRow); } setFilterIfNeeded(hScan, scan.getFilter()); hScan.setAttribute(TxConstants.TX_OPERATION_ATTRIBUTE_KEY, txCodec.encode(tx)); ResultScanner resultScanner = hTable.getScanner(hScan.build()); return new HBaseScanner(resultScanner, columnFamily); } private void setFilterIfNeeded(ScanBuilder scan, @Nullable Filter filter) { if (filter == null) { return; } if (filter instanceof FuzzyRowFilter) { FuzzyRowFilter fuzzyRowFilter = (FuzzyRowFilter) filter; List<Pair<byte[], byte[]>> fuzzyPairs = Lists.newArrayListWithExpectedSize(fuzzyRowFilter.getFuzzyKeysData().size()); for (ImmutablePair<byte[], byte[]> pair : fuzzyRowFilter.getFuzzyKeysData()) { fuzzyPairs.add(Pair.newPair(pair.getFirst(), pair.getSecond())); } scan.setFilter(new org.apache.hadoop.hbase.filter.FuzzyRowFilter(fuzzyPairs)); } else { throw new IllegalArgumentException("Unsupported filter: " + filter); } } /** * Creates an {@link Get} for the specified row and columns. * * @param row the rowkey for the Get * @param columns the columns to fetch. null means to retrieve all columns. * @throws IllegalArgumentException if columns has length 0. */ private Get createGet(byte[] row, @Nullable byte[][] columns) { Preconditions.checkArgument(columns == null || columns.length != 0); GetBuilder get = tableUtil.buildGet(row); get.addFamily(columnFamily); if (columns != null && columns.length > 0) { for (byte[] column : columns) { get.addColumn(columnFamily, column); } } else { get.addFamily(columnFamily); } try { // no tx logic needed if (tx == null) { get.setMaxVersions(1); } else { get.setAttribute(TxConstants.TX_OPERATION_ATTRIBUTE_KEY, txCodec.encode(tx)); } } catch (IOException ioe) { throw Throwables.propagate(ioe); } return get.build(); } // columns being null means to get all rows; empty columns means get no rows. private NavigableMap<byte[], byte[]> getInternal(byte[] row, @Nullable byte[][] columns) throws IOException { if (columns != null && columns.length == 0) { return EMPTY_ROW_MAP; } Get get = createGet(row, columns); Result result = hTable.get(get); // no tx logic needed if (tx == null) { return result.isEmpty() ? EMPTY_ROW_MAP : result.getFamilyMap(columnFamily); } return getRowMap(result, columnFamily); } static NavigableMap<byte[], byte[]> getRowMap(Result result, byte[] columnFamily) { if (result.isEmpty()) { return EMPTY_ROW_MAP; } // note: server-side filters all everything apart latest visible for us, so we can flatten it here NavigableMap<byte[], NavigableMap<Long, byte[]>> versioned = result.getMap().get(columnFamily); NavigableMap<byte[], byte[]> rowMap = Maps.newTreeMap(Bytes.BYTES_COMPARATOR); for (Map.Entry<byte[], NavigableMap<Long, byte[]>> column : versioned.entrySet()) { rowMap.put(column.getKey(), column.getValue().firstEntry().getValue()); } return unwrapDeletes(rowMap); } }