/* * Copyright © 2016 Cask Data, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ package co.cask.cdap.data2.increment.hbase12cdh570; import co.cask.cdap.data2.dataset2.lib.table.hbase.HBaseTable; import co.cask.cdap.data2.increment.hbase.IncrementHandlerState; import co.cask.cdap.data2.increment.hbase.TimestampOracle; import co.cask.cdap.data2.util.hbase.HTable12CDH570NameConverter; import com.google.common.annotations.VisibleForTesting; import com.google.common.collect.Maps; import org.apache.hadoop.hbase.Cell; import org.apache.hadoop.hbase.CellUtil; import org.apache.hadoop.hbase.CoprocessorEnvironment; import org.apache.hadoop.hbase.HColumnDescriptor; import org.apache.hadoop.hbase.HTableDescriptor; import org.apache.hadoop.hbase.client.Delete; import org.apache.hadoop.hbase.client.Durability; import org.apache.hadoop.hbase.client.Get; import org.apache.hadoop.hbase.client.Put; import org.apache.hadoop.hbase.client.Scan; import org.apache.hadoop.hbase.coprocessor.BaseRegionObserver; import org.apache.hadoop.hbase.coprocessor.ObserverContext; import org.apache.hadoop.hbase.coprocessor.RegionCoprocessorEnvironment; import org.apache.hadoop.hbase.io.ImmutableBytesWritable; import org.apache.hadoop.hbase.regionserver.InternalScanner; import org.apache.hadoop.hbase.regionserver.Region; import org.apache.hadoop.hbase.regionserver.RegionScanner; import org.apache.hadoop.hbase.regionserver.ScanType; import org.apache.hadoop.hbase.regionserver.Store; import org.apache.hadoop.hbase.regionserver.compactions.CompactionRequest; import org.apache.hadoop.hbase.regionserver.wal.WALEdit; import org.apache.hadoop.hbase.util.Bytes; import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.Map; import java.util.NavigableMap; import java.util.TreeMap; /** * HBase coprocessor that handles reading and writing read-less increment operations. * * <p>Writes of incremental values are performed as normal {@code Put}s, flagged with a special attribute * {@link HBaseTable#DELTA_WRITE}. The coprocessor intercepts these * writes and rewrites the cell value to use a special marker prefix.</p> * * <p>For read (for {@code Get} and {@code Scan}) operations, all of the delta values are summed up for a column, * up to and including the most recent "full" (non-delta) value. The sum of these delta values, plus the full value * (if found) is returned for the column.</p> * * <p>To mitigate the performance impact on reading, this coprocessor also overrides the scanner used in flush and * compaction operations, using {@link IncrementSummingScanner} to generate a new "full" value aggregated from * all the successfully committed delta values.</p> */ public class IncrementHandler extends BaseRegionObserver { private Region region; private IncrementHandlerState state; @Override public void start(CoprocessorEnvironment e) throws IOException { if (e instanceof RegionCoprocessorEnvironment) { RegionCoprocessorEnvironment env = (RegionCoprocessorEnvironment) e; this.region = ((RegionCoprocessorEnvironment) e).getRegion(); this.state = new IncrementHandlerState(env.getConfiguration(), env.getRegion().getTableDesc(), new HTable12CDH570NameConverter()); HTableDescriptor tableDesc = env.getRegion().getTableDesc(); for (HColumnDescriptor columnDesc : tableDesc.getFamilies()) { state.initFamily(columnDesc.getName(), convertFamilyValues(columnDesc.getValues())); } } } @VisibleForTesting public void setTimestampOracle(TimestampOracle timeOracle) { state.setTimestampOracle(timeOracle); } private Map<byte[], byte[]> convertFamilyValues(Map<ImmutableBytesWritable, ImmutableBytesWritable> writableValues) { Map<byte[], byte[]> converted = Maps.newTreeMap(Bytes.BYTES_COMPARATOR); for (Map.Entry<ImmutableBytesWritable, ImmutableBytesWritable> e : writableValues.entrySet()) { converted.put(e.getKey().get(), e.getValue().get()); } return converted; } @Override public void preGetOp(ObserverContext<RegionCoprocessorEnvironment> ctx, Get get, List<Cell> results) throws IOException { Scan scan = new Scan(get); scan.setMaxVersions(); scan.setFilter(Filters.combine(new IncrementFilter(), scan.getFilter())); RegionScanner scanner = null; try { scanner = new IncrementSummingScanner(region, scan.getBatch(), region.getScanner(scan), ScanType.USER_SCAN); scanner.next(results); ctx.bypass(); } finally { if (scanner != null) { scanner.close(); } } } @Override public void prePut(ObserverContext<RegionCoprocessorEnvironment> ctx, Put put, WALEdit edit, Durability durability) throws IOException { // we assume that if any of the column families written to are transactional, the entire write is transactional boolean transactional = state.containsTransactionalFamily(put.getFamilyCellMap().keySet()); boolean isIncrement = put.getAttribute(HBaseTable.DELTA_WRITE) != null; if (isIncrement || !transactional) { // incremental write NavigableMap<byte[], List<Cell>> newFamilyMap = new TreeMap<>(Bytes.BYTES_COMPARATOR); long tsToAssign = 0; if (!transactional) { tsToAssign = state.getUniqueTimestamp(); } for (Map.Entry<byte[], List<Cell>> entry : put.getFamilyCellMap().entrySet()) { List<Cell> newCells = new ArrayList<>(entry.getValue().size()); for (Cell cell : entry.getValue()) { // rewrite the cell value with a special prefix to identify it as a delta // for 0.98 we can update this to use cell tags byte[] newValue = isIncrement ? Bytes.add(IncrementHandlerState.DELTA_MAGIC_PREFIX, CellUtil.cloneValue(cell)) : CellUtil.cloneValue(cell); newCells.add(CellUtil.createCell(CellUtil.cloneRow(cell), CellUtil.cloneFamily(cell), CellUtil.cloneQualifier(cell), transactional ? cell.getTimestamp() : tsToAssign, cell.getTypeByte(), newValue)); } newFamilyMap.put(entry.getKey(), newCells); } put.setFamilyCellMap(newFamilyMap); } // put completes normally with value prefix marker } @Override public void preDelete(ObserverContext<RegionCoprocessorEnvironment> e, Delete delete, WALEdit edit, Durability durability) throws IOException { boolean transactional = state.containsTransactionalFamily(delete.getFamilyCellMap().keySet()); if (!transactional) { long tsToAssign = state.getUniqueTimestamp(); delete.setTimestamp(tsToAssign); // new key values NavigableMap<byte[], List<Cell>> newFamilyMap = new TreeMap<>(Bytes.BYTES_COMPARATOR); for (Map.Entry<byte[], List<Cell>> entry : delete.getFamilyCellMap().entrySet()) { List<Cell> newCells = new ArrayList<>(entry.getValue().size()); for (Cell kv : entry.getValue()) { // replace the timestamp newCells.add(CellUtil.createCell(CellUtil.cloneRow(kv), CellUtil.cloneFamily(kv), CellUtil.cloneQualifier(kv), tsToAssign, kv.getTypeByte(), CellUtil.cloneValue(kv))); } newFamilyMap.put(entry.getKey(), newCells); } delete.setFamilyCellMap(newFamilyMap); } } @Override public RegionScanner preScannerOpen(ObserverContext<RegionCoprocessorEnvironment> e, Scan scan, RegionScanner s) throws IOException { // must see all versions to aggregate increments scan.setMaxVersions(); scan.setFilter(Filters.combine(new IncrementFilter(), scan.getFilter())); return s; } @Override public RegionScanner postScannerOpen(ObserverContext<RegionCoprocessorEnvironment> ctx, Scan scan, RegionScanner scanner) throws IOException { return new IncrementSummingScanner(region, scan.getBatch(), scanner, ScanType.USER_SCAN); } @Override public InternalScanner preFlush(ObserverContext<RegionCoprocessorEnvironment> e, Store store, InternalScanner scanner) throws IOException { byte[] family = store.getFamily().getName(); return new IncrementSummingScanner(region, IncrementHandlerState.BATCH_UNLIMITED, scanner, ScanType.COMPACT_RETAIN_DELETES, state.getCompactionBound(family), state.getOldestVisibleTimestamp(family)); } public static boolean isIncrement(Cell cell) { return !CellUtil.isDelete(cell) && cell.getValueLength() == IncrementHandlerState.DELTA_FULL_LENGTH && Bytes.equals(cell.getValueArray(), cell.getValueOffset(), IncrementHandlerState.DELTA_MAGIC_PREFIX.length, IncrementHandlerState.DELTA_MAGIC_PREFIX, 0, IncrementHandlerState.DELTA_MAGIC_PREFIX.length); } @Override public InternalScanner preCompact(ObserverContext<RegionCoprocessorEnvironment> e, Store store, InternalScanner scanner, ScanType scanType) throws IOException { byte[] family = store.getFamily().getName(); return new IncrementSummingScanner(region, IncrementHandlerState.BATCH_UNLIMITED, scanner, scanType, state.getCompactionBound(family), state.getOldestVisibleTimestamp(family)); } @Override public InternalScanner preCompact(ObserverContext<RegionCoprocessorEnvironment> e, Store store, InternalScanner scanner, ScanType scanType, CompactionRequest request) throws IOException { byte[] family = store.getFamily().getName(); return new IncrementSummingScanner(region, IncrementHandlerState.BATCH_UNLIMITED, scanner, scanType, state.getCompactionBound(family), state.getOldestVisibleTimestamp(family)); } }