/**
* (c) Copyright 2014 WibiData, Inc.
*
* See the NOTICE file distributed with this work for additional
* information regarding copyright ownership.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.kiji.mapreduce.framework;
import java.io.IOException;
import java.util.Iterator;
import java.util.List;
import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Lists;
import org.apache.commons.codec.binary.Base64;
import org.apache.commons.lang.SerializationUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.client.HTableInterface;
import org.apache.hadoop.hbase.mapreduce.TableSplit;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.kiji.annotations.ApiAudience;
import org.kiji.annotations.ApiStability;
import org.kiji.mapreduce.impl.KijiTableSplit;
import org.kiji.schema.EntityId;
import org.kiji.schema.HBaseEntityId;
import org.kiji.schema.Kiji;
import org.kiji.schema.KijiColumnName;
import org.kiji.schema.KijiDataRequest;
import org.kiji.schema.KijiRegion;
import org.kiji.schema.KijiRowData;
import org.kiji.schema.KijiRowScanner;
import org.kiji.schema.KijiTable;
import org.kiji.schema.KijiTableReader;
import org.kiji.schema.KijiTableReader.KijiScannerOptions;
import org.kiji.schema.KijiURI;
import org.kiji.schema.filter.KijiRowFilter;
import org.kiji.schema.hbase.HBaseScanOptions;
import org.kiji.schema.impl.hbase.HBaseKijiRowData;
import org.kiji.schema.impl.hbase.HBaseKijiTable;
import org.kiji.schema.layout.ColumnReaderSpec;
import org.kiji.schema.util.ResourceUtils;
/** InputFormat for Hadoop MapReduce jobs reading from a Kiji table. */
@ApiAudience.Framework
@ApiStability.Stable
public final class HBaseKijiTableInputFormat
extends KijiTableInputFormat {
/**
* Number of bytes from the row-key to include when reporting progress.
* Use 32 bits precision, ie. 4 billion row keys granularity.
*/
private static final int PROGRESS_PRECISION_NBYTES = 4;
/** Configuration of this input format. */
private Configuration mConf;
/** {@inheritDoc} */
@Override
public void setConf(Configuration conf) {
mConf = conf;
}
/** {@inheritDoc} */
@Override
public Configuration getConf() {
return mConf;
}
/** {@inheritDoc} */
@Override
public RecordReader<EntityId, KijiRowData> createRecordReader(
InputSplit split,
TaskAttemptContext context
) throws IOException {
return new KijiTableRecordReader(mConf);
}
/**
* Reports the HBase table name for the specified Kiji table.
*
* @param table Kiji table to report the HBase table name of.
* @return the HBase table name for the specified Kiji table.
* @throws java.io.IOException on I/O error.
*/
private static byte[] getHBaseTableName(KijiTable table) throws IOException {
final HBaseKijiTable htable = HBaseKijiTable.downcast(table);
final HTableInterface hti = htable.openHTableConnection();
try {
return hti.getTableName();
} finally {
hti.close();
}
}
/** {@inheritDoc} */
@Override
public List<InputSplit> getSplits(JobContext context) throws IOException {
final Configuration conf = context.getConfiguration();
final KijiURI inputTableURI =
KijiURI.newBuilder(conf.get(KijiConfKeys.KIJI_INPUT_TABLE_URI)).build();
final Kiji kiji = Kiji.Factory.open(inputTableURI, conf);
try {
final KijiTable table = kiji.openTable(inputTableURI.getTable());
try {
final byte[] htableName = getHBaseTableName(table);
final List<InputSplit> splits = Lists.newArrayList();
byte[] scanStartKey = HConstants.EMPTY_START_ROW;
if (null != conf.get(KijiConfKeys.KIJI_START_ROW_KEY)) {
scanStartKey = Base64.decodeBase64(conf.get(KijiConfKeys.KIJI_START_ROW_KEY));
}
byte[] scanLimitKey = HConstants.EMPTY_END_ROW;
if (null != conf.get(KijiConfKeys.KIJI_LIMIT_ROW_KEY)) {
scanLimitKey = Base64.decodeBase64(conf.get(KijiConfKeys.KIJI_LIMIT_ROW_KEY));
}
for (KijiRegion region : table.getRegions()) {
final byte[] regionStartKey = region.getStartKey();
final byte[] regionEndKey = region.getEndKey();
// Determine if the scan start and limit key fall into the region.
// Logic was copied from o.a.h.h.m.TableInputFormatBase
if ((scanStartKey.length == 0 || regionEndKey.length == 0
|| Bytes.compareTo(scanStartKey, regionEndKey) < 0)
&& (scanLimitKey.length == 0
|| Bytes.compareTo(scanLimitKey, regionStartKey) > 0)) {
byte[] splitStartKey = (scanStartKey.length == 0
|| Bytes.compareTo(regionStartKey, scanStartKey) >= 0)
? regionStartKey : scanStartKey;
byte[] splitEndKey = ((scanLimitKey.length == 0
|| Bytes.compareTo(regionEndKey, scanLimitKey) <= 0)
&& regionEndKey.length > 0)
? regionEndKey : scanLimitKey;
// TODO(KIJIMR-65): For now pick the first available location (ie. region server),
// if any.
final String location =
region.getLocations().isEmpty() ? null : region.getLocations().iterator().next();
final TableSplit tableSplit =
new TableSplit(htableName, splitStartKey, splitEndKey, location);
splits.add(new KijiTableSplit(tableSplit));
}
}
return splits;
} finally {
ResourceUtils.releaseOrLog(table);
}
} finally {
ResourceUtils.releaseOrLog(kiji);
}
}
/** Hadoop record reader for Kiji table rows. */
public static final class KijiTableRecordReader
extends RecordReader<EntityId, KijiRowData> {
private static final Logger LOG = LoggerFactory.getLogger(KijiTableRecordReader.class);
/** Data request. */
private final KijiDataRequest mDataRequest;
private Kiji mKiji = null;
private KijiTable mTable = null;
private KijiTableReader mReader = null;
private KijiRowScanner mScanner = null;
private Iterator<KijiRowData> mIterator = null;
private KijiTableSplit mSplit = null;
private HBaseKijiRowData mCurrentRow = null;
private long mStartPos;
private long mStopPos;
/**
* Creates a new RecordReader for this input format.
*
* Perform the actual reads from Kiji.
*
* @param conf Configuration for the target Kiji.
*/
private KijiTableRecordReader(Configuration conf) {
// Get data request from the job configuration.
final String dataRequestB64 = conf.get(KijiConfKeys.KIJI_INPUT_DATA_REQUEST);
Preconditions.checkNotNull(dataRequestB64, "Missing data request in job configuration.");
final byte[] dataRequestBytes = Base64.decodeBase64(Bytes.toBytes(dataRequestB64));
mDataRequest = (KijiDataRequest) SerializationUtils.deserialize(dataRequestBytes);
}
/** {@inheritDoc} */
@Override
public void initialize(InputSplit split, TaskAttemptContext context) throws IOException {
Preconditions.checkArgument(split instanceof KijiTableSplit,
"InputSplit is not a KijiTableSplit: %s", split);
mSplit = (KijiTableSplit) split;
final Configuration conf = context.getConfiguration();
final KijiURI inputURI =
KijiURI.newBuilder(conf.get(KijiConfKeys.KIJI_INPUT_TABLE_URI)).build();
// When using Kiji tables as an input to MapReduce jobs, turn off block caching.
final HBaseScanOptions hBaseScanOptions = new HBaseScanOptions();
hBaseScanOptions.setCacheBlocks(false);
// Extract the ColumnReaderSpecs and build a mapping from column to the appropriate overrides.
final ImmutableMap.Builder<KijiColumnName, ColumnReaderSpec> overridesBuilder =
ImmutableMap.builder();
for (KijiDataRequest.Column column : mDataRequest.getColumns()) {
if (column.getReaderSpec() != null) {
overridesBuilder.put(column.getColumnName(), column.getReaderSpec());
}
}
final KijiScannerOptions scannerOptions = new KijiScannerOptions()
.setStartRow(HBaseEntityId.fromHBaseRowKey(mSplit.getStartRow()))
.setStopRow(HBaseEntityId.fromHBaseRowKey(mSplit.getEndRow()))
.setHBaseScanOptions(hBaseScanOptions);
final String filterJson = conf.get(KijiConfKeys.KIJI_ROW_FILTER);
if (null != filterJson) {
final KijiRowFilter filter = KijiRowFilter.toFilter(filterJson);
scannerOptions.setKijiRowFilter(filter);
}
mKiji = Kiji.Factory.open(inputURI, conf);
mTable = mKiji.openTable(inputURI.getTable());
mReader = mTable.getReaderFactory().readerBuilder()
.withColumnReaderSpecOverrides(overridesBuilder.build())
.buildAndOpen();
mScanner = mReader.getScanner(mDataRequest, scannerOptions);
mIterator = mScanner.iterator();
mCurrentRow = null;
mStartPos = bytesToPosition(mSplit.getStartRow(), PROGRESS_PRECISION_NBYTES);
long stopPos = bytesToPosition(mSplit.getEndRow(), PROGRESS_PRECISION_NBYTES);
mStopPos = (stopPos > 0) ? stopPos : (1L << (PROGRESS_PRECISION_NBYTES * 8));
LOG.info("Progress reporting: start={} stop={}", mStartPos, mStopPos);
}
/** {@inheritDoc} */
@Override
public EntityId getCurrentKey() throws IOException {
return mCurrentRow.getEntityId();
}
/** {@inheritDoc} */
@Override
public KijiRowData getCurrentValue() throws IOException {
return mCurrentRow;
}
/**
* Converts a byte array into an integer position in the row-key space.
*
* @param bytes Byte array to convert to an approximate position.
* @param nbytes Number of bytes to use (must be in the range 1..8).
* @return the approximate position in the row-key space.
*/
public static long bytesToPosition(final byte[] bytes, final int nbytes) {
long position = 0;
if (bytes != null) {
for (int i = 0; i < nbytes; ++i) {
final int bvalue = (i < bytes.length) ? (0xff & bytes[i]) : 0;
position = (position << 8) + bvalue;
}
}
return position;
}
/**
* Computes the start position from the start row key, for progress reporting.
*
* @param startRowKey Start row key to compute the position of.
* @return the start position from the start row key.
*/
public static long getStartPos(byte[] startRowKey) {
return bytesToPosition(startRowKey, PROGRESS_PRECISION_NBYTES);
}
/**
* Computes the stop position from the stop row key, for progress reporting.
*
* @param stopRowKey Stop row key to compute the position of.
* @return the stop position from the start row key.
*/
public static long getStopPos(byte[] stopRowKey) {
long stopPos = bytesToPosition(stopRowKey, PROGRESS_PRECISION_NBYTES);
return (stopPos > 0) ? stopPos : (1L << (PROGRESS_PRECISION_NBYTES * 8));
}
/**
* Compute the progress (between 0.0f and 1.0f) for the current row key.
*
* @param startPos Computed start position (using getStartPos).
* @param stopPos Computed stop position (using getStopPos).
* @param currentRowKey Current row to compute a progress for.
* @return the progress indicator for the given row, start and stop positions.
*/
public static float computeProgress(long startPos, long stopPos, byte[] currentRowKey) {
Preconditions.checkArgument(startPos <= stopPos,
"Invalid start/stop positions: start=%s stop=%s", startPos, stopPos);
final long currentPos = bytesToPosition(currentRowKey, PROGRESS_PRECISION_NBYTES);
Preconditions.checkArgument(startPos <= currentPos,
"Invalid start/current positions: start=%s current=%s", startPos, currentPos);
Preconditions.checkArgument(currentPos <= stopPos,
"Invalid current/stop positions: current=%s stop=%s", currentPos, stopPos);
if (startPos == stopPos) {
// Row key range is too small to perceive progress: report 50% completion
return 0.5f;
} else {
return (float) (((double) currentPos - startPos) / (stopPos - startPos));
}
}
/** {@inheritDoc} */
@Override
public float getProgress() throws IOException {
if (mCurrentRow == null) {
return 0.0f;
}
final byte[] currentRowKey = mCurrentRow.getHBaseResult().getRow();
return computeProgress(mStartPos, mStopPos, currentRowKey);
}
/** {@inheritDoc} */
@Override
public boolean nextKeyValue() throws IOException {
if (mIterator.hasNext()) {
mCurrentRow = (HBaseKijiRowData) mIterator.next();
return true;
} else {
mCurrentRow = null;
return false;
}
}
/** {@inheritDoc} */
@Override
public void close() throws IOException {
ResourceUtils.closeOrLog(mScanner);
ResourceUtils.closeOrLog(mReader);
ResourceUtils.releaseOrLog(mTable);
ResourceUtils.releaseOrLog(mKiji);
mIterator = null;
mScanner = null;
mReader = null;
mTable = null;
mKiji = null;
mSplit = null;
mCurrentRow = null;
}
}
}