/** * (c) Copyright 2014 WibiData, Inc. * * See the NOTICE file distributed with this work for additional * information regarding copyright ownership. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.kiji.mapreduce.pivot; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.util.Map; import com.google.common.base.Preconditions; import com.google.common.collect.Maps; import org.apache.avro.Schema; import org.apache.avro.generic.GenericData; import org.apache.avro.generic.GenericDatumReader; import org.apache.avro.generic.GenericDatumWriter; import org.apache.avro.io.DatumReader; import org.apache.avro.io.DatumWriter; import org.apache.avro.io.Decoder; import org.apache.avro.io.DecoderFactory; import org.apache.avro.io.Encoder; import org.apache.avro.io.EncoderFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hbase.HConstants; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.kiji.annotations.ApiAudience; import org.kiji.annotations.ApiStability; import org.kiji.avro.dsl.JavaAvroDSL; import org.kiji.mapreduce.KijiContext; import org.kiji.mapreduce.KijiTableContext; import org.kiji.mapreduce.avro.generated.CellRewriteSpec; import org.kiji.schema.DecodedCell; import org.kiji.schema.KijiCell; import org.kiji.schema.KijiColumnName; import org.kiji.schema.KijiDataRequest; import org.kiji.schema.KijiDataRequestBuilder.ColumnsDef; import org.kiji.schema.KijiIOException; import org.kiji.schema.KijiRowData; import org.kiji.schema.layout.ColumnReaderSpec; /** * Pivot M/R job to rewrite cells in a Kiji table. * * <p> * KijiCellRewriter is an example of a Pivot M/R job to rewrite cells in a Kiji table by * performing a chain of compatible Avro conversions. * <br/> * This job rewrites the cells from a map-type family or a fully-qualified column at a time. * The rewritten cells may be written either to the same column or family of the input Kiji table, * or to the same column or family of a different Kiji table, potentially in a different Kiji * instance. * <br/> * An example demonstrating how to use this job is available in {@code TestKijiCellRewriter}. * </p> */ @ApiAudience.Public @ApiStability.Experimental public class KijiCellRewriter extends KijiPivoter { private static final Logger LOG = LoggerFactory.getLogger(KijiCellRewriter.class); /** Configuration keys used by the cell rewriter job. */ public static enum ConfKeys { /** Configuration key associated to the ColumnRewriteSpec record. */ spec; /** * Returns the string representation of this configuration key. * * @return the string representation of this configuration key. */ public String get() { return String.format("%s.%s", KijiCellRewriter.class.getCanonicalName(), name()); } } /** Specific counters for this Map/Reduce job. */ public static enum Counters { /** Total number of cells processed (successfully or not). */ CELLS_PROCESSED, /** Total number of cells rewritten. */ CELLS_REWRITTEN, } /** Name of the column to rewrite. */ private KijiColumnName mColumn = null; /** * Schema rewriting rules: each datum with a schema present in this map will be rewritten. * For a given cell, the process is repeated until no rule applies. */ private Map<Schema, Schema> mRules = null; /** * Decodes a CellRewriteSpec from an encoded entry in a Hadoop configuration. * * @param conf Hadoop configuration with a CellRewriteSpec entry. * @return the decoded CellRewriteSpec. * @throws IOException on I/O error. */ private static CellRewriteSpec getSpecFromConf(final Configuration conf) throws IOException { final String specStr = conf.get(ConfKeys.spec.get()); Preconditions.checkArgument(specStr != null, "Missing configuration entry: %s", ConfKeys.spec.get()); final JavaAvroDSL avroDSL = new JavaAvroDSL(); final GenericData.Record spec = avroDSL.parseValue(specStr, CellRewriteSpec.getClassSchema()); final CellRewriteSpec specific = new CellRewriteSpec(); for (Schema.Field field : CellRewriteSpec.getClassSchema().getFields()) { specific.put(field.name(), spec.get(field.name())); } return specific; } /** {@inheritDoc} */ @Override public KijiDataRequest getDataRequest() { final CellRewriteSpec spec; try { spec = getSpecFromConf(getConf()); } catch (IOException ioe) { throw new KijiIOException(ioe); } final KijiColumnName column = new KijiColumnName(spec.getColumn()); final ColumnReaderSpec readerSpec; if (spec.getReaderSchema() == null) { readerSpec = ColumnReaderSpec.avroWriterSchemaGeneric(); } else { final Schema readerSchema = new Schema.Parser().parse(spec.getReaderSchema()); readerSpec = ColumnReaderSpec.avroReaderSchemaGeneric(readerSchema); } return KijiDataRequest.builder() .addColumns(ColumnsDef.create() .withMaxVersions(HConstants.ALL_VERSIONS) .add(column, readerSpec)) .build(); } /** {@inheritDoc} */ @Override public void setup(KijiContext context) throws IOException { super.setup(context); final CellRewriteSpec spec = getSpecFromConf(getConf()); mColumn = new KijiColumnName(spec.getColumn()); LOG.info("Rewriting cells for column {}", mColumn); // Build the map of schema-rewrite rules: mRules = Maps.newHashMap(); final JavaAvroDSL avroDSL = new JavaAvroDSL(); for (Map.Entry<String, String> entry : spec.getRules().entrySet()) { final Schema fromSchema = new Schema.Parser().parse(entry.getKey()); final Schema toSchema = new Schema.Parser().parse(entry.getValue()); mRules.put(fromSchema, toSchema); LOG.info("Rewriting cell with schema {} into schema {}", avroDSL.schemaToString(fromSchema), avroDSL.schemaToString(toSchema)); } // TODO(KIJIMR-264) Validate the requested conversion. In particular, detect cycles. } /** {@inheritDoc} */ @Override public void produce(final KijiRowData row, final KijiTableContext context) throws IOException { final Iterable<KijiCell<Object>> cells; if (mColumn.isFullyQualified()) { cells = row.asIterable(mColumn.getFamily(), mColumn.getQualifier()); } else { cells = row.asIterable(mColumn.getFamily()); } for (KijiCell<Object> cell : cells) { context.incrementCounter(Counters.CELLS_PROCESSED); final DecodedCell<Object> original = new DecodedCell<Object>(cell.getWriterSchema(), cell.getData()); final DecodedCell<Object> rewritten = rewriteCell(original); if (rewritten != original) { context.put( row.getEntityId(), mColumn.getFamily(), mColumn.getQualifier(), cell.getTimestamp(), rewritten.getData()); context.incrementCounter(Counters.CELLS_REWRITTEN); } } } /** * Rewrites a cell. * * <p> * This method is meant to be overloaded in case custom cell rewriting rules are necessary. * </p> * * @param cell Original value of the cell. * @return the new rewritten value of the cell, * or the original cell value if no translation rules apply. * @throws IOException on I/O error. * * @param <U> type of the input cell to rewrite. * @param <T> type to rewrite the input cell into. */ protected <T, U> DecodedCell<T> rewriteCell(final DecodedCell<U> cell) throws IOException { // Apply conversion rules as long as some rule matches: DecodedCell<U> rewritten = cell; while (true) { final Schema newSchema = mRules.get(rewritten.getWriterSchema()); if (newSchema == null) { // No rule apply, we are done. break; } else { rewritten = convertAvro(rewritten, newSchema); } } return (DecodedCell<T>) rewritten; } /** * Converts an Avro datum using a new (compatible) Avro schema. * * @param original Original Avro datum with its schema. * @param schema New Avro schema to convert the datum into. * @return the Avro datum converted into the specified new schema. * @throws IOException on I/O error. * * @param <U> type of the input cell to rewrite. * @param <T> type to rewrite the input cell into. */ public static <T, U> DecodedCell<T> convertAvro( DecodedCell<U> original, Schema schema ) throws IOException { final ByteArrayOutputStream baos = new ByteArrayOutputStream(); // Encode original datum to bytes: final Encoder encoder = EncoderFactory.get().directBinaryEncoder(baos, null); final DatumWriter<U> writer = new GenericDatumWriter<U>(original.getWriterSchema()); writer.write(original.getData(), encoder); encoder.flush(); // Decode bytes according to the new schema: final Decoder decoder = DecoderFactory.get().binaryDecoder(baos.toByteArray(), null); final DatumReader<T> reader = new GenericDatumReader<T>(original.getWriterSchema(), schema); final T data = reader.read(null, decoder); return new DecodedCell<T>(schema, data); } }