package org.coursera.mapreducer; import com.netflix.aegisthus.io.writable.AegisthusKey; import com.netflix.aegisthus.io.writable.AtomWritable; import com.netflix.aegisthus.util.CFMetadataUtility; import org.apache.avro.Schema; import org.apache.avro.generic.GenericData; import org.apache.avro.generic.GenericRecord; import org.apache.avro.mapred.AvroKey; import org.apache.avro.mapreduce.AvroJob; import org.apache.cassandra.config.CFMetaData; import org.apache.cassandra.cql3.CFDefinition; import org.apache.cassandra.cql3.statements.ColumnGroupMap; import org.apache.cassandra.db.Column; import org.apache.cassandra.db.OnDiskAtom; import org.apache.cassandra.db.marshal.*; import org.apache.cassandra.utils.ByteBufferUtil; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.WritableComparable; import org.apache.hadoop.mapreduce.Mapper; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; import java.nio.ByteBuffer; import java.sql.Timestamp; import java.util.Date; import java.util.UUID; public class CQLMapper extends Mapper<AegisthusKey, AtomWritable, AvroKey<GenericRecord>, NullWritable> { private static final Logger LOG = LoggerFactory.getLogger(CQLMapper.class); ColumnGroupMap.Builder cgmBuilder; CFMetaData cfMetaData; CFDefinition cfDef; ByteBuffer currentKey; Schema avroSchema; @Override protected void setup( Context context) throws IOException, InterruptedException { avroSchema = AvroJob.getOutputKeySchema(context.getConfiguration()); cfMetaData = CFMetadataUtility.initializeCfMetaData(context.getConfiguration()); cfDef = cfMetaData.getCfDef(); initBuilder(); /* This exporter assumes tables are composite, which should be true of all current schemas */ if (!cfDef.isComposite) throw new RuntimeException("Only can export composite CQL table schemas."); } @Override protected void map(AegisthusKey key, AtomWritable value, Context context) throws IOException, InterruptedException { if (currentKey == null) { currentKey = key.getKey(); } else if (!currentKey.equals(key.getKey())) { flushCgm(context); currentKey = key.getKey(); } OnDiskAtom atom = value.getAtom(); if (atom == null) { LOG.warn("Got null atom for key {}.", cfMetaData.getKeyValidator().compose(key.getKey())); return; } if (atom instanceof Column) { cgmBuilder.add((Column) atom); } else { LOG.error("Non-colum atom. {} {}", atom.getClass(), atom); throw new IllegalArgumentException("Got a non-column Atom."); } } @Override protected void cleanup( Context context) throws IOException, InterruptedException { super.cleanup(context); if (currentKey != null) { flushCgm(context); } } private void initBuilder() { // TODO: we might need to make "current" time configurable to avoid wrongly expiring data when trying to backfill. cgmBuilder = new ColumnGroupMap.Builder((CompositeType) cfMetaData.comparator, cfDef.hasCollections, System.currentTimeMillis()); } private void flushCgm(Context context) throws IOException, InterruptedException { if (cgmBuilder.isEmpty()) return; ByteBuffer[] keyComponents = cfDef.hasCompositeKey ? ((CompositeType) cfMetaData.getKeyValidator()).split(currentKey) : new ByteBuffer[] { currentKey }; ColumnGroupMap staticGroup = ColumnGroupMap.EMPTY; if (!cgmBuilder.isEmpty() && cgmBuilder.firstGroup().isStatic) { staticGroup = cgmBuilder.firstGroup(); cgmBuilder.discardFirst(); // Special case: if there are no rows, but only the static values, just flush the static values. if (cgmBuilder.isEmpty()) { handleGroup(context, ColumnGroupMap.EMPTY, keyComponents, staticGroup); } } for (ColumnGroupMap group : cgmBuilder.groups()) { handleGroup(context, group, keyComponents, staticGroup); } initBuilder(); currentKey = null; } private void handleGroup(Context context, ColumnGroupMap group, ByteBuffer[] keyComponents, ColumnGroupMap staticGroup) throws IOException, InterruptedException { GenericRecord record = new GenericData.Record(avroSchema); // write out partition keys for (CFDefinition.Name name : cfDef.partitionKeys()) { addCqlValueToRecord(record, name, keyComponents[name.position]); } // write out clustering columns for (CFDefinition.Name name : cfDef.clusteringColumns()) { addCqlValueToRecord(record, name, group.getKeyComponent(name.position)); } // regular columns for (CFDefinition.Name name : cfDef.regularColumns()) { addValue(record, name, group); } // static columns for (CFDefinition.Name name : cfDef.staticColumns()) { addValue(record, name, staticGroup); } context.write(new AvroKey(record), NullWritable.get()); } /* adapted from org.apache.cassandra.cql3.statements.SelectStatement.addValue */ private void addValue(GenericRecord record, CFDefinition.Name name, ColumnGroupMap group) { if (name.type.isCollection()) { // TODO(danchia): support collections throw new RuntimeException("Collections not supported yet."); } else { Column c = group.getSimple(name.name.key); addCqlValueToRecord(record, name, (c == null) ? null : c.value()); } } private void addCqlValueToRecord(GenericRecord record, CFDefinition.Name name, ByteBuffer value) { if (value == null) { record.put(name.name.toString(), null); return; } AbstractType<?> type = name.type; Object valueDeserialized = type.compose(value); AbstractType<?> baseType = (type instanceof ReversedType<?>) ? ((ReversedType<?>) type).baseType : type; /* special case some unsupported CQL3 types to Hive types. */ if (baseType instanceof UUIDType || baseType instanceof TimeUUIDType) { valueDeserialized = ((UUID) valueDeserialized).toString(); } else if (baseType instanceof BytesType) { ByteBuffer buffer = (ByteBuffer) valueDeserialized; byte[] data = new byte[buffer.remaining()]; buffer.get(data); valueDeserialized = data; } else if (baseType instanceof TimestampType) { Date date = (Date) valueDeserialized; valueDeserialized = date.getTime(); } //LOG.info("Setting {} type {} to class {}", name.name.toString(), type, valueDeserialized.getClass()); record.put(name.name.toString(), valueDeserialized); } }