package com.linkedin.camus.sweeper.mapreduce;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import org.apache.avro.Schema;
import org.apache.avro.Schema.Field;
import org.apache.avro.Schema.Type;
import org.apache.avro.SchemaCompatibility;
import org.apache.avro.SchemaCompatibility.SchemaCompatibilityType;
import org.apache.avro.SchemaParseException;
import org.apache.avro.SchemaValidationException;
import org.apache.avro.SchemaValidatorBuilder;
import org.apache.avro.file.DataFileReader;
import org.apache.avro.generic.GenericDatumReader;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.mapred.AvroKey;
import org.apache.avro.mapred.AvroOutputFormat;
import org.apache.avro.mapred.AvroValue;
import org.apache.avro.mapred.FsInput;
import org.apache.avro.mapreduce.AvroJob;
import org.apache.avro.mapreduce.AvroKeyOutputFormat;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import com.linkedin.camus.sweeper.utils.RelaxedAvroKeyOutputFormat;
import com.linkedin.camus.sweeper.utils.RelaxedAvroSerialization;
import com.linkedin.camus.sweeper.utils.RelaxedSchemaUtils;
public class CamusSweeperAvroKeyJob extends CamusSweeperJob {
private static final Log LOG = LogFactory.getLog(CamusSweeperAvroKeyJob.class.getName());
@Override
public void configureJob(String topic, Job job) {
boolean skipNameValidation = RelaxedSchemaUtils.skipNameValidation(job.getConfiguration());
if (skipNameValidation) {
RelaxedAvroSerialization.addToConfiguration(job.getConfiguration());
}
// setting up our input format and map output types
super.configureInput(job, AvroKeyCombineFileInputFormat.class, AvroKeyMapper.class, AvroKey.class, AvroValue.class);
// setting up our output format and output types
super.configureOutput(job, skipNameValidation ? RelaxedAvroKeyOutputFormat.class : AvroKeyOutputFormat.class,
AvroKeyReducer.class, AvroKey.class, NullWritable.class);
// finding the newest file from our input. this file will contain the newest version of our avro
// schema.
Schema schema;
try {
schema = getNewestSchemaFromSource(job);
} catch (IOException e) {
throw new RuntimeException(e);
}
// checking if we have a key schema used for deduping. if we don't then we make this a map only
// job and set the key schema
// to the newest input schema
String keySchemaStr = getConfValue(job, topic, "camus.sweeper.avro.key.schema");
Schema keySchema;
if (job.getConfiguration().getBoolean("camus.sweeper.use.all.attributes", false)) {
log.info("Using all attributes in the schema (except Map fields) for deduping");
keySchema = getAllFieldsExceptMap(schema);
} else if (keySchemaStr == null || keySchemaStr.isEmpty()
|| job.getConfiguration().getBoolean("second.stage", false)) {
job.setNumReduceTasks(0);
keySchema = schema;
} else {
keySchema = RelaxedSchemaUtils.parseSchema(keySchemaStr, job.getConfiguration());
keySchema = duplicateRecord(keySchema, schema);
if (!validateKeySchema(schema, keySchema)) {
log.info("topic:" + topic + " key invalid, using map only job");
job.setNumReduceTasks(0);
keySchema = schema;
}
}
setupSchemas(topic, job, schema, keySchema);
// setting the compression level. Only used if compression is enabled. default is 6
job.getConfiguration().setInt(AvroOutputFormat.DEFLATE_LEVEL_KEY,
job.getConfiguration().getInt(AvroOutputFormat.DEFLATE_LEVEL_KEY, 6));
}
private Schema getAllFieldsExceptMap(Schema schema) {
List<Field> fields = new ArrayList<Schema.Field>();
for (Field f : schema.getFields()) {
if (f.schema().getType() != Schema.Type.MAP) {
fields.add(new Field(f.name(), f.schema(), f.doc(), f.defaultValue(), f.order()));
}
}
Schema newSchema = Schema.createRecord(schema.getName(), schema.getDoc(), schema.getName(), false);
newSchema.setFields(fields);
return newSchema;
}
private boolean validateKeySchema(Schema schema, Schema keySchema) {
return SchemaCompatibility.checkReaderWriterCompatibility(keySchema, schema).getType()
.equals(SchemaCompatibilityType.COMPATIBLE);
}
public Schema duplicateRecord(Schema record, Schema original) {
List<Field> fields = new ArrayList<Schema.Field>();
for (Field f : record.getFields()) {
Schema fldSchema;
if (original.getField(f.name()) != null) {
fldSchema = original.getField(f.name()).schema();
} else {
fldSchema = f.schema();
}
fields.add(new Field(f.name(), fldSchema, f.doc(), f.defaultValue(), f.order()));
}
Schema newRecord = Schema.createRecord(original.getName(), record.getDoc(), original.getNamespace(), false);
newRecord.setFields(fields);
return newRecord;
}
private void setupSchemas(String topic, Job job, Schema schema, Schema keySchema) {
AvroJob.setInputKeySchema(job, schema);
AvroJob.setMapOutputKeySchema(job, keySchema);
AvroJob.setMapOutputValueSchema(job, schema);
Schema reducerSchema =
RelaxedSchemaUtils.parseSchema(getConfValue(job, topic, "camus.output.schema", schema.toString()),
job.getConfiguration());
AvroJob.setOutputKeySchema(job, reducerSchema);
}
private Schema getNewestSchemaFromSource(Job job) throws IOException {
FileSystem fs = FileSystem.get(job.getConfiguration());
Path[] sourceDirs = FileInputFormat.getInputPaths(job);
List<FileStatus> files = new ArrayList<FileStatus>();
for (Path sourceDir : sourceDirs) {
files.addAll(Arrays.asList(fs.listStatus(sourceDir)));
}
Collections.sort(files, new ReverseLastModifiedComparitor());
for (FileStatus f : files) {
Schema schema = getNewestSchemaFromSource(f.getPath(), fs);
if (schema != null)
return schema;
}
return null;
}
private Schema getNewestSchemaFromSource(Path sourceDir, FileSystem fs) throws IOException {
FileStatus[] files = fs.listStatus(sourceDir);
Arrays.sort(files, new ReverseLastModifiedComparitor());
for (FileStatus f : files) {
if (f.isDir()) {
Schema schema = getNewestSchemaFromSource(f.getPath(), fs);
if (schema != null)
return schema;
} else if (f.getPath().getName().endsWith(".avro")) {
FsInput fi = new FsInput(f.getPath(), fs.getConf());
GenericDatumReader<GenericRecord> genReader = new GenericDatumReader<GenericRecord>();
DataFileReader<GenericRecord> reader = new DataFileReader<GenericRecord>(fi, genReader);
return reader.getSchema();
}
}
return null;
}
class ReverseLastModifiedComparitor implements Comparator<FileStatus> {
@Override
public int compare(FileStatus o1, FileStatus o2) {
if (o2.getModificationTime() < o1.getModificationTime())
return -1;
else if (o2.getModificationTime() > o1.getModificationTime())
return 1;
else
return 0;
}
}
}