/*
* Copyright © 2015-2016 Cask Data, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package co.cask.cdap.examples.datacleansing;
import co.cask.cdap.api.ProgramLifecycle;
import co.cask.cdap.api.Resources;
import co.cask.cdap.api.data.batch.Input;
import co.cask.cdap.api.data.batch.Output;
import co.cask.cdap.api.data.schema.Schema;
import co.cask.cdap.api.dataset.lib.DynamicPartitioner;
import co.cask.cdap.api.dataset.lib.PartitionKey;
import co.cask.cdap.api.dataset.lib.PartitionedFileSetArguments;
import co.cask.cdap.api.dataset.lib.partitioned.KVTableStatePersistor;
import co.cask.cdap.api.dataset.lib.partitioned.PartitionBatchInput;
import co.cask.cdap.api.mapreduce.AbstractMapReduce;
import co.cask.cdap.api.mapreduce.MapReduceContext;
import co.cask.cdap.api.mapreduce.MapReduceTaskContext;
import co.cask.cdap.api.metrics.Metrics;
import com.google.common.collect.ImmutableMap;
import com.google.gson.JsonParser;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
/**
* A simple MapReduce that reads records from the rawRecords PartitionedFileSet and writes all records
* that match a particular {@link Schema} to the cleanRecords PartitionedFileSet. It also keeps track of its state of
* which partitions it has processed, so that it only processes new partitions of data each time it runs.
*/
public class DataCleansingMapReduce extends AbstractMapReduce {
protected static final String NAME = "DataCleansingMapReduce";
protected static final String OUTPUT_PARTITION_KEY = "output.partition.key";
protected static final String SCHEMA_KEY = "schema.key";
private PartitionBatchInput.BatchPartitionCommitter partitionCommitter;
@Override
public void configure() {
setName(NAME);
setMapperResources(new Resources(1024));
setReducerResources(new Resources(1024));
}
@Override
public void beforeSubmit(MapReduceContext context) throws Exception {
partitionCommitter =
PartitionBatchInput.setInput(context, DataCleansing.RAW_RECORDS,
new KVTableStatePersistor(DataCleansing.CONSUMING_STATE, "state.key"));
// Each run writes its output to a partition for the league
Long timeKey = Long.valueOf(context.getRuntimeArguments().get(OUTPUT_PARTITION_KEY));
PartitionKey outputKey = PartitionKey.builder().addLongField("time", timeKey).build();
Map<String, String> metadataToAssign = ImmutableMap.of("source.program", "DataCleansingMapReduce");
// set up two outputs - one for invalid records and one for valid records
Map<String, String> invalidRecordsArgs = new HashMap<>();
PartitionedFileSetArguments.setOutputPartitionKey(invalidRecordsArgs, outputKey);
PartitionedFileSetArguments.setOutputPartitionMetadata(invalidRecordsArgs, metadataToAssign);
context.addOutput(Output.ofDataset(DataCleansing.INVALID_RECORDS, invalidRecordsArgs));
Map<String, String> cleanRecordsArgs = new HashMap<>();
PartitionedFileSetArguments.setDynamicPartitioner(cleanRecordsArgs, TimeAndZipPartitioner.class);
PartitionedFileSetArguments.setOutputPartitionMetadata(cleanRecordsArgs, metadataToAssign);
context.addOutput(Output.ofDataset(DataCleansing.CLEAN_RECORDS, cleanRecordsArgs));
Job job = context.getHadoopJob();
job.setMapperClass(SchemaMatchingFilter.class);
job.setNumReduceTasks(0);
// simply propagate the schema (if any) to be used by the mapper
String schemaJson = context.getRuntimeArguments().get(SCHEMA_KEY);
if (schemaJson != null) {
job.getConfiguration().set(SCHEMA_KEY, schemaJson);
}
}
@Override
public void onFinish(boolean succeeded, MapReduceContext context) throws Exception {
partitionCommitter.onFinish(succeeded);
}
/**
* Partitions the records based upon a runtime argument (time) and a field extracted from the text being written (zip)
*/
public static final class TimeAndZipPartitioner extends DynamicPartitioner<NullWritable, Text> {
private Long time;
private JsonParser jsonParser;
@Override
public void initialize(MapReduceTaskContext<NullWritable, Text> mapReduceTaskContext) {
this.time = Long.valueOf(mapReduceTaskContext.getRuntimeArguments().get(OUTPUT_PARTITION_KEY));
this.jsonParser = new JsonParser();
}
@Override
public PartitionKey getPartitionKey(NullWritable key, Text value) {
int zip = jsonParser.parse(value.toString()).getAsJsonObject().get("zip").getAsInt();
return PartitionKey.builder().addLongField("time", time).addIntField("zip", zip).build();
}
}
/**
* A Mapper which skips text that doesn't match a given schema.
*/
public static class SchemaMatchingFilter extends Mapper<LongWritable, Text, NullWritable, Text>
implements ProgramLifecycle<MapReduceTaskContext<NullWritable, Text>> {
public static final Schema DEFAULT_SCHEMA = Schema.recordOf("person",
Schema.Field.of("pid", Schema.of(Schema.Type.LONG)),
Schema.Field.of("name", Schema.of(Schema.Type.STRING)),
Schema.Field.of("dob", Schema.of(Schema.Type.STRING)),
Schema.Field.of("zip", Schema.of(Schema.Type.INT)));
private SimpleSchemaMatcher schemaMatcher;
private Metrics mapMetrics;
private MapReduceTaskContext<NullWritable, Text> mapReduceTaskContext;
@Override
public void initialize(MapReduceTaskContext<NullWritable, Text> context) throws Exception {
this.mapReduceTaskContext = context;
}
@Override
public void destroy() {
}
@Override
protected void setup(Context context) throws IOException, InterruptedException {
// setup the schema to be used by the mapper
String schemaJson = context.getConfiguration().get(SCHEMA_KEY);
if (schemaJson == null) {
schemaMatcher = new SimpleSchemaMatcher(DEFAULT_SCHEMA);
} else {
schemaMatcher = new SimpleSchemaMatcher(Schema.parseJson(schemaJson));
}
}
public void map(LongWritable key, Text data, MapReduceTaskContext<NullWritable, Text> context)
throws IOException, InterruptedException {
if (!schemaMatcher.matches(data.toString())) {
context.write(DataCleansing.INVALID_RECORDS, NullWritable.get(), data);
mapMetrics.count("records.invalid", 1);
} else {
context.write(DataCleansing.CLEAN_RECORDS, NullWritable.get(), data);
mapMetrics.count("records.valid", 1);
}
}
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
map(key, value, this.mapReduceTaskContext);
}
}
}