/*
* Copyright © 2015 Cask Data, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package co.cask.cdap.examples.datacleansing;
import co.cask.cdap.api.app.AbstractApplication;
import co.cask.cdap.api.dataset.lib.KeyValueTable;
import co.cask.cdap.api.dataset.lib.PartitionedFileSet;
import co.cask.cdap.api.dataset.lib.PartitionedFileSetProperties;
import co.cask.cdap.api.dataset.lib.Partitioning;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
/**
* Data Cleansing sample Application.
*/
public class DataCleansing extends AbstractApplication {
protected static final String NAME = "DataCleansing";
protected static final String RAW_RECORDS = "rawRecords";
protected static final String CLEAN_RECORDS = "cleanRecords";
protected static final String INVALID_RECORDS = "invalidRecords";
protected static final String CONSUMING_STATE = "consumingState";
@Override
public void configure() {
setName(NAME);
setDescription("Example data cleansing application");
// Ingest and retrieve the data using a Service
addService(new DataCleansingService());
// Process the records from "rawRecords" partitioned file set using MapReduce
addMapReduce(new DataCleansingMapReduce());
// Store the state of the incrementally processing MapReduce
createDataset(CONSUMING_STATE, KeyValueTable.class);
// Create the "rawRecords" partitioned file set for storing the input records,
// configure it to work with MapReduce
createDataset(RAW_RECORDS, PartitionedFileSet.class, PartitionedFileSetProperties.builder()
// Properties for partitioning
.setPartitioning(Partitioning.builder().addLongField("time").build())
// Properties for file set
.setInputFormat(TextInputFormat.class)
.setDescription("Store input records")
.build());
createDataset(CLEAN_RECORDS, PartitionedFileSet.class, PartitionedFileSetProperties.builder()
// Properties for partitioning
.setPartitioning(Partitioning.builder().addLongField("time").addIntField("zip").build())
// Properties for file set
.setOutputFormat(TextOutputFormat.class)
// Properties for Explore (to create a partitioned Hive table)
.setEnableExploreOnCreate(true)
.setExploreFormat("text")
.setExploreFormatProperty("delimiter", "\n")
.setExploreSchema("record STRING")
.setDescription("Store clean records")
.build());
createDataset(INVALID_RECORDS, PartitionedFileSet.class, PartitionedFileSetProperties.builder()
// Properties for partitioning
.setPartitioning(Partitioning.builder().addLongField("time").build())
// Properties for file set
.setOutputFormat(TextOutputFormat.class)
// Properties for Explore (to create a partitioned Hive table)
.setEnableExploreOnCreate(true)
.setExploreFormat("text")
.setExploreFormatProperty("delimiter", "\n")
.setExploreSchema("record STRING")
.setDescription("Store invalid records")
.build());
}
}