/*
* Copyright © 2015-2016 Cask Data, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package co.cask.cdap.internal.app.runtime.batch;
import co.cask.cdap.api.app.AbstractApplication;
import co.cask.cdap.api.common.Bytes;
import co.cask.cdap.api.dataset.lib.PartitionedFileSetProperties;
import co.cask.cdap.api.dataset.lib.Partitioning;
import co.cask.cdap.api.dataset.table.Put;
import co.cask.cdap.api.dataset.table.Row;
import co.cask.cdap.api.mapreduce.AbstractMapReduce;
import co.cask.cdap.api.mapreduce.MapReduceContext;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import java.io.IOException;
/**
* App used to test whether M/R works well with time-partitioned file sets.
* It uses M/R to read from a table and write partitions, and another M/R to read partitions and write to a table.
*/
public class AppWithPartitionedFileSet extends AbstractApplication {
public static final String INPUT = "in-table";
public static final String PARTITIONED = "partitioned";
public static final String OUTPUT = "out-table";
public static final byte[] ONLY_COLUMN = { 'x' };
public static final String ROW_TO_WRITE = "row.to.write";
private static final String SEPARATOR = ":";
@Override
public void configure() {
setName("AppWithMapReduceUsingFile");
setDescription("Application with MapReduce job using file as dataset");
createDataset(INPUT, "table");
createDataset(OUTPUT, "table");
createDataset(PARTITIONED, "partitionedFileSet", PartitionedFileSetProperties.builder()
.setPartitioning(Partitioning.builder()
.addStringField("type")
.addLongField("time")
.build())
// properties for file set
.setBasePath("partitioned")
.setInputFormat(TextInputFormat.class)
.setOutputFormat(TextOutputFormat.class)
.setOutputProperty(TextOutputFormat.SEPERATOR, SEPARATOR)
// don't configure properties for the Hive table - this is used in a context where explore is disabled
.build());
addMapReduce(new PartitionWriter());
addMapReduce(new PartitionReader());
}
/**
* Map/Reduce that reads the "input" table and writes to a partition.
*/
public static final class PartitionWriter extends AbstractMapReduce {
@Override
public void configure() {
setInputDataset(INPUT);
setOutputDataset(PARTITIONED);
}
@Override
public void beforeSubmit(MapReduceContext context) throws Exception {
Job job = context.getHadoopJob();
job.setMapperClass(SimpleMapper.class);
job.setNumReduceTasks(0);
}
}
public static class SimpleMapper extends Mapper<byte[], Row, Text, Text> {
@Override
public void map(byte[] rowKey, Row row, Context context)
throws IOException, InterruptedException {
context.write(new Text(Bytes.toString(rowKey)),
new Text(Bytes.toString(row.get(ONLY_COLUMN))));
}
}
/**
* Map/Reduce that reads the "input" table and writes to a partition.
*/
public static final class PartitionReader extends AbstractMapReduce {
@Override
public void configure() {
setInputDataset(PARTITIONED);
setOutputDataset(OUTPUT);
}
@Override
public void beforeSubmit(MapReduceContext context) throws Exception {
Job job = context.getHadoopJob();
job.setMapperClass(ReaderMapper.class);
job.setNumReduceTasks(0);
String row = context.getRuntimeArguments().get(ROW_TO_WRITE);
job.getConfiguration().set(ROW_TO_WRITE, row);
}
}
public static class ReaderMapper extends Mapper<LongWritable, Text, byte[], Put> {
private static byte[] rowToWrite;
@Override
protected void setup(Context context) throws IOException, InterruptedException {
rowToWrite = Bytes.toBytes(context.getConfiguration().get(ROW_TO_WRITE));
}
@Override
public void map(LongWritable pos, Text text, Context context)
throws IOException, InterruptedException {
String line = text.toString();
String[] fields = line.split(SEPARATOR);
context.write(rowToWrite, new Put(rowToWrite, Bytes.toBytes(fields[0]), Bytes.toBytes(fields[1])));
}
}
}