/*
* Copyright © 2015-2016 Cask Data, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package co.cask.cdap.internal.app.runtime.batch;
import co.cask.cdap.api.ProgramLifecycle;
import co.cask.cdap.api.app.AbstractApplication;
import co.cask.cdap.api.data.batch.Input;
import co.cask.cdap.api.data.batch.Output;
import co.cask.cdap.api.dataset.lib.KeyValueTable;
import co.cask.cdap.api.mapreduce.AbstractMapReduce;
import co.cask.cdap.api.mapreduce.MapReduceContext;
import co.cask.cdap.api.mapreduce.MapReduceTaskContext;
import co.cask.cdap.common.io.Locations;
import co.cask.cdap.common.lang.jar.BundleJarUtil;
import com.google.common.base.Charsets;
import com.google.common.base.Preconditions;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.net.URI;
import java.nio.file.Files;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.StringTokenizer;
/**
* App to test local files in programs.
*/
public class AppWithLocalFiles extends AbstractApplication {
public static final String MR_INPUT_DATASET = "input";
public static final String MR_OUTPUT_DATASET = "output";
public static final String STOPWORDS_FILE_ARG = "stopwords.file";
public static final String STOPWORDS_FILE_ALIAS = "stopwords.txt";
private static final String LOCAL_ARCHIVE_ALIAS = "archive.jar";
@Override
public void configure() {
createDataset(MR_INPUT_DATASET, KeyValueTable.class);
createDataset(MR_OUTPUT_DATASET, KeyValueTable.class);
addStream("LocalFileStream");
addMapReduce(new MapReduceWithLocalFiles());
}
public static class MapReduceWithLocalFiles extends AbstractMapReduce {
@Override
public void beforeSubmit(MapReduceContext context) throws Exception {
Map<String, String> args = context.getRuntimeArguments();
if (args.containsKey(STOPWORDS_FILE_ARG)) {
context.localize(STOPWORDS_FILE_ALIAS, URI.create(args.get(STOPWORDS_FILE_ARG)));
}
context.localize(LOCAL_ARCHIVE_ALIAS, createTemporaryArchiveFile(), true);
context.addInput(Input.ofDataset(args.get(MR_INPUT_DATASET)));
context.addOutput(Output.ofDataset(args.get(MR_OUTPUT_DATASET)));
Job job = context.getHadoopJob();
job.setMapperClass(TokenizerMapper.class);
job.setReducerClass(IntSumReducer.class);
}
private URI createTemporaryArchiveFile() throws IOException {
File tmpDir1 = com.google.common.io.Files.createTempDir();
List<File> files = new ArrayList<>();
for (int i = 0; i < 3; i++) {
File tmpFile = File.createTempFile("abcd" + i, "txt", tmpDir1);
files.add(tmpFile);
}
File tmpDir2 = com.google.common.io.Files.createTempDir();
File destArchive = new File(tmpDir2, "myBundle.jar");
BundleJarUtil.createJar(tmpDir1, destArchive);
for (File file : files) {
BundleJarUtil.getEntry(Locations.toLocation(destArchive), file.getName()).getInput().close();
}
return destArchive.toURI();
}
public static class TokenizerMapper extends Mapper<byte[], byte[], Text, IntWritable>
implements ProgramLifecycle<MapReduceTaskContext> {
private static final IntWritable ONE = new IntWritable(1);
private Text word = new Text();
private final List<String> stopWords = new ArrayList<>();
@Override
public void map(byte[] key, byte[] value, Context context) throws IOException, InterruptedException {
StringTokenizer itr = new StringTokenizer(Bytes.toString(value));
while (itr.hasMoreTokens()) {
String token = itr.nextToken();
if (!stopWords.contains(token)) {
word.set(token);
context.write(word, ONE);
}
}
}
@Override
public void initialize(MapReduceTaskContext context) throws Exception {
Map<String, File> localFiles = context.getAllLocalFiles();
Preconditions.checkState(localFiles.size() == 2, "Expected 2 files to have been localized.");
Map<String, String> args = context.getRuntimeArguments();
Preconditions.checkArgument(args.containsKey(STOPWORDS_FILE_ARG),
"Runtime argument %s must be set.", STOPWORDS_FILE_ARG);
String localFilePath = URI.create(args.get(STOPWORDS_FILE_ARG)).getPath();
// will throw FileNotFoundException if stopwords file does not exist
File stopWordsFile = context.getLocalFile(STOPWORDS_FILE_ALIAS);
Preconditions.checkState(stopWordsFile.exists(), "Stopwords file %s must exist", localFilePath);
File localArchive = context.getLocalFile(LOCAL_ARCHIVE_ALIAS);
Preconditions.checkState(localArchive.exists(), "Local archive %s must exist", LOCAL_ARCHIVE_ALIAS);
Preconditions.checkState(localArchive.isDirectory(), "Local archive %s must have been extracted to a " +
"directory", LOCAL_ARCHIVE_ALIAS);
try (BufferedReader reader = Files.newBufferedReader(stopWordsFile.toPath(), Charsets.UTF_8)) {
String line;
while ((line = reader.readLine()) != null) {
stopWords.add(line);
}
}
}
@Override
public void destroy() {
}
}
/**
*
*/
public static class IntSumReducer extends Reducer<Text, IntWritable, byte[], byte[]> {
@Override
public void reduce(Text key, Iterable<IntWritable> values, Context context)
throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
context.write(Bytes.toBytes(key.toString()), Bytes.toBytes(sum));
}
}
}
}