package ml.shifu.shifu.core.shuffle;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.fasterxml.jackson.core.JsonParser;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.base.Splitter;
import ml.shifu.guagua.hadoop.util.HDPUtils;
import ml.shifu.guagua.mapreduce.GuaguaMapReduceConstants;
import ml.shifu.guagua.util.NumberFormatUtils;
import ml.shifu.shifu.container.obj.ModelConfig;
import ml.shifu.shifu.container.obj.RawSourceData;
import ml.shifu.shifu.core.dtrain.nn.NNConstants;
import ml.shifu.shifu.fs.PathFinder;
import ml.shifu.shifu.fs.ShifuFileUtils;
import ml.shifu.shifu.util.CommonUtils;
import ml.shifu.shifu.util.Constants;
import ml.shifu.shifu.util.Environment;
import org.apache.commons.codec.binary.Base64;
import org.apache.commons.collections.Predicate;
import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream;
import org.apache.commons.jexl2.JexlException;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.pig.impl.util.JarManager;
import org.encog.ml.data.MLDataSet;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
/**
* Created by zhanhu on 2/22/17.
*/
public class MapReduceShuffle {
private static final Logger log = LoggerFactory.getLogger(MapReduceShuffle.class);
private PathFinder pathFinder;
private ModelConfig modelConfig;
public MapReduceShuffle(ModelConfig modelConfig) {
this.modelConfig = modelConfig;
this.pathFinder = new PathFinder(this.modelConfig);
}
public void run(String srcDataPath) throws IOException, ClassNotFoundException, InterruptedException {
RawSourceData.SourceType source = this.modelConfig.getDataSet().getSource();
Configuration conf = new Configuration();
// add jars to hadoop mapper and reducer
new GenericOptionsParser(conf, new String[] { "-libjars", addRuntimeJars() });
conf.setBoolean(GuaguaMapReduceConstants.MAPRED_MAP_TASKS_SPECULATIVE_EXECUTION, true);
conf.setBoolean(GuaguaMapReduceConstants.MAPRED_REDUCE_TASKS_SPECULATIVE_EXECUTION, true);
conf.setBoolean(GuaguaMapReduceConstants.MAPREDUCE_MAP_SPECULATIVE, true);
conf.setBoolean(GuaguaMapReduceConstants.MAPREDUCE_REDUCE_SPECULATIVE, true);
conf.set(NNConstants.MAPRED_JOB_QUEUE_NAME, Environment.getProperty(Environment.HADOOP_JOB_QUEUE, "default"));
conf.setInt(GuaguaMapReduceConstants.MAPREDUCE_JOB_MAX_SPLIT_LOCATIONS, 100);
String hdpVersion = HDPUtils.getHdpVersionForHDP224();
if(StringUtils.isNotBlank(hdpVersion)) {
// for hdp 2.2.4, hdp.version should be set and configuration files should be add to container class path
conf.set("hdp.version", hdpVersion);
HDPUtils.addFileToClassPath(HDPUtils.findContainingFile("hdfs-site.xml"), conf);
HDPUtils.addFileToClassPath(HDPUtils.findContainingFile("core-site.xml"), conf);
HDPUtils.addFileToClassPath(HDPUtils.findContainingFile("mapred-site.xml"), conf);
HDPUtils.addFileToClassPath(HDPUtils.findContainingFile("yarn-site.xml"), conf);
}
// one can set guagua conf in shifuconfig
for(Map.Entry<Object, Object> entry: Environment.getProperties().entrySet()) {
if(CommonUtils.isHadoopConfigurationInjected(entry.getKey().toString())) {
conf.set(entry.getKey().toString(), entry.getValue().toString());
}
}
int shuffleSize = getDataShuffleSize(srcDataPath, source);
log.info("Try to shuffle data into - {} parts.", shuffleSize);
conf.set(Constants.SHIFU_NORM_SHUFFLE_SIZE, Integer.toString(shuffleSize));
Job job = Job.getInstance(conf, "Shifu: Shuffling normalized data - " + this.modelConfig.getModelSetName());
job.setJarByClass(getClass());
job.setMapperClass(DataShuffle.ShuffleMapper.class);
job.setMapOutputKeyClass(IntWritable.class);
job.setMapOutputValueClass(Text.class);
job.setPartitionerClass(DataShuffle.KvalPartitioner.class);
job.setReducerClass(DataShuffle.ShuffleReducer.class);
job.setOutputKeyClass(NullWritable.class);
job.setOutputValueClass(Text.class);
job.setNumReduceTasks(shuffleSize);
FileInputFormat.setInputPaths(job, srcDataPath);
FileOutputFormat.setOutputPath(job, new Path(this.pathFinder.getShuffleDataPath()));
// clean output firstly
ShifuFileUtils.deleteFile(this.pathFinder.getShuffleDataPath(), source);
// submit job
if(job.waitForCompletion(true)) {
// TODO copy or move ??
ShifuFileUtils.copy(this.pathFinder.getShuffleDataPath(), srcDataPath, source);
} else {
throw new RuntimeException("MapReduce Correlation Computing Job failed.");
}
}
// OptionsParser doesn't to support *.jar currently.
private String addRuntimeJars() {
List<String> jars = new ArrayList<String>(16);
// common-codec
jars.add(JarManager.findContainingJar(Base64.class));
// commons-compress-*.jar
jars.add(JarManager.findContainingJar(BZip2CompressorInputStream.class));
// commons-lang-*.jar
jars.add(JarManager.findContainingJar(StringUtils.class));
// common-io-*.jar
jars.add(JarManager.findContainingJar(org.apache.commons.io.IOUtils.class));
// common-collections
jars.add(JarManager.findContainingJar(Predicate.class));
// guava-*.jar
jars.add(JarManager.findContainingJar(Splitter.class));
// guagua-core-*.jar
jars.add(JarManager.findContainingJar(NumberFormatUtils.class));
// shifu-*.jar
jars.add(JarManager.findContainingJar(getClass()));
// jexl-*.jar
jars.add(JarManager.findContainingJar(JexlException.class));
// encog-core-*.jar
jars.add(JarManager.findContainingJar(MLDataSet.class));
// jackson-databind-*.jar
jars.add(JarManager.findContainingJar(ObjectMapper.class));
// jackson-core-*.jar
jars.add(JarManager.findContainingJar(JsonParser.class));
// jackson-annotations-*.jar
jars.add(JarManager.findContainingJar(JsonIgnore.class));
return StringUtils.join(jars, NNConstants.LIB_JAR_SEPARATOR);
}
private int getDataShuffleSize(String srcDataPath, RawSourceData.SourceType sourceType) throws IOException {
// if user set fixed data shuffle size, then use it
Integer fsize = Environment.getInt(Constants.SHIFU_NORM_SHUFFLE_SIZE);
if(fsize != null) {
return fsize;
}
// calculate data shuffle size based on user's prefer
Long preferPartSize = Environment.getLong(Constants.SHIFU_NORM_PREFER_PART_SIZE);
Long actualFileSize = ShifuFileUtils
.getFileOrDirectorySize(srcDataPath, sourceType);
if(preferPartSize != null && actualFileSize != null && preferPartSize != 0) {
int dataShuffleSize = (int) (actualFileSize / preferPartSize);
return ((actualFileSize % preferPartSize == 0) ? dataShuffleSize : (dataShuffleSize + 1));
} else {
return ShifuFileUtils.getFilePartCount(srcDataPath, sourceType);
}
}
}