package com.skp.experiment.clustering.minhash;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.OutputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.ToolRunner;
import org.apache.mahout.clustering.minhash.MinHashReducer;
import org.apache.mahout.common.AbstractJob;
import org.apache.mahout.common.HadoopUtil;
import org.apache.mahout.common.commandline.DefaultOptionCreator;
import org.apache.mahout.common.commandline.MinhashOptionCreator;
import org.apache.mahout.math.VectorWritable;
import com.skp.experiment.common.Text2DistributedRowMatrixJob;
public class MinHashJob extends AbstractJob {
private static final String VECTORIZED_INPUT_PATH = MinHashJob.class.getName() + ".vectorized";
private int minClusterSize;
private int minVectorSize;
private String hashType;
private int numHashFunctions;
private int keyGroups;
private int numReduceTasks;
private boolean debugOutput;
private boolean isVectorizedInput;
public static void main(String[] args) throws Exception {
ToolRunner.run( new MinHashJob(), args);
}
@Override
public int run(String[] args) throws Exception {
addInputOption();
addOutputOption();
addOption(MinhashOptionCreator.MIN_CLUSTER_SIZE,
"minClusterSize", "minimum number of users belong to cluster.",
String.valueOf(0));
addOption(MinhashOptionCreator.MIN_VECTOR_SIZE,
"minVectorSize", "minimum number of item each user purchased.",
String.valueOf(0));
addOption(MinhashOptionCreator.HASH_TYPE, "hashType",
"hash type: (linear, polynomial, murmur)", "murmur");
addOption(MinhashOptionCreator.NUM_HASH_FUNCTIONS, "numHashFunctions",
"number of hash functions.", String.valueOf(20));
addOption(MinhashOptionCreator.KEY_GROUPS, "keyGroups",
"number of key groups.", String.valueOf(4));
addOption(MinhashOptionCreator.NUM_REDUCERS, "numReduceTasks",
"number of reducer tasks.", String.valueOf(10));
addOption(MinhashOptionCreator.DEBUG_OUTPUT, "debugOutput",
"true if want to print out debug.", false);
addOption(DefaultOptionCreator.OVERWRITE_OPTION, "overwrite",
"true if want to overwrite.", false);
addOption("isVectorizedInput", "vinput", "true if input is vectorized before. default is csv format input", false);
if (parseArguments(args) == null) {
return -1;
}
isVectorizedInput = Boolean.parseBoolean(getOption("isVectorizedInput"));
Path input;
if (!isVectorizedInput) {
Path tmpOut = getTempPath(VECTORIZED_INPUT_PATH);
ToolRunner.run(new Text2DistributedRowMatrixJob(), new String[]{
"-i", getInputPath().toString(), "-o", tmpOut.toString(),
"-ri", "0", "-ci", "1", "-vi", "1", "--outKeyType", "text"
});
input = tmpOut;
} else {
input = getInputPath();
}
Path output = getOutputPath();
if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
HadoopUtil.delete(getConf(), output);
}
minClusterSize = Integer.valueOf(getOption(MinhashOptionCreator.MIN_CLUSTER_SIZE));
minVectorSize = Integer.valueOf(getOption(MinhashOptionCreator.MIN_VECTOR_SIZE));
hashType = getOption(MinhashOptionCreator.HASH_TYPE);
numHashFunctions = Integer.valueOf(getOption(MinhashOptionCreator.NUM_HASH_FUNCTIONS));
keyGroups = Integer.valueOf(getOption(MinhashOptionCreator.KEY_GROUPS));
numReduceTasks = Integer.parseInt(getOption(MinhashOptionCreator.NUM_REDUCERS));
debugOutput = Boolean.parseBoolean(MinhashOptionCreator.DEBUG_OUTPUT);
Configuration conf = getConf();
conf.setInt(MinhashOptionCreator.MIN_CLUSTER_SIZE, minClusterSize);
conf.setInt(MinhashOptionCreator.MIN_VECTOR_SIZE, minVectorSize);
conf.set(MinhashOptionCreator.HASH_TYPE, hashType);
conf.setInt(MinhashOptionCreator.NUM_HASH_FUNCTIONS, numHashFunctions);
conf.setInt(MinhashOptionCreator.KEY_GROUPS, keyGroups);
conf.setBoolean(MinhashOptionCreator.DEBUG_OUTPUT, debugOutput);
Class<? extends Writable> outputClass = debugOutput ? VectorWritable.class : Text.class;
Class<? extends OutputFormat> outputFormatClass =
debugOutput ? SequenceFileOutputFormat.class : TextOutputFormat.class;
Job job = prepareJob(input, output, SequenceFileInputFormat.class,
MinHashMapper.class, Text.class, outputClass,
MinHashReducer.class, Text.class, outputClass, outputFormatClass
);
job.setNumReduceTasks(numReduceTasks);
job.waitForCompletion(true);
//FileSystem fs = FileSystem.get(getConf());
//fs.delete(getTempPath(VECTORIZED_INPUT_PATH), true);
return 0;
}
}