package dima;
import com.beust.jcommander.JCommander;
import com.beust.jcommander.Parameter;
import com.beust.jcommander.ParameterException;
import com.google.common.base.Preconditions;
import com.google.common.primitives.Ints;
import org.apache.commons.cli.ParseException;
import org.apache.mahout.common.distance.CosineDistanceMeasure;
import org.apache.mahout.math.Vector;
import org.apache.pig.EvalFunc;
import org.apache.pig.FuncSpec;
import org.apache.pig.builtin.OutputSchema;
import org.apache.pig.data.Tuple;
import org.apache.pig.impl.logicalLayer.FrontendException;
import java.io.IOException;
import java.util.List;
/**
* Date: 20.03.13
* Time: 21:24
*
* @author Johannes Kirschnick
*/
@OutputSchema("distance:double")
public class CosineDistancePigFunction extends EvalFunc<Double> {
private final MahoutVectorConverter vectorConverter;
CosineDistanceMeasure cosineDistanceMeasure = new CosineDistanceMeasure();
boolean skip = false;
@Parameter(names = {"-skipValue"}, description = "Skip values which are equal to X", required = false)
Double skipValue;
@Parameter(names = {"-offset"}, description = "Subtract this from resulting distance function (before skip evaluation)", required = false)
double offset = 0;
public CosineDistancePigFunction() {
this(""); // no options
}
public CosineDistancePigFunction(String options) {
JCommander jCommander = new JCommander(this);
try {
// parse options
jCommander.parse(options.split(" "));
skip = (skipValue != null);
vectorConverter = new MahoutVectorConverter();
} catch (ParameterException e) {
StringBuilder out = new StringBuilder();
jCommander.setProgramName(this.getClass().getSimpleName());
jCommander.usage(out);
// We wrap this exception in a Runtime exception so that
// existing loaders that extend PigStorage don't break
throw new RuntimeException(e.getMessage() + "\n" + "In: " + options + "\n" + out.toString());
} catch (ParseException e) {
throw new IllegalArgumentException(e);
}
}
@Override
public Double exec(Tuple input) throws IOException {
// we want something like this
// (cardinality: int, entries: {entry: (index: int, value: double)})
// for each vector
Preconditions.checkArgument(input.size() == 2, "We need 2 arguments, not " + input.size());
Tuple tuple1 = (Tuple) input.get(0);
Integer sizeV1 = (Integer) tuple1.get(0);
Tuple tuple2 = (Tuple) input.get(1);
Integer sizeV2 = (Integer) tuple2.get(0);
Preconditions.checkArgument(Ints.compare(sizeV1, sizeV2) == 0,
"Vector sizes are different " + sizeV1 + " != " + sizeV1);
Vector vector = vectorConverter.toVector(tuple1);
Vector vector2 = vectorConverter.toVector(tuple2);
double distance = cosineDistanceMeasure.distance(vector, vector2) + offset;
// shortcut evaluation should prevent NPE
if(skip && distance == skipValue) {
// ignore
return null;
}
return distance;
}
@Override
public List<FuncSpec> getArgToFuncMapping() throws FrontendException {
return super.getArgToFuncMapping();
}
}