package mil.nga.giat.geowave.analytic.mapreduce.kmeans.runner;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import mil.nga.giat.geowave.analytic.PropertyManagement;
import mil.nga.giat.geowave.analytic.clustering.DistortionGroupManagement.DistortionDataAdapter;
import mil.nga.giat.geowave.analytic.clustering.DistortionGroupManagement.DistortionEntry;
import mil.nga.giat.geowave.analytic.clustering.NestedGroupCentroidAssignment;
import mil.nga.giat.geowave.analytic.mapreduce.CountofDoubleWritable;
import mil.nga.giat.geowave.analytic.mapreduce.GeoWaveAnalyticJobRunner;
import mil.nga.giat.geowave.analytic.mapreduce.kmeans.KMeansDistortionMapReduce;
import mil.nga.giat.geowave.analytic.param.CentroidParameters;
import mil.nga.giat.geowave.analytic.param.ClusteringParameters;
import mil.nga.giat.geowave.analytic.param.GlobalParameters;
import mil.nga.giat.geowave.analytic.param.JumpParameters;
import mil.nga.giat.geowave.analytic.param.ParameterEnum;
import mil.nga.giat.geowave.core.store.operations.remote.options.DataStorePluginOptions;
import mil.nga.giat.geowave.mapreduce.input.GeoWaveInputFormat;
import mil.nga.giat.geowave.mapreduce.output.GeoWaveOutputFormat;
import mil.nga.giat.geowave.mapreduce.output.GeoWaveOutputKey;
/**
*
* Calculate the distortation.
*
* See Catherine A. Sugar and Gareth M. James (2003).
* "Finding the number of clusters in a data set: An information theoretic approach"
* Journal of the American Statistical Association 98 (January): 750–763
*
*
*/
public class KMeansDistortionJobRunner extends
GeoWaveAnalyticJobRunner
{
private int k = 1;
private DataStorePluginOptions dataStoreOptions;
public KMeansDistortionJobRunner() {
setReducerCount(8);
}
public void setDataStoreOptions(
final DataStorePluginOptions dataStoreOptions ) {
this.dataStoreOptions = dataStoreOptions;
}
public void setCentroidsCount(
final int k ) {
this.k = k;
}
@Override
public void configure(
final Job job )
throws Exception {
job.setMapperClass(KMeansDistortionMapReduce.KMeansDistortionMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(CountofDoubleWritable.class);
job.setReducerClass(KMeansDistortionMapReduce.KMeansDistortionReduce.class);
job.setCombinerClass(KMeansDistortionMapReduce.KMeansDistorationCombiner.class);
job.setOutputKeyClass(GeoWaveOutputKey.class);
job.setOutputValueClass(DistortionEntry.class);
job.setOutputFormatClass(GeoWaveOutputFormat.class);
// extends wait time to 15 minutes (default: 600 seconds)
final long milliSeconds = 1000L * 60L * 15L;
final Configuration conf = job.getConfiguration();
conf.setLong(
"mapred.task.timeout",
milliSeconds);
((ParameterEnum<Integer>) JumpParameters.Jump.COUNT_OF_CENTROIDS).getHelper().setValue(
conf,
KMeansDistortionMapReduce.class,
Integer.valueOf(k));
// Required since the Mapper uses the input format parameters to lookup
// the adapter
GeoWaveInputFormat.setStoreOptions(
conf,
dataStoreOptions);
GeoWaveOutputFormat.addDataAdapter(
conf,
new DistortionDataAdapter());
}
@Override
public Class<?> getScope() {
return KMeansDistortionMapReduce.class;
}
@Override
public int run(
final Configuration config,
final PropertyManagement runTimeProperties )
throws Exception {
setReducerCount(runTimeProperties.getPropertyAsInt(
ClusteringParameters.Clustering.MAX_REDUCER_COUNT,
super.getReducerCount()));
runTimeProperties.setConfig(
new ParameterEnum[] {
CentroidParameters.Centroid.EXTRACTOR_CLASS,
CentroidParameters.Centroid.WRAPPER_FACTORY_CLASS,
GlobalParameters.Global.PARENT_BATCH_ID
},
config,
getScope());
NestedGroupCentroidAssignment.setParameters(
config,
getScope(),
runTimeProperties);
return super.run(
config,
runTimeProperties);
}
@Override
protected String getJobName() {
return "K-Means Distortion";
}
}