package mil.nga.giat.geowave.analytic.mapreduce.kmeans.runner;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashSet;
import java.util.Set;
import mil.nga.giat.geowave.analytic.PropertyManagement;
import mil.nga.giat.geowave.analytic.SimpleFeatureItemWrapperFactory;
import mil.nga.giat.geowave.analytic.clustering.CentroidManagerGeoWave;
import mil.nga.giat.geowave.analytic.clustering.NestedGroupCentroidAssignment;
import mil.nga.giat.geowave.analytic.distance.FeatureCentroidDistanceFn;
import mil.nga.giat.geowave.analytic.extract.SimpleFeatureCentroidExtractor;
import mil.nga.giat.geowave.analytic.extract.SimpleFeatureGeometryExtractor;
import mil.nga.giat.geowave.analytic.mapreduce.MapReduceJobController;
import mil.nga.giat.geowave.analytic.mapreduce.MapReduceJobRunner;
import mil.nga.giat.geowave.analytic.param.CentroidParameters;
import mil.nga.giat.geowave.analytic.param.ClusteringParameters;
import mil.nga.giat.geowave.analytic.param.CommonParameters;
import mil.nga.giat.geowave.analytic.param.FormatConfiguration;
import mil.nga.giat.geowave.analytic.param.GlobalParameters;
import mil.nga.giat.geowave.analytic.param.MapReduceParameters;
import mil.nga.giat.geowave.analytic.param.ParameterEnum;
import mil.nga.giat.geowave.analytic.param.SampleParameters;
import mil.nga.giat.geowave.analytic.param.StoreParameters;
import org.apache.hadoop.conf.Configuration;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/*
* Loop and sample multiple sets of K centers.
*
* Fulfills steps 3 through 5 in the Kmeans Parellel initialize Algorithm 2,as documented in section 3.3 in
*
* Bahmani, Kumar, Moseley, Vassilvitskii and Vattani. Scalable K-means++. VLDB
* Endowment Vol. 5, No. 7. 2012.
*
* The number of iterations is assumed to be log(psi), according the paper.
*
* As an added bonus, remove those centers that did not have sufficient number of matches, leaving the top sampleSize/iterations.
*
*/
public class SampleMultipleSetsJobRunner<T> extends
MapReduceJobController implements
MapReduceJobRunner
{
protected static final Logger LOGGER = LoggerFactory.getLogger(SampleMultipleSetsJobRunner.class);
private final KSamplerJobRunner initialSampleRunner = new KSamplerJobRunner();
private final UpdateCentroidCostJobRunner updateCostRunner = new UpdateCentroidCostJobRunner();
private final RankSamplerJobRunner jobGrowSampleRunner = new RankSamplerJobRunner();
private final StripWeakCentroidsRunner<T> stripWeakCentroidsRunner = new StripWeakCentroidsRunner<T>();
private final IterationCountCalculateRunner<T> iterationCountCalculateRunner = new IterationCountCalculateRunner<T>();
private int iterations = 1;
private int zoomLevel = 1;
public SampleMultipleSetsJobRunner() {
stage1Setup();
}
private void stage1Setup() {
init(
new MapReduceJobRunner[] {
initialSampleRunner,
updateCostRunner,
iterationCountCalculateRunner
},
new PostOperationTask[] {
DoNothingTask,
DoNothingTask,
DoNothingTask
});
}
public int getCurrentCentroidCount() {
return stripWeakCentroidsRunner.getCurrentCentroidCount();
}
private void stage2Setup(
final PropertyManagement runTimeProperties ) {
setIterations(iterationCountCalculateRunner.getIterationsCount());
init(
new MapReduceJobRunner[] {
jobGrowSampleRunner,
updateCostRunner,
stripWeakCentroidsRunner
},
new PostOperationTask[] {
DoNothingTask,
DoNothingTask,
new PostOperationTask() {
@Override
public void runTask(
final Configuration config,
final MapReduceJobRunner runner ) {
updateCostRunner.setReducerCount(Math.min(
stripWeakCentroidsRunner.getCurrentCentroidCount(),
runTimeProperties.getPropertyAsInt(
ClusteringParameters.Clustering.MAX_REDUCER_COUNT,
32)));
}
}
});
}
@Override
public int run(
final Configuration config,
final PropertyManagement runTimeProperties )
throws Exception {
// run stage 1
updateCostRunner.setReducerCount(1);
this.stripWeakCentroidsRunner.setRange(
runTimeProperties.getPropertyAsInt(
SampleParameters.Sample.MIN_SAMPLE_SIZE,
2),
runTimeProperties.getPropertyAsInt(
SampleParameters.Sample.MAX_SAMPLE_SIZE,
1000));
runTimeProperties.store(
SampleParameters.Sample.SAMPLE_SIZE,
runTimeProperties.getPropertyAsInt(
SampleParameters.Sample.MAX_SAMPLE_SIZE,
1000));
setIterations(runTimeProperties.getPropertyAsInt(
SampleParameters.Sample.SAMPLE_ITERATIONS,
1));
runTimeProperties.storeIfEmpty(
CentroidParameters.Centroid.WRAPPER_FACTORY_CLASS,
SimpleFeatureItemWrapperFactory.class);
runTimeProperties.storeIfEmpty(
CommonParameters.Common.DISTANCE_FUNCTION_CLASS,
FeatureCentroidDistanceFn.class);
runTimeProperties.storeIfEmpty(
CentroidParameters.Centroid.EXTRACTOR_CLASS,
SimpleFeatureCentroidExtractor.class);
runTimeProperties.storeIfEmpty(
CommonParameters.Common.DIMENSION_EXTRACT_CLASS,
SimpleFeatureGeometryExtractor.class);
runTimeProperties.copy(
CentroidParameters.Centroid.DATA_TYPE_ID,
SampleParameters.Sample.DATA_TYPE_ID);
runTimeProperties.copy(
CentroidParameters.Centroid.INDEX_ID,
SampleParameters.Sample.INDEX_ID);
runTimeProperties.store(
CentroidParameters.Centroid.ZOOM_LEVEL,
zoomLevel);
stage1Setup();
final int status1 = super.run(
config,
runTimeProperties);
if (status1 != 0) {
return status1;
}
stage2Setup(runTimeProperties);
for (int i = 0; i < iterations; i++) {
final int status2 = super.run(
config,
runTimeProperties);
if (status2 != 0) {
return status2;
}
}
return 0;
}
@Override
public Collection<ParameterEnum<?>> getParameters() {
final Set<ParameterEnum<?>> params = new HashSet<ParameterEnum<?>>();
params.addAll(Arrays.asList(new ParameterEnum<?>[] {
SampleParameters.Sample.MAX_SAMPLE_SIZE,
SampleParameters.Sample.SAMPLE_ITERATIONS,
SampleParameters.Sample.MIN_SAMPLE_SIZE,
CentroidParameters.Centroid.WRAPPER_FACTORY_CLASS,
CentroidParameters.Centroid.INDEX_ID,
CentroidParameters.Centroid.DATA_TYPE_ID,
CentroidParameters.Centroid.DATA_NAMESPACE_URI,
CentroidParameters.Centroid.EXTRACTOR_CLASS,
CommonParameters.Common.DISTANCE_FUNCTION_CLASS,
CommonParameters.Common.DIMENSION_EXTRACT_CLASS,
StoreParameters.StoreParam.INPUT_STORE,
GlobalParameters.Global.BATCH_ID
}));
params.addAll(MapReduceParameters.getParameters());
params.addAll(NestedGroupCentroidAssignment.getParameters());
params.addAll(CentroidManagerGeoWave.getParameters());
params.addAll(initialSampleRunner.getParameters());
return params;
}
public void setInputFormatConfiguration(
final FormatConfiguration inputFormatConfiguration ) {
initialSampleRunner.setInputFormatConfiguration(inputFormatConfiguration);
updateCostRunner.setInputFormatConfiguration(inputFormatConfiguration);
jobGrowSampleRunner.setInputFormatConfiguration(inputFormatConfiguration);
}
private void setIterations(
final int iterations ) {
this.iterations = Math.max(
this.iterations,
iterations);
}
public void setZoomLevel(
final int zoomLevel ) {
this.zoomLevel = zoomLevel;
initialSampleRunner.setZoomLevel(zoomLevel);
jobGrowSampleRunner.setZoomLevel(zoomLevel);
}
}