/***********************************************************************************************************************
* Copyright (C) 2010-2013 by the Stratosphere project (http://stratosphere.eu)
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
* an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
* specific language governing permissions and limitations under the License.
**********************************************************************************************************************/
package eu.stratosphere.test.recordJobs.kmeans.udfs;
import java.io.Serializable;
import java.util.Iterator;
import eu.stratosphere.api.java.record.functions.FunctionAnnotation.ConstantFields;
import eu.stratosphere.api.java.record.functions.ReduceFunction;
import eu.stratosphere.api.java.record.operators.ReduceOperator.Combinable;
import eu.stratosphere.types.DoubleValue;
import eu.stratosphere.types.IntValue;
import eu.stratosphere.types.Record;
import eu.stratosphere.util.Collector;
/**
* Reduce PACT determines the closes cluster center for a data point. This
* is a minimum aggregation. Hence, a Combiner can be easily implemented.
*/
@Combinable
@ConstantFields(1)
public class FindNearestCenter extends ReduceFunction implements Serializable {
private static final long serialVersionUID = 1L;
private final IntValue centerId = new IntValue();
private final CoordVector position = new CoordVector();
private final IntValue one = new IntValue(1);
private final Record result = new Record(3);
/**
* Computes a minimum aggregation on the distance of a data point to
* cluster centers.
*
* Output Format:
* 0: centerID
* 1: pointVector
* 2: constant(1) (to enable combinable average computation in the following reducer)
*/
@Override
public void reduce(Iterator<Record> pointsWithDistance, Collector<Record> out) {
double nearestDistance = Double.MAX_VALUE;
int nearestClusterId = 0;
// check all cluster centers
while (pointsWithDistance.hasNext()) {
Record res = pointsWithDistance.next();
double distance = res.getField(3, DoubleValue.class).getValue();
// compare distances
if (distance < nearestDistance) {
// if distance is smaller than smallest till now, update nearest cluster
nearestDistance = distance;
nearestClusterId = res.getField(2, IntValue.class).getValue();
res.getFieldInto(1, this.position);
}
}
// emit a new record with the center id and the data point. add a one to ease the
// implementation of the average function with a combiner
this.centerId.setValue(nearestClusterId);
this.result.setField(0, this.centerId);
this.result.setField(1, this.position);
this.result.setField(2, this.one);
out.collect(this.result);
}
// ----------------------------------------------------------------------------------------
private final Record nearest = new Record();
/**
* Computes a minimum aggregation on the distance of a data point to
* cluster centers.
*/
@Override
public void combine(Iterator<Record> pointsWithDistance, Collector<Record> out) {
double nearestDistance = Double.MAX_VALUE;
// check all cluster centers
while (pointsWithDistance.hasNext()) {
Record res = pointsWithDistance.next();
double distance = res.getField(3, DoubleValue.class).getValue();
// compare distances
if (distance < nearestDistance) {
nearestDistance = distance;
res.copyTo(this.nearest);
}
}
// emit nearest one
out.collect(this.nearest);
}
}