/*********************************************************************************************************************** * Copyright (C) 2010-2013 by the Stratosphere project (http://stratosphere.eu) * * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the * specific language governing permissions and limitations under the License. **********************************************************************************************************************/ package eu.stratosphere.test.recordJobs.kmeans.udfs; import java.io.Serializable; import java.util.Iterator; import eu.stratosphere.api.java.record.functions.FunctionAnnotation.ConstantFields; import eu.stratosphere.api.java.record.functions.ReduceFunction; import eu.stratosphere.api.java.record.operators.ReduceOperator.Combinable; import eu.stratosphere.types.IntValue; import eu.stratosphere.types.Record; import eu.stratosphere.util.Collector; /** * Reduce PACT computes the new position (coordinate vector) of a cluster * center. This is an average computation. Hence, Combinable is annotated * and the combine method implemented. * * Output Format: * 0: clusterID * 1: clusterVector */ @Combinable @ConstantFields(0) public class RecomputeClusterCenter extends ReduceFunction implements Serializable { private static final long serialVersionUID = 1L; private final IntValue count = new IntValue(); /** * Compute the new position (coordinate vector) of a cluster center. */ @Override public void reduce(Iterator<Record> dataPoints, Collector<Record> out) { Record next = null; // initialize coordinate vector sum and count CoordVector coordinates = new CoordVector(); double[] coordinateSum = null; int count = 0; // compute coordinate vector sum and count while (dataPoints.hasNext()) { next = dataPoints.next(); // get the coordinates and the count from the record double[] thisCoords = next.getField(1, CoordVector.class).getCoordinates(); int thisCount = next.getField(2, IntValue.class).getValue(); if (coordinateSum == null) { if (coordinates.getCoordinates() != null) { coordinateSum = coordinates.getCoordinates(); } else { coordinateSum = new double[thisCoords.length]; } } addToCoordVector(coordinateSum, thisCoords); count += thisCount; } // compute new coordinate vector (position) of cluster center for (int i = 0; i < coordinateSum.length; i++) { coordinateSum[i] /= count; } coordinates.setCoordinates(coordinateSum); next.setField(1, coordinates); next.setNull(2); // emit new position of cluster center out.collect(next); } /** * Computes a pre-aggregated average value of a coordinate vector. */ @Override public void combine(Iterator<Record> dataPoints, Collector<Record> out) { Record next = null; // initialize coordinate vector sum and count CoordVector coordinates = new CoordVector(); double[] coordinateSum = null; int count = 0; // compute coordinate vector sum and count while (dataPoints.hasNext()) { next = dataPoints.next(); // get the coordinates and the count from the record double[] thisCoords = next.getField(1, CoordVector.class).getCoordinates(); int thisCount = next.getField(2, IntValue.class).getValue(); if (coordinateSum == null) { if (coordinates.getCoordinates() != null) { coordinateSum = coordinates.getCoordinates(); } else { coordinateSum = new double[thisCoords.length]; } } addToCoordVector(coordinateSum, thisCoords); count += thisCount; } coordinates.setCoordinates(coordinateSum); this.count.setValue(count); next.setField(1, coordinates); next.setField(2, this.count); // emit partial sum and partial count for average computation out.collect(next); } /** * Adds two coordinate vectors by summing up each of their coordinates. * * @param cvToAddTo * The coordinate vector to which the other vector is added. * This vector is returned. * @param cvToBeAdded * The coordinate vector which is added to the other vector. * This vector is not modified. */ private void addToCoordVector(double[] cvToAddTo, double[] cvToBeAdded) { // check if both vectors have same length if (cvToAddTo.length != cvToBeAdded.length) { throw new IllegalArgumentException("The given coordinate vectors are not of equal length."); } // sum coordinate vectors coordinate-wise for (int i = 0; i < cvToAddTo.length; i++) { cvToAddTo[i] += cvToBeAdded[i]; } } }