package skywriting.examples.skyhout.kmeans; import java.io.EOFException; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.io.OutputStreamWriter; import java.util.HashMap; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.serializer.Serialization; import org.apache.hadoop.io.serializer.WritableSerialization; import org.apache.hadoop.mapreduce.lib.partition.HashPartitioner; import org.apache.mahout.clustering.kmeans.Cluster; import org.apache.mahout.clustering.kmeans.KMeansInfo; import org.apache.mahout.common.distance.DistanceMeasure; import org.apache.mahout.common.distance.SquaredEuclideanDistanceMeasure; import com.asgow.ciel.executor.Ciel; import com.asgow.ciel.references.Reference; import com.asgow.ciel.references.WritableReference; import com.asgow.ciel.tasks.FirstClassJavaTask; import skywriting.examples.skyhout.common.SkywritingTaskFileSystem; import skywriting.examples.skyhout.common.SortedPartitionedOutputCollector; import uk.co.mrry.mercator.task.JarTaskLoader; import uk.co.mrry.mercator.task.Task; public class KMeansReduceTask implements FirstClassJavaTask { private DistanceMeasure measure; private final Reference[] dataPartitionsRefs; private final Reference[] partialSumsRefs; private final Reference oldClustersRef; private final int iteration; private final double convergenceDelta; private final boolean doCache; public KMeansReduceTask(Reference[] dataPartitionRefs, Reference[] partialSumsRefs, Reference oldClustersRef, int iteration, double convergenceDelta, boolean doCache) { this.dataPartitionsRefs = dataPartitionRefs; this.partialSumsRefs = partialSumsRefs; this.oldClustersRef = oldClustersRef; this.iteration = iteration; this.convergenceDelta = convergenceDelta; this.doCache = doCache; } @Override public void invoke() throws Exception { Configuration conf = new Configuration(); conf.setClassLoader(Ciel.CLASSLOADER); conf.setClass("io.serializations", WritableSerialization.class, Serialization.class); new WritableSerialization(); FileInputStream[] fis = new FileInputStream[this.partialSumsRefs.length + 1]; for (int i = 0; i < this.partialSumsRefs.length; ++i) { fis[i] = new FileInputStream(Ciel.RPC.getFilenameForReference(this.partialSumsRefs[i])); } fis[fis.length - 1] = new FileInputStream(Ciel.RPC.getFilenameForReference(this.oldClustersRef)); WritableReference clustersOut = Ciel.RPC.getNewObjectFilename("clusters"); OutputStream[] fos = new OutputStream[] { clustersOut.open() }; SkywritingTaskFileSystem fs = new SkywritingTaskFileSystem(fis, fos, conf); this.measure = new SquaredEuclideanDistanceMeasure(); assert fs.numInputs() == 2; assert fs.numOutputs() == 2; HashMap<String, Cluster> oldClusterMap = new HashMap<String, Cluster>(); SequenceFile.Reader oldClusterReader = new SequenceFile.Reader(fs, new Path("/in/" + (fs.numInputs() - 1)), conf); while (true) { Text id = new Text(); Cluster curr = new Cluster(); try { boolean isMore = oldClusterReader.next(id, curr); if (!isMore) break; } catch (EOFException eofe) { break; } oldClusterMap.put(curr.getIdentifier(), curr); //System.out.println("Putting cluster " + curr.getIdentifier() + " in oldClusterMap"); } oldClusterReader.close(); KMeansReducerCombiner kmrc = new KMeansReducerCombiner(oldClusterMap, measure, convergenceDelta); SortedPartitionedOutputCollector<Text, KMeansInfo, KMeansInfo, Cluster> inputCollector = new SortedPartitionedOutputCollector<Text, KMeansInfo, KMeansInfo, Cluster>(fs, new HashPartitioner<Text, KMeansInfo>(), kmrc, Text.class, Cluster.class, 1); KMeansInfo currentReduceValue = new KMeansInfo(); for (int i = 0; i < fis.length - 1; ++i) { SequenceFile.Reader reduceInputReader = new SequenceFile.Reader(fs, new Path("/in/" + i), conf); while (true) { Text currentReduceKey = new Text(); try { boolean isMore = reduceInputReader.next(currentReduceKey, currentReduceValue); if (!isMore) break; } catch (EOFException eofe) { break; } inputCollector.collect(currentReduceKey, currentReduceValue); } } inputCollector.close(); if (!kmrc.areAllConverged() && this.iteration <= 10) { Reference newClustersRef = clustersOut.getCompletedRef(); Reference[] newPartialSumsRefs = new Reference[this.dataPartitionsRefs.length]; for (int i = 0; i < newPartialSumsRefs.length; ++i) { System.out.println("dpr[i]" + this.dataPartitionsRefs[i]); System.out.println("ncr " + newClustersRef); newPartialSumsRefs[i] = Ciel.spawn(new KMeansMapTask(this.dataPartitionsRefs[i], newClustersRef, this.doCache), null, 1)[0]; } Ciel.tailSpawn(new KMeansReduceTask(this.dataPartitionsRefs, newPartialSumsRefs, newClustersRef, this.iteration + 1, this.convergenceDelta, this.doCache), null); } else { Ciel.returnPlainString("Finished!"); } } @Override public Reference[] getDependencies() { Reference[] ret = new Reference[this.partialSumsRefs.length + 1]; for (int i = 0; i < this.partialSumsRefs.length; ++i) { ret[i] = this.partialSumsRefs[i]; } ret[ret.length - 1] = this.oldClustersRef; return ret; } @Override public void setup() { // TODO Auto-generated method stub } }