KMeansArbitraryDimension.java example

Explorer

flink-perf-master
- flink-jobs
  - src
    - main
      - java
        com
        github
        projectflink
        als
        ALSBroadcastJava.java
        ALSSampleData.java
        avro
        AvroPerf.java
        CompareJob.java
        GenerateLineitems.java
        Prepare.java
        generators
        KMeansDataGenerator.java
        Logdata.java
        Text.java
        Utils.java
        grep
        GrepJob.java
        GrepJobOptimized.java
        javaTestPlan
        AvroFileGenerator.java
        CsvOutputFormatTest.java
        LargeTestPlan.java
        pagerank
        AdaptivePageRank.java
        AdjacencyBuilder.java
        PageRankStephan.java
        streaming
        ChainingSpeed.java
        FlinkKafkaReader.java
        ForwardThroughput.java
        KafkaGenerator.java
        Latency.java
        StandaloneKafkaReader.java
        Throughput.java
        utils
        FixedPartitioning.java
        KafkaPartitioner.java
        PimpedKafkaSink.java
        TestFixedPartitioning.java
        testPlan
        KMeansArbitraryDimension.java
        KryoTest.java
        LargeTestPlan.java
        Order.java
        Readonly.java
        SequenceFileGenerator.java
        SequenceFileTest.java
        TPCHQuery3.java
        WordCountHashAgg.java
        WordCountWithoutCombine.java
        utils
        IterationParser.java
      - parking
        tpch
        generators
        core
        DistributedTPCH.java
        TPCHGeneratorSplittableIterator.java
        TpchEntityFormatter.java
        programs
        TPCHGenerator.java
        TPCHGeneratorExample.java
- hadoop-jobs
  - src
    - main
      - java
        com
        github
        projectflink
        hadoop
        GrepDriver.java
        KMeansDriver.java
        PageRankDriver.java
- perf-common
  - src
    - main
      - java
        com
        github
        projectflink
        common
        AnalyzeTool.java
        KafkaFT.java
- spark-jobs
  - src
    - main
      - java
        com
        github
        projectflink
        spark
        Grep.java
        GrepCaching.java
        KMeansArbitraryDimension.java
        MyRegistrator.java
        Readonly.java
        TPCH3Spark.java
        TPCH3Sql.java
        WordCount.java
        WordCountGrouping.java
        streaming
        ForwardThroughput.java
        Throughput.java
- storm-jobs
  - src
    - jvm
      - experiments
        ForwardThroughput.java
        KafkaThroughput.java
        Latency.java
        Throughput.java
        ThroughputHostsTracking.java
        TridentForwardThroughput.java
        TridentThroughput.java
      - storm
        starter
        WordCountTopology.java
        WordCountTopologyNode.java
        spout
        RandomSentenceSpout.java
        tools
        NthLastModifiedTimeTracker.java
        Rankable.java
        RankableObjectWithFields.java
        Rankings.java
        SlidingWindowCounter.java
        SlotBasedCounter.java
        trident
        TridentReach.java
        TridentWordCount.java
        util
        StormRunner.java
- streaming-state-machine
  - src
    - main
      - java
        newconsumer
        FlinkKafkaConsumer.java
        Tests.java

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.github.projectflink.testPlan;

import java.io.Serializable;
import java.util.Collection;

import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.common.functions.ReduceFunction;
import org.apache.flink.api.common.functions.RichMapFunction;
import org.apache.flink.api.java.operators.IterativeDataSet;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.tuple.Tuple3;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.api.java.DataSet;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.core.fs.FileSystem;
import org.apache.flink.util.StringUtils;

/**
 * This example implements a basic K-Means clustering algorithm.
 *
 * <p>
 * K-Means is an iterative clustering algorithm and works as follows:<br>
 * K-Means is given a set of data points to be clustered and an initial set of <i>K</i> cluster centers.
 * In each iteration, the algorithm computes the distance of each data point to each cluster center.
 * Each point is assigned to the cluster center which is closest to it.
 * Subsequently, each cluster center is moved to the center (<i>mean</i>) of all points that have been assigned to it.
 * The moved cluster centers are fed into the next iteration. 
 * The algorithm terminates after a fixed number of iterations (as in this implementation) 
 * or if cluster centers do not (significantly) move in an iteration.<br>
 * This is the Wikipedia entry for the <a href="http://en.wikipedia.org/wiki/K-means_clustering">K-Means Clustering algorithm</a>.
 * Input files are plain text files and must be formatted as follows:
 * <ul>
 * <li>Data points are represented as double values separated by a blank character.
 * Data points are separated by newline characters.<br>
 * For example <code>"1.2 2.3\n5.3 7.2\n"</code> gives two data points (x=1.2, y=2.3) and (x=5.3, y=7.2).
 * <li>Cluster centers are represented by an integer id and a point value.<br>
 * For example <code>"1 6.2 3.2\n2 2.9 5.7\n"</code> gives two centers (id=1, x=6.2, y=3.2) and (id=2, x=2.9, y=5.7).
 * </ul>
 *
 * <p>
 * Usage: <code>KMeans <points path> <centers path> <result path> <num iterations></code><br>
 *
 * <p>
 * This example shows how to use:
 * <ul>
 * <li>Bulk iterations
 * <li>Broadcast variables in bulk iterations
 * <li>Custom Java objects (PoJos)
 * </ul>
 */
@SuppressWarnings("serial")
public class KMeansArbitraryDimension {

	// *************************************************************************
	//     PROGRAM
	// *************************************************************************

	public static void main(String[] args) throws Exception {

		if(!parseParameters(args)) {
			return;
		}

		// set up execution environment
		ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

		// get input data
		DataSet<Point> points = env
			.readTextFile(pointsPath)
			.map(new ConvertToPoint());


		DataSet<Centroid> centroids = env
			.readTextFile(centersPath)
			.map(new ConvertToCentroid());


		// set number of bulk iterations for KMeans algorithm
		IterativeDataSet<Centroid> loop = centroids.iterate(numIterations);

		DataSet<Centroid> newCentroids = points
			// compute closest centroid for each point
			.map(new SelectNearestCenter()).withBroadcastSet(loop, "centroids")
			// count and sum point coordinates for each centroid
			.map(new CountAppender())
			.groupBy(0).reduce(new CentroidAccumulator())
			// compute new centroids from point counts and coordinate sums
			.map(new CentroidAverager());
		// feed new centroids back into next iteration
		DataSet<Centroid> finalCentroids = loop.closeWith(newCentroids);

		DataSet<Tuple2<Integer, Point>> clusteredPoints = points
			// assign points to final clusters
			.map(new SelectNearestCenter()).withBroadcastSet(finalCentroids, "centroids");

		// emit result
		//clusteredPoints.writeAsCsv(outputPath, "\n", " ", FileSystem.WriteMode.OVERWRITE);
		clusteredPoints.writeAsText(outputPath, FileSystem.WriteMode.OVERWRITE);
		// execute program
		env.setParallelism(dop);
		env.execute("KMeans Multi-Dimension");

	}

	// *************************************************************************
	//     DATA TYPES
	// *************************************************************************

	public static class Point implements Serializable {

		public double [] points;

		public Point() {}

		public Point(double [] points) {
			this.points = points;
		}

		public Point add(Point other) {
			for (int i = 0; i < points.length; i++) {
				points[i] =  points[i] + other.points[i];
			}
			return this;
		}

		public Point div(long val) {
			for (int i = 0; i < points.length; i++) {
				points[i] = points[i] / val;
			}
			return this;
		}

		public double euclideanDistance(Point other) {
			double sum = 0;
			for (int i = 0; i < points.length; i++) {
				sum = sum + (points[i] - other.points[i]) * (points[i] - other.points[i]);
			}
			return Math.sqrt(sum);
		}

		@Override
		public String toString() {
			return StringUtils.arrayToString(points);
		}
	}

	public static class Centroid extends Point {

		public int id;

		public Centroid() {}

		public Centroid(int id, double [] points) {
			super(points);
			this.id = id;
		}

		public Centroid(int id, Point p) {
			super(p.points);
			this.id = id;
		}

		@Override
		public String toString() {
			return id + "," + super.toString();
		}
	}

	// *************************************************************************
	//     USER FUNCTIONS
	// *************************************************************************


	/** Convert String value into data point **/
	public static final class ConvertToPoint implements MapFunction<String, Point> {

		@Override
		public Point map(String s) throws Exception {
			String [] line = s.split(" ");
			double [] points = new double[line.length];
			for (int i = 0; i < line.length; i++) {
				points[i] = Double.parseDouble(line[i]);
			}
			return new Point(points);
		}
	}

	/** Convert String value into data centroid **/
	public static final class ConvertToCentroid implements MapFunction<String, Centroid> {

		@Override
		public Centroid map(String s) throws Exception {
			String [] line = s.split(" ");
			int id = Integer.parseInt(line[0]);
			double [] points = new double[line.length - 1];
			for (int i = 1; i < line.length; i++) {
				points[i - 1] = Double.parseDouble(line[i]);
			}
			return new Centroid(id, points);
		}
	}

	/** Determines the closest cluster center for a data point. */
	public static final class SelectNearestCenter extends RichMapFunction<Point, Tuple2<Integer, Point>> {
		private Collection<Centroid> centroids;

		/** Reads the centroid values from a broadcast variable into a collection. */
		@Override
		public void open(Configuration parameters) throws Exception {
			this.centroids = getRuntimeContext().getBroadcastVariable("centroids");
		}

		@Override
		public Tuple2<Integer, Point> map(Point p) throws Exception {

			double minDistance = Double.MAX_VALUE;
			int closestCentroidId = -1;

			// check all cluster centers
			for (Centroid centroid : centroids) {
				// compute distance
				double distance = p.euclideanDistance(centroid);

				// update nearest cluster if necessary 
				if (distance < minDistance) {
					minDistance = distance;
					closestCentroidId = centroid.id;
				}
			}

			// emit a new record with the center id and the data point.
			return new Tuple2<Integer, Point>(closestCentroidId, p);
		}
	}

	/** Appends a count variable to the tuple. */
	public static final class CountAppender implements MapFunction<Tuple2<Integer, Point>, Tuple3<Integer, Point, Long>> {

		@Override
		public Tuple3<Integer, Point, Long> map(Tuple2<Integer, Point> t) {
			Tuple3<Integer, Point, Long> r = new Tuple3<Integer, Point, Long>(t.f0, t.f1, 1L);
			return r;
		}
	}

	/** Sums and counts point coordinates. */
	public static final class CentroidAccumulator implements ReduceFunction<Tuple3<Integer, Point, Long>> {

		@Override
		public Tuple3<Integer, Point, Long> reduce(Tuple3<Integer, Point, Long> val1, Tuple3<Integer, Point, Long> val2) {
			Tuple3<Integer, Point, Long> r = new Tuple3<Integer, Point, Long>(val1.f0, val1.f1.add(val2.f1), val1.f2 + val2.f2);
			return r;
		}
	}

	/** Computes new centroid from coordinate sum and count of points. */
	public static final class CentroidAverager implements MapFunction<Tuple3<Integer, Point, Long>, Centroid> {

		@Override
		public Centroid map(Tuple3<Integer, Point, Long> value) {
			Centroid c= new Centroid(value.f0, value.f1.div(value.f2));
			return c;
		}
	}

	// *************************************************************************
	//     UTIL METHODS
	// *************************************************************************

	private static String pointsPath = null;
	private static String centersPath = null;
	private static String outputPath = null;
	private static int numIterations = 10;
	private static int dop;

	private static boolean parseParameters(String[] programArguments) {
		// parse input arguments
		if(programArguments.length == 5) {
			pointsPath = programArguments[0];
			centersPath = programArguments[1];
			outputPath = programArguments[2];
			numIterations = Integer.parseInt(programArguments[3]);
			dop = Integer.parseInt(programArguments[4]);
		} else {
			System.err.println("Usage: KMeans <points path> <centers path> <result path> <num iterations>");
			return false;
		}
		return true;
	}

}