/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.mahout.clustering.streaming.cluster;
import java.util.List;
import com.google.common.collect.Lists;
import org.apache.mahout.common.Pair;
import org.apache.mahout.math.Centroid;
import org.apache.mahout.math.DenseVector;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.random.MultiNormal;
/**
* A collection of miscellaneous utility functions for working with data to be clustered.
* Includes methods for generating synthetic data and estimating distance cutoff.
*/
public final class DataUtils {
private DataUtils() {
}
/**
* Samples numDatapoints vectors of numDimensions cardinality centered around the vertices of a
* numDimensions order hypercube. The distribution of points around these vertices is
* multinormal with a radius of distributionRadius.
* A hypercube of numDimensions has 2^numDimensions vertices. Keep this in mind when clustering
* the data.
*
* Note that it is almost always the case that you want to call RandomUtils.useTestSeed() before
* generating test data. This means that you can't generate data in the declaration of a static
* variable because such initializations happen before any @BeforeClass or @Before setup methods
* are called.
*
*
* @param numDimensions number of dimensions of the vectors to be generated.
* @param numDatapoints number of data points to be generated.
* @param distributionRadius radius of the distribution around the hypercube vertices.
* @return a pair of lists, whose first element is the sampled points and whose second element
* is the list of hypercube vertices that are the means of each distribution.
*/
public static Pair<List<Centroid>, List<Centroid>> sampleMultiNormalHypercube(
int numDimensions, int numDatapoints, double distributionRadius) {
int pow2N = 1 << numDimensions;
// Construct data samplers centered on the corners of a unit hypercube.
// Additionally, keep the means of the distributions that will be generated so we can compare
// these to the ideal cluster centers.
List<Centroid> mean = Lists.newArrayListWithCapacity(pow2N);
List<MultiNormal> rowSamplers = Lists.newArrayList();
for (int i = 0; i < pow2N; i++) {
Vector v = new DenseVector(numDimensions);
// Select each of the num
int pow2J = 1 << (numDimensions - 1);
for (int j = 0; j < numDimensions; ++j) {
v.set(j, 1.0 / pow2J * (i & pow2J));
pow2J >>= 1;
}
mean.add(new Centroid(i, v, 1));
rowSamplers.add(new MultiNormal(distributionRadius, v));
}
// Sample the requested number of data points.
List<Centroid> data = Lists.newArrayListWithCapacity(numDatapoints);
for (int i = 0; i < numDatapoints; ++i) {
data.add(new Centroid(i, rowSamplers.get(i % pow2N).sample(), 1));
}
return new Pair<List<Centroid>, List<Centroid>>(data, mean);
}
/**
* Calls sampleMultinormalHypercube(numDimension, numDataPoints, 0.01).
* @see DataUtils#sampleMultiNormalHypercube(int, int, double)
*/
public static Pair<List<Centroid>, List<Centroid>> sampleMultiNormalHypercube(int numDimensions,
int numDatapoints) {
return sampleMultiNormalHypercube(numDimensions, numDatapoints, 0.01);
}
}