/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.mahout.classifier.df.data; import java.io.BufferedWriter; import java.io.File; import java.io.IOException; import java.util.Arrays; import java.util.Random; import com.google.common.base.Charsets; import com.google.common.io.Closeables; import com.google.common.io.Files; import org.apache.commons.lang.ArrayUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.mahout.classifier.df.data.Dataset.Attribute; /** * Helper methods used by the tests * */ public final class Utils { private Utils() { } /** Used when generating random CATEGORICAL values */ private static final int CATEGORICAL_RANGE = 100; /** * Generates a random list of tokens * <ul> * <li>each attribute has 50% chance to be NUMERICAL ('N') or CATEGORICAL * ('C')</li> * <li>10% of the attributes are IGNORED ('I')</li> * <li>one randomly chosen attribute becomes the LABEL ('L')</li> * </ul> * * @param rng Random number generator * @param nbTokens number of tokens to generate */ public static char[] randomTokens(Random rng, int nbTokens) { char[] result = new char[nbTokens]; for (int token = 0; token < nbTokens; token++) { double rand = rng.nextDouble(); if (rand < 0.1) { result[token] = 'I'; // IGNORED } else if (rand >= 0.5) { result[token] = 'C'; } else { result[token] = 'N'; // NUMERICAL } // CATEGORICAL } // choose the label result[rng.nextInt(nbTokens)] = 'L'; return result; } /** * Generates a space-separated String that contains all the tokens */ public static String generateDescriptor(char[] tokens) { StringBuilder builder = new StringBuilder(); for (char token1 : tokens) { builder.append(token1).append(' '); } return builder.toString(); } /** * Generates a random descriptor as follows:<br> * <ul> * <li>each attribute has 50% chance to be NUMERICAL or CATEGORICAL</li> * <li>10% of the attributes are IGNORED</li> * <li>one randomly chosen attribute becomes the LABEL</li> * </ul> */ public static String randomDescriptor(Random rng, int nbAttributes) { return generateDescriptor(randomTokens(rng, nbAttributes)); } /** * generates random data * * @param rng Random number generator * @param nbAttributes number of attributes * @param regression true is the label is numerical * @param number of data lines to generate */ public static double[][] randomDoubles(Random rng, int nbAttributes, boolean regression, int number) throws DescriptorException { String descriptor = randomDescriptor(rng, nbAttributes); Attribute[] attrs = DescriptorUtils.parseDescriptor(descriptor); double[][] data = new double[number][]; for (int index = 0; index < number; index++) { data[index] = randomVector(rng, attrs, regression); } return data; } /** * generates random data based on the given descriptor * * @param rng Random number generator * @param descriptor attributes description * @param number number of data lines to generate */ public static double[][] randomDoubles(Random rng, CharSequence descriptor, boolean regression, int number) throws DescriptorException { Attribute[] attrs = DescriptorUtils.parseDescriptor(descriptor); double[][] data = new double[number][]; for (int index = 0; index < number; index++) { data[index] = randomVector(rng, attrs, regression); } return data; } /** * Generates random data * * @param rng Random number generator * @param nbAttributes number of attributes * @param regression true is the label should be numerical * @param size data size */ public static Data randomData(Random rng, int nbAttributes, boolean regression, int size) throws DescriptorException { String descriptor = randomDescriptor(rng, nbAttributes); double[][] source = randomDoubles(rng, descriptor, regression, size); String[] sData = double2String(source); Dataset dataset = DataLoader.generateDataset(descriptor, regression, sData); return DataLoader.loadData(dataset, sData); } /** * generates a random vector based on the given attributes.<br> * the attributes' values are generated as follows :<br> * <ul> * <li>each IGNORED attribute receives a Double.NaN</li> * <li>each NUMERICAL attribute receives a random double</li> * <li>each CATEGORICAL and LABEL attribute receives a random integer in the * range [0, CATEGORICAL_RANGE[</li> * </ul> * * @param attrs attributes description */ private static double[] randomVector(Random rng, Attribute[] attrs, boolean regression) { double[] vector = new double[attrs.length]; for (int attr = 0; attr < attrs.length; attr++) { if (attrs[attr].isIgnored()) { vector[attr] = Double.NaN; } else if (attrs[attr].isNumerical()) { vector[attr] = rng.nextDouble(); } else if (attrs[attr].isCategorical()){ vector[attr] = rng.nextInt(CATEGORICAL_RANGE); } else { // LABEL if (regression) { vector[attr] = rng.nextDouble(); } else { vector[attr] = rng.nextInt(CATEGORICAL_RANGE); } } } return vector; } /** * converts a double array to a comma-separated string * * @param v double array * @return comma-separated string */ private static String double2String(double[] v) { StringBuilder builder = new StringBuilder(); for (double aV : v) { builder.append(aV).append(','); } return builder.toString(); } /** * converts an array of double arrays to an array of comma-separated strings * * @param source array of double arrays * @return array of comma-separated strings */ public static String[] double2String(double[][] source) { String[] output = new String[source.length]; for (int index = 0; index < source.length; index++) { output[index] = double2String(source[index]); } return output; } /** * Generates random data with same label value * * @param number data size * @param value label value */ public static double[][] randomDoublesWithSameLabel(Random rng, CharSequence descriptor, boolean regression, int number, int value) throws DescriptorException { int label = findLabel(descriptor); double[][] source = randomDoubles(rng, descriptor, regression, number); for (int index = 0; index < number; index++) { source[index][label] = value; } return source; } /** * finds the label attribute's index */ public static int findLabel(CharSequence descriptor) throws DescriptorException { Attribute[] attrs = DescriptorUtils.parseDescriptor(descriptor); return ArrayUtils.indexOf(attrs, Attribute.LABEL); } private static void writeDataToFile(String[] sData, Path path) throws IOException { BufferedWriter output = Files.newWriter(new File(path.toString()), Charsets.UTF_8); try { for (String line : sData) { output.write(line); output.write('\n'); } } finally { Closeables.closeQuietly(output); } } public static Path writeDataToTestFile(String[] sData) throws IOException { Path testData = new Path("testdata/Data"); FileSystem fs = testData.getFileSystem(new Configuration()); if (!fs.exists(testData)) { fs.mkdirs(testData); } Path path = new Path(testData, "DataLoaderTest.data"); writeDataToFile(sData, path); return path; } public static Path writeDatasetToTestFile(Dataset dataset) throws IOException { Path testData = new Path("testdata/Dataset"); FileSystem fs = testData.getFileSystem(new Configuration()); if (!fs.exists(testData)) { fs.mkdirs(testData); } Path datasetPath = new Path(testData, "dataset.info"); FSDataOutputStream out = fs.create(datasetPath); try { dataset.write(out); } finally { Closeables.closeQuietly(out); } return datasetPath; } /** * Split the data into numMaps splits */ public static String[][] splitData(String[] sData, int numMaps) { int nbInstances = sData.length; int partitionSize = nbInstances / numMaps; String[][] splits = new String[numMaps][]; for (int partition = 0; partition < numMaps; partition++) { int from = partition * partitionSize; int to = partition == (numMaps - 1) ? nbInstances : (partition + 1) * partitionSize; splits[partition] = Arrays.copyOfRange(sData, from, to); } return splits; } }