/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.mahout.ga.watchmaker.cd.tool; import com.google.common.collect.Lists; import com.google.common.io.Closeables; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.mahout.common.HadoopUtil; import org.apache.mahout.common.RandomUtils; import org.apache.commons.lang.ArrayUtils; import org.apache.mahout.examples.MahoutTestCase; import org.junit.Before; import org.junit.Test; import java.io.BufferedWriter; import java.io.IOException; import java.io.OutputStreamWriter; import java.util.Arrays; import java.util.Collection; import java.util.List; import java.util.Random; public final class CDInfosToolTest extends MahoutTestCase { /** max number of distinct values for any nominal attribute */ private static final int MAX_NOMINAL_VALUES = 50; private Random rng; @Override @Before public void setUp() throws Exception { super.setUp(); rng = RandomUtils.getRandom(); } private Descriptors randomDescriptors(int nbattributes, double numRate, double catRate) { char[] descriptors = new char[nbattributes]; for (int index = 0; index < nbattributes; index++) { double rnd = rng.nextDouble(); if (rnd < numRate) { // numerical attribute descriptors[index] = 'N'; } else if (rnd < (numRate + catRate)) { // categorical attribute descriptors[index] = 'C'; } else { // ignored attribute descriptors[index] = 'I'; } } return new Descriptors(descriptors); } /** * generate random descriptions given the attibutes descriptors.<br> - * numerical attributes: generate random min and max values<br> - nominal * attributes: generate a random list of values */ private Object[][] randomDescriptions(Descriptors descriptors) { int nbattrs = descriptors.size(); Object[][] descriptions = new Object[nbattrs][]; for (int index = 0; index < nbattrs; index++) { if (descriptors.isNumerical(index)) { // numerical attribute // srowen: I 'fixed' this to not use Double.{MAX,MIN}_VALUE since // it does not seem like that has the desired effect double min = rng.nextDouble() * ((long) Integer.MAX_VALUE - Integer.MIN_VALUE) + Integer.MIN_VALUE; double max = rng.nextDouble() * (Integer.MAX_VALUE - min) + min; descriptions[index] = new Double[] { min, max }; } else if (descriptors.isNominal(index)) { // categorical attribute int nbvalues = rng.nextInt(MAX_NOMINAL_VALUES) + 1; descriptions[index] = new Object[nbvalues]; for (int vindex = 0; vindex < nbvalues; vindex++) { descriptions[index][vindex] = "val_" + index + '_' + vindex; } } } return descriptions; } private void randomDataset(FileSystem fs, Path input, Descriptors descriptors, Object[][] descriptions) throws IOException { boolean[][] appeared = new boolean[descriptions.length][]; for (int desc = 0; desc < descriptors.size(); desc++) { // appeared is used only by nominal attributes if (descriptors.isNominal(desc)) { appeared[desc] = new boolean[descriptions[desc].length]; } } int nbfiles = rng.nextInt(20) + 1; for (int floop = 0; floop < nbfiles; floop++) { FSDataOutputStream out = fs.create(new Path(input, "file." + floop)); BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(out)); try { // make sure we have enough room to allow all nominal values to appear in the data int nblines = rng.nextInt(200) + MAX_NOMINAL_VALUES; for (int line = 0; line < nblines; line++) { writer.write(randomLine(descriptors, descriptions, appeared)); writer.newLine(); } } finally { Closeables.closeQuietly(writer); } } } /** * generates a random line using the given information * * @param descriptors attributes descriptions * @param descriptions detailed attributes descriptions:<br> - min and max * values for numerical attributes<br> - all distinct values for * nominal attributes * @param appeared used to make sure that each nominal attribute's value * appears at least once in the dataset */ private String randomLine(Descriptors descriptors, Object[][] descriptions, boolean[][] appeared) { StringBuilder buffer = new StringBuilder(); for (int index = 0; index < descriptors.size(); index++) { if (descriptors.isNumerical(index)) { // numerical attribute double min = (Double) descriptions[index][0]; double max = (Double) descriptions[index][1]; double value = rng.nextDouble() * (max - min) + min; buffer.append(value); } else if (descriptors.isNominal(index)) { // categorical attribute int nbvalues = descriptions[index].length; // chose a random value int vindex; if (ArrayUtils.contains(appeared[index], false)) { // if some values never appeared in the dataset, start with them do { vindex = rng.nextInt(nbvalues); } while (appeared[index][vindex]); } else { // chose any value vindex = rng.nextInt(nbvalues); } buffer.append(descriptions[index][vindex]); appeared[index][vindex] = true; } else { // ignored attribute (any value is correct) buffer.append('I'); } if (index < descriptors.size() - 1) { buffer.append(','); } } return buffer.toString(); } private static int nbNonIgnored(Descriptors descriptors) { int nbattrs = 0; for (int index = 0; index < descriptors.size(); index++) { if (!descriptors.isIgnored(index)) { nbattrs++; } } return nbattrs; } @Test public void testGatherInfos() throws Exception { int n = 1; // put a greater value when you search for some nasty bug for (int nloop = 0; nloop < n; nloop++) { int maxattr = 100; // max number of attributes int nbattrs = rng.nextInt(maxattr) + 1; // random descriptors double numRate = rng.nextDouble(); double catRate = rng.nextDouble() * (1.0 - numRate); Descriptors descriptors = randomDescriptors(nbattrs, numRate, catRate); // random descriptions Object[][] descriptions = randomDescriptions(descriptors); // random dataset Path inpath = getTestTempDirPath("input"); Path output = getTestTempDirPath("output"); Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(inpath.toUri(), conf); HadoopUtil.delete(conf, inpath); randomDataset(fs, inpath, descriptors, descriptions); // Start the tool List<String> result = Lists.newArrayList(); fs.delete(output, true); // It's unhappy if this directory exists CDInfosTool.gatherInfos(descriptors, inpath, output, result); // check the results Collection<String> target = Lists.newArrayList(); assertEquals(nbNonIgnored(descriptors), result.size()); int rindex = 0; for (int index = 0; index < nbattrs; index++) { if (descriptors.isIgnored(index)) { continue; } String description = result.get(rindex++); if (descriptors.isNumerical(index)) { // numerical attribute double min = (Double) descriptions[index][0]; double max = (Double) descriptions[index][1]; double[] range = DescriptionUtils.extractNumericalRange(description); assertTrue("bad min value for attribute (" + index + ')', min <= range[0]); assertTrue("bad max value for attribute (" + index + ')', max >= range[1]); } else if (descriptors.isNominal(index)) { // categorical attribute Object[] values = descriptions[index]; target.clear(); DescriptionUtils.extractNominalValues(description, target); assertEquals(values.length, target.size()); assertTrue(target.containsAll(Arrays.asList(values))); } } } } }