/**
* Copyright (C) 2001-2017 by RapidMiner and the contributors
*
* Complete list of developers available at our web site:
*
* http://rapidminer.com
*
* This program is free software: you can redistribute it and/or modify it under the terms of the
* GNU Affero General Public License as published by the Free Software Foundation, either version 3
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without
* even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License along with this program.
* If not, see http://www.gnu.org/licenses/.
*/
package com.rapidminer.operator.learner.tree;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import com.rapidminer.core.concurrency.ConcurrencyContext;
import com.rapidminer.example.Attribute;
import com.rapidminer.example.Example;
import com.rapidminer.example.ExampleSet;
import com.rapidminer.example.Statistics;
import com.rapidminer.example.table.NominalMapping;
import com.rapidminer.operator.Operator;
import com.rapidminer.operator.OperatorException;
import com.rapidminer.operator.UserError;
import com.rapidminer.studio.internal.Resources;
/**
* This is a fast, read-only representation of an {@link ExampleSet} with emphasis of reading the
* values by attribute. It is useful for algorithms that iterate over (subsets of) the example set
* multiple times. It is not useful for algorithms that consist of only one iteration over the
* example set because of the cost for creating this representation.
*
* A {@link ColumnExampleTable} should be viewed as a table consisting of the columns representing
* all nominal attributes followed by the columns representing all numerical attributes. To consider
* only a subset of the attributes, a selection can be represented by the numbers of the columns
* that are selected. Analogously, if only a subset of the examples (rows) should be considered, a
* selection can be represented by the numbers of the selected rows.
*
* The {@link ExampleSet} is stored inside a table of arrays. The values at numerical attributes are
* stored as double values while the values at numerical attributes are stored as byte values coming
* from their {@link NominalMapping} or, if they are missing values, as the size of the mapping. The
* label must not have missing values.
*
* @author Gisa Schaefer
*
*/
public class ColumnExampleTable {
/** Number of rows between checking for stop */
private static final int CHECK_FOR_STOP_INTERVAL = 1000;
/**
* the table creation is done in parallel if the product of the number of attributes and the
* number of examples is greater than this number
*/
private static final int THRESHOLD_PRODUCT_PARALLEL = 3_000_000;
/** If a nominal attribute has more than this number of different values, it is ignored. */
private static final int MAXIMAL_NOMINAL_VALUES = 127;
private int numberOfExamples = 0;
private int numberOfRegularNominalAttributes = 0;
private int numberOfRegularNumericalAttributes = 0;
private Attribute[] regularNominalAttributes;
private Attribute[] regularNumericalAttributes;
private Attribute label;
private Attribute weight;
/** the column containing the values of the label attribute */
private int[] labelColumn;
/** the column containing the values of the weight attribute - if it exists */
private double[] weightColumn = null;
/**
* table containing the values of the nominal attributes from an example set:
* nominalColumnTable[c][r] contains the value of the example in the row r at the attribute
* regularNominalAttributes[c]. nominalColumnTable[c] is the column containing the values for
* the nominal attribute number c.
*/
private byte[][] nominalColumnTable;
/**
* table containing the values of the numerical attributes from an example set:
* numericalColumnTable[c][r] contains the value of the example in the row r at the attribute
* regularNumericalAttributes[c]. nominalColumnTable[c] is the column containing the values for
* the numerical attribute number c.
*/
private double[][] numericalColumnTable;
/**
* The nominal column table is initialized with the values of the regular nominal attributes,
* the numerical column table with the ones of the regular numeric attributes. The values of the
* label attribute are stored in the label column and if a weight attribute exists, its values
* are stored in the weight column. Nominal values are stored by their number in the
* {@link NominalMapping} as byte if they are not missing, otherwise as the size of the mapping.
*
* @param parallelAllowed
* if the table creation can be done in parallel
* @throws OperatorException
* if the label has missing values
*
*/
public ColumnExampleTable(ExampleSet exampleSet, Operator operator, boolean parallelAllowed) throws OperatorException {
numberOfExamples = exampleSet.size();
label = exampleSet.getAttributes().getLabel();
weight = exampleSet.getAttributes().getWeight();
exampleSet.recalculateAttributeStatistics(label);
if (exampleSet.getStatistics(label, Statistics.UNKNOWN) > 0) {
throw new UserError(operator, 162, label.getName());
}
// split regular attributes into nominal and numerical and store in arrays
List<Attribute> regularNominalAttributesList = new ArrayList<Attribute>();
List<Attribute> regularNumericalAttributesList = new ArrayList<Attribute>();
for (Attribute attribute : exampleSet.getAttributes()) { // only regular attributes
if (attribute.isNominal()) {
// ignore nominal attributes with too many different values
if (attribute.getMapping().size() <= MAXIMAL_NOMINAL_VALUES) {
regularNominalAttributesList.add(attribute);
}
} else {
regularNumericalAttributesList.add(attribute);
}
}
numberOfRegularNominalAttributes = regularNominalAttributesList.size();
regularNominalAttributes = regularNominalAttributesList.toArray(new Attribute[numberOfRegularNominalAttributes]);
numberOfRegularNumericalAttributes = regularNumericalAttributesList.size();
regularNumericalAttributes = regularNumericalAttributesList
.toArray(new Attribute[numberOfRegularNumericalAttributes]);
// initialize tables
nominalColumnTable = new byte[numberOfRegularNominalAttributes][numberOfExamples];
numericalColumnTable = new double[numberOfRegularNumericalAttributes][numberOfExamples];
labelColumn = new int[numberOfExamples];
if (weight != null) {
weightColumn = new double[numberOfExamples];
}
if (betterParallel(parallelAllowed, operator)) {
populateParallel(exampleSet, operator);
} else {
populate(exampleSet, operator);
}
}
/**
* Creates the tables from the {@link ExampleSet}.
*
* @param exampleSet
* @param operator
*/
private void populate(ExampleSet exampleSet, Operator operator) {
int row = 0;
for (Example example : exampleSet) {
if (row % CHECK_FOR_STOP_INTERVAL == 0 && operator != null) {
Resources.getConcurrencyContext(operator).checkStatus();
}
fillInRow(example, row);
row++;
}
}
/**
* Creates the tables from the {@link ExampleSet} in parallel.
*
* @param exampleSet
* @param operator
* a non-null operator
* @throws OperatorException
*/
private void populateParallel(final ExampleSet exampleSet, Operator operator) throws OperatorException {
final ConcurrencyContext context = Resources.getConcurrencyContext(operator);
int numberOfThreads = context.getParallelism();
int blocksize = numberOfExamples / numberOfThreads;
int rest = numberOfExamples % numberOfThreads;
List<Callable<Void>> todo = new ArrayList<>(numberOfThreads);
int start = 0;
int end = 0;
while (end < numberOfExamples) {
start = end;
end += blocksize;
if (rest > 0) {
end++;
rest--;
}
final int startRow = start;
final int endRow = end;
todo.add(new Callable<Void>() {
@Override
public Void call() {
Example example;
for (int row = startRow; row < endRow; row++) {
if ((row - startRow) % CHECK_FOR_STOP_INTERVAL == 0) {
context.checkStatus();
}
example = exampleSet.getExample(row);
fillInRow(example, row);
}
return null;
}
});
}
try {
context.call(todo);
} catch (ExecutionException e) {
Throwable cause = e.getCause();
if (cause instanceof RuntimeException) {
throw (RuntimeException) cause;
} else if (cause instanceof Error) {
throw (Error) cause;
} else {
throw new OperatorException(cause.getMessage(), cause);
}
}
}
/**
* Fills the example in specified row of all attribute columns.
*
* @param example
* @param row
*/
private void fillInRow(Example example, int row) {
int column = 0;
for (Attribute attribute : regularNominalAttributes) {
double value = example.getValue(attribute);
if (Double.isNaN(value)) {
value = attribute.getMapping().size();
}
nominalColumnTable[column][row] = (byte) value;
column++;
}
labelColumn[row] = (int) example.getValue(label);
column = 0;
for (Attribute attribute : regularNumericalAttributes) {
numericalColumnTable[column][row] = example.getValue(attribute);
column++;
}
if (weight != null) {
weightColumn[row] = example.getValue(weight);
}
}
/**
* Calculates if it is better to fill the table in parallel.
*
* @param parallelAllowed
* @param operator
* @return
*/
private boolean betterParallel(boolean parallelAllowed, Operator operator) {
return parallelAllowed
&& operator != null
&& Resources.getConcurrencyContext(operator).getParallelism() > 1
&& ((long) numberOfRegularNominalAttributes + numberOfRegularNumericalAttributes) * numberOfExamples > THRESHOLD_PRODUCT_PARALLEL;
}
/**
* @return the number of examples in table
*/
public int getNumberOfExamples() {
return numberOfExamples;
}
/**
* @return the label attribute
*/
public Attribute getLabel() {
return label;
}
/**
* @return a int array containing the values of the nominal label column via the
* {@link NominalMapping}.
*/
public int[] getLabelColumn() {
return labelColumn;
}
/**
* @return the weight attribute, if it exists, <code>null</code> otherwise
*/
public Attribute getWeight() {
return weight;
}
/**
* @return the values of the weight attribute, if it exits, <code>null</code> otherwise
*/
public double[] getWeightColumn() {
return weightColumn;
}
/**
* @param attributeNumber
* a number that represents a nominal attribute
* @return the column containing the values of the represented nominal attribute
*/
public byte[] getNominalAttributeColumn(int attributeNumber) {
return nominalColumnTable[attributeNumber];
}
/**
* @param attributeNumber
* a number that represents a numerical attribute
* @return the column containing the values of the represented numerical attribute
*/
public double[] getNumericalAttributeColumn(int attributeNumber) {
return numericalColumnTable[attributeNumber - numberOfRegularNominalAttributes];
}
/**
* @param attributeNumber
* @return <code>true</code> if the attributeNumber represents a nominal attribute
*/
public boolean representsNominalAttribute(int attributeNumber) {
return attributeNumber < numberOfRegularNominalAttributes;
}
/**
* @param attributeNumber
* @return <code>true</code> if the attributeNumber represents a numerical attribute
*/
public boolean representsNumericalAttribute(int attributeNumber) {
return attributeNumber >= numberOfRegularNominalAttributes;
}
public int getTotalNumberOfRegularAttributes() {
return numberOfRegularNominalAttributes + numberOfRegularNumericalAttributes;
}
/**
* Returns the nominal attribute represented by the attributeNumber.
*
* @param attributeNumber
* @return
*/
public Attribute getNominalAttribute(int attributeNumber) {
return regularNominalAttributes[attributeNumber];
}
/**
* Returns the numerical attribute represented by the attributeNumber.
*
* @param attributeNumber
* @return
*/
public Attribute getNumericalAttribute(int attributeNumber) {
return regularNumericalAttributes[attributeNumber - numberOfRegularNominalAttributes];
}
public int getNumberOfRegularNominalAttributes() {
return numberOfRegularNominalAttributes;
}
public int getNumberOfRegularNumericalAttributes() {
return numberOfRegularNumericalAttributes;
}
}