/**
* Copyright (C) 2001-2017 by RapidMiner and the contributors
*
* Complete list of developers available at our web site:
*
* http://rapidminer.com
*
* This program is free software: you can redistribute it and/or modify it under the terms of the
* GNU Affero General Public License as published by the Free Software Foundation, either version 3
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without
* even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License along with this program.
* If not, see http://www.gnu.org/licenses/.
*/
package com.rapidminer.example;
import com.rapidminer.example.table.DataRow;
import com.rapidminer.example.table.SparseDataRow;
import com.rapidminer.operator.UserError;
import java.util.Arrays;
/**
* This class can be used for the efficient generation of sparse example formats. The constructor
* creates a mapping between example set and example table attribute indices which only need to be
* performed once. The sparse data can then be queried efficiently by using the methods
* {@link #getNonDefaultAttributeIndices(Example)} and
* {@link #getNonDefaultAttributeValues(Example)}. Please note that this filter should be
* reinstatiated for new example sets. Furthermore, a gain in performance is only achieved for
* examples with underlying {@link SparseDataRow}s.
*
* @author Julien Nioche, Ingo Mierswa ingomierswa Exp $
*/
public class FastExample2SparseTransform {
/**
* The mapping between the attribute indices in the data row / example table on the attribute
* indices of the given example set. This mapping is only necessary for examples backed up by
* {@link SparseDataRow}s.
*/
private int[] mapping;
/**
* This attribute array is necessary since in example sets only iterators are provided for
* attributes.
*/
private Attribute[] attributes;
/**
* The complete array of all attribute indices which can be used for data rows which do not
* implement {@link SparseDataRow}.
*/
private int[] allIndices;
/**
* Returns for a table giving the equivalence between the positions of the Attributes in the
* ExampleTable and the number of the regular Attributes in the ExampleSet. A value of -1
* indicates that the Attribute is not regular or has been deleted (is null). This is used in
* order to optimize the access to sparse DataRows (e.g. SVM implementations or for Weka), which
* is important when the number of Attributes is large.
*
* @throws UserError
*/
public FastExample2SparseTransform(ExampleSet es) throws UserError {
// init
this.mapping = new int[es.getExampleTable().getNumberOfAttributes()];
for (int i = 0; i < mapping.length; i++) {
mapping[i] = -1;
}
// create mappings
int pos = 0;
this.attributes = new Attribute[es.getAttributes().size()];
this.allIndices = new int[es.getAttributes().size()];
for (Attribute attribute : es.getAttributes()) {
int tableIndex = attribute.getTableIndex();
if (tableIndex != Attribute.VIEW_ATTRIBUTE_INDEX) {
this.mapping[attribute.getTableIndex()] = pos;
this.attributes[pos] = attribute;
this.allIndices[pos] = pos;
pos++;
} else {
throw new UserError(null, 140);
}
}
// trunkate allIndices array
this.allIndices = Arrays.copyOf(this.allIndices, pos);
// trim is necessary in order to allow fast mapping!
for (Example e : es) {
e.getDataRow().trim();
}
}
/**
* Returns a list with the indices of the regular Attributes with non-default values. This can
* be used for a faster construction of sparse dataset representations when the number of
* Attributes is large. The positions of attributes are sorted by ascending number.
*/
public int[] getNonDefaultAttributeIndices(Example example) {
int numberNonDefaultAttributes = 0;
DataRow data = example.getDataRow();
if (data instanceof SparseDataRow) {
int[] nonDefaultInd = ((SparseDataRow) (data)).getNonDefaultIndices();
int[] tempArray = new int[nonDefaultInd.length];
// map between the positive indices in the table
// and the corresponding attribute positions
for (int i = 0; i < nonDefaultInd.length; i++) {
int nextPos = mapping[nonDefaultInd[i]];
if (nextPos != -1) {
tempArray[numberNonDefaultAttributes++] = nextPos;
}
}
// trim the array and sort it
int[] finalArray = new int[numberNonDefaultAttributes];
System.arraycopy(tempArray, 0, finalArray, 0, numberNonDefaultAttributes);
// the positions have to be sorted for the sparse data
Arrays.sort(finalArray);
return finalArray;
} else {
int[] tempArray = new int[allIndices.length];
for (Attribute a : example.getAttributes()) {
int nextPos = mapping[a.getTableIndex()];
// check for view attribute and zero value
// default value should not be used, since both libsvm and fast large margin solve
// LPs, so a value other than 0 would make an impact
if (nextPos != -1 && example.getValue(a) != 0) {
tempArray[numberNonDefaultAttributes++] = nextPos;
}
}
// trim the array and sort it
int[] finalArray = new int[numberNonDefaultAttributes];
System.arraycopy(tempArray, 0, finalArray, 0, numberNonDefaultAttributes);
// the positions have to be sorted for the sparse data
Arrays.sort(finalArray);
return finalArray;
}
}
/**
* Returns an array of non-default values of the given example. These are only the values of
* regular attributes. Simply invokes {@link #getNonDefaultAttributeValues(Example, int[])} with
* the array of non-default indices for the given example.
*/
public double[] getNonDefaultAttributeValues(Example example) {
return getNonDefaultAttributeValues(example, getNonDefaultAttributeIndices(example));
}
/**
* Returns an array of non-default values of the given example. These are only the values of
* regular attributes. The size of the returned array is the same as the size of the given
* indices array.
*/
public double[] getNonDefaultAttributeValues(Example example, int[] nonDefaultIndices) {
double[] result = new double[nonDefaultIndices.length];
for (int i = 0; i < result.length; i++) {
result[i] = example.getValue(this.attributes[nonDefaultIndices[i]]);
}
return result;
}
}