/*
* Encog(tm) Core v3.4 - Java Version
* http://www.heatonresearch.com/encog/
* https://github.com/encog/encog-java-core
* Copyright 2008-2016 Heaton Research, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* For more information on Heaton Research copyrights, licenses
* and trademarks visit:
* http://www.heatonresearch.com/copyright
*/
package org.encog.util.normalize;
import java.io.File;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.encog.NullStatusReportable;
import org.encog.StatusReportable;
import org.encog.ml.data.MLData;
import org.encog.ml.data.MLDataPair;
import org.encog.ml.data.MLDataSet;
import org.encog.ml.data.basic.BasicMLData;
import org.encog.util.csv.CSVFormat;
import org.encog.util.csv.ReadCSV;
import org.encog.util.normalize.input.HasFixedLength;
import org.encog.util.normalize.input.InputField;
import org.encog.util.normalize.input.InputFieldCSV;
import org.encog.util.normalize.input.InputFieldCSVText;
import org.encog.util.normalize.input.InputFieldMLDataSet;
import org.encog.util.normalize.input.MLDataFieldHolder;
import org.encog.util.normalize.output.OutputField;
import org.encog.util.normalize.output.OutputFieldGroup;
import org.encog.util.normalize.output.OutputFieldGrouped;
import org.encog.util.normalize.output.RequireTwoPass;
import org.encog.util.normalize.segregate.Segregator;
import org.encog.util.normalize.target.NormalizationStorage;
import org.encog.util.obj.ReflectionUtil;
/**
* This class is used to normalize both input and ideal data for neural
* networks. This class can accept input from a variety of sources and output to
* a variety of targets. Normalization is a process by which input data is
* normalized so that it falls in specific ranges. Neural networks typically
* require input to be in the range of 0 to 1, or -1 to 1, depending on how the
* network is structured.
*
* The normalize class is typically given for different types of objects to tell
* it how to process data.
*
* Input Fields:
*
* Input fields specify the raw data that will be read by the Normalize class.
* Input fields are added to the Normalize class by calling addInputField
* method. Input fields must implement the InputField interface. There are a
* number of different input fields provided. Input data can be read from
* several different sources. For example, you can read the "neural network
* input" data from one CSV file and the "ideal neural network output" from
* another.
*
*
* Output Fields:
*
* The output fields are used to specify the final output from the Normalize
* class. The output fields specify both the "neural network input" and "ideal
* output". The output fields are flagged as either input our ideal. The output
* fields are not necessarily one-to-one with the input fields. For example,
* several input fields may combine to produce a single output field. Further
* some input fields may be used only to segregate data, whereas other input
* fields may be ignored all together. The type of output field that you specify
* determines the type of processing that will be done on that field. An
* OutputField is added by calling the addOutputField method.
*
*
* Segregators:
*
* Segregators are used generally for two related purposes. First, segregators
* can be used to exclude rows of data based on certain input values. Perhaps
* the data includes several classes of data, and you only want to train on one
* class. Secondly, segregators can be used to segregate data into training and
* evaluation sets. You may choose to use 80% of your data for training and 20%
* for evaluation. A segregator is added by calling the addSegregator method.
*
*
* Target Storage:
*
* The data created by the Normalization class must be stored somewhere. The
* storage targets allow this to be specified. The output can be sent to a CSV
* file, a NeuralDataSet, or any other target supported by a
* NormalizationStorage derived class. The target is specified by calling the
* setTarget method.
*
* The normalization process can take some time. The progress can be reported to
* a StatusReportable object.
*
* The normalization is a two pass process. The first pass counts the number of
* records and computes important statistics that will be used to normalize the
* output. The second pass actually performs the normalization and writes to the
* target. Both passes are performed when the process method is called.
*
*/
public class DataNormalization implements Serializable {
/**
* The serial ID.
*/
private static final long serialVersionUID = 4387885013771660300L;
/**
* The input fields.
*/
private final List<InputField> inputFields =
new ArrayList<InputField>();
/**
* The output fields.
*/
private final List<OutputField> outputFields =
new ArrayList<OutputField>();
/**
* Keep a collection of all of the ReadCSV classes to support all of the
* distinct CSV files that are to be read.
*/
private transient Collection<ReadCSV> readCSV;
/**
* Hold a map between the InputFieldCSV objects and the corresponding
* ReadCSV object. There will likely be many fields read from a single file.
* This allows only one ReadCSV object to need to be created per actual CSV
* file.
*/
private transient Map<InputField, ReadCSV> csvMap;
/**
* For each InputFieldNeuralDataSet input field an Iterator must be kept to
* actually access the data. Only one Iterator should be kept per data set
* actually used.
*/
private transient Collection<Iterator<MLDataPair>> readDataSet;
/**
* Map each of the input fields to an internally-build NeuralDataFieldHolder
* object. The NeuralDataFieldHolder object holds an Iterator, InputField
* and last NeuralDataPair object loaded.
*/
private transient Map<InputField, MLDataFieldHolder> dataSetFieldMap;
/**
* Map each of the NeuralDataSet Iterators to an internally-build
* MLDataFieldHolder object. The MLDataFieldHolder object holds an
* Iterator, InputField and last MLDataPair object loaded.
*/
private transient Map<Iterator<MLDataPair>,
MLDataFieldHolder> dataSetIteratorMap;
/**
* Output fields can be grouped together, if the value of one output field
* might affect all of the others. This collection holds a list of all of
* the output field groups.
*/
private final Set<OutputFieldGroup> groups =
new HashSet<OutputFieldGroup>();
/**
* A list of the segregators.
*/
private final List<Segregator> segregators =
new ArrayList<Segregator>();
/**
* Where the final output from the normalization is sent.
*/
private NormalizationStorage storage;
/**
* The object to report the progress of the normalization to.
*/
private transient StatusReportable report = new NullStatusReportable();
/**
* The number of records that were found in the first pass.
*/
private int recordCount;
/**
* The current record's index.
*/
private int currentIndex;
/**
* The format to use for all CSV files.
*/
private CSVFormat csvFormat = CSVFormat.ENGLISH;
/**
* How long has it been since the last report. This filters so that every
* single record does not produce a message.
*/
private int lastReport;
/**
* Add an input field.
*
* @param f
* The input field to add.
*/
public void addInputField(final InputField f) {
this.inputFields.add(f);
}
/**
* Add an output field. This output field will be added as a "ML network
* input field", not an "ideal output field".
*
* @param outputField
* The output field to add.
*/
public void addOutputField(final OutputField outputField) {
addOutputField(outputField, false);
}
/**
* Add a field and allow it to be specified as an "ideal output field". An
* "ideal" field is the expected output that the ML network is training
* towards.
*
* @param outputField
* The output field.
* @param ideal
* True if this is an ideal field.
*/
public void addOutputField(final OutputField outputField,
final boolean ideal) {
this.outputFields.add(outputField);
outputField.setIdeal(ideal);
if (outputField instanceof OutputFieldGrouped) {
final OutputFieldGrouped ofg = (OutputFieldGrouped) outputField;
this.groups.add(ofg.getGroup());
}
}
/**
* Add a segregator.
*
* @param segregator
* The segregator to add.
*/
public void addSegregator(final Segregator segregator) {
this.segregators.add(segregator);
segregator.init(this);
}
/**
* Called internally to allow each of the input fields to update their
* min/max values in the first pass.
*/
private void applyMinMax() {
for (final InputField field : this.inputFields) {
final double value = field.getCurrentValue();
field.applyMinMax(value);
}
}
/**
* Build "input data for a neural network" based on the input values
* provided. This allows input for a neural network to be normalized. This
* is typically used when data is to be presented to a trained neural
* network.
*
* @param data
* The input values to be normalized.
* @return The data to be sent to the neural network.
*/
public MLData buildForNetworkInput(final double[] data) {
// feed the input fields
int index = 0;
for (final InputField field : this.inputFields) {
if (field.getUsedForNetworkInput()) {
if (index >= data.length) {
throw new NormalizationError(
"Can't build data, input fields used for neural input, must match provided data("
+ data.length + ").");
}
field.setCurrentValue(data[index++]);
}
}
// count the output fields
int outputCount = 0;
for (final OutputField ofield : this.outputFields) {
if (!ofield.isIdeal()) {
for (int sub = 0; sub < ofield.getSubfieldCount(); sub++) {
outputCount++;
}
}
}
// process the output fields
initForOutput();
final MLData result = new BasicMLData(outputCount);
// write the value
int outputIndex = 0;
for (final OutputField ofield : this.outputFields) {
if (!ofield.isIdeal()) {
for (int sub = 0; sub < ofield.getSubfieldCount(); sub++) {
result.setData(outputIndex++, ofield.calculate(sub));
}
}
}
return result;
}
/**
* Called internally to obtain the current value for an input field.
*
* @param field
* The input field to determine.
* @param index
* The current index.
* @return The value for this input field.
*/
private double determineInputFieldValue(final InputField field,
final int index) {
double result = 0;
if( field instanceof InputFieldCSVText ) {
final InputFieldCSVText fieldCSV = (InputFieldCSVText) field;
final ReadCSV csv = this.csvMap.get(field);
String v = csv.get(fieldCSV.getOffset());
if( !fieldCSV.getMappings().containsKey(v) ) {
throw new NormalizationError("Undefined class value: " + v);
} else {
result = fieldCSV.getMappings().get(v);
}
} else if (field instanceof InputFieldCSV) {
final InputFieldCSV fieldCSV = (InputFieldCSV) field;
final ReadCSV csv = this.csvMap.get(field);
result = csv.getDouble(fieldCSV.getOffset());
} else if (field instanceof InputFieldMLDataSet) {
final InputFieldMLDataSet neuralField =
(InputFieldMLDataSet) field;
final MLDataFieldHolder holder = this.dataSetFieldMap
.get(field);
final MLDataPair pair = holder.getPair();
int offset = neuralField.getOffset();
if (offset < pair.getInput().size()) {
result = pair.getInput().getData(offset);
} else {
offset -= pair.getInput().size();
result = pair.getIdeal().getData(offset);
}
} else {
result = field.getValue(index);
}
field.setCurrentValue(result);
return result;
}
/**
* Called internally to determine all of the input field values.
*
* @param index
* The current index.
*/
private void determineInputFieldValues(final int index) {
for (final InputField field : this.inputFields) {
determineInputFieldValue(field, index);
}
}
/**
* Find an input field by its class.
*
* @param clazz
* The input field class type you are looking for.
* @param count
* The instance of the input field needed, 0 for the first.
* @return The input field if found, otherwise null.
*/
public InputField findInputField(final Class< ? > clazz, final int count) {
int i = 0;
for (final InputField field : this.inputFields) {
if (ReflectionUtil.isInstanceOf(field.getClass(), clazz)) {
if (i == count) {
return field;
}
i++;
}
}
return null;
}
/**
* Find an output field by its class.
*
* @param clazz
* The output field class type you are looking for.
* @param count
* The instance of the output field needed, 0 for the first.
* @return The output field if found, otherwise null.
*/
public OutputField findOutputField(final Class< ? > clazz,
final int count) {
int i = 0;
for (final OutputField field : this.outputFields) {
if (ReflectionUtil.isInstanceOf(field.getClass(), clazz)) {
if (i == count) {
return field;
}
i++;
}
}
return null;
}
/**
* First pass, count everything, establish min/max.
*/
private void firstPass() {
openCSV();
openDataSet();
this.currentIndex = -1;
this.recordCount = 0;
this.report.report(0, 0, "Analyzing file");
this.lastReport = 0;
int index = 0;
initForPass();
// loop over all of the records
while (next()) {
determineInputFieldValues(index);
if (shouldInclude()) {
applyMinMax();
this.recordCount++;
reportResult("First pass, analyzing file", 0, this.recordCount);
}
index++;
}
}
/**
* @return The CSV format being used.
*/
public CSVFormat getCSVFormat() {
return this.csvFormat;
}
/**
* @return The object groups.
*/
public Set<OutputFieldGroup> getGroups() {
return this.groups;
}
/**
* @return The input fields.
*/
public List<InputField> getInputFields() {
return this.inputFields;
}
/**
* @return The number of output fields that are not used as ideal values,
* these will be the input to the neural network. This is the input
* layer size for the neural network.
*/
public int getNetworkInputLayerSize() {
int result = 0;
for (final OutputField field : this.outputFields) {
if (!field.isIdeal()) {
result += field.getSubfieldCount();
}
}
return result;
}
/**
* @return The number of output fields that are used as ideal values, these
* will be the ideal output from the neural network. This is the
* output layer size for the neural network.
*/
public int getNetworkOutputLayerSize() {
int result = 0;
for (final OutputField field : this.outputFields) {
if (field.isIdeal()) {
result += field.getSubfieldCount();
}
}
return result;
}
/**
* @return The total size of all output fields. This takes into account
* output fields that generate more than one value.
*/
public int getOutputFieldCount() {
int result = 0;
for (final OutputField field : this.outputFields) {
result += field.getSubfieldCount();
}
return result;
}
/**
* @return The output fields.
*/
public List<OutputField> getOutputFields() {
return this.outputFields;
}
/**
* @return The record count.
*/
public int getRecordCount() {
return this.recordCount;
}
/**
* @return The class that progress will be reported to.
*/
public StatusReportable getReport() {
return this.report;
}
/**
* @return The segregators in use.
*/
public List<Segregator> getSegregators() {
return this.segregators;
}
/**
* @return The place that the normalization output will be stored.
*/
public NormalizationStorage getStorage() {
return this.storage;
}
/**
* Setup the row for output.
*/
public void initForOutput() {
// init groups
for (final OutputFieldGroup group : this.groups) {
group.rowInit();
}
// init output fields
for (final OutputField field : this.outputFields) {
field.rowInit();
}
}
/**
* Setup the row for output.
*/
public void initForPass() {
// init segregators
for (final Segregator segregator : this.segregators) {
segregator.passInit();
}
}
/**
* Called internally to advance to the next row.
*
* @return True if there are more rows to reed.
*/
private boolean next() {
// see if any of the CSV readers want to stop
for (final ReadCSV csv : this.readCSV) {
if (!csv.next()) {
return false;
}
}
// see if any of the data sets want to stop
for (final Iterator<MLDataPair> iterator : this.readDataSet) {
if (!iterator.hasNext()) {
return false;
}
final MLDataFieldHolder holder = this.dataSetIteratorMap
.get(iterator);
final MLDataPair pair = iterator.next();
holder.setPair(pair);
}
// see if any of the arrays want to stop
for (final InputField field : this.inputFields) {
if (field instanceof HasFixedLength) {
final HasFixedLength fixed = (HasFixedLength) field;
if ((this.currentIndex + 1) >= fixed.length()) {
return false;
}
}
}
this.currentIndex++;
return true;
}
/**
* Called internally to open the CSV file.
*/
private void openCSV() {
// clear out any CSV files already there
this.csvMap.clear();
this.readCSV.clear();
// only add each CSV once
final Map<File, ReadCSV> uniqueFiles = new HashMap<File, ReadCSV>();
// find the unique files
for (final InputField field : this.inputFields) {
if (field instanceof InputFieldCSV) {
final InputFieldCSV csvField = (InputFieldCSV) field;
final File file = csvField.getFile();
if (!uniqueFiles.containsKey(file)) {
final ReadCSV csv = new ReadCSV(file.toString(), false,
this.csvFormat);
uniqueFiles.put(file, csv);
this.readCSV.add(csv);
}
this.csvMap.put(csvField, uniqueFiles.get(file));
}
}
}
/**
* Open any datasets that were used by the input layer.
*/
private void openDataSet() {
// clear out any data sets already there
this.readDataSet.clear();
this.dataSetFieldMap.clear();
this.dataSetIteratorMap.clear();
// only add each iterator once
final Map<MLDataSet, MLDataFieldHolder> uniqueSets =
new HashMap<MLDataSet, MLDataFieldHolder>();
// find the unique files
for (final InputField field : this.inputFields) {
if (field instanceof InputFieldMLDataSet) {
final InputFieldMLDataSet dataSetField =
(InputFieldMLDataSet) field;
final MLDataSet dataSet = dataSetField.getNeuralDataSet();
if (!uniqueSets.containsKey(dataSet)) {
final Iterator<MLDataPair> iterator = dataSet
.iterator();
final MLDataFieldHolder holder =
new MLDataFieldHolder(
iterator, dataSetField);
uniqueSets.put(dataSet, holder);
this.readDataSet.add(iterator);
}
final MLDataFieldHolder holder = uniqueSets.get(dataSet);
this.dataSetFieldMap.put(dataSetField, holder);
this.dataSetIteratorMap.put(holder.getIterator(), holder);
}
}
}
public void init() {
this.readCSV = new ArrayList<ReadCSV>();
this.csvMap =
new HashMap<InputField, ReadCSV>();
this.readDataSet =
new ArrayList<Iterator<MLDataPair>>();
this.dataSetFieldMap =
new HashMap<InputField, MLDataFieldHolder>();
this.dataSetIteratorMap =
new HashMap<Iterator<MLDataPair>, MLDataFieldHolder>();
if( this.report==null ) {
this.report = new NullStatusReportable();
}
}
/**
* Call this method to begin the normalization process. Any status updates
* will be sent to the class specified in the constructor.
*/
public void process() {
init();
if (twoPassesNeeded()) {
firstPass();
}
secondPass();
}
/**
* Report on the current progress.
*
* @param message
* The message to report.
* @param total
* The total number of records to process, 0 for unknown.
* @param current
* The current record.
*/
private void reportResult(final String message, final int total,
final int current) {
// count the records, report status
this.lastReport++;
if (this.lastReport >= 10000) {
this.report.report(total, current, message);
this.lastReport = 0;
}
}
/**
* The second pass actually writes the data to the output files.
*/
private void secondPass() {
final boolean twopass = twoPassesNeeded();
// move any CSV and datasets files back to the beginning.
openCSV();
openDataSet();
initForPass();
this.currentIndex = -1;
// process the records
final int size = getOutputFieldCount();
final double[] output = new double[size];
if( storage==null ) {
throw new NormalizationError("Must define storage target.");
}
this.storage.open(this);
this.lastReport = 0;
int index = 0;
int current = 0;
while (next()) {
// read the value
for (final InputField field : this.inputFields) {
determineInputFieldValue(field, index);
}
if (shouldInclude()) {
// handle groups
initForOutput();
// write the value
int outputIndex = 0;
for (final OutputField ofield : this.outputFields) {
for (int sub = 0; sub < ofield.getSubfieldCount(); sub++) {
output[outputIndex++] = ofield.calculate(sub);
}
}
if (twopass) {
reportResult("Second pass, normalizing data",
this.recordCount, ++current);
} else {
reportResult("Processing data (single pass)",
this.recordCount, ++current);
}
this.storage.write(output, 0);
}
index++;
}
this.storage.close();
}
/**
* Set the CSV format to use.
*
* @param csvFormat
* The CSV format to use.
*/
public void setCSVFormat(final CSVFormat csvFormat) {
this.csvFormat = csvFormat;
}
/**
* Set the object that this one is reporting to.
*
* @param report
* The object that progress reports should be sent to.
*/
public void setReport(final StatusReportable report) {
this.report = report;
}
/**
* Determines where the normalized data will be sent.
*
* @param target
* The target.
*/
public void setTarget(final NormalizationStorage target) {
this.storage = target;
}
/**
* Should this row be included? Check the segregators.
*
* @return True if the row should be included.
*/
private boolean shouldInclude() {
// If no segregators, then include
if( this.segregators.size()==0 )
return true;
// include if one segregator says to include
boolean included = false;
for (final Segregator segregator : this.segregators) {
if (segregator.shouldInclude()) {
included = true;
}
}
return included;
}
/**
* @return True, if two passes are needed.
*/
public boolean twoPassesNeeded() {
for (final OutputField field : this.outputFields) {
if (field instanceof RequireTwoPass) {
return true;
}
}
return false;
}
}