/*
* Encog(tm) Core v3.4 - Java Version
* http://www.heatonresearch.com/encog/
* https://github.com/encog/encog-java-core
* Copyright 2008-2016 Heaton Research, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* For more information on Heaton Research copyrights, licenses
* and trademarks visit:
* http://www.heatonresearch.com/copyright
*/
package org.encog.app.analyst.csv;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import org.encog.app.analyst.EncogAnalyst;
import org.encog.app.analyst.csv.basic.BasicFile;
import org.encog.app.analyst.csv.basic.LoadedRow;
import org.encog.app.analyst.csv.normalize.AnalystNormalizeCSV;
import org.encog.app.analyst.util.CSVHeaders;
import org.encog.app.quant.QuantError;
import org.encog.ml.MLCluster;
import org.encog.ml.data.MLData;
import org.encog.ml.data.basic.BasicMLData;
import org.encog.ml.data.basic.BasicMLDataPair;
import org.encog.ml.data.basic.BasicMLDataSet;
import org.encog.ml.kmeans.KMeansClustering;
import org.encog.util.csv.CSVFormat;
import org.encog.util.csv.ReadCSV;
/**
* Used by the analyst to cluster a CSV file.
*
*/
public class AnalystClusterCSV extends BasicFile {
/**
* The analyst to use.
*/
private EncogAnalyst analyst;
/**
* The headers.
*/
private CSVHeaders analystHeaders;
/**
* The training data used to send to KMeans.
*/
private BasicMLDataSet data;
/**
* Analyze the data. This counts the records and prepares the data to be
* processed.
* @param theAnalyst The analyst to use.
* @param inputFile The input file to analyze.
* @param headers True, if the input file has headers.
* @param format The format of the input file.
*/
public void analyze(final EncogAnalyst theAnalyst,
final File inputFile,
final boolean headers, final CSVFormat format) {
this.setInputFilename(inputFile);
setExpectInputHeaders(headers);
setInputFormat(format);
setAnalyzed(true);
this.analyst = theAnalyst;
this.data = new BasicMLDataSet();
resetStatus();
int recordCount = 0;
final int outputLength = this.analyst.determineTotalColumns();
final ReadCSV csv = new ReadCSV(this.getInputFilename().toString(),
this.isExpectInputHeaders(), this.getFormat());
readHeaders(csv);
this.analystHeaders = new CSVHeaders(this.getInputHeadings());
while (csv.next() && !shouldStop()) {
updateStatus(true);
final double[] inputArray = AnalystNormalizeCSV.extractFields(
analyst, this.analystHeaders, csv, outputLength, true);
final MLData input = new BasicMLData(inputArray);
this.data.add(new BasicMLDataPair(input));
recordCount++;
}
setRecordCount(recordCount);
this.setColumnCount(csv.getColumnCount());
readHeaders(csv);
csv.close();
reportDone(true);
}
/**
* Prepare the output file, write headers if needed.
* @param outputFile The output file.
* @param input The number of input columns.
* @param output The number of output columns.
* @return The file to be written to.
*/
private PrintWriter prepareOutputFile(
final File outputFile,
final int input,
final int output) {
try {
final PrintWriter tw = new PrintWriter(new FileWriter(outputFile));
// write headers, if needed
if (isProduceOutputHeaders()) {
final StringBuilder line = new StringBuilder();
// handle provided fields, not all may be used, but all should
// be displayed
for (final String heading : this.getInputHeadings()) {
BasicFile.appendSeparator(line, getFormat());
line.append("\"");
line.append(heading);
line.append("\"");
}
// now the output fields that will be generated
BasicFile.appendSeparator(line, getFormat());
line.append("\"cluster\"");
tw.println(line.toString());
}
return tw;
} catch (final IOException e) {
throw new QuantError(e);
}
}
/**
* Process the file and cluster.
* @param outputFile The output file.
* @param clusters The number of clusters.
* @param theAnalyst The analyst to use.
* @param iterations The number of iterations to use.
*/
public void process(final File outputFile, final int clusters,
final EncogAnalyst theAnalyst, final int iterations) {
final PrintWriter tw = this.prepareOutputFile(outputFile, analyst
.getScript().getNormalize().countActiveFields() - 1, 1);
resetStatus();
final KMeansClustering cluster = new KMeansClustering(clusters,
this.data);
cluster.iteration(iterations);
int clusterNum = 0;
for (final MLCluster cl : cluster.getClusters()) {
for (final MLData item : cl.getData()) {
final int clsIndex = item.size();
final LoadedRow lr = new LoadedRow(this.getFormat(),item.getData(),1);
lr.getData()[clsIndex] = "" + clusterNum;
writeRow(tw, lr);
}
clusterNum++;
}
reportDone(false);
tw.close();
}
}