/* * Encog(tm) Core v3.4 - Java Version * http://www.heatonresearch.com/encog/ * https://github.com/encog/encog-java-core * Copyright 2008-2016 Heaton Research, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * * For more information on Heaton Research copyrights, licenses * and trademarks visit: * http://www.heatonresearch.com/copyright */ package org.encog.app.analyst.csv.normalize; import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.io.PrintWriter; import org.encog.EncogError; import org.encog.app.analyst.EncogAnalyst; import org.encog.app.analyst.csv.TimeSeriesUtil; import org.encog.app.analyst.csv.basic.BasicFile; import org.encog.app.analyst.missing.HandleMissingValues; import org.encog.app.analyst.script.normalize.AnalystField; import org.encog.app.analyst.util.CSVHeaders; import org.encog.app.quant.QuantError; import org.encog.util.arrayutil.NormalizationAction; import org.encog.util.csv.CSVFormat; import org.encog.util.csv.NumberList; import org.encog.util.csv.ReadCSV; import org.encog.util.logging.EncogLogging; /** * Normalize, or denormalize, a CSV file. */ public class AnalystNormalizeCSV extends BasicFile { /** * Extract fields from a file into a numeric array for machine learning. * @param analyst The analyst to use. * @param headers The headers for the input data. * @param csv The CSV that holds the input data. * @param outputLength The length of the returned array. * @param skipOutput True if the output should be skipped. * @return The encoded data. */ public static final double[] extractFields(final EncogAnalyst analyst, final CSVHeaders headers, final ReadCSV csv, final int outputLength, final boolean skipOutput) { final double[] output = new double[outputLength]; int outputIndex = 0; for (final AnalystField stat : analyst.getScript().getNormalize() .getNormalizedFields()) { stat.init(); if (stat.getAction() == NormalizationAction.Ignore) { continue; } if (stat.isOutput() && skipOutput) { continue; } int index = headers.find(stat.getName()); final String str = csv.get(index).trim(); // is this an unknown value? if( str.equals("?") || str.length()==0 ) { HandleMissingValues handler = analyst.getScript().getNormalize().getMissingValues(); double[] d = handler.handleMissing(analyst ,stat); // should we skip the entire row if( d==null ) { return null; } // copy the returned values in place of the missing values for(int i=0;i<d.length;i++) { output[outputIndex++] = d[i]; } } else { // known value if (stat.getAction() == NormalizationAction.Normalize) { double d = csv.getFormat().parse(str); d = stat.normalize(d); output[outputIndex++] = d; } else if (stat.getAction() == NormalizationAction.PassThrough) { double d = csv.getFormat().parse(str); output[outputIndex++] = d; } else { final double[] d = stat.encode(str); for (final double element : d) { output[outputIndex++] = element; } } } } return output; } /** * The analyst to use. */ private EncogAnalyst analyst; /** * Used to process time series. */ private TimeSeriesUtil series; /** * THe headers. */ private CSVHeaders analystHeaders; /** * Analyze the file. * @param inputFilename The input file. * @param expectInputHeaders True, if input headers are present. * @param inputFormat The format. * @param theAnalyst The analyst to use. */ public void analyze(final File inputFilename, final boolean expectInputHeaders, final CSVFormat inputFormat, final EncogAnalyst theAnalyst) { this.setInputFilename(inputFilename); this.setInputFormat(inputFormat); this.setExpectInputHeaders(expectInputHeaders); this.analyst = theAnalyst; this.setAnalyzed(true); this.analystHeaders = new CSVHeaders(inputFilename, expectInputHeaders, inputFormat); for (final AnalystField field : analyst.getScript().getNormalize() .getNormalizedFields()) { field.init(); } this.series = new TimeSeriesUtil(analyst,true, this.analystHeaders.getHeaders()); } /** * Normalize the input file. Write to the specified file. * * @param file * The file to write to. */ public void normalize(final File file) { if (this.analyst == null) { throw new EncogError( "Can't normalize yet, file has not been analyzed."); } ReadCSV csv = null; PrintWriter tw = null; try { csv = new ReadCSV(getInputFilename().toString(), isExpectInputHeaders(), getFormat()); tw = new PrintWriter(new FileWriter(file)); // write headers, if needed if (isProduceOutputHeaders()) { writeHeaders(tw); } resetStatus(); final int outputLength = this.analyst.determineTotalColumns(); // write file contents while (csv.next() && !shouldStop()) { updateStatus(false); double[] output = AnalystNormalizeCSV.extractFields( this.analyst, this.analystHeaders, csv, outputLength, false); if (this.series.getTotalDepth() > 1) { output = this.series.process(output); } if (output != null) { final StringBuilder line = new StringBuilder(); NumberList.toList(getFormat(), line, output); tw.println(line); } } } catch (final IOException e) { throw new QuantError(e); } finally { reportDone(false); if (csv != null) { try { csv.close(); } catch (final Exception ex) { EncogLogging.log(ex); } } if (tw != null) { try { tw.close(); } catch (final Exception ex) { EncogLogging.log(ex); } } } } /** * Set the source file. This is useful if you want to use pre-existing stats * to normalize something and skip the analyze step. * * @param file * The file to use. * @param headers * True, if headers are to be expected. * @param format * The format of the CSV file. */ public void setSourceFile(final File file, final boolean headers, final CSVFormat format) { setInputFilename(file); setExpectInputHeaders(headers); setInputFormat(format); } /** * Write the headers. * * @param tw * The output stream. */ private void writeHeaders(final PrintWriter tw) { final StringBuilder line = new StringBuilder(); for (final AnalystField stat : this.analyst.getScript().getNormalize() .getNormalizedFields()) { final int needed = stat.getColumnsNeeded(); for (int i = 0; i < needed; i++) { BasicFile.appendSeparator(line, getFormat()); line.append('\"'); line.append(CSVHeaders.tagColumn(stat.getName(), i, stat.getTimeSlice(), needed > 1)); line.append('\"'); } } tw.println(line.toString()); } }