/* * Encog(tm) Core v3.4 - Java Version * http://www.heatonresearch.com/encog/ * https://github.com/encog/encog-java-core * Copyright 2008-2016 Heaton Research, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * * For more information on Heaton Research copyrights, licenses * and trademarks visit: * http://www.heatonresearch.com/copyright */ package org.encog.app.analyst.csv.shuffle; import java.io.File; import java.io.PrintWriter; import org.encog.app.analyst.csv.basic.BasicFile; import org.encog.app.analyst.csv.basic.LoadedRow; import org.encog.mathutil.randomize.RangeRandomizer; import org.encog.util.csv.CSVFormat; import org.encog.util.csv.ReadCSV; /** * Randomly shuffle the lines of a CSV file. */ public class ShuffleCSV extends BasicFile { /** * The default buffer size. */ public static final int DEFAULT_BUFFER_SIZE = 5000; /** * The buffer size. */ private int bufferSize; /** * The buffer. */ private LoadedRow[] buffer; /** * Remaining in the buffer. */ private int remaining; /** * Construct the object. */ public ShuffleCSV() { setBufferSize(DEFAULT_BUFFER_SIZE); } /** * Analyze the neural network. * * @param inputFile * The input file. * @param headers * True, if there are headers. * @param format * The format of the CSV file. */ public void analyze(final File inputFile, final boolean headers, final CSVFormat format) { setInputFilename(inputFile); setExpectInputHeaders(headers); setInputFormat(format); setAnalyzed(true); performBasicCounts(); } /** * @return The buffer size. This is how many rows of data are loaded(and * randomized), at a time. The default is 5,000. */ public int getBufferSize() { return this.bufferSize; } /** * Get the next row from the underlying CSV file. * * @param csv * The underlying CSV file. * @return The loaded row. */ private LoadedRow getNextRow(final ReadCSV csv) { if (this.remaining == 0) { loadBuffer(csv); } while (this.remaining > 0) { final int index = RangeRandomizer.randomInt(0, this.bufferSize - 1); if (this.buffer[index] != null) { final LoadedRow result = this.buffer[index]; this.buffer[index] = null; this.remaining--; return result; } } return null; } /** * Load the buffer from the underlying file. * * @param csv * The CSV file to load from. */ private void loadBuffer(final ReadCSV csv) { for (int i = 0; i < this.buffer.length; i++) { this.buffer[i] = null; } int index = 0; while (csv.next() && (index < this.bufferSize) && !shouldStop()) { final LoadedRow row = new LoadedRow(csv); this.buffer[index++] = row; } this.remaining = index; } /** * Process, and generate the output file. * * @param outputFile * The output file. */ public void process(final File outputFile) { validateAnalyzed(); final ReadCSV csv = new ReadCSV(getInputFilename().toString(), isExpectInputHeaders(), getFormat()); LoadedRow row; final PrintWriter tw = prepareOutputFile(outputFile); resetStatus(); while ((row = getNextRow(csv)) != null) { writeRow(tw, row); updateStatus(false); } reportDone(false); tw.close(); csv.close(); } /** * Set the buffer size. * * @param s * The new buffer size. */ public void setBufferSize(final int s) { this.bufferSize = s; this.buffer = new LoadedRow[this.bufferSize]; } }