package org.deeplearning4j.examples.dataexamples; import org.datavec.api.records.reader.RecordReader; import org.datavec.api.records.reader.impl.csv.CSVRecordReader; import org.datavec.api.split.FileSplit; import org.datavec.api.util.ClassPathResource; import org.deeplearning4j.datasets.datavec.RecordReaderDataSetIterator; import org.nd4j.linalg.dataset.DataSet; import org.nd4j.linalg.dataset.api.iterator.DataSetIterator; import org.nd4j.linalg.dataset.api.preprocessor.NormalizerMinMaxScaler; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * This basic example demonstrates how to use the preprocessors available * This example uses the minmax scaler and will work with the 3.10 release and later * Later releases and current master will work with all other preprocessors * Created by susaneraly on 6/8/16. */ public class PreprocessNormalizerExample { private static Logger log = LoggerFactory.getLogger(PreprocessNormalizerExample.class); public static void main(String[] args) throws Exception { //========= This section is to create a dataset and a dataset iterator from the iris dataset stored in csv ============= // Refer to the csv example for details int numLinesToSkip = 0; String delimiter = ","; RecordReader recordReader = new CSVRecordReader(numLinesToSkip,delimiter); RecordReader recordReaderA = new CSVRecordReader(numLinesToSkip,delimiter); RecordReader recordReaderB = new CSVRecordReader(numLinesToSkip,delimiter); recordReader.initialize(new FileSplit(new ClassPathResource("iris.txt").getFile())); recordReaderA.initialize(new FileSplit(new ClassPathResource("iris.txt").getFile())); recordReaderB.initialize(new FileSplit(new ClassPathResource("iris.txt").getFile())); int labelIndex = 4; int numClasses = 3; DataSetIterator iteratorA = new RecordReaderDataSetIterator(recordReaderA,10,labelIndex,numClasses); DataSetIterator iteratorB = new RecordReaderDataSetIterator(recordReaderB,10,labelIndex,numClasses); DataSetIterator fulliterator = new RecordReaderDataSetIterator(recordReader,150,labelIndex,numClasses); DataSet datasetX = fulliterator.next(); DataSet datasetY = datasetX.copy(); // We now have datasetX, datasetY, iteratorA, iteratorB all of which have the iris dataset loaded // iteratorA and iteratorB have batchsize of 10. So the full dataset is 150/10 = 15 batches //===================================================================================================================== log.info("All preprocessors have to be fit to the intended metrics before they can be used to transform"); log.info("To have a transformation occur when next on an iterator is called use the 'setpreprocessor', example at the very end here\n"); log.info("This example demonstrates preprocessor use with the min max normalizer."); log.info("A standardizing preprocessor is also available."); log.info("Usage for all preprocessors are the same - fit then transform a dataset or set as preprocessor to an iterator"); log.info("Instantiating a preprocessor...\n"); NormalizerMinMaxScaler preProcessor = new NormalizerMinMaxScaler(); log.info("During 'fit' the preprocessor calculates the metrics (std dev and mean for the standardizer, min and max for minmaxscaler) from the data given"); log.info("Fit can take a dataset or a dataset iterator\n"); //Fitting a preprocessor with a dataset log.info("Fitting with a dataset..............."); preProcessor.fit(datasetX); log.info("Calculated metrics"); log.info("Min: {}",preProcessor.getMin()); log.info("Max: {}",preProcessor.getMax()); log.info("Once fit the preprocessor can be used to transform data wrt to the metrics of the dataset it was fit to"); log.info("Transform takes a dataset and modifies it in place"); log.info("Transforming a dataset, printing only the first ten....."); preProcessor.transform(datasetX); log.info("\n{}\n",datasetX.getRange(0,9)); log.info("Transformed datasets can be reverted back as well..."); log.info("Note the reverting happens in place."); log.info("Reverting back the dataset, printing only the first ten....."); preProcessor.revert(datasetX); log.info("\n{}\n",datasetX.getRange(0,9)); //Setting a preprocessor in an iterator log.info("Fitting a preprocessor with iteratorB......"); NormalizerMinMaxScaler preProcessorIter = new NormalizerMinMaxScaler(); preProcessorIter.fit(iteratorB); log.info("A fitted preprocessor can be set to an iterator so each time next is called the transform step happens automatically"); log.info("Setting a preprocessor for iteratorA"); iteratorA.setPreProcessor(preProcessorIter); while (iteratorA.hasNext()) { log.info("Calling next on iterator A that has a preprocessor on it"); log.info("\n{}",iteratorA.next()); log.info("Calling next on iterator B that has no preprocessor on it"); DataSet firstBatch = iteratorB.next(); log.info("\n{}",firstBatch); log.info("Note the data is different - iteratorA is preprocessed, iteratorB is not"); log.info("Now using transform on the next datset on iteratorB"); iteratorB.reset(); firstBatch = iteratorB.next(); preProcessorIter.transform(firstBatch); log.info("\n{}",firstBatch); log.info("Note that this now gives the same results"); break; } log.info("If you are using batches and an iterator, set the preprocessor on your iterator to transform data automatically when next is called"); log.info("Use the .transform function only if you are working with a small dataset and no iterator"); log.info("MinMax scaler also takes a min-max range to scale to."); log.info("Instantiating a new preprocessor and setting it's min-max scale to {-1,1}"); NormalizerMinMaxScaler preProcessorRange = new NormalizerMinMaxScaler(-1,1); log.info("Fitting to dataset"); preProcessorRange.fit(datasetY); log.info("First ten before transforming"); log.info("\n{}",datasetY.getRange(0,9)); log.info("First ten after transforming"); preProcessorRange.transform(datasetY); log.info("\n{}",datasetY.getRange(0,9)); } }