package aima.core.learning.neural; import java.io.BufferedReader; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import aima.core.learning.data.DataResource; import aima.core.learning.framework.DataSet; import aima.core.learning.framework.Example; import aima.core.util.Util; import aima.core.util.datastructure.Pair; /** * @author Ravi Mohan * */ public abstract class NNDataSet { /* * This class represents a source of examples to the rest of the nn * framework. Assumes only one function approximator works on an instance at * a given point in time */ /* * the parsed and preprocessed form of the dataset. */ private List<NNExample> dataset; /* * a copy from which examples are drawn. */ private List<NNExample> presentlyProcessed = new ArrayList<NNExample>(); /* * list of mean Values for all components of raw data set */ private List<Double> means; /* * list of stdev Values for all components of raw data set */ private List<Double> stdevs; /* * the normalized data set */ protected List<List<Double>> nds; /* * the column numbers of the "target" */ protected List<Integer> targetColumnNumbers; /* * population delegated to subclass because only subclass knows which * column(s) is target */ public abstract void setTargetColumns(); /* * create a normalized data "table" from the data in the file. At this * stage, the data isnot split into input pattern and tragets */ public void createNormalizedDataFromFile(String filename) throws Exception { List<List<Double>> rds = new ArrayList<List<Double>>(); // create raw data set try (BufferedReader reader = new BufferedReader(new InputStreamReader( DataResource.class.getResourceAsStream(filename + ".csv")))) { String line; while ((line = reader.readLine()) != null) { rds.add(exampleFromString(line, ",")); } } // normalize raw dataset nds = normalize(rds); } /* * create a normalized data "table" from the DataSet using numerizer. At * this stage, the data isnot split into input pattern and targets TODO * remove redundancy of recreating the target columns. the numerizer has * already isolated the targets */ public void createNormalizedDataFromDataSet(DataSet ds, Numerizer numerizer) throws Exception { List<List<Double>> rds = rawExamplesFromDataSet(ds, numerizer); // normalize raw dataset nds = normalize(rds); } /* * Gets (and removes) a random example from the 'presentlyProcessed' */ public NNExample getExampleAtRandom() { int i = Util.randomNumberBetween(0, (presentlyProcessed.size() - 1)); return presentlyProcessed.remove(i); } /* * Gets (and removes) a random example from the 'presentlyProcessed' */ public NNExample getExample(int index) { return presentlyProcessed.remove(index); } /* * check if any more examples remain to be processed */ public boolean hasMoreExamples() { return presentlyProcessed.size() > 0; } /* * check how many examples remain to be processed */ public int howManyExamplesLeft() { return presentlyProcessed.size(); } /* * refreshes the presentlyProcessed dataset so it can be used for a new * epoch of training. */ public void refreshDataset() { presentlyProcessed = new ArrayList<NNExample>(); for (NNExample e : dataset) { presentlyProcessed.add(e.copyExample()); } } /* * method called by clients to set up data set and make it ready for * processing */ public void createExamplesFromFile(String filename) throws Exception { createNormalizedDataFromFile(filename); setTargetColumns(); createExamples(); } /* * method called by clients to set up data set and make it ready for * processing */ public void createExamplesFromDataSet(DataSet ds, Numerizer numerizer) throws Exception { createNormalizedDataFromDataSet(ds, numerizer); setTargetColumns(); createExamples(); } public List<List<Double>> getNormalizedData() { return nds; } public List<Double> getMeans() { return means; } public List<Double> getStdevs() { return stdevs; } // // PRIVATE METHODS // /* * create Example instances from a normalized data "table". */ private void createExamples() { dataset = new ArrayList<NNExample>(); for (List<Double> dataLine : nds) { List<Double> input = new ArrayList<Double>(); List<Double> target = new ArrayList<Double>(); for (int i = 0; i < dataLine.size(); i++) { if (targetColumnNumbers.contains(i)) { target.add(dataLine.get(i)); } else { input.add(dataLine.get(i)); } } dataset.add(new NNExample(input, target)); } refreshDataset();// to populate the preentlyProcessed dataset } private List<List<Double>> normalize(List<List<Double>> rds) { int rawDataLength = rds.get(0).size(); List<List<Double>> nds = new ArrayList<List<Double>>(); means = new ArrayList<Double>(); stdevs = new ArrayList<Double>(); List<List<Double>> normalizedColumns = new ArrayList<List<Double>>(); // clculate means for each coponent of example data for (int i = 0; i < rawDataLength; i++) { List<Double> columnValues = new ArrayList<Double>(); for (List<Double> rawDatum : rds) { columnValues.add(rawDatum.get(i)); } double mean = Util.calculateMean(columnValues); means.add(mean); double stdev = Util.calculateStDev(columnValues, mean); stdevs.add(stdev); normalizedColumns.add(Util.normalizeFromMeanAndStdev(columnValues, mean, stdev)); } // re arrange data from columns // TODO Assert normalized columns have same size etc int columnLength = normalizedColumns.get(0).size(); int numberOfColumns = normalizedColumns.size(); for (int i = 0; i < columnLength; i++) { List<Double> lst = new ArrayList<Double>(); for (int j = 0; j < numberOfColumns; j++) { lst.add(normalizedColumns.get(j).get(i)); } nds.add(lst); } return nds; } private List<Double> exampleFromString(String line, String separator) { // assumes all values for inout and target are doubles List<Double> rexample = new ArrayList<Double>(); List<String> attributeValues = Arrays.asList(line.split(separator)); for (String valString : attributeValues) { rexample.add(Double.parseDouble(valString)); } return rexample; } private List<List<Double>> rawExamplesFromDataSet(DataSet ds, Numerizer numerizer) { // assumes all values for inout and target are doubles List<List<Double>> rds = new ArrayList<List<Double>>(); for (int i = 0; i < ds.size(); i++) { List<Double> rexample = new ArrayList<Double>(); Example e = ds.getExample(i); Pair<List<Double>, List<Double>> p = numerizer.numerize(e); List<Double> attributes = p.getFirst(); for (Double d : attributes) { rexample.add(d); } List<Double> targets = p.getSecond(); for (Double d : targets) { rexample.add(d); } rds.add(rexample); } return rds; } }