/*********************************************************************** This file is part of KEEL-software, the Data Mining tool for regression, classification, clustering, pattern mining and so on. Copyright (C) 2004-2010 F. Herrera (herrera@decsai.ugr.es) L. S�nchez (luciano@uniovi.es) J. Alcal�-Fdez (jalcala@decsai.ugr.es) S. Garc�a (sglopez@ujaen.es) A. Fern�ndez (alberto.fernandez@ujaen.es) J. Luengo (julianlm@decsai.ugr.es) This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see http://www.gnu.org/licenses/ **********************************************************************/ package keel.Algorithms.Neural_Networks.NNEP_Common.data; import java.io.BufferedReader; import java.io.File; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.IOException; import java.util.StringTokenizer; import net.sf.jclec.util.range.Closure; import net.sf.jclec.util.range.Interval; /** * <p> * @author Written by Amelia Zafra, Sebastian Ventura (University of Cordoba) 17/07/2007 * @version 0.1 * @since JDK1.5 * </p> */ public class KeelDataSet extends FileDataset{ /** * <p> * KeelDataSet implementation (keel dataset) * </p> */ ///////////////////////////////////////////////////////////////// // --------------------------------------- Serialization constant ///////////////////////////////////////////////////////////////// /** Generated by Eclipse */ private static final long serialVersionUID = 1L; ///////////////////////////////////////////////////////////////////////// // ------------------------------------------------- Internal Variables ///////////////////////////////////////////////////////////////////////// /** The keyword used to denote the relation name */ static String KEEL_RELATION = "@relation"; /** The keyword used to denote the attribute description */ static String KEEL_ATTRIBUTE = "@attribute"; /** The keyword used to denote the start of the arff data section */ static String KEEL_DATA = "@data"; /** The keyword used to denote the output attribute */ static String KEEL_OUTPUTS = "@outputs"; /** The keyword used to denote the input attribute */ static String KEEL_INPUTS = "@inputs"; /** Symbol which represents missed values */ protected String missedValue; /** Symbol which represents commentted values */ protected String commentedValue; /** Symbol which represents the separation between values */ protected String separationValue; /** Buffer Instance */ protected String bufferInstance = new String(); ///////////////////////////////////////////////////////////////// // ------------------------------------------------ Constructor ///////////////////////////////////////////////////////////////// /** * <p> * Constructor with the filename and the specification file * </p> * @param fileName Name of the dataset file * @param specificationFile Specification file */ public KeelDataSet(String fileName, String ...specificationFile){ super(fileName); missedValue = "?"; separationValue = ","; commentedValue = "%"; } /** * <p> * Constructor without arguments * </p> */ public KeelDataSet( ){ super(); missedValue = "?"; separationValue = ","; commentedValue = "%"; } ///////////////////////////////////////////////////////////////// // ------------------------- Overwriting FileDataset methods ///////////////////////////////////////////////////////////////// /** * <p> * Open dataset * </p> * @throws DatasetException If dataset can't be opened */ @Override public void open(){ // Generate the specification from header of data source file obtainMetadata(fileName); // Initialize variables cursorPosition = 0; cursorInstance = new AbstractDataset.Instance(); // Intervals for non specified attributes extractIntervalsFromData(); } /** * <p> * Reset dataset * </p> * @throws DatasetException if a source access error occurs */ @Override public void reset(){ try { fileReader.close(); fileReader = new BufferedReader(new FileReader(new File(fileName))); //Read until finding the sentence @DATA String line = ((BufferedReader) fileReader).readLine(); while (!line.equalsIgnoreCase(KEEL_DATA)){ line = ((BufferedReader) fileReader).readLine(); } bufferInstance = ((BufferedReader) fileReader).readLine(); while(bufferInstance.startsWith(commentedValue) || bufferInstance.equalsIgnoreCase("")){ bufferInstance = ((BufferedReader) fileReader).readLine(); } cursorPosition = 0; } catch (FileNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (Exception e){ e.printStackTrace(); } } /** * <p> * Return the next instance * </p> * @return The next instance * @throws DatasetException if a source access error occurs */ @Override public boolean next() throws DatasetException { if(bufferInstance != null){ try{ cursorPosition++; //Get the attributes of this instance StringTokenizer token = new StringTokenizer(bufferInstance, separationValue); int numAttributes = 0; //AbstractDataset.Instance instance = new AbstractDataset.Instance(); while(token.hasMoreTokens()){ IAttribute attribute = metadata.getAttribute(numAttributes); String tok = token.nextToken().trim(); if(tok.equals("<null>")) cursorInstance.setValue(numAttributes, Double.NaN); else{ double value = attribute.parse(tok); cursorInstance.setValue(numAttributes, value); } numAttributes++; } //cursorInstance = instance; prepareNextInstance(); }catch(Exception e){ e.printStackTrace();} return true; } else return false; } /** * <p> * Returns cursor instance * </p> * @return Actual instance (if exists) * @throws DatasetException if a source access error occurs */ @Override public AbstractDataset.Instance read() throws DatasetException { return cursorInstance; } /** * <p> * Close dataset * </p> * @throws DatasetException If dataset can't be closed */ @Override public void close() throws DatasetException { try { fileReader.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } ///////////////////////////////////////////////////////////////// // ----------------------------------------------------- Methods ///////////////////////////////////////////////////////////////// /** * <p> * Generate the dataset specification * </p> * @param file Name of data source file */ private void obtainMetadata(String file){ File f1 = new File(file); metadata = new Metadata(); try { fileReader = new BufferedReader(new FileReader(f1)); //Read until finding the sentence @DATA String line = ((BufferedReader) fileReader).readLine(); int indexAttribute = 0; line = line.replace("real[","real ["); line = line.replace("integer[","integer ["); line = line.replace("{"," {"); StringTokenizer elementLine = new StringTokenizer(line); String element = elementLine.nextToken(); while (!element.equalsIgnoreCase(KEEL_DATA)){ if(element.equalsIgnoreCase(KEEL_ATTRIBUTE)){ //The next attribute indexAttribute++; String name = elementLine.nextToken(); String type = elementLine.nextToken(); if(type.equalsIgnoreCase("REAL") || type.equalsIgnoreCase("INTEGER")){ addAttributeToSpecification(type, line, name); } else addAttributeToSpecification("STRING", line, name); } if(element.equalsIgnoreCase(KEEL_RELATION)){ setName(elementLine.nextToken()); } //Next line of the file line = ((BufferedReader) fileReader).readLine(); while(line.startsWith(commentedValue) || line.equalsIgnoreCase("")) line = ((BufferedReader) fileReader).readLine(); line = line.replace("real[","real ["); line = line.replace("integer[","integer ["); line = line.replace("{"," {"); elementLine = new StringTokenizer(line); element = elementLine.nextToken(); } bufferInstance = ((BufferedReader) fileReader).readLine(); while(bufferInstance.startsWith(commentedValue) || bufferInstance.equalsIgnoreCase("")){ bufferInstance = ((BufferedReader) fileReader).readLine(); } } catch (FileNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (Exception e){ e.printStackTrace(); } } /** * <p> * Store the next instance in bufferInstance * </p> */ private void prepareNextInstance(){ try { //Get the next instance String lineInstance = ((BufferedReader) fileReader).readLine(); while(lineInstance.startsWith(commentedValue) || lineInstance.equalsIgnoreCase("")){ lineInstance = ((BufferedReader) fileReader).readLine(); } bufferInstance = lineInstance; }catch(Exception e){ bufferInstance = null; } } /** * <p> * Add new attribute to the dataset specification * </p> * @param type Attribute type * @param interval Intervals value * @param name Attribute name */ private void addAttributeToSpecification(String type, String interval, String name){ // If the attribute is numerical if(type.equalsIgnoreCase("REAL")){ RealNumericalAttribute attribute = new RealNumericalAttribute(); attribute.setName(name); // If an interval is specified if(interval.contains("[") && interval.contains("]")){ // Obtain the intervals int minIndex = interval.indexOf("["); int maxIndex = interval.indexOf("]"); interval = interval.substring(minIndex+1, maxIndex); if(minIndex < maxIndex){ StringTokenizer tkInterval = new StringTokenizer(interval, ","); Interval intervals = new Interval(); intervals.setClosure(Closure.ClosedClosed); intervals.setLeft(Double.valueOf((String) tkInterval.nextElement())); intervals.setRight(Double.valueOf((String) tkInterval.nextToken())); attribute.addInterval(intervals); //Add new attribute to the specification metadata.addAttribute(attribute); } } // If the interval is not specified it must be extracted from data else { Interval intervals = new Interval(); intervals.setClosure(Closure.ClosedClosed); intervals.setLeft(Double.MIN_VALUE); intervals.setRight(Double.MAX_VALUE); attribute.addInterval(intervals); //Add new attribute to the specification metadata.addAttribute(attribute); } } else if(type.equalsIgnoreCase("INTEGER")){ IntegerNumericalAttribute attribute = new IntegerNumericalAttribute(); attribute.setName(name); // If an interval is specified if(interval.contains("[") && interval.contains("]")){ // Obtain the intervals int minIndex = interval.indexOf("["); int maxIndex = interval.indexOf("]"); interval = interval.substring(minIndex+1, maxIndex); if(minIndex < maxIndex){ StringTokenizer tkInterval = new StringTokenizer(interval, ","); net.sf.jclec.util.intset.Interval intervals = new net.sf.jclec.util.intset.Interval(); intervals.setClosure(net.sf.jclec.util.intset.Closure.ClosedClosed); intervals.setLeft(Integer.valueOf((String) tkInterval.nextElement())); intervals.setRight(Integer.valueOf((String) tkInterval.nextToken().trim())); attribute.addInterval(intervals); //Add new attribute to the specification metadata.addAttribute(attribute); } } // If the interval is not specified it must be extracted from data else { net.sf.jclec.util.intset.Interval intervals = new net.sf.jclec.util.intset.Interval(); intervals.setClosure(net.sf.jclec.util.intset.Closure.ClosedClosed); intervals.setLeft(Integer.MIN_VALUE); intervals.setRight(Integer.MAX_VALUE); attribute.addInterval(intervals); //Add new attribute to the specification metadata.addAttribute(attribute); } } else { //Obtain the categorical values int minIndex = interval.indexOf("{"); int maxIndex = interval.indexOf("}"); interval = interval.substring(minIndex+1, maxIndex); if(minIndex < maxIndex){ CategoricalAttribute attribute = new CategoricalAttribute(); attribute.setName(name); StringTokenizer categories = new StringTokenizer(interval, ","); while(categories.hasMoreTokens()) attribute.addValue(categories.nextToken().trim()); //Add new attribute to the specification metadata.addAttribute(attribute); } } } /** * <p> * Extract the interval of a Real or Integer Attribute directly from data * </p> * @return String Interval of the attribute */ private void extractIntervalsFromData(){ try { double[] min = new double[metadata.numberOfAttributes()]; double[] max = new double[metadata.numberOfAttributes()]; boolean[] nonSpecified = new boolean[metadata.numberOfAttributes()]; for(int i=0; i<metadata.numberOfAttributes(); i++){ min[i] = Double.MAX_VALUE; max[i] = Double.MIN_VALUE; if(metadata.getAttribute(i).getType() == AttributeType.DoubleNumerical){ RealNumericalAttribute attribute = (RealNumericalAttribute) metadata.getAttribute(i); nonSpecified[i] = attribute.intervalValues().getLeft() == Double.MIN_VALUE && attribute.intervalValues().getRight() == Double.MAX_VALUE; } else if(metadata.getAttribute(i).getType() == AttributeType.IntegerNumerical){ IntegerNumericalAttribute attribute = (IntegerNumericalAttribute) metadata.getAttribute(i); nonSpecified[i] = attribute.intervalValues().getLeft() == Integer.MIN_VALUE && attribute.intervalValues().getRight() == Integer.MAX_VALUE; } } while(this.next()){ IDataset.IInstance instancia = this.read(); // Extract interval for non specified attributes for(int i=0; i<metadata.numberOfAttributes(); i++){ if(nonSpecified[i]){ double value = instancia.getValue(i); if(value < min[i]) min[i] = value; if(value > max[i]) max[i] = value; } } } for(int i=0; i<metadata.numberOfAttributes(); i++){ if(nonSpecified[i]){ if(metadata.getAttribute(i).getType() == AttributeType.DoubleNumerical){ RealNumericalAttribute attribute = (RealNumericalAttribute) metadata.getAttribute(i); attribute.intervalValues().setLeft(min[i]); attribute.intervalValues().setRight(max[i]); } else if(metadata.getAttribute(i).getType() == AttributeType.IntegerNumerical){ IntegerNumericalAttribute attribute = (IntegerNumericalAttribute) metadata.getAttribute(i); attribute.intervalValues().setLeft((int)min[i]); attribute.intervalValues().setRight((int)max[i]); } } } this.reset(); }catch (DatasetException e) { e.printStackTrace(); } } }