/*********************************************************************** This file is part of KEEL-software, the Data Mining tool for regression, classification, clustering, pattern mining and so on. Copyright (C) 2004-2010 F. Herrera (herrera@decsai.ugr.es) L. S�nchez (luciano@uniovi.es) J. Alcal�-Fdez (jalcala@decsai.ugr.es) S. Garc�a (sglopez@ujaen.es) A. Fern�ndez (alberto.fernandez@ujaen.es) J. Luengo (julianlm@decsai.ugr.es) This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see http://www.gnu.org/licenses/ **********************************************************************/ package keel.Dataset; import java.util.*; import java.io.*; /** * <p> * <b> InstanceSet </b> * </p> * * The instance set class mantains a pool of instances read from the keel * formated data file. It provides a set of methods that permit to get * each instance, get the whole set of instances, get the number of instances, * etc. * * @author Albert Orriols Puig * @version keel0.1 * @see Instance * @see Attributes */ public class InstanceSet { ///////////////////////////////////////////////////////////////////////////// //////////////// ATTRIBUTES OF THE INSTANCESET CLASS //////////////////////// ///////////////////////////////////////////////////////////////////////////// /** * Attribute where all the instances of the DB are stored. */ private Instance[] instanceSet; /** * String where the header of the file is stored. */ private String header; /** * String where only the attributes definition header is stored */ private String attHeader; /** * Object that collects all the errors happened while reading the test and * train datasets. */ static FormatErrorKeeper errorLogger = new FormatErrorKeeper(); /** * This object contains the attributes definitions */ private InstanceAttributes attributes; /** * It indicates if the attributes has not be stored as non-static, permiting * the load of different datasets */ private boolean storeAttributesAsNonStatic; /** * It indicates that the output attribute has been infered as the last one */ private boolean outputInfered; ///////////////////////////////////////////////////////////////////////////// ///////////////// METHODS OF THE INSTANCESET CLASS ////////////////////////// ///////////////////////////////////////////////////////////////////////////// /** * It instances a new instance of InstanceSet */ public InstanceSet(){ storeAttributesAsNonStatic = false; attributes = null; }//end InstanceSet /** * InstanceSet * * This constructor permit define if the attribute's definition need to be * stored as non-static (nonStaticAttributes = true). Otherwise, if * nonStaticAttributes = false, using this constructor is equivalent to use * the constructor by default. */ public InstanceSet (boolean nonStaticAttributes ){ storeAttributesAsNonStatic = nonStaticAttributes; //if ( storeAttributesAsNonStatic ) Attributes.clearAll(); attributes = null; }//end InstanceSet /** * Creates a new InstanceSet with the header and Instances from the passed object * It performs a deep (new allocated) copy. * @param is Original InstanceSet */ public InstanceSet(InstanceSet is){ this.instanceSet = Arrays.copyOf(is.instanceSet, is.instanceSet.length); this.header = new String(is.header); this.attHeader = new String(is.attHeader); this.attributes = new InstanceAttributes(is.attributes); this.storeAttributesAsNonStatic = is.storeAttributesAsNonStatic; } /** * setAttributesAsNonStatic * * It stores the static-defined attributes in the class Attributes as * non static in the object attributes. After this it does not remove the * static-definition of the Attributes; this is in that way to permit to * call this functions for differents datasets from the same problem, such * as, a train dataset and the correspondent test dataset. */ public void setAttributesAsNonStatic (){ attributes = new InstanceAttributes(); attributes.copyStaticAttributes(); storeAttributesAsNonStatic = true; }//end setAttributesAsNonStatic /** * getAttributeDefinitions * * It does return the definition of the attibutes contained in the dataset. * * @return InstanceAttributes contains the attribute's definitions. */ public InstanceAttributes getAttributeDefinitions (){ return attributes; }//end InstanceAttributes /** * This method reads all the information in a DB and load it to memory. * @param fileName is the database file name. * @param isTrain is a flag that indicate if the database is for a train or for a test. * @throws DatasetException if there is any semantical error in the input file. * @throws HeaderFormatException if there is any lexical or sintactical error in the * header of the input file */ public void readSet( String fileName,boolean isTrain ) throws DatasetException, HeaderFormatException{ String line; System.out.println ("Opening the file: "+fileName+"."); //Parsing the header of the DB. errorLogger = new FormatErrorKeeper(); //Declaring an instance parser InstanceParser parser = new InstanceParser( fileName, isTrain ); // Reading information in the header, i.e., @relation, @attribute, @inputs and @outputs parseHeader ( parser, isTrain ); System.out.println ( " The number of output attributes is: " + Attributes.getOutputNumAttributes() ); //The attributes statistics are init if we are in train mode. if (isTrain && Attributes.getOutputNumAttributes() == 1){ Attributes.initStatistics(); } //A temporal vector is used to store the instances read. System.out.println ( "\n\n > Reading the data "); Vector tempSet=new Vector(1000,100000); while((line=parser.getLine())!=null) { //System.out.println (" > Data line: " + line ); tempSet.addElement( new Instance( line, isTrain, tempSet.size()) ); } //The vector of instances is converted to an array of instances. int sizeInstance=tempSet.size(); System.out.println (" > Number of instances read: "+tempSet.size()); instanceSet=new Instance[sizeInstance]; for (int i=0; i<sizeInstance; i++) { instanceSet[i]=(Instance)tempSet.elementAt(i); } //System.out.println("After converting all instances"); //System.out.println("The error logger has any error: "+errorLogger.getNumErrors()); if (errorLogger.getNumErrors() > 0){ System.out.println ("There has been "+errorLogger.getAllErrors().size()+ " errors in the Dataset format."); for (int k=0;k<errorLogger.getNumErrors();k++){ errorLogger.getError(k).print(); } throw new DatasetException("There has been "+errorLogger.getAllErrors().size()+ " errors in the Dataset format", errorLogger.getAllErrors()); } System.out.println ("\n > Finishing the statistics: (isTrain)"+isTrain+", (# out attributes)"+Attributes.getOutputNumAttributes()); //If being on a train dataset, the statistics are finished if (isTrain && Attributes.getOutputNumAttributes() == 1){ Attributes.finishStatistics(); } //close the stream parser.close(); System.out.println (" >> File LOADED CORRECTLY!!"); }//end of InstanceSet constructor. /** * It reads the information in the header of the file. * It reads relation's name, attributes' names, and inputs and outputs. * * @param parser is the parser of the data set * @param isTrain is a boolean indicating if this is a train set (and so * parameters information must be read) or a test set (parameters information * has not to be read). */ public void parseHeader ( InstanceParser parser, boolean isTrain ){ // 1. Declaration of variables Vector inputAttrNames = new Vector(); Vector outputAttrNames = new Vector(); boolean inputsDef = false; boolean outputsDef = false; String line, aux; header = ""; int attCount = 0, lineCount = 0; attHeader = null; while ( !(line = parser.getLine().trim()).equalsIgnoreCase("@data") ){ line = line.trim(); //System.out.println (" > Line read: " + line +"." ); lineCount ++; if ( line.toLowerCase().indexOf("@relation") != -1 ){ if ( isTrain ) Attributes.setRelationName ( line.replaceAll("@relation","") ); } if ( line.toLowerCase().indexOf("@attribute") != -1 ){ if (isTrain) insertAttribute ( line ); attCount ++; } if ( line.toLowerCase().indexOf("@inputs") != -1 ){ attHeader = header; inputsDef = true; aux = line.substring( 8 ); if ( isTrain ) insertInputOutput ( aux, lineCount, inputAttrNames, "inputs", isTrain ); } if ( line.toLowerCase().indexOf("@outputs") != -1 ){ if ( attHeader == null ) attHeader = header; outputsDef = true; //System.out.println ( " >>> Defining the output !!!"); aux = line.substring( 8 ); if ( isTrain ) insertInputOutput ( aux, lineCount, outputAttrNames, "outputs", isTrain ); System.out.println (" >> Size of the output is: "+ outputAttrNames.size() ); } header += line + "\n"; } if ( attHeader == null ) attHeader = header; processInputsAndOutputs ( isTrain, inputsDef, outputsDef, outputAttrNames, inputAttrNames ); }//end headerParse void insertAttribute ( String line ){ int indexL, indexR; String type; //Treating string and declaring a string tokenizer line.replace ("{"," {"); //line.replace ("["," ["); //System.out.println (" > Processing line: "+ line ); StringTokenizer st = new StringTokenizer ( line, " [{\t" ); //Disregarding the first token. It is @attribute st.nextToken(); Attribute at = new Attribute (); at.setName ( st.nextToken().trim() ); //System.out.println ( " > Attribute name: "+ at.getName() ); //Next action depends on the type of attribute: continuous or nominal if ( !st.hasMoreTokens() ) { // Parsing a nominal attribute with no definition of values //System.out.println (" > Parsing nominal attribute without values "); at.setType( Attribute.NOMINAL ); } else if ( line.indexOf("{") != -1 ) { // Parsing a nominal attribute //System.out.println (" > Parsing nominal attribute with values: "+line ); at.setType( Attribute.NOMINAL ); at.setFixedBounds ( true ); indexL = line.indexOf ("{"); indexR = line.indexOf ("}"); //System.out.println ( " > The Nominal values are: " + line.substring( indexL+1, indexR) ); StringTokenizer st2 = new StringTokenizer ( line.substring( indexL+1, indexR ), "," ); while ( st2.hasMoreTokens() ){ at.addNominalValue ( st2.nextToken().trim() ); } } else { //Parsing an integer or real type = st.nextToken().trim(); //System.out.println (" > Parsing "+ type + " attributes"); if ( type.equalsIgnoreCase("integer") ) at.setType( Attribute.INTEGER ); if ( type.equalsIgnoreCase("real") ) at.setType( Attribute.REAL ); indexL = line.indexOf ("["); indexR = line.indexOf ("]"); if ( indexL != -1 && indexR != - 1 ){ //System.out.println ( " > The real values are: " + line.substring( indexL+1, indexR) ); StringTokenizer st2 = new StringTokenizer ( line.substring( indexL+1, indexR ), "," ); double min = Double.parseDouble ( st2.nextToken().trim() ); double max = Double.parseDouble ( st2.nextToken().trim() ); at.setBounds ( min, max ); } } Attributes.addAttribute ( at ); }//end insertAttribute void insertInputOutput ( String line, int lineCount, Vector collection, String type, boolean isTrain ){ String attName; System.out.println( " >> processing: " + line ); //Declaring StringTokenizer StringTokenizer st = new StringTokenizer ( line, "," ); while ( st.hasMoreTokens() ) { attName = st.nextToken().trim(); if ( Attributes.getAttribute ( attName ) == null ) { // If this attribute has not been declared, generate error ErrorInfo er = new ErrorInfo( ErrorInfo.InputTestAttributeNotDefined, 0, lineCount, 0, 0, isTrain, ( "The attribute " + attName + " defined in @" + type + " in test, it has not been defined in @inputs in its train dataset. It will be ignored")); InstanceSet.errorLogger.setError(er); } else { System.out.println (" > " + type + " attribute considered: " + attName + "." ); collection.add ( attName ); } } }//end insertInputOutput void processInputsAndOutputs( boolean isTrain, boolean inputsDef, boolean outputsDef, Vector outputAttrNames, Vector inputAttrNames ){ //Afteer parsing the header, the inputs and the outputs are prepared. System.out.println (" >> Processing inputs and outputs"); outputInfered=false; if ( isTrain ){ if (!inputsDef && !outputsDef){ outputAttrNames.add( Attributes.getAttribute( Attributes.getNumAttributes()-1).getName() ); inputAttrNames = Attributes.getAttributesExcept(outputAttrNames); outputInfered=true; }else if (!inputsDef && outputsDef){ inputAttrNames = Attributes.getAttributesExcept(outputAttrNames); }else if (inputsDef && !outputsDef){ outputAttrNames = Attributes.getAttributesExcept(inputAttrNames); outputInfered=true; } Attributes.setOutputInputAttributes(inputAttrNames, outputAttrNames); } }//end of processInputsAndOutputs /** * Test if the output attribute has been infered. * @return True if the output attribute has been infered. False if not. */ public boolean isOutputInfered(){ return outputInfered; } /** * It returns the number of instances. * @return an int with the number of instances. */ public int getNumInstances() { if(instanceSet!=null) return instanceSet.length; else return 0; }//end numInstances /** * Gets the instance located at the cursor position. * @return the instance located at the cursor position. */ public Instance getInstance(int whichInstance) { if (whichInstance <0 || whichInstance>= instanceSet.length) return null; return instanceSet[whichInstance]; }//end getInstance /** * It returns all the instances of the class. * @return Instance[] with all the instances of the class. */ public Instance[] getInstances() { return instanceSet; }//end getInstances /** * Returns the value of an integer or a real input attribute of an instance * in the instanceSet. * @param whichInst is the position of the instance. * @param whichAttr is the position of the input attribute. * @return a String with the numeric value. * @throws ArrayIndexOutOfBoundsException If the index is out of the instance * set size. */ public double getInputNumericValue(int whichInst, int whichAttr) throws ArrayIndexOutOfBoundsException{ if (whichInst<0 || whichInst>= instanceSet.length) throw new ArrayIndexOutOfBoundsException("You are trying to access to "+whichInst+" instance and there are only "+instanceSet.length+"."); return instanceSet[whichInst].getInputRealValues(whichAttr); }//end getInputNumericValue /** * Returns the value of an integer or a real output attribute of an instance * in the instanceSet. * @param whichInst is the position of the instance. * @param whichAttr is the position of the output attribute. * @return a String with the numeric value. * @throws ArrayIndexOutOfBoundsException If the index is out of the instance * set size. */ public double getOutputNumericValue(int whichInst, int whichAttr) throws ArrayIndexOutOfBoundsException{ if (whichInst<0 || whichInst>= instanceSet.length) throw new ArrayIndexOutOfBoundsException("You are trying to access to "+whichInst+" instance and there are only "+instanceSet.length+"."); return instanceSet[whichInst].getOutputRealValues(whichAttr); }//end getOutputNumericValue /** * Returns the value of a nominal input attribute of an instance in the * instanceSet. * @param whichInst is the position of the instance. * @param whichAttr is the position of the input attribute. * @return a String with the nominal value. * @throws ArrayIndexOutOfBoundsException If the index is out of the instance * set size. */ public String getInputNominalValue(int whichInst, int whichAttr) throws ArrayIndexOutOfBoundsException{ if (whichInst<0 || whichInst>= instanceSet.length) throw new ArrayIndexOutOfBoundsException("You are trying to access to "+whichInst+" instance and there are only "+instanceSet.length+"."); return instanceSet[whichInst].getInputNominalValues(whichAttr); }//end getInputNominalValue /** * Returns the value of a nominal output attribute of an instance in the * instanceSet. * @param whichInst is the position of the instance. * @param whichAttr is the position of the output attribute. * @return a String with the nominal value. * @throws ArrayIndexOutOfBoundsException If the index is out of the instance * set size. */ public String getOutputNominalValue(int whichInst, int whichAttr) throws ArrayIndexOutOfBoundsException{ if (whichInst<0 || whichInst>= instanceSet.length) throw new ArrayIndexOutOfBoundsException("You are trying to access to "+whichInst+" instance and there are only "+instanceSet.length+"."); return instanceSet[whichInst].getOutputNominalValues(whichAttr); }//end getOutputNumericValue /** * It does remove the instance i from the instanceSet. * @param instNum is the instance removed from the instanceSet. */ public void removeInstance(int instNum){ if (instNum<0 || instNum>=instanceSet.length) return; Instance[] aux = new Instance[instanceSet.length - 1]; int add = 0; for (int i=0; i<instanceSet.length; i++){ if (instNum == i) add=1; else{ aux[i-add] = instanceSet[i]; } } //Copying the auxiliar to the instanceSet variable instanceSet = aux; aux = null; //avoiding memory leaks (not necessary in this case) }//end removeInstance /** * It does remove an attribute. To remove an attribute, the train and the * test sets have to be passed to mantain the coherence of the system. * Otherwise, only the attribute of the train set would be removed, leaving * inconsistent the instances of the test set, because of having one extra * attribute inexistent anymore. * * @param tSet is the test set. * @param inputAtt is a boolean that is true when the attribute that is * wanted to be removed is an input attribute. * @param whichAtt is a integer that indicate the position of the attriubte * to be deleted. * @return a boolean indicating if the attribute has been deleted */ public boolean removeAttribute(InstanceSet tSet, boolean inputAtt, int whichAtt){ Attribute attToDel=null; //Getting a reference to the attribute to del if (inputAtt){ if ( storeAttributesAsNonStatic && attributes != null ) attToDel = (Attribute)attributes.getInputAttribute(whichAtt); else attToDel = (Attribute)Attributes.getInputAttribute(whichAtt); } else{ if ( storeAttributesAsNonStatic && attributes != null ) attToDel = (Attribute)attributes.getOutputAttribute(whichAtt); else attToDel = (Attribute)Attributes.getOutputAttribute(whichAtt); } if ( storeAttributesAsNonStatic && attributes != null ){ System.out.println ("Removing the attribute"); if (!attributes.removeAttribute(inputAtt,whichAtt) || (tSet != null && ! tSet.attributes.removeAttribute(inputAtt,whichAtt)) ) return false; } else{ if (!Attributes.removeAttribute(inputAtt,whichAtt)) return false; } for (int i=0; i<instanceSet.length; i++){ if ( storeAttributesAsNonStatic && attributes != null ){ instanceSet[i].removeAttribute(attributes, attToDel, inputAtt, whichAtt); } else{ instanceSet[i].removeAttribute(attToDel, inputAtt, whichAtt); } } if (tSet != null) for (int i=0; i<tSet.instanceSet.length; i++){ if ( storeAttributesAsNonStatic && attributes != null ) tSet.instanceSet[i].removeAttribute(attributes,attToDel, inputAtt, whichAtt); else tSet.instanceSet[i].removeAttribute(attToDel, inputAtt, whichAtt); } return true; }//end removeAttribute /** * It returns the header. * @return a String with the header of the file. */ public String getHeader() { return header; }//end getHeader public void setHeader(String copia) { header = new String(copia); }//end getHeader public String getAttHeader() { return this.attHeader; }//end getHeader public void setAttHeader(String copia) { attHeader = new String(copia); }//end getHeader /** * It does return a new header (not necessary the same header as the * input file one). It only includes the valid attributes, those ones * defined in @inputs and @outputs (or taken as that role following the * keel format specification). * @return a String with the new header */ public String getNewHeader(){ String line = ""; Attribute []attrs = null; //Getting the relation name and the attributes if ( storeAttributesAsNonStatic && attributes != null ) { line = "@relation "+attributes.getRelationName()+"\n"; attrs = attributes.getInputAttributes(); } else{ line = "@relation "+Attributes.getRelationName()+"\n"; attrs = Attributes.getInputAttributes(); } for (int i=0; i<attrs.length; i++){ line += attrs[i].toString()+"\n"; } //Gettin all the outputs attributes if ( storeAttributesAsNonStatic && attributes != null ){ attrs = attributes.getOutputAttributes(); line += attrs[0].toString()+"\n"; //Getting @inputs and @outputs line += attributes.getInputHeader()+"\n"; line += attributes.getOutputHeader()+"\n"; } else{ attrs = Attributes.getOutputAttributes(); line += attrs[0].toString()+"\n"; //Getting @inputs and @outputs line += Attributes.getInputHeader()+"\n"; line += Attributes.getOutputHeader()+"\n"; } return line; }//end getNewHeader /** * It does return the original header definiton but * without @input and @output in there */ public String getOriginalHeaderWithoutInOut(){ String line = ""; Attribute []attrs = null; //Getting the relation name and the attributes if ( storeAttributesAsNonStatic && attributes != null ){ line = "@relation "+attributes.getRelationName()+"\n"; attrs = attributes.getAttributes(); } else{ line = "@relation "+Attributes.getRelationName()+"\n"; attrs = Attributes.getAttributes(); } for (int i=0; i<attrs.length; i++){ line += attrs[i].toString()+"\n"; } return line; }//end getOriginalHeaderWithoutInOut /** * It prints the dataset to the specified PrintWriter * @param out is the PrintWriter where to print */ public void print (PrintWriter out){ for (int i=0; i<instanceSet.length; i++){ out.println ("> Instance "+i+":"); if ( storeAttributesAsNonStatic && attributes != null ) instanceSet[i].print(attributes, out); else instanceSet[i].print(out); } }//end print /** * It prints the dataset to the specified PrintWriter. * The order of the attributes is the same as in the * original file * @param out is the PrintWriter where to print * @param printInOut indicates if the @inputs (1), @outputs(2), * both of them (3) or any (0) has to be printed */ public void printAsOriginal (PrintWriter out, int printInOut){ /*Printing the header as the original one*/ out.println( header ); if ( storeAttributesAsNonStatic && attributes != null ){ if(printInOut==1 || printInOut==3) out.println( attributes.getInputHeader() ); if(printInOut==2 || printInOut==3) out.println( attributes.getOutputHeader() ); } else{ if(printInOut==1 || printInOut==3) out.println( Attributes.getInputHeader() ); if(printInOut==2 || printInOut==3) out.println( Attributes.getOutputHeader() ); } out.print("@data"); for (int i=0; i<instanceSet.length;i++){ out.println(); if ( storeAttributesAsNonStatic && attributes != null ) instanceSet[i].printAsOriginal( attributes, out ); else instanceSet[i].printAsOriginal( out ); } }//end printAsOriginal public void print (){ System.out.println ("------------- ATTRIBUTES --------------"); if ( storeAttributesAsNonStatic && attributes != null ){ attributes.print(); } else{ Attributes.print(); } System.out.println ("-------------- INSTANCES --------------"); for (int i=0; i<instanceSet.length; i++){ System.out.print ("\n> Instance "+i+":"); if ( storeAttributesAsNonStatic && attributes != null ){ instanceSet[i].print( attributes ); } else instanceSet[i].print(); } }//end print /** * Remove all instances from this InstanceSet */ public void clearInstances(){ instanceSet = null; } /** * It adds the passed instance at the end of the present InstanceSet * @param inst the instance to be added */ public void addInstance(Instance inst){ int i = 0; Instance nVector[]; if(instanceSet!=null){ nVector = new Instance[instanceSet.length+1]; for(i=0;i<instanceSet.length;i++){ nVector[i] = instanceSet[i]; } }else nVector = new Instance[1]; nVector[i] = inst; instanceSet = nVector; } /** * Clear the non-Static attributes. The static class Attributes is not modified. */ public void clearNonStaticAttributes(){ attributes = null; } /** * Appends the given attribute to the non-static list of the current InstanceSet * @param at The Attribute to be Appended */ public void addAttribute(Attribute at){ if(attributes==null) attributes = new InstanceAttributes(); attributes.addAttribute(at); } }//end of InstanceSet Class.