/***********************************************************************
This file is part of KEEL-software, the Data Mining tool for regression,
classification, clustering, pattern mining and so on.
Copyright (C) 2004-2010
F. Herrera (herrera@decsai.ugr.es)
L. S�nchez (luciano@uniovi.es)
J. Alcal�-Fdez (jalcala@decsai.ugr.es)
S. Garc�a (sglopez@ujaen.es)
A. Fern�ndez (alberto.fernandez@ujaen.es)
J. Luengo (julianlm@decsai.ugr.es)
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see http://www.gnu.org/licenses/
**********************************************************************/
package keel.Algorithms.Neural_Networks.NNEP_Common.data;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.util.StringTokenizer;
import net.sf.jclec.util.range.Closure;
import net.sf.jclec.util.range.Interval;
/**
* <p>
* @author Written by Amelia Zafra, Sebastian Ventura (University of Cordoba) 17/07/2007
* @version 0.1
* @since JDK1.5
* </p>
*/
public class KeelDataSet extends FileDataset{
/**
* <p>
* KeelDataSet implementation (keel dataset)
* </p>
*/
/////////////////////////////////////////////////////////////////
// --------------------------------------- Serialization constant
/////////////////////////////////////////////////////////////////
/** Generated by Eclipse */
private static final long serialVersionUID = 1L;
/////////////////////////////////////////////////////////////////////////
// ------------------------------------------------- Internal Variables
/////////////////////////////////////////////////////////////////////////
/** The keyword used to denote the relation name */
static String KEEL_RELATION = "@relation";
/** The keyword used to denote the attribute description */
static String KEEL_ATTRIBUTE = "@attribute";
/** The keyword used to denote the start of the arff data section */
static String KEEL_DATA = "@data";
/** The keyword used to denote the output attribute */
static String KEEL_OUTPUTS = "@outputs";
/** The keyword used to denote the input attribute */
static String KEEL_INPUTS = "@inputs";
/** Symbol which represents missed values */
protected String missedValue;
/** Symbol which represents commentted values */
protected String commentedValue;
/** Symbol which represents the separation between values */
protected String separationValue;
/** Buffer Instance */
protected String bufferInstance = new String();
/////////////////////////////////////////////////////////////////
// ------------------------------------------------ Constructor
/////////////////////////////////////////////////////////////////
/**
* <p>
* Constructor with the filename and the specification file
* </p>
* @param fileName Name of the dataset file
* @param specificationFile Specification file
*/
public KeelDataSet(String fileName, String ...specificationFile){
super(fileName);
missedValue = "?";
separationValue = ",";
commentedValue = "%";
}
/**
* <p>
* Constructor without arguments
* </p>
*/
public KeelDataSet( ){
super();
missedValue = "?";
separationValue = ",";
commentedValue = "%";
}
/////////////////////////////////////////////////////////////////
// ------------------------- Overwriting FileDataset methods
/////////////////////////////////////////////////////////////////
/**
* <p>
* Open dataset
* </p>
* @throws DatasetException If dataset can't be opened
*/
@Override
public void open(){
// Generate the specification from header of data source file
obtainMetadata(fileName);
// Initialize variables
cursorPosition = 0;
cursorInstance = new AbstractDataset.Instance();
// Intervals for non specified attributes
extractIntervalsFromData();
}
/**
* <p>
* Reset dataset
* </p>
* @throws DatasetException if a source access error occurs
*/
@Override
public void reset(){
try {
fileReader.close();
fileReader = new BufferedReader(new FileReader(new File(fileName)));
//Read until finding the sentence @DATA
String line = ((BufferedReader) fileReader).readLine();
while (!line.equalsIgnoreCase(KEEL_DATA)){
line = ((BufferedReader) fileReader).readLine();
}
bufferInstance = ((BufferedReader) fileReader).readLine();
while(bufferInstance.startsWith(commentedValue) || bufferInstance.equalsIgnoreCase("")){
bufferInstance = ((BufferedReader) fileReader).readLine();
}
cursorPosition = 0;
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (Exception e){
e.printStackTrace();
}
}
/**
* <p>
* Return the next instance
* </p>
* @return The next instance
* @throws DatasetException if a source access error occurs
*/
@Override
public boolean next() throws DatasetException {
if(bufferInstance != null){
try{
cursorPosition++;
//Get the attributes of this instance
StringTokenizer token = new StringTokenizer(bufferInstance, separationValue);
int numAttributes = 0;
//AbstractDataset.Instance instance = new AbstractDataset.Instance();
while(token.hasMoreTokens()){
IAttribute attribute = metadata.getAttribute(numAttributes);
String tok = token.nextToken().trim();
if(tok.equals("<null>"))
cursorInstance.setValue(numAttributes, Double.NaN);
else{
double value = attribute.parse(tok);
cursorInstance.setValue(numAttributes, value);
}
numAttributes++;
}
//cursorInstance = instance;
prepareNextInstance();
}catch(Exception e){ e.printStackTrace();}
return true;
}
else
return false;
}
/**
* <p>
* Returns cursor instance
* </p>
* @return Actual instance (if exists)
* @throws DatasetException if a source access error occurs
*/
@Override
public AbstractDataset.Instance read() throws DatasetException {
return cursorInstance;
}
/**
* <p>
* Close dataset
* </p>
* @throws DatasetException If dataset can't be closed
*/
@Override
public void close() throws DatasetException {
try {
fileReader.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
/////////////////////////////////////////////////////////////////
// ----------------------------------------------------- Methods
/////////////////////////////////////////////////////////////////
/**
* <p>
* Generate the dataset specification
* </p>
* @param file Name of data source file
*/
private void obtainMetadata(String file){
File f1 = new File(file);
metadata = new Metadata();
try {
fileReader = new BufferedReader(new FileReader(f1));
//Read until finding the sentence @DATA
String line = ((BufferedReader) fileReader).readLine();
int indexAttribute = 0;
line = line.replace("real[","real [");
line = line.replace("integer[","integer [");
line = line.replace("{"," {");
StringTokenizer elementLine = new StringTokenizer(line);
String element = elementLine.nextToken();
while (!element.equalsIgnoreCase(KEEL_DATA)){
if(element.equalsIgnoreCase(KEEL_ATTRIBUTE)){
//The next attribute
indexAttribute++;
String name = elementLine.nextToken();
String type = elementLine.nextToken();
if(type.equalsIgnoreCase("REAL") || type.equalsIgnoreCase("INTEGER")){
addAttributeToSpecification(type, line, name);
}
else
addAttributeToSpecification("STRING", line, name);
}
if(element.equalsIgnoreCase(KEEL_RELATION)){
setName(elementLine.nextToken());
}
//Next line of the file
line = ((BufferedReader) fileReader).readLine();
while(line.startsWith(commentedValue) || line.equalsIgnoreCase(""))
line = ((BufferedReader) fileReader).readLine();
line = line.replace("real[","real [");
line = line.replace("integer[","integer [");
line = line.replace("{"," {");
elementLine = new StringTokenizer(line);
element = elementLine.nextToken();
}
bufferInstance = ((BufferedReader) fileReader).readLine();
while(bufferInstance.startsWith(commentedValue) || bufferInstance.equalsIgnoreCase("")){
bufferInstance = ((BufferedReader) fileReader).readLine();
}
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (Exception e){
e.printStackTrace();
}
}
/**
* <p>
* Store the next instance in bufferInstance
* </p>
*/
private void prepareNextInstance(){
try
{
//Get the next instance
String lineInstance = ((BufferedReader) fileReader).readLine();
while(lineInstance.startsWith(commentedValue) || lineInstance.equalsIgnoreCase("")){
lineInstance = ((BufferedReader) fileReader).readLine();
}
bufferInstance = lineInstance;
}catch(Exception e){
bufferInstance = null;
}
}
/**
* <p>
* Add new attribute to the dataset specification
* </p>
* @param type Attribute type
* @param interval Intervals value
* @param name Attribute name
*/
private void addAttributeToSpecification(String type, String interval, String name){
// If the attribute is numerical
if(type.equalsIgnoreCase("REAL")){
RealNumericalAttribute attribute = new RealNumericalAttribute();
attribute.setName(name);
// If an interval is specified
if(interval.contains("[") && interval.contains("]")){
// Obtain the intervals
int minIndex = interval.indexOf("[");
int maxIndex = interval.indexOf("]");
interval = interval.substring(minIndex+1, maxIndex);
if(minIndex < maxIndex){
StringTokenizer tkInterval = new StringTokenizer(interval, ",");
Interval intervals = new Interval();
intervals.setClosure(Closure.ClosedClosed);
intervals.setLeft(Double.valueOf((String) tkInterval.nextElement()));
intervals.setRight(Double.valueOf((String) tkInterval.nextToken()));
attribute.addInterval(intervals);
//Add new attribute to the specification
metadata.addAttribute(attribute);
}
}
// If the interval is not specified it must be extracted from data
else {
Interval intervals = new Interval();
intervals.setClosure(Closure.ClosedClosed);
intervals.setLeft(Double.MIN_VALUE);
intervals.setRight(Double.MAX_VALUE);
attribute.addInterval(intervals);
//Add new attribute to the specification
metadata.addAttribute(attribute);
}
}
else if(type.equalsIgnoreCase("INTEGER")){
IntegerNumericalAttribute attribute = new IntegerNumericalAttribute();
attribute.setName(name);
// If an interval is specified
if(interval.contains("[") && interval.contains("]")){
// Obtain the intervals
int minIndex = interval.indexOf("[");
int maxIndex = interval.indexOf("]");
interval = interval.substring(minIndex+1, maxIndex);
if(minIndex < maxIndex){
StringTokenizer tkInterval = new StringTokenizer(interval, ",");
net.sf.jclec.util.intset.Interval intervals = new net.sf.jclec.util.intset.Interval();
intervals.setClosure(net.sf.jclec.util.intset.Closure.ClosedClosed);
intervals.setLeft(Integer.valueOf((String) tkInterval.nextElement()));
intervals.setRight(Integer.valueOf((String) tkInterval.nextToken().trim()));
attribute.addInterval(intervals);
//Add new attribute to the specification
metadata.addAttribute(attribute);
}
}
// If the interval is not specified it must be extracted from data
else {
net.sf.jclec.util.intset.Interval intervals = new net.sf.jclec.util.intset.Interval();
intervals.setClosure(net.sf.jclec.util.intset.Closure.ClosedClosed);
intervals.setLeft(Integer.MIN_VALUE);
intervals.setRight(Integer.MAX_VALUE);
attribute.addInterval(intervals);
//Add new attribute to the specification
metadata.addAttribute(attribute);
}
}
else
{
//Obtain the categorical values
int minIndex = interval.indexOf("{");
int maxIndex = interval.indexOf("}");
interval = interval.substring(minIndex+1, maxIndex);
if(minIndex < maxIndex){
CategoricalAttribute attribute = new CategoricalAttribute();
attribute.setName(name);
StringTokenizer categories = new StringTokenizer(interval, ",");
while(categories.hasMoreTokens())
attribute.addValue(categories.nextToken().trim());
//Add new attribute to the specification
metadata.addAttribute(attribute);
}
}
}
/**
* <p>
* Extract the interval of a Real or Integer Attribute directly from data
* </p>
* @return String Interval of the attribute
*/
private void extractIntervalsFromData(){
try {
double[] min = new double[metadata.numberOfAttributes()];
double[] max = new double[metadata.numberOfAttributes()];
boolean[] nonSpecified = new boolean[metadata.numberOfAttributes()];
for(int i=0; i<metadata.numberOfAttributes(); i++){
min[i] = Double.MAX_VALUE;
max[i] = Double.MIN_VALUE;
if(metadata.getAttribute(i).getType() == AttributeType.DoubleNumerical){
RealNumericalAttribute attribute = (RealNumericalAttribute) metadata.getAttribute(i);
nonSpecified[i] = attribute.intervalValues().getLeft() == Double.MIN_VALUE && attribute.intervalValues().getRight() == Double.MAX_VALUE;
}
else if(metadata.getAttribute(i).getType() == AttributeType.IntegerNumerical){
IntegerNumericalAttribute attribute = (IntegerNumericalAttribute) metadata.getAttribute(i);
nonSpecified[i] = attribute.intervalValues().getLeft() == Integer.MIN_VALUE && attribute.intervalValues().getRight() == Integer.MAX_VALUE;
}
}
while(this.next()){
IDataset.IInstance instancia = this.read();
// Extract interval for non specified attributes
for(int i=0; i<metadata.numberOfAttributes(); i++){
if(nonSpecified[i]){
double value = instancia.getValue(i);
if(value < min[i])
min[i] = value;
if(value > max[i])
max[i] = value;
}
}
}
for(int i=0; i<metadata.numberOfAttributes(); i++){
if(nonSpecified[i]){
if(metadata.getAttribute(i).getType() == AttributeType.DoubleNumerical){
RealNumericalAttribute attribute = (RealNumericalAttribute) metadata.getAttribute(i);
attribute.intervalValues().setLeft(min[i]);
attribute.intervalValues().setRight(max[i]);
}
else if(metadata.getAttribute(i).getType() == AttributeType.IntegerNumerical){
IntegerNumericalAttribute attribute = (IntegerNumericalAttribute) metadata.getAttribute(i);
attribute.intervalValues().setLeft((int)min[i]);
attribute.intervalValues().setRight((int)max[i]);
}
}
}
this.reset();
}catch (DatasetException e) {
e.printStackTrace();
}
}
}