/*********************************************************************** This file is part of KEEL-software, the Data Mining tool for regression, classification, clustering, pattern mining and so on. Copyright (C) 2004-2010 F. Herrera (herrera@decsai.ugr.es) L. S�nchez (luciano@uniovi.es) J. Alcal�-Fdez (jalcala@decsai.ugr.es) S. Garc�a (sglopez@ujaen.es) A. Fern�ndez (alberto.fernandez@ujaen.es) J. Luengo (julianlm@decsai.ugr.es) This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see http://www.gnu.org/licenses/ **********************************************************************/ /** * <p> * @author Written by Juli�n Luengo Mart�n 06/03/2006 * @version 0.5 * @since JDK 1.5 * </p> */ package keel.Algorithms.Preprocess.Missing_Values.EventCovering; import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.util.Vector; import java.util.*; import java.lang.*; import keel.Dataset.Attribute; import keel.Dataset.Attributes; import keel.Dataset.Instance; import keel.Dataset.InstanceSet; import keel.Algorithms.Preprocess.Missing_Values.EventCovering.Stat.*; /* class IPComp implements Comparator { public int compare(Object obj1, Object obj2) { double i1 = (InstanceP)obj1.Px; double i2 = (InstanceP)obj2.Px; if(i1<i2) return -1; if(i1==i2) return 0; return 1; } }*/ /** * <p> * Based on the work of Wong et al., a mixed-mode probability model is approximated * by a discrete one. First, they discretize the continuous components using a minimum * loss of information criterion. Treating a mixed-mode feature n-tuple as a * discrete-valued one, the authors propose a new statistical approach for synthesis * of knowledge based on cluster analysis. As main advantage, this method does not * require neither scale normalization nor ordering of discrete values. By synthesis * of the data into statistical knowledge, they refer to the following processes: * 1) synthesize and detect from data inherent patterns which indicate statistical * interdependency; * 2) group the given data into inherent clusters based on these detected interdependency; * and 3) interpret the underlying patterns for each clusters identified. * </p> * The method of synthesis is based on author's eventcovering approach. * With the developed inference method, we are able to estimate the MVs in the data. * This method assumes the data is DISCRETIZED (but won't throw any error with continuous * data). */ public class EventCovering { double [] mean = null; double [] std_dev = null; double tempData = 0; String[][] X = null; //matrix of transformed data int ndatos = 0; int nentradas = 0; int tipo = 0; int direccion = 0; int nvariables = 0; int nsalidas = 0; int totalMissing = 0; //METHOD PARAMS double T = 0.07; int min_change_num = 0; double Cfactor = 1; InstanceSet IS,IStest; String input_train_name = new String(); String input_test_name = new String(); String output_train_name = new String(); String output_test_name = new String(); String temp = new String(); String data_out = new String(""); StatFunc chi; /** Creates a new instance of EventCovering * @param fileParam The path to the configuration file with all the parameters in KEEL format */ public EventCovering(String fileParam) { config_read(fileParam); IS = new InstanceSet(); IStest = new InstanceSet(); } //Write data matrix X to disk, in KEEL format private void write_results(String output){ //File OutputFile = new File(output_train_name.substring(1, output_train_name.length()-1)); try { FileWriter file_write = new FileWriter(output); file_write.write(IS.getHeader()); //now, print the normalized data file_write.write("@data\n"); for(int i=0;i<ndatos;i++){ //System.out.println(i); file_write.write(X[i][0]); for(int j=1;j<nvariables;j++){ file_write.write(","+X[i][j]); } file_write.write("\n"); } file_write.close(); } catch (IOException e) { System.out.println("IO exception = " + e ); System.exit(-1); } } //Reads the parameter file, and parses data into strings private void config_read(String fileParam){ File inputFile = new File(fileParam); if (inputFile == null || !inputFile.exists()) { System.out.println("parameter "+fileParam+" file doesn't exists!"); System.exit(-1); } //begin the configuration read from file try { FileReader file_reader = new FileReader(inputFile); BufferedReader buf_reader = new BufferedReader(file_reader); //FileWriter file_write = new FileWriter(outputFile); String line; do{ line = buf_reader.readLine(); }while(line.length()==0); //avoid empty lines for processing -> produce exec failure String out[]= line.split("algorithm = "); //alg_name = new String(out[1]); //catch the algorithm name //input & output filenames do{ line = buf_reader.readLine(); }while(line.length()==0); out= line.split("inputData = "); out = out[1].split("\\s\""); input_train_name = new String(out[0].substring(1, out[0].length()-1)); input_test_name = new String(out[1].substring(0, out[1].length()-1)); if(input_test_name.charAt(input_test_name.length()-1)=='"') input_test_name = input_test_name.substring(0,input_test_name.length()-1); do{ line = buf_reader.readLine(); }while(line.length()==0); out = line.split("outputData = "); out = out[1].split("\\s\""); output_train_name = new String(out[0].substring(1, out[0].length()-1)); output_test_name = new String(out[1].substring(0, out[1].length()-1)); if(output_test_name.charAt(output_test_name.length()-1)=='"') output_test_name = output_test_name.substring(0,output_test_name.length()-1); //parameters do{ line = buf_reader.readLine(); }while(line.length()==0); out = line.split("T = "); T = (new Double(out[1])).doubleValue(); //parse the string into a double do{ line = buf_reader.readLine(); }while(line.length()==0); out = line.split("minChangeNum = "); min_change_num = (new Integer(out[1])).intValue(); //parse the string into a double do{ line = buf_reader.readLine(); }while(line.length()==0); out = line.split("Cfactor = "); Cfactor = (new Double(out[1])).doubleValue(); //parse the string into a double file_reader.close(); } catch (IOException e) { System.out.println("IO exception = " + e ); System.exit(-1); } } /** * <p> * Computes the Hamming distance between 2 instances * </p> * @param i1 First Instance * @param i2 Second instance * @return The Hamming distance */ protected double dist(Instance i1, Instance i2){ double d = 0; double [] a; double [] b; /* //Euclidean distance a = i1.getAllInputValues(); b = i2.getAllInputValues(); for(int i=0;i<nentradas;i++){ d += (a[i]-b[i])*(a[i]-b[i]); } a = i1.getAllOutputValues(); b = i2.getAllOutputValues(); for(int i=0;i<nsalidas;i++){ d += (a[i]-b[i])*(a[i]-b[i]); } d = Math.sqrt(d); */ //Hamming distance a = i1.getAllInputValues(); b = i2.getAllInputValues(); for(int i=0;i<nentradas;i++){ if(a[i]!=b[i]) d++; } a = i1.getAllOutputValues(); b = i2.getAllOutputValues(); for(int i=0;i<nsalidas;i++){ if(a[i]!=b[i]) d++; } return d; } /** * <p> * Estimates the mutual information between the instances in the data set * </p> * @return The mutual information for all possible combinations of 2 instances */ protected double[][] computeMutualInformation(){ double[][] I; Instance inst; FreqListPair[][] frec; ValuesFreq F; double u,v; double f_uv,f_u,f_v; double [] ent; double [] sal; double [] all; HashSet lu = new HashSet(); HashSet lv = new HashSet(); boolean found; String[] valores; String e1,e2; frec = new FreqListPair[nvariables][nvariables]; for(int i=0;i<nvariables;i++) for(int j=0;j<nvariables;j++) frec[i][j] = new FreqListPair(); //frec = new FreqListPair(); //matrix of mutual summed information I = new double[nvariables][nvariables]; all = new double[nvariables]; for(int k=0;k<ndatos;k++){ inst = IS.getInstance(k); //if(!inst.existsAnyMissingValue()){ ent = inst.getAllInputValues(); sal = inst.getAllOutputValues(); for(int m=0;m<nentradas;m++){ if(!inst.getInputMissingValues(m)) all[m] = ent[m]; else all[m] = Double.MIN_VALUE; } for(int m=0;m<nsalidas;m++) if(!inst.getOutputMissingValues(m)) all[m+nentradas] = sal[m]; else all[m+nentradas] = Double.MIN_VALUE; for(int i = 0; i< nvariables;i++){ for(int j = i+1; j < nvariables;j++){ u = all[i]; v = all[j]; frec[i][j].AddElement(String.valueOf(u),String.valueOf(v)); //store both elements // lu.add(String.valueOf(u)); // lv.add(String.valueOf(v)); //System.out.println("("+i+","+j+") OK de ("+nvariables+","+nvariables+")"); } } //} } for(int i = 0; i< nvariables;i++){ for(int j = i+1; j < nvariables;j++){ frec[i][j].reset(); I[i][j] = 0; while(!frec[i][j].outOfBounds()){ F = frec[i][j].getCurrent(); e1 = F.getValue1(); e2 = F.getValue2(); f_u = (double)frec[i][j].elem1SumFreq(e1)/ndatos; f_v = (double)frec[i][j].elem2SumFreq(e2)/ndatos; f_uv = (double)F.getFreq()/ndatos; I[i][j] += f_uv * Math.log(f_uv/(f_u*f_v))/Math.log(2); frec[i][j].iterate(); } } } return I; } /** * <p> * Computes the dependece Tree using Dijkstra algorithm * </p> * @param I The paired-mutual information of this data set * @return */ protected Vector computeTree(double[][] I){ double[] nodo; int ik = -1,jk = -1; int k,m; double max; nodo = new double[nvariables]; Vector tree = new Vector(); Pair par; for(int i=0;i<nvariables;i++) nodo[i] = i; k = 1; while(k-nvariables<0){ //search for maximum I //since I[i][j] is always non-negative, use initial MAX value as a negative one //if we don't want consider zero-Information values, use max = 0 as initial value max = -1; for(int i=0;i<nvariables-1;i++){ for(int j=i+1;j<nvariables;j++){ if(I[i][j] > max){ ik = i; jk = j; max = I[i][j]; } } } if(nodo[ik] == nodo[jk]){ I[ik][jk] = -1; } else{ par = new Pair(ik,jk); tree.addElement(par); I[ik][jk] = -1; m = 0; while(m<nvariables){ if(nodo[m] == nodo[jk]){ nodo[m] = nodo[jk]; } m++; } k++; } } return tree; } /** * <p> * Computes the conjunctive probabilities using the second order probabilities. * </p> * @param tree The dependence tree of this data set * @return An array of probabilites for each instance (in the same order) */ protected double[] computePx(Vector tree){ double [] Px; Instance inst,e; double a,b; double x1,x2; int count,total; Pair p; Px = new double[ndatos]; for(int i = 0;i < ndatos;i++){ inst = IS.getInstance(i); if(!inst.existsAnyMissingValue()){ a = (inst.getAllInputValues())[0]; count = 0; for(int j = 0;j < ndatos;j++){ e = IS.getInstance(j); if((e.getAllInputValues())[0] == a) count++; } Px[i] = (double)count/ndatos; for(int j=0;j<tree.size();j++){ p = (Pair)tree.elementAt(j); if(p.e1<nentradas) a = (inst.getAllInputValues())[p.e1]; else a = (inst.getAllOutputValues())[p.e1-nentradas]; if(p.e2<nentradas) b = (inst.getAllInputValues())[p.e2]; else b = (inst.getAllOutputValues())[p.e2-nentradas]; count = 0; total = 0; for(int k = 0;k < ndatos;k++){ e = IS.getInstance(k); if(p.e1<nentradas) x1 = (e.getAllInputValues())[p.e1]; else x1 = (e.getAllOutputValues())[p.e1-nentradas]; if(p.e2<nentradas) x2 = (e.getAllInputValues())[p.e2]; else x2 = (e.getAllOutputValues())[p.e2-nentradas]; if(x1==a){ total++; if(x2==b) count++; } } Px[i] *= (double)count/total; } } else{ Px[i] = -1; //instance with missing data, do not count for cluster making! totalMissing++; } } return Px; } /** * <p> * Initializes the set of clusters using information of the data set * </p> * @param Px The second order probablity estimation * @return a initinal set of clusters */ protected Vector clusterInitation(double[] Px){ int k,t; double muMean; double Dst; double P_; double d,dmax,dtop,p; int max,tmax,choosenV; int alreadyTaken; boolean Dfound,found; Vector L = new Vector(); Vector Dist = new Vector(); Vector Ps = new Vector(); Vector Index = new Vector(); Instance x; Vector Clusters = new Vector(); Cluster cluster; //InstanceSet IS ; k = 0; t = 0; //IS.readSet(input_train_name,true); muMean = 0; max = 0; for(int i=0;i<ndatos;i++){ if(Px[i]>=0){ muMean += Px[i]; if(Px[i]>Px[max]) max = i; } } muMean = muMean/ndatos; //T = 1; //threshold for cluster size tmax = 0; for(int j = 0; j < nvariables;j++){ Attribute a = Attributes.getAttribute(j); if(a.getType()!=Attribute.NOMINAL){ if(a.getMaxAttribute()-a.getMinAttribute() > tmax) tmax = (int)(a.getMaxAttribute()-a.getMinAttribute()); } else{ if(a.getNumNominalValues() > tmax) tmax = a.getNumNominalValues(); } } //T = T *tmax/8; //dummy cluster Cluster C0 = new Cluster(); /* Cluster C1 = new Cluster(); C1.addInstance(IS.getInstance(max)); Clusters.addElement(C1); */ alreadyTaken = 0; while(alreadyTaken<ndatos-totalMissing){ if(ndatos-alreadyTaken>T) P_ = muMean; else P_ = 0; //List all x in L, provided x has P(x) > P_ for(int i=0;i<ndatos;i++){ if(Px[i]>P_){ L.addElement(IS.getInstance(i)); Ps.addElement(new Double(Px[i])); Index.addElement(new Integer(i)); } } //compute D for each x Dist.clear(); for(int i=0;i<L.size();i++){ x = (Instance)L.elementAt(i); Dist.addElement(new Double(D(x,L))); //System.out.println("i = "+i); } //get D* dtop = Double.MAX_VALUE; do{ dmax = 0; for(int i=0;i<Dist.size();i++){ d = ((Double)Dist.elementAt(i)).doubleValue(); if(dmax<d && d<dtop) dmax = d; } //avoid isolated values, making D* such exist at least //one 'x' with D*-1 Dfound = false; for(int i=0;i<Dist.size()&&!Dfound;i++){ d = ((Double)Dist.elementAt(i)).doubleValue(); if((int)(dmax-1)<=d) Dfound = true; } if(!Dfound) dtop = dmax; }while(!Dfound && dtop > 1); Dst = dmax; //D* found do{ dmax = 0; //locate the x with maximum P(x) for(int i=0;i<L.size();i++){ p = ((Double)Ps.elementAt(i)).doubleValue(); if(p>dmax){ max = i; dmax = p; } } x = (Instance) L.elementAt(max); found = false; Vector cv = new Vector(); for(int i=0;i<Clusters.size();i++){ cluster = (Cluster)Clusters.elementAt(i); d = D(x,cluster.C); if(d<Dst){ cv.addElement(new Integer(i)); } } if(cv.size()==1){ cluster = (Cluster)Clusters.elementAt(((Integer)cv.firstElement()).intValue()); cluster.C.addElement(x); } else{ if(cv.size()>1){ found = false; for(int i=0;i<cv.size() && !found;i++){ if(((Integer)cv.elementAt(i)).intValue() < k){ C0.C.addElement(x); found = true; } } //merge all clusters if(!found){ cluster = (Cluster)Clusters.elementAt(((Integer)cv.firstElement()).intValue()); for(int i=1;i<cv.size();i++){ choosenV = ((Integer)cv.elementAt(i)).intValue(); cluster.C.addAll( ( (Cluster) Clusters.elementAt(((Integer)cv.elementAt(i)).intValue()) ).C ); Clusters.removeElementAt(((Integer)cv.elementAt(i)).intValue()); t--; //shift left remaining cluster index for(int j=i+1;j<cv.size();j++){ if( ((Integer)cv.elementAt(j)).intValue() > choosenV){ cv.set(j,new Integer(((Integer)cv.elementAt(j)).intValue()-1)); } } //cv.removeElementAt(i); //i--; //compensate the shift left } } } //x will form a new cluster by himself else{ cluster = new Cluster(); cluster.addInstance(x); Clusters.addElement(cluster); t++; } } alreadyTaken++; L.removeElementAt(max); Ps.removeElementAt(max); Px[((Integer)Index.elementAt(max)).intValue()] = -1; //so it cant be choosen again Index.removeElementAt(max); }while(L.size()>0); k = t; muMean = 0; max = 0; for(int i=0;i<ndatos;i++){ if(Px[i]>=0){ muMean += Px[i]; if(Px[i]>Px[max]) max = i; } } muMean = muMean/ndatos; } for(int i=0;i<t;i++){ cluster = (Cluster)Clusters.elementAt(i); if(cluster.C.size()<T) C0.C.addAll(cluster.C); } Clusters.add(0, C0); //assign identifier to each cluster for(int i=0;i<Clusters.size();i++) ((Cluster)Clusters.elementAt(i)).setNumber(i); return Clusters; } /** * <p> * This method refines the initial clusters obtained by clusterInitiation() * </p> * @param Clusters The set of clusters to be refined * @return A refined set of clusters */ protected Vector refineClusters(Vector Clusters){ FreqList [] obs; double [] values; double [] input; double [] output; double [] sum_Pcond = new double[nvariables]; Instance inst; Cluster cluster; ValueFreq val; double confident = 0.05; double NS_denom; int totalFreqs; int nearestCluster,isAt,index; double exp,observed,D,I,H,tmp,minNS; Vector [] Eck = new Vector[nvariables]; Vector [] Ekc = new Vector[nvariables]; FreqListPair atr_clust = new FreqListPair(); FreqListPair [] acj_xk = new FreqListPair[nvariables]; double [] R = new double[nvariables]; Vector nextGenClusters; boolean uncertain; int number_of_change, prev_changes; Vector foundIndex = new Vector(); chi = new StatFunc(); for(int i=0;i<nvariables;i++) acj_xk[i] = new FreqListPair(); obs = new FreqList[nvariables]; for(int i=0;i<nvariables;i++){ obs[i] = new FreqList(); Eck[i] = new Vector(); Ekc[i] = new Vector(); } //make the frequency distribution for(int i=0;i<ndatos;i++){ inst = IS.getInstance(i); values = inst.getAllInputValues(); for(int j=0;j<nentradas;j++){ obs[j].AddElement(String.valueOf(values[j])); } values = inst.getAllOutputValues(); for(int j=0;j<nsalidas;j++){ obs[j+nentradas].AddElement(String.valueOf(values[j])); } } D = 0; //********************************************************************* //********************************************************************* //***************************BEGIN refinement************************** //********************************************************************* //********************************************************************* number_of_change = 0; do{ //**************************** REVISAR BEGIN ******************************** //compute Eck totalFreqs = 0; for(int k=0;k<nvariables;k++){ obs[k].reset(); Eck[k].clear(); while(!obs[k].outOfBounds()){ D = 0; foundIndex.clear(); for(int j=0;j<Clusters.size();j++){ cluster = (Cluster) Clusters.elementAt(j); exp = obs[k].getCurrent().getFreq()*cluster.C.size(); exp = (double) exp/ndatos; observed = cluster.getObserved(obs[k].getCurrent().getValue(), k); if(observed > 0){ foundIndex.addElement(new Integer(cluster.getNumber())); } D = D + (double)(observed-exp)*(observed-exp)/exp; } if(D>StatFunc.chiSquarePercentage(confident,Clusters.size()-1)){ Eck[k].addElement(obs[k].getCurrent()); for(int j=0;j<foundIndex.size();j++){ index = ((Integer) foundIndex.elementAt(j)).intValue(); atr_clust.AddElement(String.valueOf(index),String.valueOf(obs[k].getCurrent().getValue())); acj_xk[k].AddElement(String.valueOf(index),String.valueOf(obs[k].getCurrent().getValue())); totalFreqs++; } } obs[k].iterate(); } } //check if there was attributes selected for Ekc //if not, finish prev_changes = number_of_change; number_of_change = 0; if(totalFreqs!=0){ //compute Ekc for(int k=0;k<nvariables;k++){ Ekc[k].clear(); for(int j=0;j<Clusters.size();j++){ D = 0; cluster = (Cluster) Clusters.elementAt(j); obs[k].reset(); while(!obs[k].outOfBounds()){ exp = obs[k].getCurrent().getFreq()*cluster.C.size(); exp = (double) exp/ndatos; observed = cluster.getObserved(obs[k].getCurrent().getValue(), k); D = D + (double)(observed-exp)*(observed-exp)/exp; obs[k].iterate(); } if(D>StatFunc.chiSquarePercentage(confident,Clusters.size()-1)) Ekc[k].addElement(Clusters.elementAt(j)); } } //**************************** REVISAR END ******************************** //now, compute the interdependency redundancy measure //between Xck and Ck for(int k=0;k<nvariables;k++){ I = 0; H = 0; //compute expected mutual information and entropy for(int u = 0;u<Eck[k].size();u++){ for(int s=0;s<Ekc[k].size();s++){ cluster = (Cluster) Ekc[k].elementAt(s); val= (ValueFreq)Eck[k].elementAt(u); tmp = (double)atr_clust.getPairFreq(String.valueOf(cluster.getNumber()),String.valueOf(val.getValue()))/totalFreqs; if(tmp>0){ H -= (double)tmp*Math.log(tmp)/Math.log(2); tmp = (double)tmp * Math.log(tmp/((double)val.getFreq()*cluster.C.size()/(totalFreqs*totalFreqs)))/Math.log(2); I += tmp; } } } if(I!=0 && H!=0) R[k] = (double) I/H; else R[k] = 0; } NS_denom = 0; for(int k=0;k<nvariables;k++) NS_denom += R[k]; NS_denom *= nvariables; nextGenClusters = (Vector)Clusters.clone(); for(int i=0;i<Clusters.size();i++){ cluster = (Cluster)Clusters.elementAt(i); for(int j=0;j<cluster.C.size();j++){ inst = (Instance)cluster.C.elementAt(j); minNS = Double.MAX_VALUE; nearestCluster = 0; //the dummy cluster C0 for(int u=1;u<Clusters.size();u++){ tmp = NS(inst,u, cluster.C.size(), R, acj_xk, Ekc,NS_denom); if(i!= 0 && u==i && tmp!= -1) tmp = tmp/Cfactor; if(tmp!=-1 && tmp<minNS){ nearestCluster = u; minNS = tmp; } } if(nearestCluster!=i){ //move the element to destination cluster isAt = ((Cluster)nextGenClusters.elementAt(i)).C.indexOf(inst); ((Cluster)nextGenClusters.elementAt(i)).C.removeElementAt(isAt); ((Cluster)nextGenClusters.elementAt(nearestCluster)).addInstance(inst); number_of_change++; } } } /*System.out.print("[ "); for(int q=0;q<Clusters.size();q++){ System.out.print(((Cluster)Clusters.elementAt(q)).C.size()+","); } System.out.print(" ]");*/ Clusters.clear(); Clusters = nextGenClusters; } }while(number_of_change>0 && Math.abs(number_of_change-prev_changes)>=min_change_num); //********************************************************************* //********************************************************************* //***************************END refinement**************************** //********************************************************************* //********************************************************************* //for(int i=0) return Clusters; } protected double NS(Instance inst,int numCluster,int sizeCluster,double[] R,FreqListPair [] acj_xk,Vector [] Ekc,double NS_denom){ double prob; double temp; double xk; double sum_Pcond; double mutualI; double NSvalue; Cluster cluster; double [] input; double [] output; input = inst.getAllInputValues(); output = inst.getAllOutputValues(); mutualI = 0; for(int k=0;k<nvariables;k++){ if(k<nentradas) xk = input[k]; else xk = output[k-nentradas]; sum_Pcond = 0; for(int i=0;i<Ekc[k].size();i++){ cluster = (Cluster)Ekc[k].elementAt(i); sum_Pcond += (double)acj_xk[k].sumPairFreq(String.valueOf(cluster.getNumber()),String.valueOf(xk))/sizeCluster; } temp = 0; if(sum_Pcond>0 && sum_Pcond > T){ prob = (double) acj_xk[k].sumPairFreq(String.valueOf(numCluster),String.valueOf(xk))/sizeCluster; if(prob>0){ temp = (double) prob/sum_Pcond; temp = -Math.log(temp)/Math.log(2); temp *= R[k]; } } mutualI += temp; } if(mutualI != 0) NSvalue = (double)mutualI/NS_denom; else NSvalue = -1; return NSvalue; } protected double D(Instance x,Vector S){ double dmin; double d; dmin = Double.MAX_VALUE; for(int i=0;i<S.size();i++){ if(x!=(Instance)S.elementAt(i)){ d = dist(x,(Instance)S.elementAt(i)); if(d < dmin) dmin = d; } } return dmin; } /** * <p> * Process the training and test files provided in the parameters file to the constructor. * </p> */ public void process(){ int in = 0; int out = 0; int debug = 0; String[] row = null; boolean valuesRemaining = false; FreqList sameValue = null; double[] outputs = null; double[] outputsCandidate = null; double[] inputs = null; double[] inputsCandidate = null; boolean[] inputsMissing = null; boolean[] taken = null; Vector instancesSelected = new Vector(); boolean same = true; boolean valueFound = false; ValueFreq valueTimes; double minD = 0; double dist; VAList candidatesList = null; valueAssociations va; Instance missing = null; Instance i1,i2; Vector Clusters = null; Cluster c; int selectedCluster = 0; int centroid = 0; Vector tree; double[][] I; double[] Px; try { // Load in memory a dataset that contains a classification problem IS.readSet(input_train_name,true); ndatos = IS.getNumInstances(); nvariables = Attributes.getNumAttributes(); nentradas = Attributes.getInputNumAttributes(); nsalidas = Attributes.getOutputNumAttributes(); X = new String[ndatos][nvariables];//matrix with transformed data totalMissing = 0; //Create clusters for all instances without data missing I = computeMutualInformation(); tree = computeTree(I); Px = computePx(tree); if(totalMissing != ndatos && totalMissing != 0){ Clusters = clusterInitation(Px); int acum = 0; for(int i=0;i<Clusters.size();i++){ c = (Cluster)Clusters.elementAt(i); acum += c.C.size(); } Clusters = refineClusters(Clusters); } else{ Cluster C0 = new Cluster(); Clusters = new Vector(); for(int i = 0;i < ndatos;i++){ Instance inst = IS.getInstance(i); C0.C.addElement(inst); } Clusters.addElement(C0); } //process current dataset for(int i = 0;i < ndatos;i++){ Instance inst = IS.getInstance(i); in = 0; out = 0; for(int j = 0; j < nvariables;j++){ Attribute a = Attributes.getAttribute(j); direccion = a.getDirectionAttribute(); tipo = a.getType(); if(direccion == Attribute.INPUT){ if(tipo != Attribute.NOMINAL && !inst.getInputMissingValues(in)){ X[i][j] = new String(String.valueOf(inst.getInputRealValues(in))); } else{ if(!inst.getInputMissingValues(in)) X[i][j] = inst.getInputNominalValues(in); else{ //missing data, we must find the cluster this //instance fits better minD = Double.MAX_VALUE; for(int u=0;u<Clusters.size();u++){ c = (Cluster) Clusters.elementAt(u); dist = D(inst,c.C); if(dist<minD){ selectedCluster = u; minD = dist; } } //now, find the nearest element of the cluster c = (Cluster)Clusters.elementAt(selectedCluster); minD = Double.MAX_VALUE; dist = 0; for(int l=0;l<c.C.size();l++){ i2 = (Instance)c.C.elementAt(l); dist = dist(inst,i2); if(i2.getInputMissingValues(in)) dist += nvariables; if(dist<minD){ minD = dist; centroid = l; } } //use the nearest attribute as reference i1 = (Instance)c.C.elementAt(centroid); if(i1.getInputMissingValues(in)) X[i][j] = "<null>"; else{ if(tipo != Attribute.NOMINAL){ X[i][j] = new String(String.valueOf(i1.getInputRealValues(in))); } else{ X[i][j] = i1.getInputNominalValues(in); } } } } in++; } else{ if(direccion == Attribute.OUTPUT){ if(tipo != Attribute.NOMINAL && !inst.getOutputMissingValues(out)){ X[i][j] = new String(String.valueOf(inst.getOutputRealValues(out))); } else{ if(!inst.getOutputMissingValues(out)) X[i][j] = inst.getOutputNominalValues(out); else{ //missing data, we must find the cluster this //instance fits better minD = Double.MAX_VALUE; for(int u=0;u<Clusters.size();u++){ c = (Cluster) Clusters.elementAt(u); dist = D(inst,c.C); if(dist<minD){ selectedCluster = u; minD = dist; } } //now, find the nearest element of the cluster c = (Cluster)Clusters.elementAt(selectedCluster); minD = Double.MAX_VALUE; dist = 0; for(int l=0;l<c.C.size();l++){ i2 = (Instance)c.C.elementAt(l); dist = dist(inst,i2); if(i2.getOutputMissingValues(out)) dist += nvariables; if(dist<minD){ minD = dist; centroid = l; } } //use the centroid attribute as reference i1 = (Instance)c.C.elementAt(centroid); if(i1.getOutputMissingValues(out)) X[i][j] = "<null>"; else{ if(tipo != Attribute.NOMINAL){ X[i][j] = new String(String.valueOf(i1.getOutputRealValues(out))); } else{ X[i][j] = i1.getOutputNominalValues(out); } } } } out++; } } } } }catch (Exception e){ System.out.println("Dataset exception = " + e ); e.printStackTrace(); System.exit(-1); } write_results(output_train_name); /***************************************************************************************/ //does a test file associated exist? if(input_train_name.compareTo(input_test_name)!=0){ try { // Load in memory a dataset that contains a classification problem IStest.readSet(input_test_name,false); ndatos = IStest.getNumInstances(); nvariables = Attributes.getNumAttributes(); nentradas = Attributes.getInputNumAttributes(); nsalidas = Attributes.getOutputNumAttributes(); X = new String[ndatos][nvariables];//matrix with transformed data totalMissing = 0; //Create clusters for all instances without data missing /*I = computeMutualInformation(); tree = computeTree(I); Px = computePx(tree); if(totalMissing != ndatos && totalMissing != 0){ Clusters = clusterInitation(Px); int acum = 0; for(int i=0;i<Clusters.size();i++){ c = (Cluster)Clusters.elementAt(i); acum += c.C.size(); } Clusters = refineClusters(Clusters); } else{ Cluster C0 = new Cluster(); Clusters = new Vector(); for(int i = 0;i < ndatos;i++){ Instance inst = IS.getInstance(i); C0.C.addElement(inst); } Clusters.addElement(C0); }*/ //process current dataset for(int i = 0;i < ndatos;i++){ Instance inst = IStest.getInstance(i); in = 0; out = 0; for(int j = 0; j < nvariables;j++){ Attribute a = Attributes.getAttribute(j); direccion = a.getDirectionAttribute(); tipo = a.getType(); if(direccion == Attribute.INPUT){ if(tipo != Attribute.NOMINAL && !inst.getInputMissingValues(in)){ X[i][j] = new String(String.valueOf(inst.getInputRealValues(in))); } else{ if(!inst.getInputMissingValues(in)) X[i][j] = inst.getInputNominalValues(in); else{ //missing data, we must find the cluster this //instance fits better minD = Double.MAX_VALUE; for(int u=0;u<Clusters.size();u++){ c = (Cluster) Clusters.elementAt(u); dist = D(inst,c.C); if(dist<minD){ selectedCluster = u; minD = dist; } } //now, find the nearest element of the cluster c = (Cluster)Clusters.elementAt(selectedCluster); minD = Double.MAX_VALUE; dist = 0; for(int l=0;l<c.C.size();l++){ i2 = (Instance)c.C.elementAt(l); dist = dist(inst,i2); if(i2.getInputMissingValues(in)) dist += nvariables; if(dist<minD){ minD = dist; centroid = l; } } //use the nearest attribute as reference i1 = (Instance)c.C.elementAt(centroid); if(i1.getInputMissingValues(in)) X[i][j] = "<null>"; else{ if(tipo != Attribute.NOMINAL){ X[i][j] = new String(String.valueOf(i1.getInputRealValues(in))); } else{ X[i][j] = i1.getInputNominalValues(in); } } } } in++; } else{ if(direccion == Attribute.OUTPUT){ if(tipo != Attribute.NOMINAL && !inst.getOutputMissingValues(out)){ X[i][j] = new String(String.valueOf(inst.getOutputRealValues(out))); } else{ if(!inst.getOutputMissingValues(out)) X[i][j] = inst.getOutputNominalValues(out); else{ //missing data, we must find the cluster this //instance fits better minD = Double.MAX_VALUE; for(int u=0;u<Clusters.size();u++){ c = (Cluster) Clusters.elementAt(u); dist = D(inst,c.C); if(dist<minD){ selectedCluster = u; minD = dist; } } //now, find the nearest element of the cluster c = (Cluster)Clusters.elementAt(selectedCluster); minD = Double.MAX_VALUE; dist = 0; for(int l=0;l<c.C.size();l++){ i2 = (Instance)c.C.elementAt(l); dist = dist(inst,i2); if(i2.getOutputMissingValues(out)) dist += nvariables; if(dist<minD){ minD = dist; centroid = l; } } //use the centroid attribute as reference i1 = (Instance)c.C.elementAt(centroid); if(i1.getOutputMissingValues(out)) X[i][j] = "<null>"; else{ if(tipo != Attribute.NOMINAL){ X[i][j] = new String(String.valueOf(i1.getOutputRealValues(out))); } else{ X[i][j] = i1.getOutputNominalValues(out); } } } } out++; } } } } }catch (Exception e){ System.out.println("Dataset exception = " + e ); e.printStackTrace(); System.exit(-1); } write_results(output_test_name); } } }