/***********************************************************************
This file is part of KEEL-software, the Data Mining tool for regression,
classification, clustering, pattern mining and so on.
Copyright (C) 2004-2010
F. Herrera (herrera@decsai.ugr.es)
L. S�nchez (luciano@uniovi.es)
J. Alcal�-Fdez (jalcala@decsai.ugr.es)
S. Garc�a (sglopez@ujaen.es)
A. Fern�ndez (alberto.fernandez@ujaen.es)
J. Luengo (julianlm@decsai.ugr.es)
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see http://www.gnu.org/licenses/
**********************************************************************/
/*
* PropertyListToKeel.java
*/
package keel.Algorithms.Preprocess.Converter;
import java.io.*;
import org.jdom.*;
import org.jdom.input.*;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* <p>
* <b> PropertyListToKeel </b>
* </p>
*
* Clase extendida de la clase Importer. Esta clase permite convertir
* un fichero de datos con formato property list (con sintaxis xml) a
* formato de datos Keel.
*
* @author Teresa Prieto López (UCO)
* @version 1.0
*/
public class PropertyListToKeel extends Importer {
//Variable auxiliar para almacenar el texto de todos los descendientes de un nodo.
private String lineAux = new String();//Variable almacena el elemento o etiqueta principal que forma el documento xml.
private Element root;
/*
* Metodo utilizado para convertir los datos almacenados dentro del fichero
* con formato property list indicado mediante la variable pathnameInput a
* formato keel en el fichero indicado por la ruta pathnameOutput
*
* @param pathnameInput ruta del fichero con formato property list.
* @param pathnameOutput ruta con los datos en formato keel.
*
* @throws Exception */
public void Start(String pathnameInput, String pathnameOutput) throws Exception {
Pattern p;
Matcher m;
int numElements = 0;
int j = 0;
int i = 0;
int k = 0;
int cont = 0;
int type;
int actualValueInt;
double min;
double max;
double actualValue;
String nameAttribute = new String();
String nameAttributeInitial = new String();
String value = new String();
String nameChildren = new String();
List<Element> firstInstance;
Element children;
Element instance;
String vowel[] = {"a", "e", "i", "o", "u", "A", "E", "I", "O", "U"};
String vowel_accent[] = {"�", "�", "�", "�", "�", "�", "�", "�", "�", "�"};
try {
SAXBuilder builder = new SAXBuilder(false);
Document doc = builder.build(new File(pathnameInput));
//construyo el arbol en memoria desde el fichero
// que se lo pasaré por parametro.
root = doc.getRootElement();
FindParent(root, "dict");
//todos los hijos que tengan
List instances = root.getChildren();
numElements = instances.size();
//Buscamos un hijo con nombre dict para a partir de él saber el número de atributos
if (numElements > 0) {
i = 0;
children = (Element) instances.get(i);
nameChildren = children.getName();
while (!nameChildren.equalsIgnoreCase("dict")) {
i++;
children = (Element) instances.get(i);
nameChildren = children.getName();
}
firstInstance = ((Element) instances.get(i)).getChildren();
} else {
System.out.println("No hay instancias");
return;
}
for (i = 0; i < firstInstance.size(); i++) {
children = (Element) firstInstance.get(i);
nameChildren = children.getName();
if (nameChildren.equalsIgnoreCase("key")) {
numAttributes++;
}
}
//Reservamos memoria para almacenar la definición de los atributos y de los datos
attribute = new keel.Dataset.Attribute[numAttributes];
data = new Vector[numAttributes];
types = new Vector[numAttributes];
for (i = 0; i < numAttributes; i++) {
attribute[i] = new keel.Dataset.Attribute();
data[i] = new Vector();
types[i] = new Vector();
}
Iterator it = instances.iterator();
i = 0;
k = 0;
while (it.hasNext()) {
instance = (Element) it.next();
if (instance.getName().equalsIgnoreCase("dict")) {
List element = instance.getChildren();
numElements = element.size();
j = 0;
i = 0;
while (j < numElements) {
children = (Element) element.get(j);
if (children.getName().equalsIgnoreCase("key")) {
nameAttribute = children.getText();
p = Pattern.compile("^\\s+");
m = p.matcher(nameAttribute);
nameAttribute = m.replaceAll("");
p = Pattern.compile("\\s+$");
m = p.matcher(nameAttribute);
nameAttribute = m.replaceAll("");
nameAttribute = nameAttribute.replace("'", "");
nameAttribute = nameAttribute.replace("\"", "");
nameAttribute = nameAttribute.replace("\r", " ");
nameAttribute = nameAttribute.replace("\n", " ");
nameAttribute = nameAttribute.replace(" ", "");
nameAttribute = nameAttribute.replace("<", "<");
nameAttribute = nameAttribute.replace(">", ">");
nameAttribute = nameAttribute.replace(""", "\"");
nameAttribute = nameAttribute.replace("", "-");
nameAttribute = nameAttribute.replace("&", "&");
nameAttribute = nameAttribute.replace("<", "<");
nameAttribute = nameAttribute.replace(">", ">");
for (cont = 0; cont < vowel.length; cont++) {
nameAttribute = nameAttribute.replace("&" + vowel[cont] + "acute;", vowel_accent[cont]);
}
p = Pattern.compile("\\s+");
m = p.matcher(nameAttribute);
nameAttribute = m.replaceAll(" ");
if (nameAttribute.contains(" ")) {
StringTokenizer tokenUcfirts = new StringTokenizer(nameAttribute, " ");
String lineUcfirts = "";
if (tokenUcfirts.hasMoreTokens()) {
lineUcfirts = tokenUcfirts.nextToken();
}
while (tokenUcfirts.hasMoreTokens()) {
lineUcfirts = lineUcfirts.concat(UcFirst(tokenUcfirts.nextToken()));
}
nameAttribute = lineUcfirts;
}
if (nameAttribute.equals("") || nameAttribute.equals("?") || nameAttribute.equals("<null>")) {
nameAttribute = "ATTRIBUTE_" + (i + 1) + "";
}
if (k > 0) {
nameAttributeInitial = attribute[i].getName();
if (!nameAttributeInitial.equalsIgnoreCase(nameAttribute)) {
System.out.println("Los nombres de los atributos no coinciden en todas las instancias");
return;
}
}//end if
attribute[i].setName(nameAttribute);
}//end if
//El contador 'j' se usa para recorrer los hijos de "dict"
j++;
children = (Element) element.get(j);
if (children.getName().equalsIgnoreCase("array") || children.getName().equalsIgnoreCase("dict")) {
value = ListChildrenText(children, 0);
} else {
value = "";
value = children.getText();
}
p = Pattern.compile("^\\s+");
m = p.matcher(value);
value = m.replaceAll("");
p = Pattern.compile("\\s+$");
m = p.matcher(value);
value = m.replaceAll("");
value = value.replace("\r", " ");
value = value.replace("\n", " ");
value = value.replace(" ", "");
value = value.replace("<", "<");
value = value.replace(">", ">");
value = value.replace(""", "\"");
value = value.replace("", "-");
value = value.replace("&", "&");
value = value.replace("<", "<");
value = value.replace(">", ">");
for (cont = 0; cont < vowel.length; cont++) {
value = value.replace("&" + vowel[cont] + "acute;", vowel_accent[cont]);
}
if (value.equals("") || value.equals("<null>") || value == null) {
value = "?";
}
data[i].addElement(value);
j++;
//El valor de 'i' recorre los atributos
i++;
}//end while(j<numElements)
//El contador 'k' recorre las intancias
k++;
}//end if
}//end while
for (i = 0; i < data[0].size(); i++) {
for (j = 0; j < numAttributes; j++) {
value = (String) data[j].elementAt(i);
types[j].addElement(DataType(value));
}
}
for (i = 0; i < numAttributes; i++) {
if (types[i].contains(NOMINAL)) {
attribute[i].setType(NOMINAL);
} else {
if (types[i].contains(REAL)) {
attribute[i].setType(REAL);
} else {
if (types[i].contains(INTEGER)) {
attribute[i].setType(INTEGER);
} else {
attribute[i].setType(-1);
}
}
}
}
for (i = 0; i < data[0].size(); i++) {
for (j = 0; j < numAttributes; j++) {
value = (String) data[j].elementAt(i);
type = attribute[j].getType();
if (type == NOMINAL) {
p = Pattern.compile("[^A-ZÑa-zñ0-9_-]+");
m = p.matcher(value);
if (m.find() && !value.startsWith("'") && !value.endsWith("'") && !value.equals("?")) {
value = "'" + value + "'";
data[j].set(i, value);
}
if (!(attribute[j].isNominalValue(value)) && !value.equals("?")) {
attribute[j].addNominalValue(value);
}
}
if (type == INTEGER) {
if (!value.equals("?")) {
actualValueInt = Integer.valueOf(value);
data[j].set(i, actualValueInt);
if ((attribute[j].getFixedBounds()) == false) {
attribute[j].setBounds(actualValueInt, actualValueInt);
} else {
min = attribute[j].getMinAttribute();
max = attribute[j].getMaxAttribute();
if (actualValueInt < min) {
attribute[j].setBounds(actualValueInt, max);
}
if (actualValueInt > max) {
attribute[j].setBounds(min, actualValueInt);
}
}
}
}
if (type == REAL) {
if (!value.equals("?")) {
actualValue = Double.valueOf(value);
data[j].set(i, actualValue);
if ((attribute[j].getFixedBounds()) == false) {
attribute[j].setBounds(actualValue, actualValue);
} else {
min = attribute[j].getMinAttribute();
max = attribute[j].getMaxAttribute();
if (actualValue < min) {
attribute[j].setBounds(actualValue, max);
}
if (actualValue > max) {
attribute[j].setBounds(min, actualValue);
}
}
}
}
}//end while
}//end while
} catch (Exception e) {
System.out.println(e);
System.exit(1);
}
File f = new File(pathnameInput);
nameRelation = f.getName();
p = Pattern.compile("\\.[A-Za-z]+");
m = p.matcher(nameRelation);
nameRelation = m.replaceAll("");
p = Pattern.compile("\\s+");
m = p.matcher(nameRelation);
nameRelation = m.replaceAll("");
super.Save(pathnameOutput);
}//end Start()
/*
* Método encargado de recorrer todo el árbol xml para encontrar
* el nodo padre del nodo o etiqueta cuyo nombre coincida con el valor del parámetro childrenName.
* El nodo padre de dicha etiqueta será asignado a la variable miembro
* root.
*
* @param Element current. Elemento o nodo xml actual.
* @param String childrenName. Variable String que indica el
* nombre de la etiqueta a buscar.
*
*/
public void FindParent(Element current, String childrenName) {
if (current.getName().equalsIgnoreCase(childrenName)) {
this.root = current.getParentElement();
return;
} else {
List children = current.getChildren();
Iterator iterator = children.iterator();
while (iterator.hasNext()) {
Element child = (Element) iterator.next();
FindParent(child, childrenName);
}
}
} //end FindParent()
/*
* Método recursivo que devuelve el texto que contiene todos los descendientes
* de un nodo o etiqueta de un elemento xml.
*
* @param Element current que indica que nodo o etiqueta xml actual.
* @param int cont Variable que se utiliza como contador de descendientes.
*
* @return String . Devuelve el valor de la variable auxiliar lineAux
* que almacena el texto de todos los descendientes de un nodo separado
* cada uno por un espacio en blanco.
*
* @throws Exception
*/
public String ListChildrenText(Element current, int cont) {
if (cont == 0) {
lineAux = "";
}
if ((current.getChildren()).size() == 0) {
lineAux = lineAux.concat(current.getText() + " ");
}
List children = current.getChildren();
Iterator iterator = children.iterator();
while (iterator.hasNext()) {
Element child = (Element) iterator.next();
ListChildrenText(child, cont++);
}
return lineAux;
} //end listChildrenText()
}//end PropertyListToKeel