/* * To change this template, choose Tools | Templates * and open the template in the editor. */ package br.uff.ic.oceano.ostra.tools.datamining.util; import br.uff.ic.oceano.core.exception.ServiceException; import br.uff.ic.oceano.ostra.model.DataBaseSnapshot; import static br.uff.ic.oceano.ostra.controle.Constantes.*; import br.uff.ic.oceano.ostra.discretizer.Discretizer; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.StringTokenizer; import java.util.logging.Level; import java.util.logging.Logger; /** * * @author DanCastellani */ public class DatabaseToArffService { private static final String RELATION = "@RELATION "; private static final String ATTRIBUTE = "@ATTRIBUTE "; private static final String DATA = "@DATA "; private static final String REAL = " REAL"; private static final String NUMERIC = " NUMERIC"; public static String dataBaseToARFF(DataBaseSnapshot dbs, List<Discretizer> discretizers) throws ServiceException { if (dbs.getInstancesSize() == 0) { return null; } Map<String, Discretizer> discretizersMap = initializeDiscretizers(dbs, discretizers); final List<String> instanciasFormatadas = formataDados(dbs.getInstances()); final String dataBaseName = "\"Oceano " + dbs.getFormatedSnapshotTime() + "\""; final String header = getHeader(dbs.getAttributes(), instanciasFormatadas, dataBaseName, discretizersMap); final String data = getData(instanciasFormatadas); final String strARFF = header + data; // System.out.println("********************************* ARFF TO RETURN ************************************"); // System.out.println(strARFF); // System.out.println("*************************************************************************************"); return strARFF; } private static String getHeader(List<String> attributeNames, List<String> instanciasFormatadas, String dataBaseName, Map<String, Discretizer> discretizersMap) { List<StringTokenizer> valores = new ArrayList<StringTokenizer>(instanciasFormatadas.size()); List<Set<String>> atributos = new ArrayList<Set<String>>(instanciasFormatadas.size()); // inicializaListaDeValores(instanciasFormatadas, valores); // inicializaConjuntoDeValores(valores, atributos); // preencheValoresDeAtributos(valores, atributos); final StringBuffer header = new StringBuffer(); header.append(RELATION).append(dataBaseName).append("\n%\n"); preencheCabecalhoComAtributos(attributeNames, atributos, header, discretizersMap); header.append("%\n"); header.append("% Instancias: ").append(instanciasFormatadas.size()).append("\n"); header.append("%\n"); return header.toString(); } private static String getData(List<String> instancias) { final StringBuffer data = new StringBuffer(); data.append(DATA); data.append("\n%\n"); for (String linha : instancias) { data.append(linha).append("\n"); } return data.toString(); } private static void inicializaConjuntoDeValores(List<StringTokenizer> valores, List<Set<String>> atributos) { //inicializa os conjuntos de valores de cada atributo for (int i = 0; i < valores.get(0).countTokens(); i++) { atributos.add(new HashSet<String>()); } } private static void inicializaListaDeValores(List<String> instancias, List<StringTokenizer> valores) { //inicializa a lista de valores e atributos for (String instancia : instancias) { StringTokenizer st = new StringTokenizer(instancia, ARFF_VALUE_SEPARATOR); valores.add(st); } } private static void preencheCabecalhoComAtributos(List<String> names, List<Set<String>> atributos, final StringBuffer header, Map<String, Discretizer> discretizersMap) { //constroi o string de cabeƧalho for (int numeroDoAtributo = 0; numeroDoAtributo < atributos.size(); numeroDoAtributo++) { //valores deste atributo final StringBuffer attributeValues = new StringBuffer(); for (String valor : atributos.get(numeroDoAtributo)) { attributeValues.append(valor).append(ARFF_VALUE_SEPARATOR); } final int stringSize = attributeValues.toString().length(); String attributeName = names.get(numeroDoAtributo); if (attributeName.contains(" ")) { attributeName = ARFF_STRING_DELIMITER + attributeName + ARFF_STRING_DELIMITER; } String attributeDeclarationValue = null; //Verify and Apply discretizer // System.out.println("Construindo cabeƧalho do atributo: " + attributeName); if (discretizersMap.containsKey(attributeName.replace("\"", ""))) { Discretizer discretizer = discretizersMap.get(attributeName.replace("\"", "")); attributeDeclarationValue = " {" + discretizer.getHeaderDeclaration(attributeValues.toString().substring(0, stringSize - 1)) + "}"; } else if (attributeName.startsWith(PREFIX_ATTRIBUTE_NUMBER)) { attributeDeclarationValue = NUMERIC; } else if (names.get(numeroDoAtributo).startsWith(PREFIX_DELTA_METRIC_AVARAGE)) { attributeDeclarationValue = REAL; } else if (names.get(numeroDoAtributo).startsWith(PREFIX_DELTA_METRIC_STANDARD_DEVIATON)) { attributeDeclarationValue = REAL; } else { attributeDeclarationValue = " {" + attributeValues.toString().substring(0, stringSize - 1) + "}"; } // System.out.println("attributeDeclarationValue = " + attributeDeclarationValue); header.append(ATTRIBUTE).append(attributeName).append(attributeDeclarationValue).append("\n"); } } private static void preencheValoresDeAtributos(List<StringTokenizer> valores, List<Set<String>> atributos) { //preenche os valores de cada atributo for (StringTokenizer stringTokenizer : valores) { int posicaoAtributo = 0; while (stringTokenizer.hasMoreTokens()) { final String valor = stringTokenizer.nextToken(); atributos.get(posicaoAtributo).add(valor); posicaoAtributo++; } } } private static String preparaString(String s) { final String trimS = s.trim(); if (trimS.contains(" ")) { return "\"" + trimS + "\""; } else { return trimS; } } private static List<String> formataDados(List<String> instancias) { List<String> instanciasFormatadas = new ArrayList<String>(instancias.size()); StringTokenizer st; for (String linha : instancias) { st = new StringTokenizer(linha, ATTRIBUTE_SEPARATOR); StringBuilder sb = new StringBuilder(); while (st.hasMoreTokens()) { final String s = st.nextToken(); if (s.trim().equals("null")) { sb.append(ATTRIBUTE_NOT_KNOWN_SYMBOL); } else { sb.append(preparaString(s)); } sb.append(ARFF_VALUE_SEPARATOR); } final String linhaFormada = sb.toString(); instanciasFormatadas.add(linhaFormada.substring(0, linhaFormada.length() - 1)); } return instanciasFormatadas; } /** * Initializes the discretizer's list validating it and updating the attribute's names when necessary. * @param discretizers * @throws ServiceException */ private static Map<String, Discretizer> initializeDiscretizers(DataBaseSnapshot dbs, List<Discretizer> discretizers) throws ServiceException { Map<String, Discretizer> discretizersMap = new HashMap<String, Discretizer>(); if (discretizers == null) { return discretizersMap; } for (Discretizer discretizer : discretizers) { //validate if (!dbs.getAttributes().contains(discretizer.getAttributeTarget())) { final String msg = "Attribute target not known " + discretizer.getAttributeTarget() + " of " + discretizer.getClass().getCanonicalName(); Logger.getLogger(DatabaseToArffService.class.getName()).log(Level.WARNING, msg); } //insert into map discretizersMap.put(discretizer.getAttributeTarget(), discretizer); } return discretizersMap; } }