/* * RapidMiner * * Copyright (C) 2001-2011 by Rapid-I and the contributors * * Complete list of developers available at our web site: * * http://rapid-i.com * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see http://www.gnu.org/licenses/. */ package com.rapidminer.gui.tools.dialogs.wizards.dataimport.csv; import java.text.NumberFormat; import java.text.ParseException; import java.util.ArrayList; import java.util.LinkedHashSet; import java.util.Set; import com.rapidminer.operator.ports.metadata.AttributeMetaData; import com.rapidminer.operator.ports.metadata.ExampleSetMetaData; import com.rapidminer.operator.ports.metadata.MDInteger; import com.rapidminer.operator.ports.metadata.SetRelation; import com.rapidminer.tools.Ontology; import com.rapidminer.tools.Tools; import com.rapidminer.tools.math.container.Range; /** * @author Tobias Malbrecht */ public abstract class DataEvaluator { private final NumberFormat numberFormat; private int rowCount = 0; private int columnCount = 0; private String[] columnNames = null; private boolean[] canParseDouble = null; private boolean[] canParseInteger = null; private double[] minValues = null; private double[] maxValues = null; private int[] numberOfMissings = null; private ArrayList<LinkedHashSet<String>> valueSets = new ArrayList<LinkedHashSet<String>>(); private int[] valueTypes = null; private boolean complete = false; // TODO add date guessing and missing evaluation // TODO add different formats for each string array value public DataEvaluator(NumberFormat numberFormat) { this.numberFormat = numberFormat; start(); } public void start() { rowCount = 0; columnCount = 0; columnNames = null; canParseDouble = new boolean[columnCount]; canParseInteger = new boolean[columnCount]; minValues = new double[columnCount]; maxValues = new double[columnCount]; numberOfMissings = new int[columnCount]; valueTypes = null; valueSets.clear(); complete = false; } public void setColumnNames(String[] columnNames) { if (columnCount < columnNames.length) { this.columnNames = new String[columnNames.length]; for (int i = 0; i < columnNames.length; i++) { this.columnNames[i] = columnNames[i]; } extendToLength(columnNames.length); } else { this.columnNames = columnNames; } } public void setValueTypes(int[] valueTypes) { if (columnCount < valueTypes.length) { this.valueTypes = new int[valueTypes.length]; for (int i = 0; i < valueTypes.length; i++) { this.valueTypes[i] = valueTypes[i]; } extendToLength(valueTypes.length); } else { this.valueTypes = valueTypes; } } public void update(String[] values) { if (columnCount < values.length) { extendToLength(values.length); } for (int i = 0; i < values.length; i++) { if (values[i] == null || values[i].isEmpty()) { numberOfMissings[i]++; continue; } valueSets.get(i).add(values[i]); // TODO add date handling if (canParseDouble[i]) { try { Number number = numberFormat.parse(values[i]); if (minValues[i] > number.doubleValue()) { minValues[i] = number.doubleValue(); } if (maxValues[i] < number.doubleValue()) { maxValues[i] = number.doubleValue(); } if (canParseInteger[i]) { if (!Tools.isEqual(Math.round(number.doubleValue()), number.intValue())) { canParseInteger[i] = false; } } } catch (ParseException e) { canParseDouble[i] = false; canParseInteger[i] = false; } } } rowCount++; } private void extendToLength(int length) { boolean[] newCanParseDouble = new boolean[length]; boolean[] newCanParseInteger = new boolean[length]; double[] newMinValues = new double[length]; double[] newMaxValues = new double[length]; int[] newNumberOfMissings = new int[length]; for (int i = 0; i < length; i++) { newCanParseDouble[i] = true; newCanParseInteger[i] = true; newMinValues[i] = Double.MAX_VALUE; newMaxValues[i] = Double.MIN_VALUE; newNumberOfMissings[i] = 0; } for (int i = 0; i < columnCount; i++) { newCanParseDouble[i] = canParseDouble[i]; newCanParseInteger[i] = canParseInteger[i]; newMinValues[i] = minValues[i]; newMaxValues[i] = maxValues[i]; newNumberOfMissings[i] = numberOfMissings[i]; } canParseDouble = newCanParseDouble; canParseInteger = newCanParseInteger; minValues = newMinValues; maxValues = newMaxValues; numberOfMissings = newNumberOfMissings; int difference = length - valueSets.size(); for (int i = 0; i < difference; i++) { valueSets.add(new LinkedHashSet<String>()); } columnCount = length; } public void finish(boolean complete) { this.complete = complete; if (columnNames == null) { this.columnNames = new String[columnCount]; } else if (columnCount > columnNames.length) { String[] newColumnNames = new String[columnNames.length]; for (int i = 0; i < columnNames.length; i++) { newColumnNames[i] = columnNames[i]; } this.columnNames = newColumnNames; } for (int i = 0; i < columnNames.length; i++) { if (columnNames[i] == null || columnNames[i].isEmpty()) { columnNames[i] = getGenericColumnName(i); } } this.valueTypes = new int[columnCount]; for (int i = 0; i < columnCount; i++) { if (canParseInteger[i]) { valueTypes[i] = Ontology.INTEGER; continue; } if (canParseDouble[i]) { valueTypes[i] = Ontology.REAL; continue; } if (valueSets.get(i).size() <= 2) { valueTypes[i] = Ontology.BINOMINAL; continue; } valueTypes[i] = Ontology.NOMINAL; } } protected String[] getColumnNames() { return columnNames; } protected int getColumnCount() { return columnCount; } protected int getRowCount() { return rowCount; } protected int[] getValueTypes() { return valueTypes; } protected int[] getNumberOfMissings() { return numberOfMissings; } protected Set<String> getValueSet(int column) { return valueSets.get(column); } protected boolean isGuess() { return !complete; } public ExampleSetMetaData getMetaData() { ExampleSetMetaData metaData = new ExampleSetMetaData(); for (int i = 0; i < getColumnCount(); i++) { AttributeMetaData amd = new AttributeMetaData(getColumnNames()[i], getValueTypes()[i]); MDInteger missings = new MDInteger(getNumberOfMissings()[i]); SetRelation relation = SetRelation.EQUAL; if (isGuess()) { relation = SetRelation.SUPERSET; missings.increaseByUnknownAmount(); } if (Ontology.ATTRIBUTE_VALUE_TYPE.isA(getValueTypes()[i], Ontology.NUMERICAL)) { amd.setValueRange(new Range(minValues[i], maxValues[i]), relation); } else { amd.setValueSet(getValueSet(i), relation); } amd.setNumberOfMissingValues(missings); metaData.addAttribute(amd); } metaData.setNumberOfExamples(new MDInteger(getRowCount())); if (isGuess()) { metaData.getNumberOfExamples().increaseByUnknownAmount(); metaData.attributesAreSuperset(); } return metaData; } public abstract String getGenericColumnName(int column); }