/* * RapidMiner * * Copyright (C) 2001-2011 by Rapid-I and the contributors * * Complete list of developers available at our web site: * * http://rapid-i.com * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see http://www.gnu.org/licenses/. */ package com.rapidminer.tools.att; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStreamReader; import java.nio.charset.Charset; import java.util.ArrayList; import java.util.List; import com.rapidminer.example.table.AttributeFactory; import com.rapidminer.example.table.RapidMinerLineReader; import com.rapidminer.tools.LoggingHandler; import com.rapidminer.tools.Ontology; /** This class can be used as a simple attribute data source creation factory for many types of * table like data. * * @author Ingo Mierswa */ public class AttributeDataSourceCreator { /** The list of the abstract attribute informations. */ private ArrayList<AttributeDataSource> sources = new ArrayList<AttributeDataSource>(); public AttributeDataSourceCreator() {} public List<AttributeDataSource> getAttributeDataSources() { return sources; } public void loadData(File file, char[] commentChars, String columnSeparators, char decimalPointCharacter, boolean useQuotes, char quoteChar, char escapeChar, boolean trimLines, boolean firstLineAsNames, int maxCounter, boolean skipErrorLines, Charset encoding, LoggingHandler logging) throws IOException { this.sources.clear(); String[] columnNames = null; int maxColumns = -1; int[] valueTypes = null; boolean[] onlyMissing = null; RapidMinerLineReader lineReader = new RapidMinerLineReader(columnSeparators, commentChars, useQuotes, quoteChar, escapeChar, trimLines, skipErrorLines); BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(file), encoding)); int counter = 0; boolean first = true; while ((maxCounter < 0) || (counter <= maxCounter)) { String[] columns = lineReader.readLine(in, -1); if (columns == null) // break loop if last line was read break; boolean lineOk = true; if ((maxColumns != -1) && (maxColumns != columns.length)) { lineOk = false; if (skipErrorLines) { if (logging != null) { logging.logWarning("Number of columns in line " + counter + " was unexpected, was: " + columns.length + ", expected: " + maxColumns + ". Skipping line..."); } } else { throw new IOException("Number of columns in line " + counter + " was unexpected, was: " + columns.length + ", expected: " + maxColumns); } } if (lineOk) { if (first) { maxColumns = columns.length; valueTypes = new int[maxColumns]; onlyMissing = new boolean[maxColumns]; for (int i = 0; i < valueTypes.length; i++) { valueTypes[i] = Ontology.INTEGER; onlyMissing[i] = true; } if (firstLineAsNames) { columnNames = columns; } else { guessValueTypes(columns, valueTypes, onlyMissing, decimalPointCharacter); } first = false; } else { guessValueTypes(columns, valueTypes, onlyMissing, decimalPointCharacter); } } counter++; } in.close(); // set the value type of all columns with missing values only to nominal for (int i = 0; i < valueTypes.length; i++) { if (onlyMissing[i]) valueTypes[i] = Ontology.NOMINAL; } if (columnNames == null) { String defaultName = file.getAbsolutePath().substring(file.getAbsolutePath().lastIndexOf(File.separator) + 1); columnNames = new String[maxColumns]; for (int i = 0; i < columnNames.length; i++) columnNames[i] = defaultName + " (" + (i+1) + ")"; } else if (columnNames.length < maxColumns) { String defaultName = file.getAbsolutePath().substring(file.getAbsolutePath().lastIndexOf(File.separator) + 1); String[] newColumnNames = new String[maxColumns]; System.arraycopy(columnNames, 0, newColumnNames, 0, columnNames.length); for (int i = columnNames.length; i < newColumnNames.length; i++) { newColumnNames[i] = defaultName + " (" + (i+1) + ")"; } } for (int i = 0; i < maxColumns; i++) { this.sources.add(new AttributeDataSource(AttributeFactory.createAttribute(columnNames[i], valueTypes[i]), file, i, "attribute")); } } public static void guessValueTypes(String[] data, int[] valueTypes, boolean[] onlyMissing, char decimalPointCharacter) { for (int c = 0; c < valueTypes.length; c++) { String value = data[c]; if ((value != null) && (!value.equals("?")) && (value.length() > 0)) { onlyMissing[c] = false; try { String valueString = value.replace(decimalPointCharacter, '.'); double d = Double.parseDouble(valueString); if ((valueTypes[c] == Ontology.INTEGER) && ((int) d != d)) { valueTypes[c] = Ontology.REAL; } } catch (NumberFormatException e) { valueTypes[c] = Ontology.NOMINAL; } } } } }