/* * RapidMiner * * Copyright (C) 2001-2008 by Rapid-I and the contributors * * Complete list of developers available at our web site: * * http://rapid-i.com * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see http://www.gnu.org/licenses/. */ package com.rapidminer.example.table; import java.io.BufferedReader; import java.io.IOException; import java.io.Reader; import java.util.HashMap; import java.util.Map; import java.util.StringTokenizer; import com.rapidminer.example.Attribute; import com.rapidminer.tools.att.AttributeSet; /** * Reads the data rows in sparse format. The format is specified in the class * comment of {@link com.rapidminer.operator.io.SparseFormatExampleSource}. * {@link Attribute}s may be passed to the reader in its constructor. If they * are ommitted, they are generated on the fly. In either case, indices are * assigned to the attributes. If an {@link AbstractExampleTable} is generated using * instances of this class, the constructor of {@link AbstractExampleTable} will * reassign these indexes. * * @author Simon Fischer, Ingo Mierswa * @version $Id: SparseFormatDataRowReader.java,v 2.20 2006/03/27 13:21:58 * ingomierswa Exp $ */ public class SparseFormatDataRowReader extends AbstractDataRowReader { /** Names of the formats. */ public static final String[] FORMAT_NAMES = { "xy", "yx", "prefix", "separate_file", "no_label" }; /** Label succeeds attributes. */ public static final int FORMAT_XY = 0; /** Label preceeds attributes. */ public static final int FORMAT_YX = 1; /** Label has a prefix specified in the prefix map. */ public static final int FORMAT_PREFIX = 2; /** Label is in separate file. */ public static final int FORMAT_SEPARATE_FILE = 3; /** Label is missing. */ public static final int FORMAT_NO_LABEL = 4; /** Reader for the labels. */ private BufferedReader inAttributes, inLabels; /** The attribute set with regular and special attributes. */ private AttributeSet attributeSet = null; /** Remember if an end of file has occured. */ private boolean eof; /** Remember if a line has already been read. */ private boolean lineRead; /** The maximum number of attributes to read. */ private int maxNumber; /** Number of lines already read. */ private int linesRead; /** The DataRow that will be returned in the next call to {@link #next()} */ private DataRow currentDataRow; /** * One out of FORMAT_XY, FORMAT_YX, FORMAT_PREFIX, FORMAT_SEPARATE_FILE, and * FORMAT_NO_LABEL. */ private int format; /** * The dimension of the examples, i.e. the total number of regular and * special attributes. */ private int dimension; /** Maps prefixes to special attribute names, e.g. "l:" to "label". */ private Map<String, String> prefixMap = new HashMap<String, String>(); /** * Creates a new data row reader for sparse format. The attributes indices * must not be set. If they are, they are reassigned new values when this * constructor is called! * * @param factory * Factory used to create {@link DataRow} instances. * @param format * One Out of FORMAT_XY, FORMAT_YX, FORMAT_PREFIX, and * FORMAT_SEPARATE_FILE. * @param prefixMap * Maps prefixes to special attribute names (e.g. "l" * to "label"). * @param attributeSet * Set of regular and special attributes. * @param attributeReader * Reader for the data * @param labelReader * Reader for the labels. Only necessary if format is * FORMAT_SEPARATE_FILE. * @param sampleSize * sample size, may be -1 for no limit. */ public SparseFormatDataRowReader(DataRowFactory factory, int format, Map<String, String> prefixMap, AttributeSet attributeSet, Reader attributeReader, Reader labelReader, int sampleSize) { super(factory); this.format = format; this.prefixMap = prefixMap; this.attributeSet = attributeSet; if (attributeSet == null) { throw new IllegalArgumentException("AttributeSet must not be null."); } this.dimension = attributeSet.getAllAttributes().size(); this.maxNumber = sampleSize; this.inAttributes = new BufferedReader(attributeReader); if (format == FORMAT_SEPARATE_FILE) { if (labelReader == null) throw new IllegalArgumentException("labelReader must not be null if format is 'separate_file'!"); this.inLabels = new BufferedReader(labelReader); } if (format != FORMAT_NO_LABEL) { if (attributeSet.getSpecialAttribute("label") == null) { throw new IllegalArgumentException("If format is not no_label, label attribute must be defined."); } } } /** Checks if futher examples exist. Returns false if one of the files end. */ public boolean hasNext() { if ((maxNumber > -1) && (linesRead >= maxNumber)) return false; if (lineRead) return !eof; try { eof = !readLine(); if (eof) { inAttributes.close(); if (inLabels != null) inLabels.close(); } } catch (IOException e) { throw new RuntimeException(e.getMessage(), e); } lineRead = true; return (!eof); } private boolean readLine() throws IOException { String attributeLine = null; do { attributeLine = inAttributes.readLine(); if (attributeLine == null) return false; } while (attributeLine.startsWith("#") || (attributeLine.length() == 0)); this.currentDataRow = getFactory().create(dimension); StringTokenizer tokenizer = new StringTokenizer(attributeLine); String labelString = null; if (format == FORMAT_YX) { labelString = tokenizer.nextToken(); } else if (format == FORMAT_SEPARATE_FILE) { do { labelString = inLabels.readLine(); if (labelString == null) return false; } while (labelString.startsWith("#") || (labelString.length() == 0)); } while (tokenizer.hasMoreTokens()) { String attributeToken = tokenizer.nextToken(); int colonIndex = attributeToken.indexOf(':'); if ((format == FORMAT_XY) && (colonIndex == -1)) { if (labelString != null) { throw new IOException("Malformed line in examplefile: " + attributeToken); } else { labelString = attributeToken; } } else { String pos = attributeToken.substring(0, colonIndex);// references // the // attribute String value = attributeToken.substring(colonIndex + 1); // the // attribute // value Attribute attribute = null; // the referenced attribute try { int index = Integer.parseInt(pos) - 1; if ((index < 0) || (index >= attributeSet.getNumberOfRegularAttributes())) throw new IOException("Attribute index out of range: '" + (index + 1) + "'! Index must be between 1 and dimension " + attributeSet.getNumberOfRegularAttributes() + "!"); attribute = attributeSet.getAttribute(index); } catch (NumberFormatException e) { String specialAttributeName = prefixMap.get(pos); if (specialAttributeName == null) { attribute = attributeSet.getSpecialAttribute(pos); if (attribute == null) throw new IOException("Illegal attribute index: '" + pos + "' (legal values are integers and defined prefixes for special attributes (Parameter prefix_map of SparseFormatExampleSource))!"); } else { attribute = attributeSet.getSpecialAttribute(specialAttributeName); } if (attribute == null) throw new IOException("Unknown special attribute: " + specialAttributeName); } if (attribute != null) { if (attribute.isNominal()) { currentDataRow.set(attribute, attribute.getMapping().mapString(value)); } else { try { currentDataRow.set(attribute, Double.parseDouble(value)); } catch (NumberFormatException e) { throw new IOException("Attribute is not numerical: '" + value + "'!"); } } } } } if (labelString != null) { Attribute label = attributeSet.getSpecialAttribute("label"); if (label.isNominal()) { currentDataRow.set(label, label.getMapping().mapString(labelString)); } else { try { currentDataRow.set(label, Double.parseDouble(labelString)); } catch (NumberFormatException e) { throw new IOException("Label is not numerical: '" + labelString + "'."); } } } currentDataRow.trim(); return true; } /** Returns the next Example. */ public DataRow next() { if (eof == true) return null; if (!lineRead) if (!hasNext()) return null; linesRead++; lineRead = false; return currentDataRow; } }