/* * RapidMiner * * Copyright (C) 2001-2008 by Rapid-I and the contributors * * Complete list of developers available at our web site: * * http://rapid-i.com * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see http://www.gnu.org/licenses/. */ package com.rapidminer.example.table; import java.io.BufferedReader; import java.io.File; import java.io.IOException; import java.nio.charset.Charset; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.concurrent.atomic.AtomicInteger; import com.rapidminer.example.Attribute; import com.rapidminer.tools.LogService; import com.rapidminer.tools.RandomGenerator; import com.rapidminer.tools.Tools; import com.rapidminer.tools.att.AttributeDataSource; /** * <p> * FileDataRowReader implements a DataRowReader that reads DataRows from a file. * This is the main data reader for many file formats (including csv) and is * used by the ExampleSource operator and the attribute editor. * </p> * * <p> * This class supports the reading of data from multiple source files. Each * attribute (including special attributes like labels, weights, ...) might be * read from another file. Please note that only the minimum number of lines of * all files will be read, i.e. if one of the data source files has less lines * than the others, only this number of data rows will be read. * </p> * * <p> * The split points can be defined with regular expressions (please refer to the * Java API). Quoting is possible but not suggested since the runtime is higher. * The user should ensure that the split characters are not included in the data * columns. Please refer to {@link RapidMinerLineReader} for further information. * </p> * * <p> * Unknown attribute values can be marked with empty strings or "?". * </p> * * @author Ingo Mierswa * @version $Id: FileDataRowReader.java,v 2.21 2006/03/27 13:21:58 ingomierswa * Exp $ */ public class FileDataRowReader extends AbstractDataRowReader { private static final int FILE_NR = 0; private static final int COLUMN_NR = 1; /** The file readers. */ private BufferedReader[] fileReader; /** The attribute descriptions. */ private Attribute[] attributes; /** Remember if an end of file has occured. */ private boolean eof; /** Remember if a line has already been read. */ private boolean lineRead; /** The sample ratio. */ private double sampleRatio = 1.0d; /** The maximum number of examples to read (sampling). */ private int maxNumber = -1; /** The number of lines read so far (i.e. the number of examples). */ private int linesRead = 0; /** * This array hold the current data. The first dimension is used for * distinguishing different sources and the second for data read from the * corresponding source. */ private String[][] currentData; /** * This array holds the information how many columns each data source should * provide. Otherwise an IOException will be thrown. This information is * only used for checks and error improvement. */ private int[] expectedNumberOfColumns; /** This reader maps lines read from a file to RapidMiner columns. */ private RapidMinerLineReader rapidMinerLineReader; /** The random generator used for sampling. */ private RandomGenerator random; /** * Array of size [number of attributes][2]. For each attribute i the value * of dataSourceIndex[i][FILE_NR] is used as an index to {@link #fileReader} * and the value of dataSourceIndex[i][TOKEN_NR] specifies the index of the * column to use for attribute i. */ private int[][] dataSourceIndex; /** * Constructs a new FileDataRowReader. * * @param factory * Factory used to create data rows. * @param attributeDataSources * List of {@link AttributeDataSource}s. * @param sampleRatio * the ratio of examples which will be read. Only used if * sampleSize is -1. * @param sampleSize * Limit sample to the first sampleSize lines read from files. -1 * for no limit, then the sampleRatio will be used. * @param separatorsRegExpr * a regular expression describing the separator characters for * the columns of each line * @param commentChars * defines which characters are used to comment the rest of a * line * @param useQuotes * indicates if quotes should be used and parsed. Slows down * reading and should be avoided if possible * @param random * the random generator used for sampling */ public FileDataRowReader(DataRowFactory factory, List<AttributeDataSource> attributeDataSources, double sampleRatio, int sampleSize, String separatorsRegExpr, char[] commentChars, boolean useQuotes, boolean trimLines, Charset encoding, RandomGenerator random) throws IOException { super(factory); this.sampleRatio = sampleRatio; this.maxNumber = sampleSize; this.attributes = new Attribute[attributeDataSources.size()]; this.dataSourceIndex = new int[attributeDataSources.size()][2]; this.rapidMinerLineReader = new RapidMinerLineReader(separatorsRegExpr, commentChars, useQuotes, trimLines); this.random = random; initReader(factory, attributeDataSources, sampleSize, separatorsRegExpr, useQuotes, encoding); } /** Skips the next line, if present. */ public void skipLine() { try { readLine(); } catch (Exception e) { LogService.getGlobal().log("Problem during skipping of line: " + e.getMessage(), LogService.WARNING); } } /** Read the complete data. */ private void initReader(DataRowFactory factory, List<AttributeDataSource> attributeDataSources, int sampleSize, String separatorsRegExpr, boolean useQuotes, Charset encoding) throws IOException { // map all files used to indices List<BufferedReader> readerList = new LinkedList<BufferedReader>(); Map<File, Integer> fileMap = new HashMap<File, Integer>(); Iterator<AttributeDataSource> i = attributeDataSources.iterator(); int attribute = 0; int greatestFileIndex = -1; List<AtomicInteger> columnCounters = new ArrayList<AtomicInteger>(); while (i.hasNext()) { AttributeDataSource ads = i.next(); attributes[attribute] = ads.getAttribute(); File file = ads.getFile(); Integer fileIndex = fileMap.get(file); // new file found? -> create reader and map to index number if (fileIndex == null) { fileIndex = Integer.valueOf(++greatestFileIndex); fileMap.put(file, fileIndex); readerList.add(Tools.getReader(file, encoding)); columnCounters.add(new AtomicInteger(1)); } else { AtomicInteger counter = columnCounters.get(fileIndex.intValue()); counter.incrementAndGet(); } dataSourceIndex[attribute][FILE_NR] = fileIndex.intValue(); dataSourceIndex[attribute][COLUMN_NR] = ads.getColumn(); attribute++; } this.fileReader = new BufferedReader[readerList.size()]; readerList.toArray(this.fileReader); currentData = new String[this.fileReader.length][]; // create counters expectedNumberOfColumns = new int[columnCounters.size()]; Iterator<AtomicInteger> j = columnCounters.iterator(); int k = 0; while (j.hasNext()) expectedNumberOfColumns[k++] = j.next().intValue(); } /** * Reads a line of data from all file readers. Returns true if the line was * readable, i.e. the end of the source files was not yet reached. */ private boolean readLine() throws IOException { boolean eofReached = false; boolean ok = false; while (!ok) { for (int i = 0; i < fileReader.length; i++) { currentData[i] = rapidMinerLineReader.readLine(fileReader[i], expectedNumberOfColumns[i]); if (currentData[i] == null) { eofReached = true; break; } } if ((eofReached) || (maxNumber != -1) || (sampleRatio == 1.0d) || (random.nextDouble() < sampleRatio)) ok = true; } if (eofReached) { for (int i = 0; i < fileReader.length; i++) { fileReader[i].close(); } return false; } else { return true; } } /** * Checks if another line exists and reads. The next line is only read once * even if this method is invoked more than once. */ public boolean hasNext() { if ((maxNumber > -1) && (linesRead >= maxNumber)) return false; if (lineRead) return !eof; try { eof = !readLine(); } catch (IOException e) { LogService.getGlobal().log(e.getMessage(), LogService.ERROR); return false; } lineRead = true; return (!eof); } /** Returns the next Example. */ public DataRow next() { if (eof == true) return null; if (!lineRead) if (!hasNext()) return null; String[] data = new String[attributes.length]; for (int i = 0; i < attributes.length; i++) { if (dataSourceIndex[i][1] == -1) { data[i] = null; } else { data[i] = currentData[dataSourceIndex[i][0]][dataSourceIndex[i][1]]; } } DataRow dataRow = getFactory().create(data, attributes); linesRead++; lineRead = false; return dataRow; } }