FileDataRowReader.java example

Explorer
rapidminer-studio-master
- doc
  - doc
- src
/**
 * Copyright (C) 2001-2017 by RapidMiner and the contributors
 * 
 * Complete list of developers available at our web site:
 * 
 * http://rapidminer.com
 * 
 * This program is free software: you can redistribute it and/or modify it under the terms of the
 * GNU Affero General Public License as published by the Free Software Foundation, either version 3
 * of the License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without
 * even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 * Affero General Public License for more details.
 * 
 * You should have received a copy of the GNU Affero General Public License along with this program.
 * If not, see http://www.gnu.org/licenses/.
*/
package com.rapidminer.example.table;

import com.rapidminer.example.Attribute;
import com.rapidminer.tools.LogService;
import com.rapidminer.tools.RandomGenerator;
import com.rapidminer.tools.Tools;
import com.rapidminer.tools.att.AttributeDataSource;

import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.logging.Level;


/**
 * <p>
 * FileDataRowReader implements a DataRowReader that reads DataRows from a file. This is the main
 * data reader for many file formats (including csv) and is used by the ExampleSource operator and
 * the attribute editor.
 * </p>
 * 
 * <p>
 * This class supports the reading of data from multiple source files. Each attribute (including
 * special attributes like labels, weights, ...) might be read from another file. Please note that
 * only the minimum number of lines of all files will be read, i.e. if one of the data source files
 * has less lines than the others, only this number of data rows will be read.
 * </p>
 * 
 * <p>
 * The split points can be defined with regular expressions (please refer to the Java API). Quoting
 * is possible but not suggested since the runtime is higher. The user should ensure that the split
 * characters are not included in the data columns. Please refer to {@link RapidMinerLineReader} for
 * further information.
 * </p>
 * 
 * <p>
 * Unknown attribute values can be marked with empty strings or "?".
 * </p>
 * 
 * @author Ingo Mierswa Exp $
 */
public class FileDataRowReader extends AbstractDataRowReader {

	private static final int FILE_NR = 0;

	private static final int COLUMN_NR = 1;

	/** The file readers. */
	private BufferedReader[] fileReader;

	/** The attribute descriptions. */
	private Attribute[] attributes;

	/** Remember if an end of file has occured. */
	private boolean eof;

	/** Remember if a line has already been read. */
	private boolean lineRead;

	/** The sample ratio. */
	private double sampleRatio = 1.0d;

	/** The maximum number of examples to read (sampling). */
	private int maxNumber = -1;

	/** The number of lines read so far (i.e. the number of examples). */
	private int linesRead = 0;

	/**
	 * This array hold the current data. The first dimension is used for distinguishing different
	 * sources and the second for data read from the corresponding source.
	 */
	private String[][] currentData;

	/**
	 * This array holds the information how many columns each data source should provide. Otherwise
	 * an IOException will be thrown. This information is only used for checks and error
	 * improvement.
	 */
	private int[] expectedNumberOfColumns;

	/** This reader maps lines read from a file to RapidMiner columns. */
	private RapidMinerLineReader rapidMinerLineReader;

	/** The random generator used for sampling. */
	private RandomGenerator random;

	/**
	 * Array of size [number of attributes][2]. For each attribute i the value of
	 * dataSourceIndex[i][FILE_NR] is used as an index to {@link #fileReader} and the value of
	 * dataSourceIndex[i][TOKEN_NR] specifies the index of the column to use for attribute i.
	 */
	private int[][] dataSourceIndex;

	/**
	 * Constructs a new FileDataRowReader.
	 * 
	 * @param factory
	 *            Factory used to create data rows.
	 * @param attributeDataSources
	 *            List of {@link AttributeDataSource}s.
	 * @param sampleRatio
	 *            the ratio of examples which will be read. Only used if sampleSize is -1.
	 * @param sampleSize
	 *            Limit sample to the first sampleSize lines read from files. -1 for no limit, then
	 *            the sampleRatio will be used.
	 * @param separatorsRegExpr
	 *            a regular expression describing the separator characters for the columns of each
	 *            line
	 * @param commentChars
	 *            defines which characters are used to comment the rest of a line
	 * @param useQuotes
	 *            indicates if quotes should be used and parsed. Slows down reading and should be
	 *            avoided if possible
	 * @param random
	 *            the random generator used for sampling
	 */
	public FileDataRowReader(DataRowFactory factory, List<AttributeDataSource> attributeDataSources, double sampleRatio,
			int sampleSize, String separatorsRegExpr, char[] commentChars, boolean useQuotes, char quoteChar,
			char escapeChar, boolean trimLines, boolean skipErrorLines, Charset encoding, RandomGenerator random)
			throws IOException {
		super(factory);
		this.sampleRatio = sampleRatio;
		this.maxNumber = sampleSize;
		this.attributes = new Attribute[attributeDataSources.size()];
		this.dataSourceIndex = new int[attributeDataSources.size()][2];
		this.rapidMinerLineReader = new RapidMinerLineReader(separatorsRegExpr, commentChars, useQuotes, quoteChar,
				escapeChar, trimLines, skipErrorLines);
		this.random = random;
		initReader(factory, attributeDataSources, sampleSize, separatorsRegExpr, useQuotes, encoding);
	}

	/** Read the complete data. */
	private void initReader(DataRowFactory factory, List<AttributeDataSource> attributeDataSources, int sampleSize,
			String separatorsRegExpr, boolean useQuotes, Charset encoding) throws IOException {
		// map all files used to indices
		List<BufferedReader> readerList = new LinkedList<BufferedReader>();
		Map<File, Integer> fileMap = new HashMap<File, Integer>();
		Iterator<AttributeDataSource> i = attributeDataSources.iterator();
		int attribute = 0;
		int greatestFileIndex = -1;
		List<AtomicInteger> columnCounters = new ArrayList<AtomicInteger>();
		while (i.hasNext()) {
			AttributeDataSource ads = i.next();
			attributes[attribute] = ads.getAttribute();
			File file = ads.getFile();
			Integer fileIndex = fileMap.get(file);
			// new file found? -> create reader and map to index number
			if (fileIndex == null) {
				fileIndex = Integer.valueOf(++greatestFileIndex);
				fileMap.put(file, fileIndex);
				readerList.add(Tools.getReader(file, encoding));
				columnCounters.add(new AtomicInteger(1));
			} else {
				AtomicInteger counter = columnCounters.get(fileIndex.intValue());
				counter.incrementAndGet();
			}
			dataSourceIndex[attribute][FILE_NR] = fileIndex.intValue();
			dataSourceIndex[attribute][COLUMN_NR] = ads.getColumn();
			attribute++;
		}

		this.fileReader = new BufferedReader[readerList.size()];
		readerList.toArray(this.fileReader);
		currentData = new String[this.fileReader.length][];

		// create counters
		expectedNumberOfColumns = new int[columnCounters.size()];
		Iterator<AtomicInteger> j = columnCounters.iterator();
		int k = 0;
		while (j.hasNext()) {
			expectedNumberOfColumns[k++] = j.next().intValue();
		}
	}

	/** Skips the next line, if present. */
	public void skipLine() {
		try {
			readLine();
		} catch (Exception e) {
			// LogService.getGlobal().log("Problem during skipping of line: " + e.getMessage(),
			// LogService.WARNING);
			LogService.getRoot().log(Level.WARNING,
					"com.rapidminer.example.table.FileDataRowReader.problem_during_skipping_of_line", e.getMessage());
		}
	}

	/**
	 * Reads a line of data from all file readers. Returns true if the line was readable, i.e. the
	 * end of the source files was not yet reached.
	 */
	private boolean readLine() throws IOException {
		boolean eofReached = false;
		boolean ok = false;
		while (!ok) {
			for (int i = 0; i < fileReader.length; i++) {
				currentData[i] = rapidMinerLineReader.readLine(fileReader[i], expectedNumberOfColumns[i]);
				if (currentData[i] == null) {
					eofReached = true;
					break;
				}
			}
			if ((eofReached) || (maxNumber != -1) || (sampleRatio == 1.0d) || (random.nextDouble() < sampleRatio)) {
				ok = true;
			}
		}
		if (eofReached) {
			for (int i = 0; i < fileReader.length; i++) {
				fileReader[i].close();
			}
			return false;
		} else {
			return true;
		}
	}

	/**
	 * Checks if another line exists and reads. The next line is only read once even if this method
	 * is invoked more than once.
	 */
	@Override
	public boolean hasNext() {
		if ((maxNumber > -1) && (linesRead >= maxNumber)) {
			return false;
		}

		if (lineRead) {
			return !eof;
		}

		try {
			eof = !readLine();
		} catch (IOException e) {
			LogService.getRoot().severe(e.getMessage());
			return false;
		}
		lineRead = true;

		return !eof;
	}

	/** Returns the next Example. */
	@Override
	public DataRow next() {
		if (eof == true) {
			return null;
		}
		if (!lineRead) {
			if (!hasNext()) {
				return null;
			}
		}

		String[] data = new String[attributes.length];
		for (int i = 0; i < attributes.length; i++) {
			if (dataSourceIndex[i][1] == -1) {
				data[i] = null;
			} else {
				data[i] = currentData[dataSourceIndex[i][0]][dataSourceIndex[i][1]];
			}
		}

		DataRow dataRow = getFactory().create(data, attributes);
		linesRead++;
		lineRead = false;
		return dataRow;
	}
}