/*
* RapidMiner
*
* Copyright (C) 2001-2011 by Rapid-I and the contributors
*
* Complete list of developers available at our web site:
*
* http://rapid-i.com
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see http://www.gnu.org/licenses/.
*/
package com.rapidminer.gui.tools.dialogs.wizards.dataimport.csv;
import java.io.File;
import java.io.IOException;
import java.text.NumberFormat;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import org.jfree.util.Log;
import com.rapidminer.example.Attribute;
import com.rapidminer.example.ExampleSet;
import com.rapidminer.example.table.AttributeFactory;
import com.rapidminer.example.table.DataRow;
import com.rapidminer.example.table.DataRowReader;
import com.rapidminer.example.table.DoubleArrayDataRow;
import com.rapidminer.example.table.ExampleTable;
import com.rapidminer.example.table.MemoryExampleTable;
import com.rapidminer.operator.ports.metadata.AttributeMetaData;
import com.rapidminer.operator.ports.metadata.ExampleSetMetaData;
import com.rapidminer.operator.ports.metadata.MetaData;
import com.rapidminer.tools.CSVParseException;
import com.rapidminer.tools.LineParser;
import com.rapidminer.tools.Ontology;
/**
* A helper class for reading CSV files
*
* @author Tobias Malbrecht
*/
public class CSVFileReader {
private static final int MAX_LINES = 2000;
private final File file;
private final boolean useFirstRowAsColumnNames;
private final LineParser parser;
private final NumberFormat numberFormat;
private boolean eofReached = false;
private int rowCount = -1;
private final DataEvaluator dataEvaluator;
public CSVFileReader(final File file, boolean useFirstRowAsColumnNames, LineParser parser, NumberFormat numberFormat) {
this.file = file;
this.useFirstRowAsColumnNames = useFirstRowAsColumnNames;
this.parser = parser;
this.numberFormat = numberFormat;
this.dataEvaluator = new DataEvaluator(numberFormat) {
@Override
public String getGenericColumnName(int column) {
return file.getName() + "_" + (column + 1);
}
};
}
public LinkedList<String[]> readData(int maxLines) throws IOException {
String line = null;
eofReached = false;
boolean first = true;
LineReader reader = new LineReader(file);
LinkedList<String[]> valueLines = new LinkedList<String[]>();
dataEvaluator.start();
do {
line = reader.readLine();
if (line != null) {
String[] valueLine = parser.parse(line);
if (valueLine != null) {
if (first) {
first = false;
if (useFirstRowAsColumnNames) {
dataEvaluator.setColumnNames(valueLine);
continue;
}
}
dataEvaluator.update(valueLine);
valueLines.add(valueLine);
}
rowCount++;
} else {
eofReached = true;
break;
}
} while (rowCount < maxLines);
reader.close();
dataEvaluator.finish(eofReached);
return valueLines;
}
private void guessMetaData() throws IOException {
readData(MAX_LINES);
}
public MetaData getMetaData() throws IOException {
guessMetaData();
return dataEvaluator.getMetaData();
}
public ExampleSet createExampleSet() throws IOException {
guessMetaData();
ExampleSetMetaData metaData = dataEvaluator.getMetaData();
ArrayList<Attribute> attributes = new ArrayList<Attribute>(metaData.getAllAttributes().size());
for (AttributeMetaData amd : metaData.getAllAttributes()) {
attributes.add(AttributeFactory.createAttribute(amd.getName(), amd.getValueType()));
}
ExampleTable table = new MemoryExampleTable(attributes, getDataRowReader(attributes));
return table.createExampleSet();
}
public Iterator<String[]> getDataReader() throws IOException {
Iterator<String[]> iterator = new Iterator<String[]>() {
private String line = null;
private boolean first = useFirstRowAsColumnNames;
private LineReader reader = new LineReader(file);
@Override
public boolean hasNext() {
try {
if (first) {
do {
line = reader.readLine();
if (line == null) {
return false;
}
} while (parser.parse(line) == null);
first = false;
}
do {
line = reader.readLine();
if (line == null) {
reader.close();
return false;
}
} while (parser.parse(line) == null);
return true;
} catch (IOException e) {
return false;
}
}
@Override
public String[] next() {
try {
return parser.parse(line);
} catch (CSVParseException e) {
throw new IllegalArgumentException(e);
}
}
@Override
public void remove() {
throw new UnsupportedOperationException("Can not remove data rows from reader.");
}
};
return iterator;
}
public DataRowReader getDataRowReader(final List<Attribute> attributeList) throws IOException {
DataRowReader dataRowReader = new DataRowReader() {
private Iterator<String[]> iterator = getDataReader();
private int columnCount = attributeList.size();
private Attribute[] attributes = new Attribute[columnCount];
{
attributes = attributeList.toArray(attributes);
}
@Override
public boolean hasNext() {
return iterator.hasNext();
}
@Override
public DataRow next() {
String[] valueLine = iterator.next();
double[] values = new double[columnCount];
for (int i = 0; i < columnCount; i++) {
values[i] = Double.NaN;
}
for (int i = 0; i < valueLine.length; i++) {
if (i >= valueLine.length) {
Log.warn("Metadata was not correctly specified.");
continue;
}
if (valueLine[i] == null || valueLine[i].isEmpty()) {
continue;
}
if (Ontology.ATTRIBUTE_VALUE_TYPE.isA(attributes[i].getValueType(), Ontology.NUMERICAL)) {
try {
values[i] = numberFormat.parse(valueLine[i]).doubleValue();
} catch (ParseException e) {
System.err.println("cannot parse");
}
continue;
}
if (Ontology.ATTRIBUTE_VALUE_TYPE.isA(attributes[i].getValueType(), Ontology.NOMINAL)) {
values[i] = attributes[i].getMapping().mapString(valueLine[i]);
continue;
}
}
return new DoubleArrayDataRow(values);
}
@Override
public void remove() {
throw new UnsupportedOperationException("Can not remove data rows from reader.");
}
};
return dataRowReader;
}
}