/**
* Copyright (C) 2001-2017 by RapidMiner and the contributors
*
* Complete list of developers available at our web site:
*
* http://rapidminer.com
*
* This program is free software: you can redistribute it and/or modify it under the terms of the
* GNU Affero General Public License as published by the Free Software Foundation, either version 3
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without
* even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License along with this program.
* If not, see http://www.gnu.org/licenses/.
*/
package com.rapidminer.studio.io.data.internal.file.csv;
import java.nio.file.Path;
import java.text.NumberFormat;
import java.util.Collections;
import java.util.Map;
import com.rapidminer.core.io.data.ColumnMetaData.ColumnType;
import com.rapidminer.core.io.data.DataSetException;
import com.rapidminer.core.io.data.DataSetMetaData;
import com.rapidminer.core.io.data.source.DataSource;
import com.rapidminer.core.io.data.source.DataSourceConfiguration;
import com.rapidminer.core.io.data.source.FileDataSource;
import com.rapidminer.operator.OperatorException;
import com.rapidminer.operator.nio.model.CSVResultSetConfiguration;
import com.rapidminer.operator.nio.model.DataResultSet;
import com.rapidminer.studio.io.data.DefaultDataSetMetaData;
import com.rapidminer.studio.io.data.internal.ResultSetAdapter;
import com.rapidminer.studio.io.data.internal.ResultSetAdapterUtils;
import com.rapidminer.tools.StrictDecimalFormat;
/**
* A {@link DataSource} implementation for CSV files.
*
* @author Gisa Schaefer
* @since 7.0.0
*/
public class CSVDataSource extends FileDataSource {
private DataSetMetaData metaData = new DefaultDataSetMetaData(Collections.<String> emptyList(),
Collections.<ColumnType> emptyList());
private CSVResultSetConfiguration configuration = new CSVResultSetConfiguration();
private CSVResultSetAdapter dataSet;
private DataSourceConfiguration dataSetConstructionConfiguration = null;
@Override
public void setLocation(Path newLocation) {
super.setLocation(newLocation);
configuration.setCsvFile(newLocation.toString());
}
/**
* @return the {@link CSVResultSetConfiguration} for this data source. Changes to the
* configuration will affect the import process as it stores the internal import
* configuration
*/
public CSVResultSetConfiguration getResultSetConfiguration() {
return configuration;
}
@Override
public CSVResultSetAdapter getData() throws DataSetException {
CSVResultSetAdapter wholeData = getCachedDataSet();
wholeData.setMaximumEndRow(ResultSetAdapter.NO_END_ROW);
return wholeData;
}
/**
* Checks if the cached data set can be reused and stores a new one if not.
*
* @return the cached data set
* @throws DataSetException
*/
private CSVResultSetAdapter getCachedDataSet() throws DataSetException {
boolean configurationChanged = dataSetConstructionConfiguration != null
&& !dataSetConstructionConfiguration.getParameters().equals(getConfiguration().getParameters());
if (dataSet == null || configurationChanged) {
// close old data set in case a new one is created
if (dataSet != null) {
dataSet.close();
dataSet = null;
}
try {
dataSet = new CSVResultSetAdapter(this, getResultSetConfiguration().makeDataResultSet(null),
getDataStartRow(), ResultSetAdapter.NO_END_ROW);
dataSetConstructionConfiguration = getConfiguration();
} catch (OperatorException e) {
throw new DataSetException(e.getMessage(), e.getCause());
}
}
return dataSet;
}
@Override
public CSVResultSetAdapter getPreview(int maxPreviewRows) throws DataSetException {
// choose endRow such that there are maxPreviewRow rows in total
int endRow = getDataStartRow() + maxPreviewRows - 1;
CSVResultSetAdapter previewData = getCachedDataSet();
previewData.setMaximumEndRow(endRow);
return previewData;
}
/**
* @return the row where the data starts, not counting the header row
*/
private int getDataStartRow() {
int startRow = configuration.getStartingRow();
if (configuration.hasHeaderRow() && configuration.getHeaderRow() == startRow) {
startRow++;
}
return startRow;
}
@Override
public DataSetMetaData getMetadata() {
return metaData;
}
/**
* @return the number format associated to the configuration
*/
NumberFormat getNumberFormat() {
return new StrictDecimalFormat(getResultSetConfiguration().getDecimalCharacter());
}
/**
* Creates a new {@link DataSetMetaData} instance with the results of the
* {@link CSVFormatSpecificationWizardStep} and assigns it to the {@link #metaData} field of
* this {@link CSVDataSource}.
* <p>
* The method also checks if the header row and the starting row exist and throws an exception
* otherwise.
*
* @throws DataSetException
* in case the starting row or header row do not exist or the specified CSV file
* could not be read because of IO issues
*/
public void createMetaData() throws DataSetException {
// create a new CSV ResultSet configuration which reads the whole selected file
// we cannot call getData() here as it might already skip the first lines
try (CSVResultSetConfiguration configuration = new CSVResultSetConfiguration()) {
configuration.setCsvFile(getLocation().toFile().toString());
configuration.setSkipComments(getResultSetConfiguration().isSkipComments());
configuration.setCommentCharacters(getResultSetConfiguration().getCommentCharacters());
configuration.setDecimalCharacter(getResultSetConfiguration().getDecimalCharacter());
configuration.setEncoding(getResultSetConfiguration().getEncoding());
configuration.setEscapeCharacter(getResultSetConfiguration().getEscapeCharacter());
configuration.setQuoteCharacter(getResultSetConfiguration().getQuoteCharacter());
configuration.setUseQuotes(getResultSetConfiguration().isUseQuotes());
configuration.setColumnSeparators(getResultSetConfiguration().getColumnSeparators());
configuration.setHasHeaderRow(getResultSetConfiguration().hasHeaderRow());
configuration.setHeaderRow(getResultSetConfiguration().getHeaderRow());
int headerRowIndex = configuration.hasHeaderRow() ? configuration.getHeaderRow()
: ResultSetAdapter.NO_HEADER_ROW;
try (DataResultSet dataSet = configuration.makeDataResultSet(null)) {
this.metaData = ResultSetAdapterUtils.createMetaData(dataSet, getNumberFormat(), getDataStartRow(),
headerRowIndex);
} catch (OperatorException e) {
throw new DataSetException(e.getMessage(), e);
}
}
}
@Override
public DataSourceConfiguration getConfiguration() {
final Map<String, String> storedConfiguration = configuration.getParameterMap();
return new DataSourceConfiguration() {
@Override
public String getVersion() {
return "0";
}
@Override
public Map<String, String> getParameters() {
return storedConfiguration;
}
};
}
@Override
public void configure(DataSourceConfiguration configuration) throws DataSetException {}
@Override
public void close() throws DataSetException {
configuration.close();
if (dataSet != null) {
dataSet.close();
dataSet = null;
}
}
}