/**
* Copyright (C) 2001-2017 by RapidMiner and the contributors
*
* Complete list of developers available at our web site:
*
* http://rapidminer.com
*
* This program is free software: you can redistribute it and/or modify it under the terms of the
* GNU Affero General Public License as published by the Free Software Foundation, either version 3
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without
* even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License along with this program.
* If not, see http://www.gnu.org/licenses/.
*/
package com.rapidminer.operator.nio.model;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import java.util.Date;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.rapidminer.gui.tools.dialogs.wizards.dataimport.csv.LineReader;
import com.rapidminer.operator.Operator;
import com.rapidminer.operator.OperatorException;
import com.rapidminer.operator.ProcessStoppedException;
import com.rapidminer.operator.UserError;
import com.rapidminer.operator.nio.model.ParsingError.ErrorCode;
import com.rapidminer.tools.CSVParseException;
import com.rapidminer.tools.LineParser;
import com.rapidminer.tools.LogService;
import com.rapidminer.tools.Ontology;
import com.rapidminer.tools.ProgressListener;
import com.rapidminer.tools.WebServiceTools;
/**
*
* @author Simon Fischer
*
*/
public class CSVResultSet implements DataResultSet {
/**
* specifies how many rows should be read to guess the column separator, 1 headline + 10 further
* rows
*/
private static final int LINES_FOR_GUESSING = 11;
private static final int MAX_LOG_COUNT = 100;
private CSVResultSetConfiguration configuration;
private LineReader reader;
private LineParser parser;
private String[] next;
private String[] current;
private int currentRow;
private String[] columnNames;
private int[] valueTypes;
private int numColumns = 0;
private Operator operator;
private final List<ParsingError> errors = new LinkedList<>();
private int logCount = 0;
private long multiplier;
private long lineCounter = 0;
public static enum ColumnSplitter {
SEMI_COLON(";", Pattern.compile(";")), COMMA(",", Pattern.compile(",")), TAB("\t", Pattern.compile("\t")), TILDE("~",
Pattern.compile("~")), PIPE("|", Pattern.compile("\\|"));
private final Pattern pattern;
private final String seperator;
ColumnSplitter(String seperator, Pattern pattern) {
this.seperator = seperator;
this.pattern = pattern;
}
public Pattern getPattern() {
return pattern;
}
public String getString() {
return seperator;
}
}
private static int getSeperatorCount(Pattern seperatorPattern, String content) {
if (content == null) {
return 0;
}
Matcher matcher = seperatorPattern.matcher(content);
int count = 0;
while (matcher.find()) {
count++;
}
return count;
}
public CSVResultSet(CSVResultSetConfiguration configuration, Operator operator) throws OperatorException {
this.configuration = configuration;
this.operator = operator;
open();
}
private void open() throws OperatorException {
getErrors().clear();
close();
InputStream in = openStream();
logCount = 0;
// if encoding is UTF-8, we will have to check whether the stream starts with a BOM. If not
// restart stream
if (configuration.getEncoding().name().equals("UTF-8")) {
try {
if (in.read() != 239 || in.read() != 187 || in.read() != 191) {
in.close();
in = openStream();
}
} catch (IOException e) {
try {
in.close();
} catch (IOException e1) {
}
throw new UserError(operator, e, 321, configuration.getCsvFile(), e.toString());
}
}
reader = new LineReader(in, configuration.getEncoding());
parser = new LineParser(configuration);
try {
if (operator != null && reader.getSize() > 0L) {
multiplier = reader.getSize() / 100L;
lineCounter = 0;
operator.getProgress().setCheckForStop(false);
operator.getProgress().setTotal(100);
}
} catch (IOException e) {
// ignore and assume indeterminate progress
}
try {
readNext();
} catch (IOException e) {
try {
in.close();
} catch (IOException e1) {
}
throw new UserError(operator, e, 321, configuration.getCsvFile(), e.toString());
}
if (next == null) {
errors.add(new ParsingError(1, -1, ErrorCode.FILE_SYNTAX_ERROR, "No valid line found."));
// throw new UserError(operator, 321, configuration.getCsvFile(),
// "No valid line found.");
columnNames = new String[0];
valueTypes = new int[0];
} else {
numColumns = next.length;
columnNames = new String[next.length];
for (int i = 0; i < next.length; i++) {
columnNames[i] = "att" + (i + 1);
}
valueTypes = new int[next.length];
Arrays.fill(valueTypes, Ontology.NOMINAL);
currentRow = -1;
}
}
/**
* Guesses the column separator of the csv file by counting which {@link ColumnSplitter} appears
* the most in the first rows.
*
* @param csvFile
* the csv file
* @return the most frequent column separator
*/
public static String guessColumnSeperator(File csvFile) {
return guessColumnSplitter(csvFile).getString();
}
/**
* Guesses the column separator of the csv file by counting which {@link ColumnSplitter} appears
* the most in the first rows.
*
* @param csvFile
* the path to the file to analyze
* @return the most frequent column separator
*/
public static String guessColumnSeperator(String csvFile) {
return guessColumnSplitter(csvFile).getString();
}
/**
* Guesses the column splitter of the csv file by counting which {@link ColumnSplitter} appears
* the most in the first rows.
*
* @param csvFile
* the path to the file to analyze
* @return the most frequent {@link ColumnSplitter}
*/
public static ColumnSplitter guessColumnSplitter(String csvFile) {
return guessColumnSplitter(new File(csvFile));
}
/**
* Guesses the column splitter of the csv file by counting which {@link ColumnSplitter} appears
* the most in the first rows.
*
* @param csvFile
* the file to analyze
* @return the most frequent {@link ColumnSplitter}
*/
public static ColumnSplitter guessColumnSplitter(File csvFile) {
try (LineReader tempReader = new LineReader(csvFile, StandardCharsets.UTF_8)) {
/* could be default, apply heuristics to find the column splitter */
HashMap<ColumnSplitter, Integer> splitterValues = new HashMap<>();
for (ColumnSplitter splitter : ColumnSplitter.values()) {
splitterValues.put(splitter, 0);
}
int lineCount = 0;
while (lineCount < LINES_FOR_GUESSING) {
String line = tempReader.readLine();
// SEMI_COLON,
splitterValues.put(ColumnSplitter.SEMI_COLON, splitterValues.get(ColumnSplitter.SEMI_COLON)
+ getSeperatorCount(ColumnSplitter.SEMI_COLON.getPattern(), line));
// COMMA,
splitterValues.put(ColumnSplitter.COMMA, splitterValues.get(ColumnSplitter.COMMA)
+ getSeperatorCount(ColumnSplitter.COMMA.getPattern(), line));
// TAB,
splitterValues.put(ColumnSplitter.TAB,
splitterValues.get(ColumnSplitter.TAB) + getSeperatorCount(ColumnSplitter.TAB.getPattern(), line));
// TILDE,
splitterValues.put(ColumnSplitter.TILDE, splitterValues.get(ColumnSplitter.TILDE)
+ getSeperatorCount(ColumnSplitter.TILDE.getPattern(), line));
// PIPE
splitterValues.put(ColumnSplitter.PIPE,
splitterValues.get(ColumnSplitter.PIPE) + getSeperatorCount(ColumnSplitter.PIPE.getPattern(), line));
lineCount++;
}
int maxValue = 0;
ColumnSplitter guessedSplitter = ColumnSplitter.SEMI_COLON;
for (ColumnSplitter splitter : ColumnSplitter.values()) {
if (splitterValues.get(splitter) > maxValue) {
maxValue = splitterValues.get(splitter);
guessedSplitter = splitter;
}
}
return guessedSplitter;
} catch (IOException e) {
return ColumnSplitter.SEMI_COLON;
}
}
protected InputStream openStream() throws UserError {
try {
URL url = new URL(configuration.getCsvFile());
try {
return WebServiceTools.openStreamFromURL(url);
} catch (IOException e) {
throw new UserError(operator, 301, e, configuration.getCsvFile());
}
} catch (MalformedURLException e) {
// URL did not work? Try as file...
try {
String csvFile = configuration.getCsvFile();
if (csvFile == null) {
throw new UserError(this.operator, "file_consumer.no_file_defined");
}
return new FileInputStream(csvFile);
} catch (FileNotFoundException e1) {
throw new UserError(operator, 301, e1, configuration.getCsvFile());
}
}
}
private void readNext() throws IOException {
do {
String line = reader.readLine();
if (line == null) {
next = null;
return;
}
try {
next = parser.parse(line);
if (operator != null && ++lineCounter % 1000 == 0) {
long position = reader.getPosition();
if (position > 0) {
int currentProgress = (int) (position / multiplier);
if (currentProgress != operator.getProgress().getCompleted()) {
try {
operator.getProgress().setCompleted(currentProgress);
} catch (ProcessStoppedException e) {
// Will not happen, because check for stop is deactivated.
}
}
}
}
if (next != null) { // no comment read
break;
}
} catch (CSVParseException e) {
ParsingError parsingError = new ParsingError(currentRow, -1, ErrorCode.FILE_SYNTAX_ERROR, line, e);
getErrors().add(parsingError);
String warning = "Could not parse line " + currentRow + " in input: " + e.toString();
if (logCount < MAX_LOG_COUNT) {
if (operator != null) {
operator.logWarning(warning);
} else {
LogService.getRoot().warning(warning);
}
} else {
if (logCount == MAX_LOG_COUNT) {
if (operator != null) {
operator.logWarning("Maximum number of warnings exceeded. Will display no further warnings.");
} else {
LogService.getRoot()
.warning("Maximum number of warnings exceeded. Will display no further warnings.");
}
}
}
logCount++;
next = new String[] { line };
}
} while (true);
}
@Override
public boolean hasNext() {
return next != null;
}
@Override
public void next(ProgressListener listener) throws OperatorException {
current = next;
currentRow++;
try {
readNext();
} catch (IOException e) {
throw new UserError(operator, e, 321, configuration.getCsvFile(), e.toString());
}
}
@Override
public int getNumberOfColumns() {
return numColumns;
}
@Override
public String[] getColumnNames() {
return columnNames;
}
@Override
public boolean isMissing(int columnIndex) {
return columnIndex >= current.length || current[columnIndex] == null || current[columnIndex].isEmpty();
}
@Override
public Number getNumber(int columnIndex) throws ParseException {
throw new ParseException(
new ParsingError(currentRow, columnIndex, ParsingError.ErrorCode.UNPARSEABLE_REAL, current[columnIndex]));
}
@Override
public String getString(int columnIndex) throws ParseException {
if (columnIndex < current.length) {
return current[columnIndex];
} else {
return null;
}
}
@Override
public Date getDate(int columnIndex) throws ParseException {
throw new ParseException(
new ParsingError(currentRow, columnIndex, ParsingError.ErrorCode.UNPARSEABLE_DATE, current[columnIndex]));
}
@Override
public ValueType getNativeValueType(int columnIndex) throws ParseException {
return ValueType.STRING;
}
@Override
public void close() throws OperatorException {
if (reader == null) {
return;
}
try {
reader.close();
} catch (IOException e) {
throw new UserError(operator, 321, e, configuration.getCsvFile(), e.toString());
} finally {
reader = null;
}
}
@Override
public void reset(ProgressListener listener) throws OperatorException {
open();
}
@Override
public int[] getValueTypes() {
return valueTypes;
}
@Override
public int getCurrentRow() {
return currentRow;
}
public List<ParsingError> getErrors() {
return errors;
}
}