/**
* DataCleaner (community edition)
* Copyright (C) 2014 Neopost - Customer Information Management
*
* This copyrighted material is made available to anyone wishing to use, modify,
* copy, or redistribute it subject to the terms and conditions of the GNU
* Lesser General Public License, as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
* for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this distribution; if not, write to:
* Free Software Foundation, Inc.
* 51 Franklin Street, Fifth Floor
* Boston, MA 02110-1301 USA
*/
package org.datacleaner.util;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.util.Arrays;
import java.util.List;
import org.apache.metamodel.csv.CsvConfiguration;
import org.apache.metamodel.csv.CsvDataContext;
import org.apache.metamodel.data.DataSet;
import org.apache.metamodel.data.Row;
import org.apache.metamodel.schema.Table;
import org.apache.metamodel.schema.naming.ColumnNamingStrategy;
import org.apache.metamodel.schema.naming.CustomColumnNamingStrategy;
import org.apache.metamodel.util.FileHelper;
import org.apache.metamodel.util.FileResource;
import org.apache.metamodel.util.InMemoryResource;
import org.apache.metamodel.util.Resource;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.ibm.icu.text.CharsetDetector;
import com.ibm.icu.text.CharsetMatch;
/**
* Provides an auto-detection mechanism for the properties of a
* {@link CsvConfiguration}.
*/
public class CsvConfigurationDetection {
private static final Logger logger = LoggerFactory.getLogger(CsvConfigurationDetection.class);
/**
* Amount of bytes to read for autodetection of encoding, separator and
* quotes
*/
private static final int SAMPLE_BUFFER_SIZE = 128 * 1024;
private final Resource _resource;
private List<String> _columnNames;
public CsvConfigurationDetection(final File file) {
_resource = new FileResource(file);
}
public CsvConfigurationDetection(final Resource resource) {
_resource = resource;
}
protected byte[] getSampleBuffer() {
byte[] bytes = new byte[SAMPLE_BUFFER_SIZE];
final InputStream inputStream = _resource.read();
try {
final int bufferSize = inputStream.read(bytes, 0, SAMPLE_BUFFER_SIZE);
if (bufferSize != -1 && bufferSize != SAMPLE_BUFFER_SIZE) {
bytes = Arrays.copyOf(bytes, bufferSize);
}
return bytes;
} catch (final IOException e) {
logger.error("IOException occurred while reading sample buffer", e);
return new byte[0];
} finally {
FileHelper.safeClose(inputStream);
}
}
/**
* Auto-detects the file encoding of a file
*
* @return
*/
public String suggestEncoding() {
return suggestEncoding(getSampleBuffer());
}
protected String suggestEncoding(final byte[] bytes) {
final CharsetDetector cd = new CharsetDetector();
cd.setText(bytes);
final CharsetMatch charsetMatch = cd.detect();
final String charSet = charsetMatch.getName();
final int confidence = charsetMatch.getConfidence();
logger.info("CharsetMatch: {} ({}% confidence)", charSet, confidence);
return charSet;
}
/**
* Auto-detect the {@link CsvConfiguration} of a CSV style data file,
* providing the encoding externally.
*
* @param encoding
* @return
* @throws IllegalStateException
* if an error occurs during auto-detection
*/
public CsvConfiguration suggestCsvConfiguration(final String encoding, final List<String> columnNames)
throws IllegalStateException {
final byte[] sample = getSampleBuffer();
return suggestCsvConfiguration(sample, encoding, columnNames);
}
/**
* Auto-detects the {@link CsvConfiguration} of a CSV style data file.
*
* @return
* @throws IllegalStateException
* if an error occurs during auto-detection
*/
public CsvConfiguration suggestCsvConfiguration() throws IllegalStateException {
return suggestCsvConfiguration(null);
}
public CsvConfiguration suggestCsvConfiguration(final List<String> columnNames) throws IllegalStateException {
final byte[] sample = getSampleBuffer();
final String encoding = suggestEncoding(sample);
return suggestCsvConfiguration(sample, encoding, columnNames);
}
private CsvConfiguration suggestCsvConfiguration(final byte[] sample, final String encoding,
final List<String> columnNames) throws IllegalStateException {
final char[] sampleChars = readSampleBuffer(sample, encoding);
if (indexOf('\n', sampleChars) == -1 && indexOf('\r', sampleChars) == -1) {
throw new IllegalStateException("No newline in first " + sampleChars.length + " chars");
}
int newlines = 0;
int tabs = 0;
int commas = 0;
int semicolons = 0;
int pipes = 0;
int singleQuotes = 0;
int doubleQuotes = 0;
int backslashes = 0;
for (int i = 0; i < sampleChars.length; i++) {
final char c = sampleChars[i];
if (c == '\n') {
newlines++;
} else if (c == '\t') {
tabs++;
} else if (c == ',') {
commas++;
} else if (c == ';') {
semicolons++;
} else if (c == '\'') {
singleQuotes++;
} else if (c == '|') {
pipes++;
} else if (c == '"') {
doubleQuotes++;
} else if (c == '\\') {
backslashes++;
}
}
final char separatorChar;
final char quoteChar;
final char escapeChar;
final int detectedSeparator = Math.max(tabs, Math.max(commas, Math.max(semicolons, pipes)));
if (detectedSeparator == 0 || detectedSeparator < newlines) {
separatorChar = ',';
} else {
// set the separator
if (detectedSeparator == commas) {
separatorChar = ',';
} else if (detectedSeparator == semicolons) {
separatorChar = ';';
} else if (detectedSeparator == tabs) {
separatorChar = '\t';
} else if (detectedSeparator == pipes) {
separatorChar = '|';
} else {
separatorChar = ',';
}
}
if (backslashes > 0) {
escapeChar = '\\';
} else {
escapeChar = CsvConfiguration.NOT_A_CHAR;
}
final int detectedQuote = Math.max(singleQuotes, doubleQuotes);
if (detectedQuote == 0 || detectedQuote < newlines) {
quoteChar = '"';
} else {
// set the quote
if (detectedQuote == singleQuotes) {
quoteChar = '\'';
} else if (detectedQuote == doubleQuotes) {
quoteChar = '"';
} else {
quoteChar = '"';
}
}
final ColumnNamingStrategy columnNamingStategy;
if (columnNames != null && columnNames.size() > 0) {
columnNamingStategy = new CustomColumnNamingStrategy(columnNames);
_columnNames = columnNames;
} else {
columnNamingStategy = null;
}
// detect if multi line values occur
boolean multiline = false;
final CsvConfiguration multiLineConfiguration =
new CsvConfiguration(CsvConfiguration.DEFAULT_COLUMN_NAME_LINE, columnNamingStategy, encoding,
separatorChar, quoteChar, escapeChar, false, true);
try {
final CsvDataContext testDataContext =
new CsvDataContext(new InMemoryResource("foo.txt", sample, System.currentTimeMillis()),
multiLineConfiguration);
final Table table = testDataContext.getDefaultSchema().getTable(0);
if (_columnNames == null) {
_columnNames = Arrays.asList(testDataContext.getDefaultSchema().getTable(0).getColumnNames());
}
try (DataSet dataSet = testDataContext.query().from(table).select(table.getColumns()).execute()) {
while (dataSet.next()) {
final Row row = dataSet.getRow();
final Object[] values = row.getValues();
for (final Object value : values) {
if (value != null && value instanceof String) {
if (((String) value).indexOf('\n') != -1) {
// found a multi line value
multiline = true;
break;
}
}
}
}
}
} catch (final Exception e) {
logger.warn("Failed to detect multiline property of CsvConfiguration, defaulting to 'true'", e);
return multiLineConfiguration;
}
return new CsvConfiguration(CsvConfiguration.DEFAULT_COLUMN_NAME_LINE, columnNamingStategy, encoding,
separatorChar, quoteChar, escapeChar, false, multiline);
}
private int indexOf(final char c, final char[] sampleChars) {
for (int i = 0; i < sampleChars.length; i++) {
if (c == sampleChars[i]) {
return i;
}
}
return -1;
}
protected char[] readSampleBuffer(final byte[] bytes, final String charSet) throws IllegalStateException {
char[] buffer = new char[bytes.length];
Reader reader = null;
try {
reader = new InputStreamReader(new ByteArrayInputStream(bytes), charSet);
// read a sample of the file to auto-detect quotes and separators
final int bufferSize = reader.read(buffer);
if (bufferSize != -1) {
buffer = Arrays.copyOf(buffer, bufferSize);
}
} catch (final Exception e) {
if (logger.isWarnEnabled()) {
logger.warn("Error reading from file: " + e.getMessage(), e);
}
throw new IllegalStateException("Error reading from file: " + e.getMessage());
} finally {
if (reader != null) {
try {
reader.close();
} catch (final IOException ioe) {
logger.debug("Could not close reader", ioe);
}
}
}
return buffer;
}
public List<String> getColumnNames() {
return _columnNames;
}
}