/** * DataCleaner (community edition) * Copyright (C) 2014 Neopost - Customer Information Management * * This copyrighted material is made available to anyone wishing to use, modify, * copy, or redistribute it subject to the terms and conditions of the GNU * Lesser General Public License, as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License * for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this distribution; if not, write to: * Free Software Foundation, Inc. * 51 Franklin Street, Fifth Floor * Boston, MA 02110-1301 USA */ package org.datacleaner.util; import java.io.ByteArrayInputStream; import java.io.File; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.Reader; import java.util.Arrays; import java.util.List; import org.apache.metamodel.csv.CsvConfiguration; import org.apache.metamodel.csv.CsvDataContext; import org.apache.metamodel.data.DataSet; import org.apache.metamodel.data.Row; import org.apache.metamodel.schema.Table; import org.apache.metamodel.schema.naming.ColumnNamingStrategy; import org.apache.metamodel.schema.naming.CustomColumnNamingStrategy; import org.apache.metamodel.util.FileHelper; import org.apache.metamodel.util.FileResource; import org.apache.metamodel.util.InMemoryResource; import org.apache.metamodel.util.Resource; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.ibm.icu.text.CharsetDetector; import com.ibm.icu.text.CharsetMatch; /** * Provides an auto-detection mechanism for the properties of a * {@link CsvConfiguration}. */ public class CsvConfigurationDetection { private static final Logger logger = LoggerFactory.getLogger(CsvConfigurationDetection.class); /** * Amount of bytes to read for autodetection of encoding, separator and * quotes */ private static final int SAMPLE_BUFFER_SIZE = 128 * 1024; private final Resource _resource; private List<String> _columnNames; public CsvConfigurationDetection(final File file) { _resource = new FileResource(file); } public CsvConfigurationDetection(final Resource resource) { _resource = resource; } protected byte[] getSampleBuffer() { byte[] bytes = new byte[SAMPLE_BUFFER_SIZE]; final InputStream inputStream = _resource.read(); try { final int bufferSize = inputStream.read(bytes, 0, SAMPLE_BUFFER_SIZE); if (bufferSize != -1 && bufferSize != SAMPLE_BUFFER_SIZE) { bytes = Arrays.copyOf(bytes, bufferSize); } return bytes; } catch (final IOException e) { logger.error("IOException occurred while reading sample buffer", e); return new byte[0]; } finally { FileHelper.safeClose(inputStream); } } /** * Auto-detects the file encoding of a file * * @return */ public String suggestEncoding() { return suggestEncoding(getSampleBuffer()); } protected String suggestEncoding(final byte[] bytes) { final CharsetDetector cd = new CharsetDetector(); cd.setText(bytes); final CharsetMatch charsetMatch = cd.detect(); final String charSet = charsetMatch.getName(); final int confidence = charsetMatch.getConfidence(); logger.info("CharsetMatch: {} ({}% confidence)", charSet, confidence); return charSet; } /** * Auto-detect the {@link CsvConfiguration} of a CSV style data file, * providing the encoding externally. * * @param encoding * @return * @throws IllegalStateException * if an error occurs during auto-detection */ public CsvConfiguration suggestCsvConfiguration(final String encoding, final List<String> columnNames) throws IllegalStateException { final byte[] sample = getSampleBuffer(); return suggestCsvConfiguration(sample, encoding, columnNames); } /** * Auto-detects the {@link CsvConfiguration} of a CSV style data file. * * @return * @throws IllegalStateException * if an error occurs during auto-detection */ public CsvConfiguration suggestCsvConfiguration() throws IllegalStateException { return suggestCsvConfiguration(null); } public CsvConfiguration suggestCsvConfiguration(final List<String> columnNames) throws IllegalStateException { final byte[] sample = getSampleBuffer(); final String encoding = suggestEncoding(sample); return suggestCsvConfiguration(sample, encoding, columnNames); } private CsvConfiguration suggestCsvConfiguration(final byte[] sample, final String encoding, final List<String> columnNames) throws IllegalStateException { final char[] sampleChars = readSampleBuffer(sample, encoding); if (indexOf('\n', sampleChars) == -1 && indexOf('\r', sampleChars) == -1) { throw new IllegalStateException("No newline in first " + sampleChars.length + " chars"); } int newlines = 0; int tabs = 0; int commas = 0; int semicolons = 0; int pipes = 0; int singleQuotes = 0; int doubleQuotes = 0; int backslashes = 0; for (int i = 0; i < sampleChars.length; i++) { final char c = sampleChars[i]; if (c == '\n') { newlines++; } else if (c == '\t') { tabs++; } else if (c == ',') { commas++; } else if (c == ';') { semicolons++; } else if (c == '\'') { singleQuotes++; } else if (c == '|') { pipes++; } else if (c == '"') { doubleQuotes++; } else if (c == '\\') { backslashes++; } } final char separatorChar; final char quoteChar; final char escapeChar; final int detectedSeparator = Math.max(tabs, Math.max(commas, Math.max(semicolons, pipes))); if (detectedSeparator == 0 || detectedSeparator < newlines) { separatorChar = ','; } else { // set the separator if (detectedSeparator == commas) { separatorChar = ','; } else if (detectedSeparator == semicolons) { separatorChar = ';'; } else if (detectedSeparator == tabs) { separatorChar = '\t'; } else if (detectedSeparator == pipes) { separatorChar = '|'; } else { separatorChar = ','; } } if (backslashes > 0) { escapeChar = '\\'; } else { escapeChar = CsvConfiguration.NOT_A_CHAR; } final int detectedQuote = Math.max(singleQuotes, doubleQuotes); if (detectedQuote == 0 || detectedQuote < newlines) { quoteChar = '"'; } else { // set the quote if (detectedQuote == singleQuotes) { quoteChar = '\''; } else if (detectedQuote == doubleQuotes) { quoteChar = '"'; } else { quoteChar = '"'; } } final ColumnNamingStrategy columnNamingStategy; if (columnNames != null && columnNames.size() > 0) { columnNamingStategy = new CustomColumnNamingStrategy(columnNames); _columnNames = columnNames; } else { columnNamingStategy = null; } // detect if multi line values occur boolean multiline = false; final CsvConfiguration multiLineConfiguration = new CsvConfiguration(CsvConfiguration.DEFAULT_COLUMN_NAME_LINE, columnNamingStategy, encoding, separatorChar, quoteChar, escapeChar, false, true); try { final CsvDataContext testDataContext = new CsvDataContext(new InMemoryResource("foo.txt", sample, System.currentTimeMillis()), multiLineConfiguration); final Table table = testDataContext.getDefaultSchema().getTable(0); if (_columnNames == null) { _columnNames = Arrays.asList(testDataContext.getDefaultSchema().getTable(0).getColumnNames()); } try (DataSet dataSet = testDataContext.query().from(table).select(table.getColumns()).execute()) { while (dataSet.next()) { final Row row = dataSet.getRow(); final Object[] values = row.getValues(); for (final Object value : values) { if (value != null && value instanceof String) { if (((String) value).indexOf('\n') != -1) { // found a multi line value multiline = true; break; } } } } } } catch (final Exception e) { logger.warn("Failed to detect multiline property of CsvConfiguration, defaulting to 'true'", e); return multiLineConfiguration; } return new CsvConfiguration(CsvConfiguration.DEFAULT_COLUMN_NAME_LINE, columnNamingStategy, encoding, separatorChar, quoteChar, escapeChar, false, multiline); } private int indexOf(final char c, final char[] sampleChars) { for (int i = 0; i < sampleChars.length; i++) { if (c == sampleChars[i]) { return i; } } return -1; } protected char[] readSampleBuffer(final byte[] bytes, final String charSet) throws IllegalStateException { char[] buffer = new char[bytes.length]; Reader reader = null; try { reader = new InputStreamReader(new ByteArrayInputStream(bytes), charSet); // read a sample of the file to auto-detect quotes and separators final int bufferSize = reader.read(buffer); if (bufferSize != -1) { buffer = Arrays.copyOf(buffer, bufferSize); } } catch (final Exception e) { if (logger.isWarnEnabled()) { logger.warn("Error reading from file: " + e.getMessage(), e); } throw new IllegalStateException("Error reading from file: " + e.getMessage()); } finally { if (reader != null) { try { reader.close(); } catch (final IOException ioe) { logger.debug("Could not close reader", ioe); } } } return buffer; } public List<String> getColumnNames() { return _columnNames; } }