/* * ARX: Powerful Data Anonymization * Copyright 2012 - 2017 Fabian Prasser, Florian Kohlmayer and contributors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.deidentifier.arx.io; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.Reader; import java.nio.charset.Charset; import java.util.Iterator; import org.apache.poi.ss.formula.functions.T; import org.deidentifier.arx.DataType; import com.univocity.parsers.csv.CsvFormat; import com.univocity.parsers.csv.CsvParser; import com.univocity.parsers.csv.CsvParserSettings; /** * This class implements a reader for CSV encoded information. * * @author Fabian Prasser * @author Florian Kohlmayer */ public class CSVDataInput { /** * Static helper class for lazy initialization of a read * * @author Fabian Prasser * @author Florian Kohlmayer */ private static class LazyFileReader extends Reader { /** Reader */ private InputStreamReader reader = null; /** File */ private final File file; /** Charset */ private final Charset charset; /** * Creates a new instance * * @param file */ public LazyFileReader(File file, Charset charset) { this.file = file; this.charset = charset; } @Override public void close() throws IOException { if (reader != null) { reader.close(); } } @Override public int read(char[] cbuf, int off, int len) throws IOException { reader = reader != null ? reader : new InputStreamReader(new FileInputStream(file), charset); return reader.read(cbuf, off, len); } } /** A reader. */ private final Reader reader; /** Settings. */ private final CsvParserSettings settings; /** Cleanisng enabled */ private final boolean cleansing; /** The data type for each column */ private final DataType<T>[] datatypes; /** * Instantiate. * * @param file the file * @throws IOException Signals that an I/O exception has occurred. */ public CSVDataInput(final File file, final Charset charset) throws IOException { this(file, charset, CSVSyntax.DEFAULT_DELIMITER); } /** * Instantiate. * * @param file the file * @param delimiter the delimiter * @throws IOException Signals that an I/O exception has occurred. */ public CSVDataInput(final File file, final Charset charset, final char delimiter) throws IOException { this(file, charset, delimiter, CSVSyntax.DEFAULT_QUOTE); } /** * Instantiate. * * @param file the file * @param delimiter the delimiter * @param quote the quote * @throws IOException Signals that an I/O exception has occurred. */ public CSVDataInput(final File file, final Charset charset, final char delimiter, final char quote) throws IOException { this(file, charset, delimiter, quote, CSVSyntax.DEFAULT_ESCAPE); } /** * Instantiate. * * @param file the file * @param delimiter the delimiter * @param quote the quote * @param escape the escape * @throws IOException Signals that an I/O exception has occurred. */ public CSVDataInput(final File file, final Charset charset, final char delimiter, final char quote, final char escape) throws IOException { this(file, charset, delimiter, quote, escape, CSVSyntax.DEFAULT_LINEBREAK); } /** * Instantiate. * * @param file the file * @param delimiter the delimiter * @param quote the quote * @param escape the escape * @param linebreak the linebreak * @throws IOException Signals that an I/O exception has occurred. */ public CSVDataInput(final File file, final Charset charset, final char delimiter, final char quote, final char escape, final char[] linebreak) throws IOException { this(new LazyFileReader(file, charset), delimiter, quote, escape, linebreak, null); } /** * Instantiate. * * @param file the file * @param config the config * @throws IOException Signals that an I/O exception has occurred. */ public CSVDataInput(final File file, final Charset charset, final CSVSyntax config) throws IOException { this(file, charset, config, null); } /** * Instatiate. * * @param file * @param config * @param datatype * @throws IOException */ public CSVDataInput(final File file, final Charset charset, final CSVSyntax config, final DataType<T>[] datatype) throws IOException { this(new LazyFileReader(file, charset), config.getDelimiter(), config.getQuote(), config.getEscape(), config.getLinebreak(), datatype); } /** * Instantiate. * * @param stream the stream * @throws IOException Signals that an I/O exception has occurred. */ public CSVDataInput(final InputStream stream, final Charset charset) throws IOException { this(stream, charset, CSVSyntax.DEFAULT_DELIMITER); } /** * Instantiate. * * @param stream the stream * @param delimiter the delimiter * @throws IOException Signals that an I/O exception has occurred. */ public CSVDataInput(final InputStream stream, final Charset charset, final char delimiter) throws IOException { this(stream, charset, delimiter, CSVSyntax.DEFAULT_QUOTE); } /** * Instantiate. * * @param stream the stream * @param delimiter the delimiter * @param quote the quote * @throws IOException Signals that an I/O exception has occurred. */ public CSVDataInput(final InputStream stream, final Charset charset, final char delimiter, final char quote) throws IOException { this(stream, charset, delimiter, quote, CSVSyntax.DEFAULT_ESCAPE); } /** * Instantiate. * * @param stream the stream * @param delimiter the delimiter * @param quote the quote * @param escape the escape * @throws IOException Signals that an I/O exception has occurred. */ public CSVDataInput(final InputStream stream, final Charset charset, final char delimiter, final char quote, final char escape) throws IOException { this(stream, charset, delimiter, quote, escape, CSVSyntax.DEFAULT_LINEBREAK); } /** * Instantiate. * * @param stream the stream * @param delimiter the delimiter * @param quote the quote * @param escape the escape * @param linebreak the linebreak * @throws IOException Signals that an I/O exception has occurred. */ public CSVDataInput(final InputStream stream, final Charset charset, final char delimiter, final char quote, final char escape, final char[] linebreak) throws IOException { this(new InputStreamReader(stream, charset), delimiter, quote, escape, linebreak, null); } /** * Instantiate. * * @param stream the stream * @param config the config * @throws IOException Signals that an I/O exception has occurred. */ public CSVDataInput(final InputStream stream, final Charset charset, final CSVSyntax config) throws IOException { this(stream, charset, config, null); } /** * Instantiate. * * @param stream * @param config * @param datatypes * @throws IOException */ public CSVDataInput(final InputStream stream, final Charset charset, final CSVSyntax config, final DataType<T>[] datatypes) throws IOException { this(new InputStreamReader(stream, charset), config.getDelimiter(), config.getQuote(), config.getEscape(), config.getLinebreak(), datatypes); } /** * Instantiate. * * @param reader the reader * @param delimiter the delimiter * @param quote the quote * @param escape the escape * @param linebreak the linebreak * @throws IOException Signals that an I/O exception has occurred. */ public CSVDataInput(final Reader reader, final char delimiter, final char quote, final char escape, final char[] linebreak, final DataType<T>[] datatypes) throws IOException { this.reader = reader; this.datatypes = datatypes; if (datatypes != null) { cleansing = true; } else { cleansing = false; } settings = createSettings(delimiter, quote, escape, linebreak); } /** * Instantiate. * * @param filename the filename * @throws IOException Signals that an I/O exception has occurred. */ public CSVDataInput(final String filename, final Charset charset) throws IOException { this(filename, charset, CSVSyntax.DEFAULT_DELIMITER); } /** * Instantiate. * * @param filename the filename * @param delimiter the delimiter * @throws IOException Signals that an I/O exception has occurred. */ public CSVDataInput(final String filename, final Charset charset, final char delimiter) throws IOException { this(filename, charset, delimiter, CSVSyntax.DEFAULT_QUOTE); } /** * Instantiate. * * @param filename the filename * @param delimiter the delimiter * @param quote the quote * @throws IOException Signals that an I/O exception has occurred. */ public CSVDataInput(final String filename, final Charset charset, final char delimiter, final char quote) throws IOException { this(filename, charset, delimiter, quote, CSVSyntax.DEFAULT_ESCAPE); } /** * Instantiate. * * @param filename the filename * @param delimiter the delimiter * @param quote the quote * @param escape the escape * @throws IOException Signals that an I/O exception has occurred. */ public CSVDataInput(final String filename, final Charset charset, final char delimiter, final char quote, final char escape) throws IOException { this(filename, charset, delimiter, quote, escape, CSVSyntax.DEFAULT_LINEBREAK); } /** * Instantiate. * * @param filename the filename * @param delimiter the delimiter * @param quote the quote * @param escape the escape * @param linebreak the linebreak * @throws IOException Signals that an I/O exception has occurred. */ public CSVDataInput(final String filename, final Charset charset, final char delimiter, final char quote, final char escape, final char[] linebreak) throws IOException { this(new File(filename), charset, delimiter, quote, escape, linebreak); } /** * Instantiate. * * @param filename the filename * @param config the config * @throws IOException Signals that an I/O exception has occurred. */ public CSVDataInput(final String filename, final Charset charset, final CSVSyntax config) throws IOException { this(filename, charset, config, null); } /** * Instantiate. * * @param filename * @param config * @param datatypes * @throws IOException */ public CSVDataInput(final String filename, final Charset charset, final CSVSyntax config, final DataType<T>[] datatypes) throws IOException { this(new LazyFileReader(new File(filename), charset), config.getDelimiter(), config.getQuote(), config.getEscape(), config.getLinebreak(), datatypes); } /** * Closes the reader. * * @throws IOException Signals that an I/O exception has occurred. */ public void close() throws IOException { reader.close(); } /** * Returns an iterator. * * @return the iterator */ public Iterator<String[]> iterator() { return new Iterator<String[]>() { // Next tuple CsvParser parser = null; String[] next = null; @Override public boolean hasNext() { initParser(); return next != null; } @Override public String[] next() { initParser(); String[] result = next; next = parser.parseNext(); // Replace each non matching value with the special NULL string if (cleansing) { if (result.length != datatypes.length) { throw new IllegalArgumentException("More columns available in CSV file than data types specified!"); } for (int i = 0; i < result.length; i++) { if (!datatypes[i].isValid(result[i])) { result[i] = DataType.NULL_VALUE; } } } return result; } @Override public void remove() { throw new UnsupportedOperationException("Not implemented"); } /** Initializes the parser*/ private void initParser() { if (parser == null) { parser = new CsvParser(settings); parser.beginParsing(reader); next = parser.parseNext(); } } }; } /** * Creates the CsvParserSettings. * * @param delimiter the delimiter * @param quote the quote * @param escape the escape * @param linebreak the linebreak * @return the csv parser settings */ private CsvParserSettings createSettings(final char delimiter, final char quote, final char escape, final char[] linebreak) { CsvFormat format = new CsvFormat(); format.setDelimiter(delimiter); format.setQuote(quote); format.setQuoteEscape(escape); format.setLineSeparator(linebreak); format.setNormalizedNewline(CSVSyntax.getNormalizedLinebreak(linebreak)); format.setComment('\0'); CsvParserSettings settings = new CsvParserSettings(); settings.setEmptyValue(""); settings.setNullValue(""); settings.setFormat(format); return settings; } }