/*
* ARX: Powerful Data Anonymization
* Copyright 2014 - 2015 Karol Babioch, Fabian Prasser, Florian Kohlmayer
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.deidentifier.arx.io;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import org.apache.commons.io.input.CountingInputStream;
import org.deidentifier.arx.DataType;
/**
* Import adapter for CSV files
*
* This adapter can import data from a CSV file. The CSV file itself is
* described by an appropriate {@link ImportConfigurationCSV} object. Internally
* this class makes use of {@link CSVDataInput} to read the CSV file on a line
* by line basis. A counting input stream (@link CountingInputStream} is used in
* order for {@link #getProgress() to be able to return the percentage of data
* that has already been processed.
*
* @author Karol Babioch
* @author Fabian Prasser
*/
public class ImportAdapterCSV extends ImportAdapter {
/** The configuration describing the CSV file being used. */
private ImportConfigurationCSV config;
/** The size of the CSV file. */
private long bytesTotal;
/**
* Counting input stream
*
* This is used within {@link #getProgress()} to be able to know how many
* bytes have already been processed.
*/
private CountingInputStream cin;
/**
* @see {@link CSVDataInput}
*/
private CSVDataInput in;
/**
* Actual iterator used to go through data within CSV file.
*
* @see {@link CSVDataInput#iterator()}
*/
private Iterator<String[]> it;
/**
* Contains the last row as returned by {@link CSVDataInput#iterator()}.
*
* @note This row cannot be simply returned, but needs to be further
* processed, e.g. to return only selected columns.
*/
private String[] row;
/**
* Indicates whether the first row has already been returned
*
* The first row contains the name of the columns. Depending upon {@link #containsHeader} and whether the name of the column has been
* assigned explicitly, this is either the value of the file itself, the
* value defined by the user, or a default value.
*/
private boolean headerReturned = false;
/**
* Creates a new instance of this object with given configuration.
*
* @param config {@link #config}
* @throws IOException In case file doesn't contain actual data
*/
protected ImportAdapterCSV(ImportConfigurationCSV config) throws IOException {
super(config);
this.config = config;
this.bytesTotal = new File(config.getFileLocation()).length();
/* Used to keep track of progress */
cin = new CountingInputStream(new FileInputStream(new File(config.getFileLocation())));
/* Get CSV iterator */
in = new CSVDataInput(cin, config.getCharset(), config.getDelimiter(), config.getQuote(), config.getEscape(), config.getLinebreak());
it = in.iterator();
/* Check whether there is actual data within the CSV file */
if (it.hasNext()) {
row = it.next();
if (config.getContainsHeader()) {
if (!it.hasNext()) {
throw new IOException("CSV contains nothing but header");
}
}
} else {
throw new IOException("CSV file contains no data");
}
// Create header
header = createHeader();
}
/**
* Returns the percentage of data that has already been returned
*
* This divides the amount of bytes that have already been read by the
* amount of total bytes and casts the result into a percentage.
*
* @return
*/
@Override
public int getProgress() {
/* Check whether stream has been opened already at all */
if (cin == null) {
return 0;
}
long bytesRead = cin.getByteCount();
return (int) ((double) bytesRead / (double) bytesTotal * 100d);
}
/**
* Indicates whether there is another element to return
*
* This returns true when the CSV file has another line, which would be
* assigned to {@link #row} during the last iteration of {@link #next()}.
*
* @return
*/
@Override
public boolean hasNext() {
return row != null;
}
/**
* Returns the next row
*
* The returned element is sorted as defined by {@link ImportColumn#index} and contains as many elements as there are columns selected to import
* from {@link #indexes}. The first row will always contain the names of the
* columns. {@link #headerReturned} is used to keep track of that.
*
* @return
*/
@Override
public String[] next() {
/* Check whether header was already returned */
if (!headerReturned) {
headerReturned = true;
return header;
}
/* Create regular row */
String[] result;
try {
result = new String[indexes.length];
for (int i = 0; i < indexes.length; i++) {
result[i] = row[indexes[i]];
if (!dataTypes[i].isValid(result[i])) {
if (config.columns.get(i).isCleansing()) {
result[i] = DataType.NULL_VALUE;
} else {
throw new IllegalArgumentException("Data value does not match data type");
}
}
}
} catch (ArrayIndexOutOfBoundsException e) {
throw new IllegalArgumentException("Inconsistent length of header and records");
}
/* Fetches the next row, which will be used in next iteration */
if (it.hasNext()) {
row = it.next();
} else {
row = null;
}
/* Return resulting row */
return result;
}
/**
* Dummy.
*/
@Override
public void remove() {
throw new UnsupportedOperationException();
}
/**
* Creates the header row
*
* This returns a string array with the names of the columns that will be
* returned later on by iterating over this object. Depending upon the
* configuration {@link ImportConfigurationCSV#getContainsHeader()} and
* whether or not names have been assigned explicitly either the appropriate
* values will be returned, or names will be made up on the fly following
* the pattern "Column #x", where x is incremented for each column.
*
* @return
*/
private String[] createHeader() {
/* Preparation work */
if (config.getContainsHeader()) this.config.prepare(row);
this.indexes = getIndexesToImport();
this.dataTypes = getColumnDatatypes();
/* Initialization */
String[] header = new String[config.getColumns().size()];
List<ImportColumn> columns = config.getColumns();
/* Create header */
for (int i = 0, len = columns.size(); i < len; i++) {
ImportColumn column = columns.get(i);
/* Check whether there is a header, which is not empty */
if (config.getContainsHeader() &&
!row[((ImportColumnCSV) column).getIndex()].equals("")) {
/* Assign name of CSV file itself */
header[i] = row[((ImportColumnCSV) column).getIndex()];
} else {
/* Nothing defined in header (or empty), build name manually */
header[i] = "Column #" + ((ImportColumnCSV) column).getIndex();
}
if (column.getAliasName() != null) {
/* Name has been assigned explicitly */
header[i] = column.getAliasName();
}
column.setAliasName(header[i]);
}
/* Fetch next row in preparation for next iteration */
if (config.getContainsHeader()) {
if (it.hasNext()) {
row = it.next();
} else {
row = null;
}
}
/* Return header */
return header;
}
/**
* Returns an array with indexes of columns that should be imported
*
* Only columns listed within {@link #column} will be imported. This
* iterates over the list of columns and returns an array with indexes of
* columns that should be imported.
*
* @return Array containing indexes of columns that should be imported
*/
protected int[] getIndexesToImport() {
/* Get indexes to import from */
ArrayList<Integer> indexes = new ArrayList<Integer>();
for (ImportColumn column : config.getColumns()) {
indexes.add(((ImportColumnCSV) column).getIndex());
}
int[] result = new int[indexes.size()];
for (int i = 0; i < result.length; i++) {
result[i] = indexes.get(i);
}
return result;
}
}