/*
* ARX: Powerful Data Anonymization
* Copyright 2014 - 2015 Karol Babioch, Fabian Prasser, Florian Kohlmayer
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.deidentifier.arx.io;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.ss.usermodel.Workbook;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.deidentifier.arx.DataType;
import org.deidentifier.arx.io.ImportConfigurationExcel.ExcelFileTypes;
/**
* Import adapter for Excel files
*
* This adapter can import data from Excel files. It handles both XLS and XLSX
* files. The file type itself is defined by {@link ImportConfigurationExcel}.
* The files are accessed using Apache POI.
*
* @author Karol Babioch
* @author Fabian Prasser
* @see <a href="https://poi.apache.org/">Aapache POI</a>
*/
public class ImportAdapterExcel extends ImportAdapter {
/** The configuration describing the Excel file. */
private ImportConfigurationExcel config;
/** Actual iterator used to go through data. */
private Iterator<Row> iterator;
/**
* Contains the last row as returned by the iterator.
*
* @note This row cannot be simply returned, but needs to be further
* processed, e.g. to return only selected columns.
*/
private Row row;
/**
* Indicates whether the first row has already been returned
*
* The first row contains the name of the columns and always needs to be
* returned first in order to guarantee that the framework will pick up the
* names correctly.
*/
private boolean headerReturned = false;
/** Number of rows within the specified sheet. */
private int totalRows;
/** Current row {@link lastRow} is referencing. */
private int currentRow = 0;
/** TODO */
private FileInputStream input;
/**
* Creates a new instance of this object with given configuration
*
* Depending upon the file type it either uses HSSF or XSSF to access the
* file. In both cases {@link #iterator} will be assigned a reference to
* an iterator, which can then be used to access the actual data on a row by
* row basis.
*
* @param config
* {@link #config}
*
* @throws IOException
* In case file doesn't contain actual data
*/
protected ImportAdapterExcel(ImportConfigurationExcel config) throws IOException {
super(config);
this.config = config;
/* Get row iterator */
input = new FileInputStream(config.getFileLocation());
Workbook workbook = null;
if (config.getExcelFileType() == ExcelFileTypes.XLS) {
workbook = new HSSFWorkbook(input);
} else if (config.getExcelFileType() == ExcelFileTypes.XLSX) {
workbook = new XSSFWorkbook(input);
} else {
input.close();
throw new IllegalArgumentException("File type not supported");
}
workbook.setMissingCellPolicy(Row.CREATE_NULL_AS_BLANK);
Sheet sheet = workbook.getSheetAt(config.getSheetIndex());
iterator = sheet.iterator();
/* Get total number of rows */
totalRows = sheet.getPhysicalNumberOfRows();
/* Check whether there is actual data within the file */
if (iterator.hasNext()) {
row = iterator.next();
if (config.getContainsHeader()) {
if (!iterator.hasNext()) {
throw new IOException("File contains nothing but header");
}
}
} else {
throw new IOException("File contains no data");
}
// Create header
header = createHeader();
}
/**
* Returns the percentage of data that has already been returned
*
* The basis for this calculation is the row currently being accessed.
*
* @return
* @see {@link #currentRow}
* @see {@link #totalRows}
*/
@Override
public int getProgress() {
return (int) (((double) currentRow / (double) totalRows) * 100d);
}
/**
* Indicates whether there is another element to return
*
* This returns true when the file contains another line, which could be
* accessed by {@link #iterator}.
*
* @return
* @note {@link #row} effectively works as buffer and will always be set
* up by the previous iteration, so once there is no data, it will be
* assigned <code>null</code>, which is checked for here.
*/
@Override
public boolean hasNext() {
return row != null;
}
/**
* Returns the next row
*
* The returned element is sorted as defined by {@link ImportColumn#index} and contains as many elements as there are columns selected to import
* from {@link #indexes}. The first row will always contain the names of the
* columns. {@link #headerReturned} is used to keep track of that.
*
* @return
*/
@Override
public String[] next() {
/* Check whether header was already returned */
if (!headerReturned) {
headerReturned = true;
return header;
}
/* Create regular row */
String[] result = new String[indexes.length];
for (int i = 0; i < indexes.length; i++) {
row.getCell(indexes[i]).setCellType(Cell.CELL_TYPE_STRING);
result[i] = IOUtil.trim(row.getCell(indexes[i]).getStringCellValue());
if (!dataTypes[i].isValid(result[i])) {
if (config.columns.get(i).isCleansing()) {
result[i] = DataType.NULL_VALUE;
} else {
throw new IllegalArgumentException("Data value does not match data type");
}
}
}
/* Fetches the next row, which will be used in next iteration */
if (iterator.hasNext()) {
row = iterator.next();
currentRow++;
} else {
row = null;
try {
input.close();
} catch (Exception e) {
/* Die silently */
}
}
/* Return resulting row */
return result;
}
/**
* Dummy.
*/
@Override
public void remove() {
throw new UnsupportedOperationException();
}
/**
* Creates the header row
*
* This returns a string array with the names of the columns that will be
* returned later on by iterating over this object. Depending upon the
* configuration {@link ImportConfigurationExcel#getContainsHeader()} and
* whether or not names have been assigned explicitly either the appropriate
* values will be returned, or names will be made up on the fly following
* the pattern "Column #x", where x is incremented for each column.
*
* @return
*/
private String[] createHeader() {
/* Preparation work */
if (config.getContainsHeader()) {
config.prepare(row);
}
indexes = getIndexesToImport();
dataTypes = getColumnDatatypes();
/* Initialization */
String[] header = new String[config.getColumns().size()];
List<ImportColumn> columns = config.getColumns();
/* Create header */
for (int i = 0, len = columns.size(); i < len; i++) {
ImportColumn column = columns.get(i);
row.getCell(((ImportColumnExcel) column).getIndex()).setCellType(Cell.CELL_TYPE_STRING);
String name = IOUtil.trim(row.getCell(((ImportColumnExcel) column).getIndex()).getStringCellValue());
if (config.getContainsHeader() && !name.equals("")) {
/* Assign name of file itself */
header[i] = name;
} else {
/* Nothing defined in header (or empty), build name manually */
header[i] = "Column #" + ((ImportColumnExcel) column).getIndex();
}
if (column.getAliasName() != null) {
/* Name has been assigned explicitly */
header[i] = column.getAliasName();
}
column.setAliasName(header[i]);
}
/* Fetch next row in preparation for next iteration */
if (config.getContainsHeader()) {
if (iterator.hasNext()) {
row = iterator.next();
currentRow++;
} else {
row = null;
}
}
/* Return header */
return header;
}
/**
* Returns an array with indexes of columns that should be imported
*
* Only columns listed within {@link #column} will be imported. This
* iterates over the list of columns and returns an array with indexes of
* columns that should be imported.
*
* @return Array containing indexes of columns that should be imported
*/
protected int[] getIndexesToImport() {
/* Get indexes to import from */
ArrayList<Integer> indexes = new ArrayList<Integer>();
for (ImportColumn column : config.getColumns()) {
indexes.add(((ImportColumnExcel) column).getIndex());
}
int[] result = new int[indexes.size()];
for (int i = 0; i < result.length; i++) {
result[i] = indexes.get(i);
}
return result;
}
}