ImportAdapterExcel.java example

Explorer
ARX-master
- src
/*
 * ARX: Powerful Data Anonymization
 * Copyright 2014 - 2015 Karol Babioch, Fabian Prasser, Florian Kohlmayer
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 * http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.deidentifier.arx.io;

import java.io.FileInputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.ss.usermodel.Workbook;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.deidentifier.arx.DataType;
import org.deidentifier.arx.io.ImportConfigurationExcel.ExcelFileTypes;

/**
 * Import adapter for Excel files
 * 
 * This adapter can import data from Excel files. It handles both XLS and XLSX
 * files. The file type itself is defined by {@link ImportConfigurationExcel}.
 * The files are accessed using Apache POI.
 *
 * @author Karol Babioch
 * @author Fabian Prasser
 * @see <a href="https://poi.apache.org/">Aapache POI</a>
 */
public class ImportAdapterExcel extends ImportAdapter {

    /** The configuration describing the Excel file. */
    private ImportConfigurationExcel config;

    /** Actual iterator used to go through data. */
    private Iterator<Row>            iterator;

    /**
     * Contains the last row as returned by the iterator.
     *
     * @note This row cannot be simply returned, but needs to be further
     *       processed, e.g. to return only selected columns.
     */
    private Row                      row;

    /**
     * Indicates whether the first row has already been returned
     * 
     * The first row contains the name of the columns and always needs to be
     * returned first in order to guarantee that the framework will pick up the
     * names correctly.
     */
    private boolean                  headerReturned = false;

    /** Number of rows within the specified sheet. */
    private int                      totalRows;

    /** Current row {@link lastRow} is referencing. */
    private int                      currentRow     = 0;

    /** TODO */
    private FileInputStream          input;

    /**
     * Creates a new instance of this object with given configuration
     * 
     * Depending upon the file type it either uses HSSF or XSSF to access the
     * file. In both cases {@link #iterator} will be assigned a reference to
     * an iterator, which can then be used to access the actual data on a row by
     * row basis.
     * 
     * @param config
     *            {@link #config}
     * 
     * @throws IOException
     *             In case file doesn't contain actual data
     */
    protected ImportAdapterExcel(ImportConfigurationExcel config) throws IOException {

        super(config);
        this.config = config;

        /* Get row iterator */
        input = new FileInputStream(config.getFileLocation());
        Workbook workbook = null;

        if (config.getExcelFileType() == ExcelFileTypes.XLS) {
            workbook = new HSSFWorkbook(input);
        } else if (config.getExcelFileType() == ExcelFileTypes.XLSX) {
            workbook = new XSSFWorkbook(input);
        } else {
            input.close();
            throw new IllegalArgumentException("File type not supported");
        }

        workbook.setMissingCellPolicy(Row.CREATE_NULL_AS_BLANK);
        Sheet sheet = workbook.getSheetAt(config.getSheetIndex());
        iterator = sheet.iterator();

        /* Get total number of rows */
        totalRows = sheet.getPhysicalNumberOfRows();

        /* Check whether there is actual data within the file */
        if (iterator.hasNext()) {

            row = iterator.next();
            if (config.getContainsHeader()) {
                if (!iterator.hasNext()) {
                    throw new IOException("File contains nothing but header");
                }
            }
        } else {
            throw new IOException("File contains no data");
        }

        // Create header
        header = createHeader();
    }

    /**
     * Returns the percentage of data that has already been returned
     * 
     * The basis for this calculation is the row currently being accessed.
     *
     * @return
     * @see {@link #currentRow}
     * @see {@link #totalRows}
     */
    @Override
    public int getProgress() {
        return (int) (((double) currentRow / (double) totalRows) * 100d);
    }

    /**
     * Indicates whether there is another element to return
     * 
     * This returns true when the file contains another line, which could be
     * accessed by {@link #iterator}.
     *
     * @return
     * @note {@link #row} effectively works as buffer and will always be set
     *       up by the previous iteration, so once there is no data, it will be
     *       assigned <code>null</code>, which is checked for here.
     */
    @Override
    public boolean hasNext() {
        return row != null;
    }

    /**
     * Returns the next row
     * 
     * The returned element is sorted as defined by {@link ImportColumn#index} and contains as many elements as there are columns selected to import
     * from {@link #indexes}. The first row will always contain the names of the
     * columns. {@link #headerReturned} is used to keep track of that.
     *
     * @return
     */
    @Override
    public String[] next() {

        /* Check whether header was already returned */
        if (!headerReturned) {
            headerReturned = true;
            return header;
        }

        /* Create regular row */
        String[] result = new String[indexes.length];
        for (int i = 0; i < indexes.length; i++) {

            row.getCell(indexes[i]).setCellType(Cell.CELL_TYPE_STRING);
            result[i] = IOUtil.trim(row.getCell(indexes[i]).getStringCellValue());

            if (!dataTypes[i].isValid(result[i])) {
                if (config.columns.get(i).isCleansing()) {
                    result[i] = DataType.NULL_VALUE;
                } else {
                    throw new IllegalArgumentException("Data value does not match data type");
                }
            }
        }

        /* Fetches the next row, which will be used in next iteration */
        if (iterator.hasNext()) {
            row = iterator.next();
            currentRow++;
        } else {
            row = null;
            try {
                input.close();
            } catch (Exception e) {
                /* Die silently */
            }
        }

        /* Return resulting row */
        return result;
    }

    /**
     * Dummy.
     */
    @Override
    public void remove() {
        throw new UnsupportedOperationException();
    }

    /**
     * Creates the header row
     * 
     * This returns a string array with the names of the columns that will be
     * returned later on by iterating over this object. Depending upon the
     * configuration {@link ImportConfigurationExcel#getContainsHeader()} and
     * whether or not names have been assigned explicitly either the appropriate
     * values will be returned, or names will be made up on the fly following
     * the pattern "Column #x", where x is incremented for each column.
     *
     * @return
     */
    private String[] createHeader() {

        /* Preparation work */
        if (config.getContainsHeader()) {
            config.prepare(row);
        }
        indexes = getIndexesToImport();
        dataTypes = getColumnDatatypes();

        /* Initialization */
        String[] header = new String[config.getColumns().size()];
        List<ImportColumn> columns = config.getColumns();

        /* Create header */
        for (int i = 0, len = columns.size(); i < len; i++) {

            ImportColumn column = columns.get(i);

            row.getCell(((ImportColumnExcel) column).getIndex()).setCellType(Cell.CELL_TYPE_STRING);
            String name = IOUtil.trim(row.getCell(((ImportColumnExcel) column).getIndex()).getStringCellValue());

            if (config.getContainsHeader() && !name.equals("")) {
                /* Assign name of file itself */
                header[i] = name;
            } else {
                /* Nothing defined in header (or empty), build name manually */
                header[i] = "Column #" + ((ImportColumnExcel) column).getIndex();
            }

            if (column.getAliasName() != null) {
                /* Name has been assigned explicitly */
                header[i] = column.getAliasName();
            }

            column.setAliasName(header[i]);
        }

        /* Fetch next row in preparation for next iteration */
        if (config.getContainsHeader()) {

            if (iterator.hasNext()) {
                row = iterator.next();
                currentRow++;
            } else {
                row = null;
            }
        }

        /* Return header */
        return header;
    }

    /**
     * Returns an array with indexes of columns that should be imported
     * 
     * Only columns listed within {@link #column} will be imported. This
     * iterates over the list of columns and returns an array with indexes of
     * columns that should be imported.
     * 
     * @return Array containing indexes of columns that should be imported
     */
    protected int[] getIndexesToImport() {

        /* Get indexes to import from */
        ArrayList<Integer> indexes = new ArrayList<Integer>();
        for (ImportColumn column : config.getColumns()) {
            indexes.add(((ImportColumnExcel) column).getIndex());
        }

        int[] result = new int[indexes.size()];
        for (int i = 0; i < result.length; i++) {
            result[i] = indexes.get(i);
        }
        return result;
    }
}