ImportAdapterCSV.java example

Explorer
ARX-master
- src
/*
 * ARX: Powerful Data Anonymization
 * Copyright 2014 - 2015 Karol Babioch, Fabian Prasser, Florian Kohlmayer
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 * http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.deidentifier.arx.io;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

import org.apache.commons.io.input.CountingInputStream;
import org.deidentifier.arx.DataType;

/**
 * Import adapter for CSV files
 * 
 * This adapter can import data from a CSV file. The CSV file itself is
 * described by an appropriate {@link ImportConfigurationCSV} object. Internally
 * this class makes use of {@link CSVDataInput} to read the CSV file on a line
 * by line basis. A counting input stream (@link CountingInputStream} is used in
 * order for {@link #getProgress() to be able to return the percentage of data
 * that has already been processed.
 * 
 * @author Karol Babioch
 * @author Fabian Prasser
 */
public class ImportAdapterCSV extends ImportAdapter {

    /** The configuration describing the CSV file being used. */
    private ImportConfigurationCSV config;

    /** The size of the CSV file. */
    private long                   bytesTotal;

    /**
     * Counting input stream
     * 
     * This is used within {@link #getProgress()} to be able to know how many
     * bytes have already been processed.
     */
    private CountingInputStream    cin;

    /**
     * @see {@link CSVDataInput}
     */
    private CSVDataInput           in;

    /**
     * Actual iterator used to go through data within CSV file.
     *
     * @see {@link CSVDataInput#iterator()}
     */
    private Iterator<String[]>     it;

    /**
     * Contains the last row as returned by {@link CSVDataInput#iterator()}.
     *
     * @note This row cannot be simply returned, but needs to be further
     *       processed, e.g. to return only selected columns.
     */
    private String[]               row;

    /**
     * Indicates whether the first row has already been returned
     * 
     * The first row contains the name of the columns. Depending upon {@link #containsHeader} and whether the name of the column has been
     * assigned explicitly, this is either the value of the file itself, the
     * value defined by the user, or a default value.
     */
    private boolean                headerReturned = false;

    /**
     * Creates a new instance of this object with given configuration.
     *
     * @param config {@link #config}
     * @throws IOException In case file doesn't contain actual data
     */
    protected ImportAdapterCSV(ImportConfigurationCSV config) throws IOException {

        super(config);
        this.config = config;
        this.bytesTotal = new File(config.getFileLocation()).length();

        /* Used to keep track of progress */
        cin = new CountingInputStream(new FileInputStream(new File(config.getFileLocation())));

        /* Get CSV iterator */
        in = new CSVDataInput(cin, config.getCharset(), config.getDelimiter(), config.getQuote(), config.getEscape(), config.getLinebreak());
        it = in.iterator();

        /* Check whether there is actual data within the CSV file */
        if (it.hasNext()) {
            row = it.next();
            if (config.getContainsHeader()) {
                if (!it.hasNext()) {
                    throw new IOException("CSV contains nothing but header");
                }
            }
        } else {
            throw new IOException("CSV file contains no data");
        }

        // Create header
        header = createHeader();
    }

    /**
     * Returns the percentage of data that has already been returned
     * 
     * This divides the amount of bytes that have already been read by the
     * amount of total bytes and casts the result into a percentage.
     *
     * @return
     */
    @Override
    public int getProgress() {

        /* Check whether stream has been opened already at all */
        if (cin == null) {
            return 0;
        }

        long bytesRead = cin.getByteCount();
        return (int) ((double) bytesRead / (double) bytesTotal * 100d);
    }

    /**
     * Indicates whether there is another element to return
     * 
     * This returns true when the CSV file has another line, which would be
     * assigned to {@link #row} during the last iteration of {@link #next()}.
     *
     * @return
     */
    @Override
    public boolean hasNext() {
        return row != null;
    }

    /**
     * Returns the next row
     * 
     * The returned element is sorted as defined by {@link ImportColumn#index} and contains as many elements as there are columns selected to import
     * from {@link #indexes}. The first row will always contain the names of the
     * columns. {@link #headerReturned} is used to keep track of that.
     *
     * @return
     */
    @Override
    public String[] next() {

        /* Check whether header was already returned */
        if (!headerReturned) {
            headerReturned = true;
            return header;
        }

        /* Create regular row */
        String[] result;
        try {
            result = new String[indexes.length];
            for (int i = 0; i < indexes.length; i++) {
                result[i] = row[indexes[i]];
                if (!dataTypes[i].isValid(result[i])) {
                    if (config.columns.get(i).isCleansing()) {
                        result[i] = DataType.NULL_VALUE;
                    } else {
                        throw new IllegalArgumentException("Data value does not match data type");
                    }
                }
            }
        } catch (ArrayIndexOutOfBoundsException e) {
            throw new IllegalArgumentException("Inconsistent length of header and records");
        }

        /* Fetches the next row, which will be used in next iteration */
        if (it.hasNext()) {
            row = it.next();
        } else {
            row = null;
        }

        /* Return resulting row */
        return result;
    }

    /**
     * Dummy.
     */
    @Override
    public void remove() {
        throw new UnsupportedOperationException();
    }

    /**
     * Creates the header row
     * 
     * This returns a string array with the names of the columns that will be
     * returned later on by iterating over this object. Depending upon the
     * configuration {@link ImportConfigurationCSV#getContainsHeader()} and
     * whether or not names have been assigned explicitly either the appropriate
     * values will be returned, or names will be made up on the fly following
     * the pattern "Column #x", where x is incremented for each column.
     *
     * @return
     */
    private String[] createHeader() {

        /* Preparation work */
        if (config.getContainsHeader()) this.config.prepare(row);
        this.indexes = getIndexesToImport();
        this.dataTypes = getColumnDatatypes();

        /* Initialization */
        String[] header = new String[config.getColumns().size()];
        List<ImportColumn> columns = config.getColumns();

        /* Create header */
        for (int i = 0, len = columns.size(); i < len; i++) {

            ImportColumn column = columns.get(i);

            /* Check whether there is a header, which is not empty */
            if (config.getContainsHeader() &&
                !row[((ImportColumnCSV) column).getIndex()].equals("")) {

                /* Assign name of CSV file itself */
                header[i] = row[((ImportColumnCSV) column).getIndex()];
            } else {
                /* Nothing defined in header (or empty), build name manually */
                header[i] = "Column #" + ((ImportColumnCSV) column).getIndex();
            }

            if (column.getAliasName() != null) {
                /* Name has been assigned explicitly */
                header[i] = column.getAliasName();
            }
            column.setAliasName(header[i]);
        }

        /* Fetch next row in preparation for next iteration */
        if (config.getContainsHeader()) {
            if (it.hasNext()) {
                row = it.next();
            } else {
                row = null;
            }
        }

        /* Return header */
        return header;
    }

    /**
     * Returns an array with indexes of columns that should be imported
     * 
     * Only columns listed within {@link #column} will be imported. This
     * iterates over the list of columns and returns an array with indexes of
     * columns that should be imported.
     * 
     * @return Array containing indexes of columns that should be imported
     */
    protected int[] getIndexesToImport() {

        /* Get indexes to import from */
        ArrayList<Integer> indexes = new ArrayList<Integer>();
        for (ImportColumn column : config.getColumns()) {
            indexes.add(((ImportColumnCSV) column).getIndex());
        }

        int[] result = new int[indexes.size()];
        for (int i = 0; i < result.length; i++) {
            result[i] = indexes.get(i);
        }

        return result;
    }
}