CreateCsvFileAnalyzer.java example

Explorer
DataCleaner-master
/**
 * DataCleaner (community edition)
 * Copyright (C) 2014 Neopost - Customer Information Management
 *
 * This copyrighted material is made available to anyone wishing to use, modify,
 * copy, or redistribute it subject to the terms and conditions of the GNU
 * Lesser General Public License, as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
 * for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this distribution; if not, write to:
 * Free Software Foundation, Inc.
 * 51 Franklin Street, Fifth Floor
 * Boston, MA  02110-1301  USA
 */
package org.datacleaner.extension.output;

import java.io.File;
import java.io.IOException;
import java.io.OutputStream;
import java.io.Writer;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
import java.util.List;

import javax.inject.Inject;
import javax.inject.Named;

import org.apache.metamodel.csv.CsvConfiguration;
import org.apache.metamodel.csv.CsvDataContext;
import org.apache.metamodel.csv.CsvWriter;
import org.apache.metamodel.data.DataSet;
import org.apache.metamodel.data.Row;
import org.apache.metamodel.schema.Table;
import org.apache.metamodel.util.FileHelper;
import org.apache.metamodel.util.FileResource;
import org.apache.metamodel.util.Resource;
import org.datacleaner.api.Alias;
import org.datacleaner.api.Categorized;
import org.datacleaner.api.Configured;
import org.datacleaner.api.Description;
import org.datacleaner.api.FileProperty;
import org.datacleaner.api.FileProperty.FileAccessMode;
import org.datacleaner.api.HasDistributionAdvice;
import org.datacleaner.api.HasLabelAdvice;
import org.datacleaner.api.Initialize;
import org.datacleaner.api.InputColumn;
import org.datacleaner.api.Provided;
import org.datacleaner.api.Validate;
import org.datacleaner.beans.writers.WriteDataResult;
import org.datacleaner.beans.writers.WriteDataResultImpl;
import org.datacleaner.components.categories.WriteSuperCategory;
import org.datacleaner.connection.CsvDatastore;
import org.datacleaner.connection.Datastore;
import org.datacleaner.descriptors.FilterDescriptor;
import org.datacleaner.descriptors.TransformerDescriptor;
import org.datacleaner.job.builder.AnalysisJobBuilder;
import org.datacleaner.output.OutputWriter;
import org.datacleaner.output.csv.CsvOutputWriterFactory;
import org.datacleaner.user.UserPreferences;
import org.datacleaner.util.sort.SortMergeWriter;

import com.google.common.base.Strings;

@Named("Create CSV file")
@Alias("Write to CSV file")
@Description("Write data to a CSV file. CSV file writing is extremely fast and the file format is commonly "
        + "used in many tools. But CSV files do not preserve data types.")
@Categorized(superCategory = WriteSuperCategory.class)
public class CreateCsvFileAnalyzer extends AbstractOutputWriterAnalyzer
        implements HasLabelAdvice, HasDistributionAdvice {

    public static final String PROPERTY_FILE = "File";
    public static final String PROPERTY_OVERWRITE_FILE_IF_EXISTS = "Overwrite file if exists";
    public static final String PROPERTY_COLUMN_TO_BE_SORTED_ON = "Column to be sorted on";
    public static final String PROPERTY_INCLUDE_HEADER = "Include header";

    @Inject
    @Configured(value = PROPERTY_FILE, order = 1)
    @FileProperty(accessMode = FileAccessMode.SAVE, extension = { "csv", "tsv", "txt", "dat" })
    Resource file;

    @Inject
    @Configured(order = 2, required = false)
    char separatorChar = ',';

    @Inject
    @Configured(order = 3, required = false)
    Character quoteChar = '"';

    @Inject
    @Configured(order = 4, required = false)
    Character escapeChar = '\\';

    @Inject
    @Configured(order = 5, required = false, value = PROPERTY_INCLUDE_HEADER)
    boolean includeHeader = true;

    @Inject
    @Description("Character-set encoding to write data with")
    @Configured(order = 6, required = false)
    String encoding = FileHelper.DEFAULT_ENCODING;

    @Inject
    @Description("An optional column to sort all records with. Note that sorting can add substantial "
            + "performance penalties to the overall operation.")
    @Configured(order = 7, required = false, value = PROPERTY_COLUMN_TO_BE_SORTED_ON)
    InputColumn<?> columnToBeSortedOn;

    @Inject
    @Configured(value = PROPERTY_OVERWRITE_FILE_IF_EXISTS)
    boolean overwriteFileIfExists;

    @Inject
    @Provided
    UserPreferences userPreferences;

    private Resource _targetResource;
    private int _indexOfColumnToBeSortedOn = -1;
    private boolean _isColumnToBeSortedOnPresentInInput = true;

    @Initialize
    public void initTempFile() throws Exception {
        if (_targetResource == null) {
            if (columnToBeSortedOn != null) {
                _targetResource = new FileResource(File.createTempFile("csv_file_analyzer", ".csv"));
            } else {
                _targetResource = file;
            }
        }
    }

    @Override
    public String getSuggestedLabel() {
        if (file == null) {
            return null;
        }
        return file.getName();
    }

    @Validate
    public void validate() {
        if (!overwriteFileIfExists && file.isExists()) {
            throw new IllegalStateException(
                    "The file already exists. Please configure the job to overwrite the existing file.");
        }
    }

    @Override
    public void configureForFilterOutcome(final AnalysisJobBuilder ajb, final FilterDescriptor<?, ?> descriptor,
            final String categoryName) {
        final String dsName = ajb.getDatastore().getName();
        final File saveDatastoreDirectory = userPreferences.getSaveDatastoreDirectory();
        final String displayName = descriptor.getDisplayName();
        file = new FileResource(
                new File(saveDatastoreDirectory, dsName + "-" + displayName + "-" + categoryName + ".csv"));
    }

    @Override
    public void configureForTransformedData(final AnalysisJobBuilder ajb, final TransformerDescriptor<?> descriptor) {
        final String dsName = ajb.getDatastore().getName();
        final File saveDatastoreDirectory = userPreferences.getSaveDatastoreDirectory();
        final String displayName = descriptor.getDisplayName();
        file = new FileResource(new File(saveDatastoreDirectory, dsName + "-" + displayName + ".csv"));
    }

    @Override
    public OutputWriter createOutputWriter() {
        final List<String> headers = new ArrayList<>();
        for (int i = 0; i < columns.length; i++) {
            final String columnName = getColumnHeader(i);
            headers.add(columnName);
            if (columnToBeSortedOn != null) {
                if (columns[i].equals(columnToBeSortedOn)) {
                    _indexOfColumnToBeSortedOn = i;
                }
            }
        }

        if (columnToBeSortedOn != null) {
            if (_indexOfColumnToBeSortedOn == -1) {
                _isColumnToBeSortedOnPresentInInput = false;
                _indexOfColumnToBeSortedOn = columns.length;
                headers.add(columnToBeSortedOn.getName());
                final InputColumn<?>[] newColumns = new InputColumn<?>[columns.length + 1];
                for (int i = 0; i < columns.length; i++) {
                    newColumns[i] = columns[i];
                }
                newColumns[columns.length] = columnToBeSortedOn;
                columns = newColumns;
            }
        }

        if (_targetResource == null) {
            try {
                initTempFile();
            } catch (final Exception e) {
                throw new RuntimeException(e);
            }
        }

        return CsvOutputWriterFactory
                .getWriter(_targetResource, headers.toArray(new String[0]), getSafeEncoding(), separatorChar,
                        getSafeQuoteChar(), getSafeEscapeChar(), includeHeader, columns);
    }

    private String getSafeEncoding() {
        if (Strings.isNullOrEmpty(encoding)) {
            return FileHelper.DEFAULT_ENCODING;
        }
        return encoding;
    }

    private char getSafeQuoteChar() {
        if (quoteChar == null) {
            return CsvConfiguration.NOT_A_CHAR;
        }
        return quoteChar;
    }

    private char getSafeEscapeChar() {
        if (escapeChar == null) {
            return CsvConfiguration.NOT_A_CHAR;
        }
        return escapeChar;
    }

    private String getColumnHeader(final int index) {
        if (fields == null) {
            return columns[index].getName();
        }
        return fields[index];
    }

    @Override
    protected WriteDataResult getResultInternal(final int rowCount) {
        final CsvConfiguration csvConfiguration =
                new CsvConfiguration(CsvConfiguration.DEFAULT_COLUMN_NAME_LINE, getSafeEncoding(), separatorChar,
                        getSafeQuoteChar(), getSafeEscapeChar(), false, true);

        if (columnToBeSortedOn != null) {

            final CsvDataContext tempDataContext = new CsvDataContext(_targetResource, csvConfiguration);
            final Table table = tempDataContext.getDefaultSchema().getTable(0);

            final Comparator<? super Row> comparator =
                    SortHelper.createComparator(columnToBeSortedOn, _indexOfColumnToBeSortedOn);

            final CsvWriter csvWriter = new CsvWriter(csvConfiguration);
            final SortMergeWriter<Row, Writer> sortMergeWriter = new SortMergeWriter<Row, Writer>(comparator) {

                @Override
                protected void writeHeader(final Writer writer) throws IOException {
                    final List<String> headers = new ArrayList<>(Arrays.asList(table.getColumnNames()));
                    if (!_isColumnToBeSortedOnPresentInInput) {
                        headers.remove(columnToBeSortedOn.getName());
                    }

                    final String[] columnNames = headers.toArray(new String[0]);
                    final String line = csvWriter.buildLine(columnNames);
                    writer.write(line);
                }

                @Override
                protected void writeRow(final Writer writer, final Row row, final int count) throws IOException {
                    for (int i = 0; i < count; i++) {
                        final List<Object> valuesList = new ArrayList<>(Arrays.asList(row.getValues()));
                        if (!_isColumnToBeSortedOnPresentInInput) {
                            valuesList.remove(_indexOfColumnToBeSortedOn);
                        }

                        final Object[] values = valuesList.toArray(new Object[0]);

                        final String[] stringValues = new String[values.length];
                        for (int j = 0; j < stringValues.length; j++) {
                            final Object obj = values[j];
                            if (obj != null) {
                                stringValues[j] = obj.toString();
                            }
                        }
                        final String line = csvWriter.buildLine(stringValues);
                        writer.write(line);
                    }
                }

                @Override
                protected Writer createWriter(final Resource resource) {
                    final OutputStream outputStream = resource.write();
                    return FileHelper.getWriter(outputStream, getSafeEncoding());
                }
            };

            // read from the temp file and sort it into the final file
            try (DataSet dataSet = tempDataContext.query().from(table).selectAll().execute()) {
                while (dataSet.next()) {
                    final Row row = dataSet.getRow();
                    sortMergeWriter.append(row);
                }
            }

            sortMergeWriter.write(file);
        }

        final Datastore datastore = new CsvDatastore(file.getName(), file, csvConfiguration);
        return new WriteDataResultImpl(rowCount, datastore, null, null);
    }

    public void setFile(final File file) {
        this.file = new FileResource(file);
    }

    public void setFile(final Resource resource) {
        this.file = resource;
    }

    @Override
    public boolean isDistributable() {
        return columnToBeSortedOn == null;
    }
}