/** * DataCleaner (community edition) * Copyright (C) 2014 Neopost - Customer Information Management * * This copyrighted material is made available to anyone wishing to use, modify, * copy, or redistribute it subject to the terms and conditions of the GNU * Lesser General Public License, as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License * for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this distribution; if not, write to: * Free Software Foundation, Inc. * 51 Franklin Street, Fifth Floor * Boston, MA 02110-1301 USA */ package org.datacleaner.extension.output; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.Comparator; import java.util.List; import javax.inject.Named; import org.apache.commons.io.FilenameUtils; import org.apache.metamodel.DataContext; import org.apache.metamodel.UpdateableDataContext; import org.apache.metamodel.csv.CsvConfiguration; import org.apache.metamodel.csv.CsvDataContext; import org.apache.metamodel.data.DataSet; import org.apache.metamodel.data.Row; import org.apache.metamodel.drop.DropTable; import org.apache.metamodel.schema.Table; import org.apache.metamodel.util.FileHelper; import org.apache.metamodel.util.FileResource; import org.apache.metamodel.util.Resource; import org.datacleaner.api.Alias; import org.datacleaner.api.Categorized; import org.datacleaner.api.Configured; import org.datacleaner.api.Description; import org.datacleaner.api.Distributed; import org.datacleaner.api.FileProperty; import org.datacleaner.api.FileProperty.FileAccessMode; import org.datacleaner.api.HasLabelAdvice; import org.datacleaner.api.Initialize; import org.datacleaner.api.InputColumn; import org.datacleaner.api.Validate; import org.datacleaner.beans.writers.WriteDataResult; import org.datacleaner.beans.writers.WriteDataResultImpl; import org.datacleaner.components.categories.WriteSuperCategory; import org.datacleaner.connection.Datastore; import org.datacleaner.connection.DatastoreConnection; import org.datacleaner.connection.ExcelDatastore; import org.datacleaner.connection.UpdateableDatastoreConnection; import org.datacleaner.descriptors.FilterDescriptor; import org.datacleaner.descriptors.TransformerDescriptor; import org.datacleaner.job.builder.AnalysisJobBuilder; import org.datacleaner.output.OutputWriter; import org.datacleaner.output.csv.CsvOutputWriterFactory; import org.datacleaner.output.excel.ExcelOutputWriterFactory; import org.datacleaner.util.sort.SortMergeWriter; @Named("Create Excel spreadsheet") @Alias("Write to Excel spreadsheet") @Description( "Write data to an Excel spreadsheet, useful for manually editing and inspecting the data in Microsoft Excel.") @Categorized(superCategory = WriteSuperCategory.class) @Distributed(false) public class CreateExcelSpreadsheetAnalyzer extends AbstractOutputWriterAnalyzer implements HasLabelAdvice { public static final String PROPERTY_FILE = "File"; public static final String PROPERTY_SHEET_NAME = "Sheet name"; public static final String PROPERTY_OVERWRITE_SHEET_IF_EXISTS = "Overwrite sheet if exists"; private static final String[] excelExtension = { "xlsx", "xls" }; // excel breaks when sheet names are longer than 31 chars - see // https://github.com/datacleaner/DataCleaner/issues/864 private static final int SHEET_NAME_MAX_LENGTH = 31; private static final char[] SHEET_NAME_ILLEGAL_CHARS = new char[] { '.', ':' }; private final Character separatorChar = ','; private final Character quoteChar = '"'; private final Character escapeChar = '\\'; private final boolean includeHeader = true; @Configured(PROPERTY_FILE) @FileProperty(accessMode = FileAccessMode.SAVE, extension = { "xls", "xlsx" }) File file = new File("DataCleaner-staging.xlsx"); @Configured(PROPERTY_SHEET_NAME) String sheetName; @Configured(PROPERTY_OVERWRITE_SHEET_IF_EXISTS) boolean overwriteSheetIfExists; @Configured(order = 1, required = false) InputColumn<?> columnToBeSortedOn; private File _targetFile; private int indexOfColumnToBeSortedOn = -1; private boolean isColumnToBeSortedOnPresentInInput = true; @Initialize public void initTempFile() throws Exception { if (_targetFile == null) { if (columnToBeSortedOn != null) { _targetFile = File.createTempFile("csv_file_analyzer", ".csv"); } else { _targetFile = file; } } } @Override public String getSuggestedLabel() { if (file == null || sheetName == null) { return null; } return file.getName() + " - " + sheetName; } @Validate public void validate() { sheetName = sheetName.trim(); if (sheetName.length() > SHEET_NAME_MAX_LENGTH) { throw new IllegalStateException("Sheet name must be maximum " + SHEET_NAME_MAX_LENGTH + " characters long"); } for (final char c : SHEET_NAME_ILLEGAL_CHARS) { if (sheetName.indexOf(c) != -1) { throw new IllegalStateException("Sheet name cannot contain '" + c + "'"); } } if (!overwriteSheetIfExists && file.exists()) { final Datastore datastore = new ExcelDatastore(file.getName(), new FileResource(file), file.getAbsolutePath()); try (DatastoreConnection connection = datastore.openConnection()) { final String[] tableNames = connection.getDataContext().getDefaultSchema().getTableNames(); for (int i = 0; i < tableNames.length; i++) { if (tableNames[i].equals(sheetName)) { throw new IllegalStateException( "The sheet '" + sheetName + "' already exists. Please select another sheet name."); } } } } if (!FilenameUtils.isExtension(file.getName(), excelExtension)) { throw new IllegalStateException("Please add the '.xlsx' or '.xls' extension to the filename"); } } @Override public void configureForFilterOutcome(final AnalysisJobBuilder ajb, final FilterDescriptor<?, ?> descriptor, final String categoryName) { final String dsName = ajb.getDatastore().getName(); sheetName = fixSheetName(dsName + "-" + descriptor.getDisplayName() + "-" + categoryName); } @Override public void configureForTransformedData(final AnalysisJobBuilder ajb, final TransformerDescriptor<?> descriptor) { final String dsName = ajb.getDatastore().getName(); sheetName = fixSheetName(dsName + "-" + descriptor.getDisplayName()); } private String fixSheetName(String sheet) { for (final char c : SHEET_NAME_ILLEGAL_CHARS) { while (sheet.indexOf(c) != -1) { sheet = sheet.replace(c, '-'); } } if (sheet.length() > SHEET_NAME_MAX_LENGTH) { sheet = sheet.substring(0, SHEET_NAME_MAX_LENGTH); } return sheet; } @Override public OutputWriter createOutputWriter() { if (file.exists()) { final ExcelDatastore datastore = new ExcelDatastore(file.getName(), new FileResource(file), file.getAbsolutePath()); try (UpdateableDatastoreConnection connection = datastore.openConnection()) { final DataContext dataContext = connection.getDataContext(); final String[] tableNames = dataContext.getDefaultSchema().getTableNames(); for (int i = 0; i < tableNames.length; i++) { if (tableNames[i].equals(sheetName)) { if (overwriteSheetIfExists) { final Table tableSheet = dataContext.getTableByQualifiedLabel(sheetName); final UpdateableDataContext updateableDataContext = connection.getUpdateableDataContext(); updateableDataContext.executeUpdate(new DropTable(tableSheet)); } } } } } // If the user wants the file sorted after a column we create a // temporary file and return a CSV writer in order // to make a MergeSort on it, otherwise we return a normal Excel writer if (columnToBeSortedOn != null) { return createTemporaryCsvWriter(); } else { return ExcelOutputWriterFactory.getWriter(file.getPath(), sheetName, fields, columns); } } private OutputWriter createTemporaryCsvWriter() { final List<String> headers = new ArrayList<>(); for (int i = 0; i < columns.length; i++) { final String columnName = getColumnHeader(i); headers.add(columnName); if (columnToBeSortedOn != null) { if (columns[i].getName().equals(columnToBeSortedOn.getName())) { indexOfColumnToBeSortedOn = i; } } } if (indexOfColumnToBeSortedOn == -1) { this.isColumnToBeSortedOnPresentInInput = false; indexOfColumnToBeSortedOn = columns.length; headers.add(columnToBeSortedOn.getName()); final InputColumn<?>[] newColumns = new InputColumn<?>[columns.length + 1]; for (int i = 0; i < columns.length; i++) { newColumns[i] = columns[i]; } newColumns[columns.length] = columnToBeSortedOn; columns = newColumns; } if (_targetFile == null) { try { initTempFile(); } catch (final Exception e) { throw new RuntimeException(e); } } return CsvOutputWriterFactory .getWriter(_targetFile.getPath(), headers.toArray(new String[0]), separatorChar, quoteChar, escapeChar, includeHeader, columns); } private String getColumnHeader(final int index) { if (fields == null) { return columns[index].getName(); } return fields[index]; } @Override protected WriteDataResult getResultInternal(final int rowCount) { if (columnToBeSortedOn != null) { // Sorts the file using merge sort in the temporary file and then // write into the final file mergeSortFile(); } final FileResource resource = new FileResource(file); final Datastore datastore = new ExcelDatastore(file.getName(), resource, file.getAbsolutePath()); return new WriteDataResultImpl(rowCount, datastore, null, sheetName); } private void mergeSortFile() { final CsvConfiguration csvConfiguration = new CsvConfiguration(CsvConfiguration.DEFAULT_COLUMN_NAME_LINE, FileHelper.DEFAULT_ENCODING, separatorChar, quoteChar, escapeChar, false, true); final CsvDataContext tempDataContext = new CsvDataContext(_targetFile, csvConfiguration); final Table table = tempDataContext.getDefaultSchema().getTable(0); final Comparator<? super Row> comparator = SortHelper.createComparator(columnToBeSortedOn, indexOfColumnToBeSortedOn); final SortMergeWriter<Row, ExcelDataContextWriter> sortMergeWriter = new SortMergeWriter<Row, ExcelDataContextWriter>(comparator) { @Override protected ExcelDataContextWriter createWriter(final Resource resource) { assert resource instanceof FileResource; final FileResource fileResource = (FileResource) resource; return new ExcelDataContextWriter(fileResource.getFile(), sheetName); } @Override protected void writeHeader(final ExcelDataContextWriter writer) throws IOException { final List<String> headers = new ArrayList<>(Arrays.asList(table.getColumnNames())); if (!isColumnToBeSortedOnPresentInInput) { headers.remove(columnToBeSortedOn.getName()); } writer.createTable(headers); } @Override protected void writeRow(final ExcelDataContextWriter writer, final Row row, final int count) throws IOException { for (int i = 0; i < count; i++) { final List<Object> valuesList = new ArrayList<>(Arrays.asList(row.getValues())); if (!isColumnToBeSortedOnPresentInInput) { valuesList.remove(indexOfColumnToBeSortedOn); } final Object[] values = valuesList.toArray(new Object[0]); writer.insertValues(values); } } }; // read from the temp file and sort it into the final file try (DataSet dataSet = tempDataContext.query().from(table).selectAll().execute()) { while (dataSet.next()) { final Row row = dataSet.getRow(); sortMergeWriter.append(row); } } sortMergeWriter.write(file); } public void setFile(final File file) { this.file = file; } public void setSheetName(final String sheetName) { this.sheetName = sheetName; } }