UniqueKeyCheckAnalyzer.java example

Explorer
DataCleaner-master
/**
 * DataCleaner (community edition)
 * Copyright (C) 2014 Neopost - Customer Information Management
 *
 * This copyrighted material is made available to anyone wishing to use, modify,
 * copy, or redistribute it subject to the terms and conditions of the GNU
 * Lesser General Public License, as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
 * for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this distribution; if not, write to:
 * Free Software Foundation, Inc.
 * 51 Franklin Street, Fifth Floor
 * Boston, MA  02110-1301  USA
 */
package org.datacleaner.beans.uniqueness;

import java.io.File;
import java.io.IOException;
import java.io.Writer;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.concurrent.atomic.AtomicInteger;

import javax.inject.Named;

import org.apache.metamodel.csv.CsvConfiguration;
import org.apache.metamodel.csv.CsvDataContext;
import org.apache.metamodel.csv.CsvWriter;
import org.apache.metamodel.data.DataSet;
import org.apache.metamodel.util.FileHelper;
import org.apache.metamodel.util.Resource;
import org.apache.metamodel.util.ToStringComparator;
import org.datacleaner.api.Analyzer;
import org.datacleaner.api.Concurrent;
import org.datacleaner.api.Configured;
import org.datacleaner.api.Description;
import org.datacleaner.api.Initialize;
import org.datacleaner.api.InputColumn;
import org.datacleaner.api.InputRow;
import org.datacleaner.util.WriteBuffer;
import org.datacleaner.util.sort.SortMergeWriter;

@Named("Unique key check")
@Description("Check your keys (or other fields) for uniqueness")
@Concurrent(true)
public class UniqueKeyCheckAnalyzer implements Analyzer<UniqueKeyCheckAnalyzerResult> {

    private static final CsvConfiguration CSV_CONFIGURATION = new CsvConfiguration();

    @Configured
    InputColumn<?> column;

    @Configured
    @Description( "How many values to buffer before loading them to disk. For high volume data, "
            + "consider increasing the buffer to minimize the amount of open disk handles.")
    int _bufferSize = 20000;

    private WriteBuffer _writeBuffer;
    private SortMergeWriter<String, Writer> _sorter;
    private AtomicInteger _rowCount;
    private AtomicInteger _nullCount;

    public UniqueKeyCheckAnalyzer() {
    }

    public UniqueKeyCheckAnalyzer(final int bufferSize) {
        _bufferSize = bufferSize;
    }

    @Initialize
    public void init() {
        _rowCount = new AtomicInteger();
        _nullCount = new AtomicInteger();
        _sorter = new SortMergeWriter<String, Writer>(_bufferSize, ToStringComparator.getComparator()) {
            private final CsvWriter csvWriter = new CsvWriter(CSV_CONFIGURATION);

            @Override
            protected void writeHeader(final Writer writer) throws IOException {
                final String line = csvWriter.buildLine(new String[] { "text", "count" });
                writer.write(line);
            }

            @Override
            protected void writeRow(final Writer writer, final String row, final int count) throws IOException {
                if (count > 1) {
                    final String line = csvWriter.buildLine(new String[] { row, "" + count });
                    writer.write(line);
                    writer.write('\n');
                }
            }

            @Override
            protected Writer createWriter(final Resource file) {
                return FileHelper.getWriter(file.write(), FileHelper.DEFAULT_ENCODING);
            }
        };
        _writeBuffer = new WriteBuffer(_bufferSize, rows -> {
            for (final Object[] objects : rows) {
                final String string = (String) objects[0];
                _sorter.append(string);
            }
        });
    }

    @Override
    public void run(final InputRow row, final int distinctCount) {
        final Object value = row.getValue(column);

        _rowCount.addAndGet(distinctCount);

        if (value == null) {
            _nullCount.addAndGet(distinctCount);
        } else {
            final String str = value.toString();

            for (int i = 0; i < distinctCount; i++) {
                _writeBuffer.addToBuffer(new Object[] { str });
            }
        }
    }

    @Override
    public UniqueKeyCheckAnalyzerResult getResult() {
        _writeBuffer.flushBuffer();

        File file;
        try {
            file = File.createTempFile("UniqueKeyCheckAnalyzer", ".txt");
        } catch (final Exception e) {
            final File tempDir = FileHelper.getTempDir();
            file = new File(tempDir, "UniqueKeyCheckAnalyzer-" + System.currentTimeMillis() + ".txt");
        }

        _sorter.write(file);

        final AtomicInteger nonUniques = new AtomicInteger();

        final Map<String, Integer> samples = new LinkedHashMap<>();

        final CsvDataContext dataContext = new CsvDataContext(file, CSV_CONFIGURATION);
        try (DataSet dataSet = dataContext.query().from(dataContext.getDefaultSchema().getTable(0))
                .select("text", "count").execute()) {
            int i = 0;
            while (dataSet.next()) {
                final String text = (String) dataSet.getRow().getValue(0);
                final String countStr = (String) dataSet.getRow().getValue(1);
                final int count = Integer.parseInt(countStr);
                if (i < 1000) {
                    // only build up to 1000 records in the sample
                    samples.put(text, count);
                }
                nonUniques.addAndGet(count);
                i++;
            }
        }

        final int nonUniqueCount = nonUniques.get();
        final int rowCount = _rowCount.get();
        final int nullCount = _nullCount.get();
        final int uniqueCount = rowCount - nullCount - nonUniqueCount;

        return new UniqueKeyCheckAnalyzerResult(rowCount, uniqueCount, nonUniqueCount, nullCount, samples);
    }
}