/** * DataCleaner (community edition) * Copyright (C) 2014 Neopost - Customer Information Management * * This copyrighted material is made available to anyone wishing to use, modify, * copy, or redistribute it subject to the terms and conditions of the GNU * Lesser General Public License, as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License * for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this distribution; if not, write to: * Free Software Foundation, Inc. * 51 Franklin Street, Fifth Floor * Boston, MA 02110-1301 USA */ package org.datacleaner.beans.uniqueness; import java.io.File; import java.io.IOException; import java.io.Writer; import java.util.LinkedHashMap; import java.util.Map; import java.util.concurrent.atomic.AtomicInteger; import javax.inject.Named; import org.apache.metamodel.csv.CsvConfiguration; import org.apache.metamodel.csv.CsvDataContext; import org.apache.metamodel.csv.CsvWriter; import org.apache.metamodel.data.DataSet; import org.apache.metamodel.util.FileHelper; import org.apache.metamodel.util.Resource; import org.apache.metamodel.util.ToStringComparator; import org.datacleaner.api.Analyzer; import org.datacleaner.api.Concurrent; import org.datacleaner.api.Configured; import org.datacleaner.api.Description; import org.datacleaner.api.Initialize; import org.datacleaner.api.InputColumn; import org.datacleaner.api.InputRow; import org.datacleaner.util.WriteBuffer; import org.datacleaner.util.sort.SortMergeWriter; @Named("Unique key check") @Description("Check your keys (or other fields) for uniqueness") @Concurrent(true) public class UniqueKeyCheckAnalyzer implements Analyzer<UniqueKeyCheckAnalyzerResult> { private static final CsvConfiguration CSV_CONFIGURATION = new CsvConfiguration(); @Configured InputColumn<?> column; @Configured @Description( "How many values to buffer before loading them to disk. For high volume data, " + "consider increasing the buffer to minimize the amount of open disk handles.") int _bufferSize = 20000; private WriteBuffer _writeBuffer; private SortMergeWriter<String, Writer> _sorter; private AtomicInteger _rowCount; private AtomicInteger _nullCount; public UniqueKeyCheckAnalyzer() { } public UniqueKeyCheckAnalyzer(final int bufferSize) { _bufferSize = bufferSize; } @Initialize public void init() { _rowCount = new AtomicInteger(); _nullCount = new AtomicInteger(); _sorter = new SortMergeWriter<String, Writer>(_bufferSize, ToStringComparator.getComparator()) { private final CsvWriter csvWriter = new CsvWriter(CSV_CONFIGURATION); @Override protected void writeHeader(final Writer writer) throws IOException { final String line = csvWriter.buildLine(new String[] { "text", "count" }); writer.write(line); } @Override protected void writeRow(final Writer writer, final String row, final int count) throws IOException { if (count > 1) { final String line = csvWriter.buildLine(new String[] { row, "" + count }); writer.write(line); writer.write('\n'); } } @Override protected Writer createWriter(final Resource file) { return FileHelper.getWriter(file.write(), FileHelper.DEFAULT_ENCODING); } }; _writeBuffer = new WriteBuffer(_bufferSize, rows -> { for (final Object[] objects : rows) { final String string = (String) objects[0]; _sorter.append(string); } }); } @Override public void run(final InputRow row, final int distinctCount) { final Object value = row.getValue(column); _rowCount.addAndGet(distinctCount); if (value == null) { _nullCount.addAndGet(distinctCount); } else { final String str = value.toString(); for (int i = 0; i < distinctCount; i++) { _writeBuffer.addToBuffer(new Object[] { str }); } } } @Override public UniqueKeyCheckAnalyzerResult getResult() { _writeBuffer.flushBuffer(); File file; try { file = File.createTempFile("UniqueKeyCheckAnalyzer", ".txt"); } catch (final Exception e) { final File tempDir = FileHelper.getTempDir(); file = new File(tempDir, "UniqueKeyCheckAnalyzer-" + System.currentTimeMillis() + ".txt"); } _sorter.write(file); final AtomicInteger nonUniques = new AtomicInteger(); final Map<String, Integer> samples = new LinkedHashMap<>(); final CsvDataContext dataContext = new CsvDataContext(file, CSV_CONFIGURATION); try (DataSet dataSet = dataContext.query().from(dataContext.getDefaultSchema().getTable(0)) .select("text", "count").execute()) { int i = 0; while (dataSet.next()) { final String text = (String) dataSet.getRow().getValue(0); final String countStr = (String) dataSet.getRow().getValue(1); final int count = Integer.parseInt(countStr); if (i < 1000) { // only build up to 1000 records in the sample samples.put(text, count); } nonUniques.addAndGet(count); i++; } } final int nonUniqueCount = nonUniques.get(); final int rowCount = _rowCount.get(); final int nullCount = _nullCount.get(); final int uniqueCount = rowCount - nullCount - nonUniqueCount; return new UniqueKeyCheckAnalyzerResult(rowCount, uniqueCount, nonUniqueCount, nullCount, samples); } }