/**
* AnalyzerBeans
* Copyright (C) 2014 Neopost - Customer Information Management
*
* This copyrighted material is made available to anyone wishing to use, modify,
* copy, or redistribute it subject to the terms and conditions of the GNU
* Lesser General Public License, as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
* for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this distribution; if not, write to:
* Free Software Foundation, Inc.
* 51 Franklin Street, Fifth Floor
* Boston, MA 02110-1301 USA
*/
package org.eobjects.analyzer.beans.uniqueness;
import java.io.File;
import java.io.IOException;
import java.io.Writer;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.concurrent.atomic.AtomicInteger;
import org.apache.metamodel.csv.CsvConfiguration;
import org.apache.metamodel.csv.CsvDataContext;
import org.apache.metamodel.csv.CsvWriter;
import org.apache.metamodel.data.DataSet;
import org.apache.metamodel.util.Action;
import org.apache.metamodel.util.FileHelper;
import org.apache.metamodel.util.ToStringComparator;
import org.eobjects.analyzer.beans.api.Analyzer;
import org.eobjects.analyzer.beans.api.AnalyzerBean;
import org.eobjects.analyzer.beans.api.Concurrent;
import org.eobjects.analyzer.beans.api.Configured;
import org.eobjects.analyzer.beans.api.Description;
import org.eobjects.analyzer.beans.api.Initialize;
import org.eobjects.analyzer.data.InputColumn;
import org.eobjects.analyzer.data.InputRow;
import org.eobjects.analyzer.util.WriteBuffer;
import org.eobjects.analyzer.util.sort.SortMergeWriter;
@AnalyzerBean("Unique key check")
@Description("Check your keys (or other fields) for uniqueness")
@Concurrent(true)
public class UniqueKeyCheckAnalyzer implements Analyzer<UniqueKeyCheckAnalyzerResult> {
private static final int BUFFER_SIZE = 20000;
private static final CsvConfiguration CSV_CONFIGURATION = new CsvConfiguration();
@Configured
InputColumn<?> column;
private final int _bufferSize;
private WriteBuffer _writeBuffer;
private SortMergeWriter<String, Writer> _sorter;
private AtomicInteger _rowCount;
private AtomicInteger _nullCount;
public UniqueKeyCheckAnalyzer() {
this(BUFFER_SIZE);
}
public UniqueKeyCheckAnalyzer(int bufferSize) {
_bufferSize = bufferSize;
}
@Initialize
public void init() {
_rowCount = new AtomicInteger();
_nullCount = new AtomicInteger();
_sorter = new SortMergeWriter<String, Writer>(_bufferSize, ToStringComparator.getComparator()) {
private final CsvWriter csvWriter = new CsvWriter(CSV_CONFIGURATION);
@Override
protected void writeHeader(Writer writer) throws IOException {
final String line = csvWriter.buildLine(new String[] { "text", "count" });
writer.write(line);
}
@Override
protected void writeRow(Writer writer, String row, int count) throws IOException {
if (count > 1) {
final String line = csvWriter.buildLine(new String[] { row, "" + count });
writer.write(line);
writer.write('\n');
}
}
@Override
protected Writer createWriter(File file) {
return FileHelper.getBufferedWriter(file);
}
};
_writeBuffer = new WriteBuffer(_bufferSize, new Action<Iterable<Object[]>>() {
@Override
public void run(Iterable<Object[]> rows) throws Exception {
for (Object[] objects : rows) {
final String string = (String) objects[0];
_sorter.append(string);
}
}
});
}
@Override
public void run(InputRow row, int distinctCount) {
final Object value = row.getValue(column);
_rowCount.addAndGet(distinctCount);
if (value == null) {
_nullCount.addAndGet(distinctCount);
} else {
String str = value.toString();
for (int i = 0; i < distinctCount; i++) {
_writeBuffer.addToBuffer(new Object[] { str });
}
}
}
@Override
public UniqueKeyCheckAnalyzerResult getResult() {
_writeBuffer.flushBuffer();
File file;
try {
file = File.createTempFile("UniqueKeyCheckAnalyzer", ".txt");
} catch (Exception e) {
File tempDir = FileHelper.getTempDir();
file = new File(tempDir, "UniqueKeyCheckAnalyzer-" + System.currentTimeMillis() + ".txt");
}
_sorter.write(file);
final AtomicInteger nonUniques = new AtomicInteger();
final Map<String, Integer> samples = new LinkedHashMap<String, Integer>();
final CsvDataContext dataContext = new CsvDataContext(file, CSV_CONFIGURATION);
try (final DataSet dataSet = dataContext.query().from(dataContext.getDefaultSchema().getTable(0))
.select("text", "count").execute()) {
int i = 0;
while (dataSet.next()) {
final String text = (String) dataSet.getRow().getValue(0);
final String countStr = (String) dataSet.getRow().getValue(1);
final int count = Integer.parseInt(countStr);
if (i < 1000) {
// only build up to 1000 records in the sample
samples.put(text, count);
}
nonUniques.addAndGet(count);
i++;
}
}
final int nonUniqueCount = nonUniques.get();
final int rowCount = _rowCount.get();
final int nullCount = _nullCount.get();
final int uniqueCount = rowCount - nullCount - nonUniqueCount;
return new UniqueKeyCheckAnalyzerResult(rowCount, uniqueCount, nonUniqueCount, nullCount, samples);
}
}