/**
* DataCleaner (community edition)
* Copyright (C) 2014 Neopost - Customer Information Management
*
* This copyrighted material is made available to anyone wishing to use, modify,
* copy, or redistribute it subject to the terms and conditions of the GNU
* Lesser General Public License, as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
* for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this distribution; if not, write to:
* Free Software Foundation, Inc.
* 51 Franklin Street, Fifth Floor
* Boston, MA 02110-1301 USA
*/
package org.datacleaner.storage;
import java.io.Serializable;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.atomic.AtomicInteger;
import org.datacleaner.api.InputRow;
import org.datacleaner.util.CollectionUtils2;
import com.google.common.cache.Cache;
/**
* An abstract RowAnnotationFactory that supports a (optional) threshold
*
* @deprecated this abstract implementation was found to be way too greedy and
* dirty, see issue #506
*/
@Deprecated
public abstract class AbstractRowAnnotationFactory implements RowAnnotationFactory, Serializable {
private static final long serialVersionUID = 1L;
private final Map<RowAnnotationImpl, AtomicInteger> _rowCounts = new ConcurrentHashMap<>();
private final Integer _storedRowsThreshold;
private final transient Cache<Integer, Boolean> _cachedRows = CollectionUtils2.createCache(10000, 10 * 60);
public AbstractRowAnnotationFactory(final Integer storedRowsThreshold) {
if (storedRowsThreshold == null) {
_storedRowsThreshold = Integer.MAX_VALUE;
} else {
_storedRowsThreshold = storedRowsThreshold;
}
}
@Override
public final void annotate(final InputRow row, final RowAnnotation annotation) {
final RowAnnotationImpl ann = (RowAnnotationImpl) annotation;
final AtomicInteger count = getCounter(ann);
boolean storeRow = true;
if (_storedRowsThreshold != null) {
if (count.getAndIncrement() >= _storedRowsThreshold.intValue()) {
storeRow = false;
}
}
if (storeRow) {
// TODO: In clustered scenarios, there's a chance of row ID
// collision
final int rowId = (int) row.getId();
if (_cachedRows != null) {
final Boolean previously = _cachedRows.asMap().putIfAbsent(rowId, true);
if (previously == null) {
// only store row values when they where not present
// previously
storeRowValues(rowId, row);
}
}
storeRowAnnotation(rowId, annotation);
}
ann.incrementRowCount(1);
}
private AtomicInteger getCounter(final RowAnnotationImpl ann) {
AtomicInteger count = _rowCounts.get(ann);
if (count == null) {
if (_rowCounts instanceof ConcurrentMap) {
final AtomicInteger newCounter = new AtomicInteger();
final ConcurrentMap<RowAnnotationImpl, AtomicInteger> concurrentMap =
(ConcurrentMap<RowAnnotationImpl, AtomicInteger>) _rowCounts;
count = concurrentMap.putIfAbsent(ann, newCounter);
if (count == null) {
count = newCounter;
}
} else {
// for backwards compatibility we also need to support
// (deserialized) hash maps
synchronized (_rowCounts) {
count = _rowCounts.get(ann);
if (count == null) {
count = new AtomicInteger();
_rowCounts.put(ann, count);
}
}
}
}
return count;
}
@Override
public final void resetAnnotation(final RowAnnotation annotation) {
final RowAnnotationImpl ann = (RowAnnotationImpl) annotation;
ann.resetRowCount();
_rowCounts.remove(annotation);
resetRows(annotation);
}
@Override
public final RowAnnotation createAnnotation() {
return new RowAnnotationImpl();
}
/**
* Removes the annotation from any rows that has been annotated with it.
*
* @param annotation
*/
protected abstract void resetRows(RowAnnotation annotation);
/**
* Gets the distinct count from a row that has been stored and retried using
* the getRows(...) method.
*
* @param row
* @return
*/
protected abstract int getDistinctCount(InputRow row);
protected abstract void storeRowAnnotation(int rowId, RowAnnotation annotation);
protected abstract void storeRowValues(int rowId, InputRow row);
public final Integer getStoredRowsThreshold() {
return _storedRowsThreshold;
}
}