/** * DataCleaner (community edition) * Copyright (C) 2014 Neopost - Customer Information Management * * This copyrighted material is made available to anyone wishing to use, modify, * copy, or redistribute it subject to the terms and conditions of the GNU * Lesser General Public License, as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License * for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this distribution; if not, write to: * Free Software Foundation, Inc. * 51 Franklin Street, Fifth Floor * Boston, MA 02110-1301 USA */ package org.datacleaner.storage; import java.io.Serializable; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.List; import java.util.concurrent.ConcurrentHashMap; import org.datacleaner.api.InputRow; /** * Successor of {@link InMemoryRowAnnotationFactory}, our implementation of * {@link RowAnnotationFactory} that is based on in-memory storage of sample * records. A new class was added to allow deserialization of old DataCleaner * results, yet this class fully replaces the old one functionally. */ public final class InMemoryRowAnnotationFactory2 extends AbstractRowAnnotationFactory2 implements RowAnnotationFactory, Serializable { private static final long serialVersionUID = 1L; /** * Kind of a magic number, but a way to ensure that the ratio between sample * sets and records is kept under control, at least in default scenarios. */ private static final int DEFAULT_SAMPLE_LIMIT = 500 * 500; private final ConcurrentHashMap<RowAnnotation, List<InputRow>> _storage; private final int _maxSampleRecords; private final int _maxSampleSets; public InMemoryRowAnnotationFactory2() { this(500); } public InMemoryRowAnnotationFactory2(final int maxSampleRecords) { this(Math.min(10, DEFAULT_SAMPLE_LIMIT / maxSampleRecords), 500); } /** * * @param maxSampleSets * the maximum number of sample record collections to keep * @param maxSampleRecords * the maximum number of records to keep in each collection */ public InMemoryRowAnnotationFactory2(final int maxSampleSets, final int maxSampleRecords) { _storage = new ConcurrentHashMap<>(); _maxSampleSets = Math.max(0, maxSampleSets); _maxSampleRecords = Math.max(0, maxSampleRecords); } private void addInputRowsToCollection(final Collection<InputRow> rowCollection, final Collection<InputRow> rows) { if (rowCollection == null) { return; } int size = rowCollection.size(); if (size >= _maxSampleRecords) { return; } for (final InputRow inputRow : rows) { synchronized (rowCollection) { rowCollection.add(inputRow); size++; if (size >= _maxSampleRecords) { return; } } } } private Collection<InputRow> getInputRowCollection(final int defaultSize, final RowAnnotation annotation) { List<InputRow> rowCollection = _storage.get(annotation); if (rowCollection == null) { if (_storage.size() >= _maxSampleSets) { return null; } rowCollection = new ArrayList<>(defaultSize); final List<InputRow> existingCollection = _storage.putIfAbsent(annotation, rowCollection); if (existingCollection != null) { rowCollection = existingCollection; } } return rowCollection; } @Override public void annotate(final InputRow row, final RowAnnotation annotation) { super.annotate(row, annotation); final Collection<InputRow> rowCollection = getInputRowCollection(10, annotation); if (rowCollection != null) { synchronized (rowCollection) { if (rowCollection.size() >= _maxSampleRecords) { return; } rowCollection.add(row); } } } @Override public void resetAnnotation(final RowAnnotation annotation) { super.resetAnnotation(annotation); _storage.remove(annotation); } @Override public List<InputRow> getSampleRows(final RowAnnotation annotation) { final List<InputRow> collection = _storage.get(annotation); if (collection == null) { return Collections.emptyList(); } return Collections.unmodifiableList(collection); } @Override public void transferAnnotations(final RowAnnotation from, final RowAnnotation to) { super.transferAnnotations(from, to); final Collection<InputRow> fromCollection = _storage.get(from); if (fromCollection == null || fromCollection.isEmpty()) { return; } final Collection<InputRow> toCollection = getInputRowCollection(fromCollection.size(), to); addInputRowsToCollection(toCollection, fromCollection); _storage.remove(from); } @Override public boolean hasSampleRows(final RowAnnotation annotation) { return _storage.containsKey(annotation); } }