/*
Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016. All rights reserved.
Contact:
SYSTAP, LLC DBA Blazegraph
2501 Calvert ST NW #106
Washington, DC 20008
licenses@blazegraph.com
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
/*
* Created on Sep 3, 2008
*/
package com.bigdata.striterator;
import java.io.Serializable;
import java.util.Arrays;
import java.util.UUID;
import com.bigdata.bop.solutions.JVMDistinctBindingSetsOp;
import com.bigdata.btree.BTree;
import com.bigdata.btree.IndexMetadata;
import com.bigdata.btree.NOPTupleSerializer;
import com.bigdata.btree.keys.ASCIIKeyBuilderFactory;
import com.bigdata.btree.keys.KVO;
import com.bigdata.journal.IIndexManager;
import com.bigdata.journal.TemporaryStore;
import com.bigdata.relation.rule.IRule;
import com.bigdata.relation.rule.eval.ISolution;
import com.bigdata.util.BytesUtil;
/**
* A filter that imposes a DISTINCT constraint on the {@link ISolution}s
* generated by an {@link IRule}. The filter is optimized if only a single
* chunk is visited by the source iterator. Otherwise, the filter is implemented
* using a {@link BTree} backed by a {@link TemporaryStore}.
* <p>
* When more than one chunk is processed, {@link ISolution}s are transformed
* into unsigned byte[] keys. The {@link BTree} is tested for each such key. If
* the key is NOT found, then it is inserted into the {@link BTree} and the
* solution is passed by the filter. Otherwise the solution is rejected by the
* filter. The backing {@link BTree} is closed when the filter is finalized, but
* it will hold a hard reference to the {@link TemporaryStore} until then.
* Solutions are processed in chunks for efficient ordered reads and writes on
* the {@link BTree}.
*
* @todo A statistical distinct filter can be implemented using bloom filter
* INSTEAD of a {@link BTree} but the bloom filter parameters MUST be
* chosen so as to make the possibility of a false positive sufficiently
* unlikely to satisfy the application criteria. However, such a filter
* will always have a non-zero chance of incorrectly rejecting a solution
* when that solution has NOT been seen by the filter. Since the bloom
* filter can under-generate, it could only be applied in very specialized
* circumstances, e.g., it might be OK for text search.
*
* @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a>
* @version $Id$
*
* @deprecated by {@link JVMDistinctBindingSetsOp}
*/
abstract public class DistinctFilter<E> implements IChunkConverter<E, E> {
/**
* Used to lazily obtain a {@link TemporaryStore}.
* <p>
* Note: NOT {@link Serializable}.
*/
private final IIndexManager indexManager;
/**
* Lazily created if more than one chunk will be visited.
* <p>
* Note: NOT {@link Serializable}.
*/
private BTree btree = null;
/**
*
* @param indexManager
* Used to lazily obtain a {@link TemporaryStore}.
*/
public DistinctFilter(final IIndexManager indexManager) {
if (indexManager == null)
throw new IllegalArgumentException();
this.indexManager = indexManager;
}
public E[] convert(final IChunkedOrderedIterator<E> src) {
if (src == null)
throw new IllegalArgumentException();
// read a chunk from the source iterator.
final E[] chunk = src.nextChunk();
// true iff there is nothing more available from the source itr.
final boolean exhausted = !src.hasNext();
final int n = chunk.length;
final KVO<E>[] a = new KVO[n];
for (int i = 0; i < n; i++) {
final E e = chunk[i];
a[i] = new KVO(getSortKey(e), null/*val*/, e);
}
// Put into sorted order by the generated sort keys.
Arrays.sort(a);
if (btree == null && exhausted) {
/*
* Special case when we have not yet created a BTree to hold the
* distinct keys and the source iterator will not visit any more
* elements. For this case we can just compare each element in
* sorted order with the next element in sorted order and drop
* duplicates.
*/
int j = 0;
// chunk large enough if everything is distinct.
final E[] tmp = (E[]) java.lang.reflect.Array.newInstance(
// chunk[0].getClass(),
chunk.getClass().getComponentType(),
n);
// always emit the first element.
tmp[j++] = a[0].obj;
for (int i = 1; i < n; i++) {
if (!BytesUtil.bytesEqual(a[i - 1].key, a[i].key)) {
tmp[j++] = a[i].obj;
}
}
if (j != n) {
// make it dense.
E[] tmp2 = (E[]) java.lang.reflect.Array.newInstance(//
// tmp[0].getClass(),
tmp.getClass().getComponentType(),
j);
System.arraycopy(tmp, 0, tmp2, 0, j);
return tmp2;
}
return tmp;
}
/*
* General case.
*/
if (btree == null) {
/*
* Create the B+Tree on which we will write the distinct keys.
*/
final IndexMetadata metadata = new IndexMetadata(UUID.randomUUID());
/*
* The key builder is dead simple since keys are byte[]s and values
* are not used.
*/
metadata.setTupleSerializer(new NOPTupleSerializer(
new ASCIIKeyBuilderFactory(0/* initialCapacity */)));
// create the B+Tree.
btree = BTree.create(indexManager.getTempStore(), metadata);
}
/*
* a[] is in sorted order by the unsigned byte[] keys. For each key, we
* test the B+Tree. If the key is NOT found then we insert the key and
* add the element to the output chunk. If the key is found then we skip
* it since it has already been observed (and by extension, the element
* paired with that key has already been observed).
*/
{
int j = 0;
// chunk large enough if everything is distinct.
final E[] tmp = (E[]) java.lang.reflect.Array.newInstance(
// chunk[0].getClass(),
chunk.getClass().getComponentType(),//
n);
for (int i = 0; i < n; i++) {
if (!btree.contains(a[i].key)) {
btree.insert(a[i].key, null/* val */);
tmp[j++] = a[i].obj;
}
}
if (j != n) {
// make it dense.
E[] tmp2 = (E[]) java.lang.reflect.Array.newInstance(
// chunk[0].getClass(),
chunk.getClass().getComponentType(),//
j);
System.arraycopy(tmp, 0, tmp2, 0, j);
return tmp2;
}
return tmp;
}
}
/**
* Return an unsigned byte[] key that is a representation of the visited
* element. Elements are judged for distinctness in terms of the generated
* sort key.
*
* @param e
* The visited element.
*
* @return The unsigned byte[] key.
*/
abstract protected byte[] getSortKey(E e);
}