package com.bigdata.rdf.spo;
import java.util.LinkedHashSet;
import java.util.NoSuchElementException;
import java.util.Set;
import com.bigdata.BigdataStatics;
import com.bigdata.btree.BTree;
import com.bigdata.btree.keys.KeyBuilder;
import com.bigdata.rdf.internal.IV;
import com.bigdata.rdf.store.IRawTripleStore;
import com.bigdata.util.Bytes;
import cutthecrap.utils.striterators.ICloseableIterator;
/**
* Iterator using a {@link BTree} filter out duplicate (s,p,o) tuples.
*
* @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a>
* @version $Id: DistinctSPOIterator.java 3472 2010-08-31 16:21:47Z thompsonbry
* $
*
* @see SPORelation#distinctSPOIterator(ICloseableIterator)
*
* @deprecated By a simple distinct filter and a filter which strips off the
* context position from an SPOC. Stack them together and it does
* the same thing. (The fall back to the B+Tree might still be
* interesting if we do not have a persistent hash map to fall back
* on instead and we want streaming results. Otherwise use an
* external merge sort.)
*/
public class DistinctSPOIterator implements ICloseableIterator<ISPO> {
/**
* The backing relation, which is only used to obtain the {@link BTree}
* instance in {@link #overflowToBTree(Set)}.
*/
private SPORelation spoRelation;
/**
* The source iterator.
*/
private ICloseableIterator<ISPO> src;
/**
* Hash set is allocated when the first {@link ISPO} is visited and is used
* until the {@link #MAX_HASH_SET_CAPACITY} is reached, at which point the
* {@link #btreeSet} is allocated.
*/
private Set<ISPO> hashSet;
/**
* B+Tree is used once the {@link #MAX_HASH_SET_CAPACITY} is reached. The
* B+Tree is slowed than the {@link #hashSet}, but can spill onto the disk
* and is appropriate for very large distinct sets.
*/
private BTree btreeSet;
/**
* Buffer reused for each (s,p,o) key. The buffer is allocated to the exact
* size when the {@link #btreeSet} is allocated.
*/
private KeyBuilder keyBuilder;
/**
* The next element to be visited or <code>null</code> if we need to scan
* ahead.
*/
private ISPO next = null;
/**
* <code>true</code> iff the iterator has been proven to be exhausted.
*/
private boolean exhausted = false;
/**
* <code>true</code> iff the iterator has been {@link #close()}ed.
*/
private boolean closed = false;
/**
* The #of distinct {@link ISPO}s read from the {@link #src} iterator so
* far.
*/
private int ndistinct = 0;
/**
* The #of {@link ISPO}s read from the {@link #src} iterator.
*/
private int nscanned = 0;
/**
* After this many entries we create the {@link #btreeSet} which can spill
* out onto the disk.
*
* @todo configuration parameter (via the constructor). Low memory JVMs
* might want to use a smaller threshold, but the hash set is much
* faster (10x or better). Large memory JVMs might want to use an even
* larger threshold. [I've set this to MAX_VALUE since the BTree is a
* huge performance hit.]
*/
static final int MAX_HASH_SET_CAPACITY = Integer.MAX_VALUE;//100000;
/**
*
* @param src
* The source iterator.
*/
public DistinctSPOIterator(final SPORelation spoRelation,
final ICloseableIterator<ISPO> src) {
if (spoRelation == null)
throw new IllegalArgumentException();
if (src == null)
throw new IllegalArgumentException();
this.spoRelation = spoRelation;
this.src = src;
}
public void close() {
if (closed)
return;
closed = true;
/*
* Close the source iterator.
*/
src.close();
/*
* Close the btree. This will discard all of its buffers.
*/
if (btreeSet != null) {
btreeSet.close();
}
spoRelation = null;
src = null;
hashSet = null;
btreeSet = null;
keyBuilder = null;
}
/**
* Returns immediately if there is an element waiting. Otherwise, scans
* ahead until it finds an element which has not already been visited. It
* then add the element to the set of elements already seen and saves a
* reference to that element to be returned by {@link #next()}.
*/
public boolean hasNext() {
if (exhausted || closed)
return false;
if (next != null)
return true;
if (hashSet == null) {
/*
* Allocate hash set.
*
* Note: using a linked hash set for faster iterator if we have to
* convert to a B+Tree.
*
* Note: the initial capacity is the default since most access paths
* have low cardinality.
*
* @todo if the caller knows the range count (upper bound) then we
* could plan the hash set capacity more accurately.
*
* @todo defer hashSet creation until 2nd distinct SPO shows up to
* reduce the object allocation.
*/
hashSet = new LinkedHashSet<ISPO>();
} else if (btreeSet == null && ndistinct >= MAX_HASH_SET_CAPACITY) {
/*
* Open B+Tree. We will not put anything new into the hashSet,
* but we will continue to test first against the hashSet and
* then against the B+Tree. New distinct ISPOs are inserted into
* the B+Tree.
*/
if (BigdataStatics.debug)
System.err.println("Distinct SPO iterator overflow");
// allocate buffer.
keyBuilder = new KeyBuilder(3 * Bytes.SIZEOF_LONG);
// allocate B+Tree w/ bloom filter.
btreeSet = spoRelation.getSPOOnlyBTree(true/* bloomFilter */);
}
// scan for the next distinct ISPO from the src iterator.
return _hasNext();
}
/**
* Scan for the next distinct {@link ISPO} from the src iterator and set it
* on {@link #next}.
*
* @return <code>true</code> if another distinct {@link ISPO} was found.
*/
private boolean _hasNext() {
while (next == null && src.hasNext()) {
/*
* Read another ISPO from the iterator and strip off the context
* position.
*
* Note: distinct is enforced on (s,p,o). By stripping off the
* context and statement type information first, we ensure that
* (s,p,o) duplicates will be recognized as such.
*
* Note: this approach requires us to discard the statement type
* metadata.
*/
// read next from the source iterator.
ISPO tmp = src.next(); nscanned++;
// strip off the context (and statement type).
tmp = new SPO(tmp.s(), tmp.p(), tmp.o(), (IV) null/* c */);
if (btreeSet == null) {
// Insert into the hash set.
if (!hashSet.add(tmp)) {
// duplicate, keep scanning.
continue;
}
} else {
// First, test the hash set.
if (hashSet.contains(tmp)) {
// duplicate, keep scanning.
continue;
}
// Next, test the B+Tree.
final byte[] key = SPOKeyOrder.SPO.encodeKey(keyBuilder, tmp);
if (btreeSet.contains(key)) {
// duplicate, keep scanning.
continue;
}
// Finally, insert into the B+Tree.
btreeSet.insert(key, null);
}
// found a new distinct spo.
next = tmp;
ndistinct++;
} // while(...)
if (next == null) {
exhausted = true;
return false;
}
return true;
}
public ISPO next() {
if (!hasNext())
throw new NoSuchElementException();
assert next != null;
final ISPO tmp = next;
next = null;
return tmp;
}
public void remove() {
throw new UnsupportedOperationException();
}
}