package org.basex.util.ft;
import java.util.BitSet;
import java.util.NoSuchElementException;
import org.basex.query.QueryException;
import org.basex.query.ft.FTTokens;
import org.basex.util.list.TokenList;
/**
* Generalized search algorithm based on the Bitap string matching algorithm.
* The implementation is based on the implementation in Wikipedia, but uses
* {@link BitSet} for fast bit operation. This version works with a set of
* needles and each one of it can be matched in the haystack.
*
* @author BaseX Team 2005-12, BSD License
* @author Dimitar Popov
* @see <a href="http://en.wikipedia.org/wiki/Bitap_algorithm"
* >http://en.wikipedia.org/wiki/Bitap_algorithm</a>
*/
public final class FTBitapSearch {
/** Iterator over the set of elements being searched. */
private final FTIterator haystack;
/** Subset of elements being searched for. */
private final FTTokens needles;
/** Comparator used for comparing two elements for equality. */
private final TokenComparator cmp;
/** Bit masks, showing which elements from a {@link #needles} are equal to
* the current element of {@link #haystack}. */
private final BitSet[] masks;
/** Needle indexes in {@link #needles} sorted by the length of the needle. */
private final int[] sorted;
/** Is the method {@link #hasNext} already called? */
private boolean next;
/** The current position in the {@link #haystack} iterator; first is 0. */
private int pos;
/** Index of the needle which is matched. */
private int match;
/**
* Constructor.
* @param h iterator over the set of elements being searched ("haystack")
* @param n a list of "needles" (a needle is an array of elements being
* searched for)
* @param c comparator for comparing two elements for equality
*/
public FTBitapSearch(final FTIterator h, final FTTokens n,
final TokenComparator c) {
haystack = h;
cmp = c;
needles = n;
sorted = new int[n.size()];
// skip empty needles:
int count = -1;
for(int i = 0; i < sorted.length; i++) {
if(n.get(i) != null && n.get(i).size() > 0) sorted[++count] = i;
}
masks = new BitSet[++count];
// sort the needles by length (longest first):
for(int i = 0; i < count; i++) {
for(int j = i; j > 0
&& n.get(sorted[j]).size() > n.get(sorted[j - 1]).size(); j--) {
final int t = sorted[j];
sorted[j] = sorted[j - 1];
sorted[j - 1] = t;
}
// initialize the bit masks, too:
masks[i] = new BitSet();
masks[i].set(0);
}
}
/**
* Is there one more match?
* @return {@code true} if yes
* @throws QueryException if an error occurs during search
*/
public boolean hasNext() throws QueryException {
if(masks.length == 0) return false;
if(next) return pos >= 0;
// find next hit:
next = true;
while(haystack.hasNext()) {
final byte[] current = haystack.nextToken();
++pos;
// check each needle for a match:
boolean matched = false;
for(int i = 0; i < masks.length; i++) {
final int id = sorted[i];
final TokenList n = needles.get(id);
final BitSet m = masks[id];
// compare each element from the needle and set the corresponding bit:
for(int k = n.size(); k >= 1; k--)
m.set(k, m.get(k - 1) && cmp.equal(current, n.get(k - 1)));
// if the last element of the needle's mask is true, then all elements
// of the needle are matched:
if(m.get(n.size()) && !matched) {
match = id;
matched = true;
}
}
if(matched) return true;
}
// nothing was found and the whole haystack was checked:
pos = -1;
return false;
}
/**
* Position in the haystack of the next match.
* @return start position of the match; first position is 0
* @throws QueryException if an error occurs during search
*/
public int next() throws QueryException {
if(hasNext()) {
next = false;
return pos - needles.get(match).size();
}
throw new NoSuchElementException();
}
/**
* Token comparator.
*
* @author BaseX Team 2005-12, BSD License
* @author Dimitar Popov
*/
public interface TokenComparator {
/**
* Check if two tokens are equal.
* @param t1 first token
* @param t2 second token
* @return {@code true} if the two are equal
* @throws QueryException if an error occurs during comparison
*/
boolean equal(final byte[] t1, final byte[] t2) throws QueryException;
}
}