package org.basex.query.ft;
import static org.basex.util.Token.*;
import static org.basex.util.ft.FTFlag.*;
import org.basex.core.Prop;
import org.basex.query.QueryException;
import org.basex.util.Levenshtein;
import org.basex.util.ft.FTBitapSearch;
import org.basex.util.ft.FTBitapSearch.TokenComparator;
import org.basex.util.ft.FTIterator;
import org.basex.util.ft.FTLexer;
import org.basex.util.ft.FTOpt;
import org.basex.util.hash.TokenObjMap;
import org.basex.util.list.TokenList;
/**
* This class performs the full-text tokenization.
*
* @author BaseX Team 2005-12, BSD License
* @author Christian Gruen
*/
final class FTTokenizer {
/** Wildcard object cache. */
final TokenObjMap<FTWildcard> wcCache = new TokenObjMap<FTWildcard>();
/** Levenshtein reference. */
final Levenshtein ls = new Levenshtein();
/** Calling expression. */
final FTWords words;
/** Full-text options. */
final FTOpt opt;
/** Levenshtein error. */
final int lserr;
/** Token comparator. */
private final TokenComparator cmp;
/** Cache. */
private final TokenObjMap<FTTokens> cache = new TokenObjMap<FTTokens>();
/**
* Constructor.
* @param w full-text words
* @param o full-text options
* @param pr database properties
*/
public FTTokenizer(final FTWords w, final FTOpt o, final Prop pr) {
words = w;
opt = o;
lserr = pr.num(Prop.LSERROR);
cmp = new TokenComparator() {
@Override
public boolean equal(final byte[] in, final byte[] qu)
throws QueryException {
FTWildcard ftw = null;
if(opt.is(WC)) {
ftw = wcCache.get(qu);
if(ftw == null) {
ftw = new FTWildcard(qu, words.input);
wcCache.add(qu, ftw);
}
}
return
// skip stop words, i. e. if the current query token is a stop word,
// it is always equal to the corresponding input token:
opt.sw != null && opt.sw.id(qu) != 0 ||
// fuzzy search:
(opt.is(FZ) ? ls.similar(in, qu, lserr) :
// wild-card search:
ftw != null ? ftw.match(in) :
// simple search:
eq(in, qu));
}
};
}
/**
* Returns a new lexer, adopting the tokenizer options.
* @param lex input lexer
* @return lexer
*/
FTLexer copy(final FTLexer lex) {
// assign options to text:
final FTOpt to = lex.ftOpt();
to.set(ST, opt.is(ST));
to.set(DC, opt.is(DC));
to.set(CS, opt.is(CS));
to.ln = opt.ln;
to.th = opt.th;
to.sd = opt.sd;
return new FTLexer(to).init(lex.text());
}
/**
* Returns cached query tokens.
* @param query query token
* @return number of occurrences
* @throws QueryException query exception
*/
FTTokens cache(final byte[] query) throws QueryException {
FTTokens tokens = cache.get(query);
if(tokens == null) {
tokens = new FTTokens();
cache.add(query, tokens);
// cache query tokens:
final FTIterator quLex = new FTLexer(opt).init(query);
final TokenList quList = new TokenList(1);
while(quLex.hasNext()) quList.add(quLex.nextToken());
tokens.add(quList);
// if thesaurus is required, add the terms which extend the query:
if(opt.th != null) {
for(final byte[] ext : opt.th.find(words.input, query)) {
// parse each extension term to a set of tokens:
final TokenList tl = new TokenList(1);
quLex.init(ext);
while(quLex.hasNext()) tl.add(quLex.nextToken());
// add each thesaurus term as an additional query term:
tokens.add(tl);
}
}
}
return tokens;
}
/**
* Checks if the first token contains the second full-text term.
* @param query cached query tokens
* @param input input text
* @return number of occurrences
* @throws QueryException query exception
*/
int contains(final FTTokens query, final FTLexer input)
throws QueryException {
input.init();
final FTBitapSearch bs = new FTBitapSearch(input, query, cmp);
int c = 0;
while(bs.hasNext()) {
final int pos = bs.next();
words.add(pos, pos + query.length() - 1);
++c;
}
words.matches.sTokenNum++;
words.first = false;
return c;
}
}