package org.basex.util.ft;
import java.util.Map.Entry;
import java.util.TreeMap;
import org.basex.index.IndexToken;
import org.basex.io.serial.XMLSerializer;
import org.basex.query.ft.FTFilter;
import org.basex.util.Token;
import org.basex.util.list.StringList;
/**
* Performs full-text lexing on token. Calls tokenizers, stemmers matching to
* full-text options to achieve this.
*
* @author BaseX Team 2005-12, BSD License
* @author Jens Erat
*/
public final class FTLexer extends FTIterator implements IndexToken {
/** Tokenizer. */
private final Tokenizer tok;
/** Full-text options. */
private final FTOpt fto;
/** Text to be tokenized. */
private byte[] text = Token.EMPTY;
/** Iterator over result tokens. */
private FTIterator iter;
/** The last parsed span. */
private FTSpan curr;
/** The last parsed text. */
private byte[] ctxt;
/**
* Constructor, using the default full-text options. Called by the
* {@link XMLSerializer}, {@link FTFilter}, and the map visualizations.
*/
public FTLexer() {
this(null);
}
/**
* Default constructor.
* @param opt full-text options
*/
public FTLexer(final FTOpt opt) {
fto = opt;
// check if language option is provided:
Language lang = opt != null ? opt.ln : null;
if(lang == null) lang = Language.def();
// use default tokenizer if specific tokenizer is not available.
Tokenizer tk = Tokenizer.IMPL.getFirst();
for(final Tokenizer t : Tokenizer.IMPL) {
if(t.supports(lang)) {
tk = t;
break;
}
}
tok = tk.get(opt);
iter = tok;
// wrap original iterator
if(opt != null && opt.is(FTFlag.ST)) {
if(opt.sd == null) {
// use default stemmer if specific stemmer is not available.
Stemmer st = Stemmer.IMPL.getFirst();
for(final Stemmer stem : Stemmer.IMPL) {
if(stem.supports(lang)) {
st = stem;
break;
}
}
iter = st.get(lang, iter);
} else {
iter = new DictionaryStemmer(opt.sd, iter);
}
}
}
/**
* Sets the special character flag.
* Returns not only tokens, but also delimiters.
* @return self reference
*/
public FTLexer sc() {
tok.special = true;
return this;
}
/**
* Initializes the iterator.
*/
public void init() {
init(text);
}
@Override
public FTLexer init(final byte[] txt) {
text = txt;
iter.init(txt);
return this;
}
@Override
public boolean hasNext() {
return iter.hasNext();
}
@Override
public FTSpan next() {
curr = iter.next();
return curr;
}
@Override
public byte[] nextToken() {
ctxt = iter.nextToken();
return ctxt;
}
/**
* Returns total number of tokens.
* @return token count
*/
public int count() {
init();
int c = 0;
while(hasNext()) {
nextToken();
c++;
}
return c;
}
@Override
public IndexType type() {
return IndexType.FULLTEXT;
}
/**
* Returns the original token. Inherited from {@link IndexToken};
* use {@link #next} or {@link #nextToken} if not using this interface.
* @return current token.
*/
@Override
public byte[] get() {
return ctxt != null ? ctxt : curr.text;
}
/**
* Returns the full-text options. Can be {@code null}.
* @return full-text options
*/
public FTOpt ftOpt() {
return fto;
}
/**
* Returns the text to be processed.
* @return text
*/
public byte[] text() {
return text;
}
/**
* Is paragraph? Does not have to be implemented by all tokenizers.
* Returns false if not implemented.
* @return boolean
*/
public boolean paragraph() {
return tok.paragraph();
}
/**
* Calculates a position value, dependent on the specified unit. Does not have
* to be implemented by all tokenizers. Returns 0 if not implemented.
* @param w word position
* @param u unit
* @return new position
*/
public int pos(final int w, final FTUnit u) {
return tok.pos(w, u);
}
/**
* Gets full-text info for the specified token; needed for visualizations.
* See {@link Tokenizer#info} for more info.
* @return int arrays or empty array if not implemented
*/
public int[][] info() {
return tok.info();
}
/**
* Lists all languages for which tokenizers and stemmers are available.
* @return supported languages
*/
public static StringList languages() {
final TreeMap<Language, Stemmer> langs = new TreeMap<Language, Stemmer>();
for(final Stemmer stem : Stemmer.IMPL) {
for(final Language l : stem.languages()) {
if(langs.containsKey(l)) continue;
for(final Tokenizer t : Tokenizer.IMPL) {
if(t.languages().contains(l)) langs.put(l, stem);
}
}
}
final StringList sl = new StringList();
for(final Entry<Language, Stemmer> l : langs.entrySet()) {
sl.add(l.getKey() + " (" + l.getValue() + ')');
}
sl.sort(true, true);
return sl;
}
}