package org.basex.index.ft;
import static org.basex.core.Text.*;
import static org.basex.data.DataText.*;
import java.io.IOException;
import org.basex.core.BaseXException;
import org.basex.core.Prop;
import org.basex.data.Data;
import org.basex.index.IndexBuilder;
import org.basex.io.out.DataOutput;
import org.basex.util.Num;
import org.basex.util.Performance;
import org.basex.util.TokenBuilder;
import org.basex.util.ft.FTFlag;
import org.basex.util.ft.FTLexer;
import org.basex.util.ft.FTOpt;
import org.basex.util.ft.Language;
import org.basex.util.ft.Scoring;
import org.basex.util.ft.Stemmer;
import org.basex.util.ft.StopWords;
import org.basex.util.ft.Tokenizer;
import org.basex.util.list.IntList;
import org.basex.util.Util;
/**
* This class contains common methods for full-text index builders.
*
* @author BaseX Team 2005-12, BSD License
* @author Christian Gruen
*/
public abstract class FTBuilder extends IndexBuilder {
/** Word parser. */
private final FTLexer lex;
/** Current lexer position. */
int pos;
/** Scoring mode; see {@link Prop#SCORING}. */
final int scm;
/** Number of indexed tokens. */
private long ntok;
/** Document units (all document or text nodes in a document). */
private final IntList unit = new IntList();
/** Container for all frequencies. TF: freq(i, j). */
private final IntList freq = new IntList();
/** Container for maximal frequencies. TF: max(l, freq(l, j)). */
private int[] maxfreq;
/** Container for number of documents with token i. IDF: n(i). */
private int[] ntoken;
/** Maximum scoring value. */
private int max;
/** Minimum scoring value. */
private int min;
/** Current token. */
private int token;
/** Current frequency. */
private int fc;
/**
* Returns a new full-text index builder.
* @param d data reference
* @return index builder
* @throws IOException IOException
*/
public static FTBuilder get(final Data d) throws IOException {
return d.meta.wildcards ? new FTTrieBuilder(d) : new FTFuzzyBuilder(d);
}
/**
* Constructor.
* @param d data reference
* @throws IOException IOException
*/
FTBuilder(final Data d) throws IOException {
super(d);
final Prop prop = d.meta.prop;
final FTOpt fto = new FTOpt();
fto.set(FTFlag.DC, prop.is(Prop.DIACRITICS));
fto.set(FTFlag.CS, prop.is(Prop.CASESENS));
fto.set(FTFlag.ST, prop.is(Prop.STEMMING));
fto.sw = new StopWords(d, prop.get(Prop.STOPWORDS));
fto.ln = Language.get(prop);
if(!Tokenizer.supportFor(fto.ln))
throw new BaseXException(NO_TOKENIZER_X, fto.ln);
if(prop.is(Prop.STEMMING) && !Stemmer.supportFor(fto.ln))
throw new BaseXException(NO_STEMMER_X, fto.ln);
scm = d.meta.scoring;
max = -1;
min = Integer.MAX_VALUE;
lex = new FTLexer(fto);
}
/**
* Extracts and indexes words from the specified data reference.
* @throws IOException I/O Exception
*/
final void index() throws IOException {
// delete old index
abort();
final Performance perf = Util.debug ? new Performance() : null;
Util.debug(det());
for(pre = 0; pre < size; ++pre) {
if((pre & 0xFFFF) == 0) check();
final int k = data.kind(pre);
if(k != Data.TEXT) {
if(scm == 1 && k == Data.DOC) unit.add(pre);
continue;
}
if(scm == 2) unit.add(pre);
pos = -1;
final StopWords sw = lex.ftOpt().sw;
lex.init(data.text(pre, true));
while(lex.hasNext()) {
final byte[] tok = lex.nextToken();
++pos;
// skip too long and stopword tokens
if(tok.length <= data.meta.maxlen &&
(sw.size() == 0 || sw.id(tok) == 0)) {
// check if main memory is exhausted
if((ntok++ & 0xFFF) == 0 && scm == 0 && memFull()) {
// currently no frequency support for tf/idf based scoring
writeIndex(csize++);
Performance.gc(2);
}
index(tok);
}
}
}
// calculate term frequencies
if(scm > 0) {
maxfreq = new int[unit.size() + 1];
ntoken = new int[nrTokens()];
token = 0;
calcFreq();
}
// write tokens
token = 0;
write();
// set meta data
if(scm > 0) {
data.meta.maxscore = max;
data.meta.minscore = min;
}
data.meta.ftxtindex = true;
Util.gc(perf);
}
/**
* Calculates the tf-idf data for a single token.
* @param vpre pre values for a token
*/
final void calcFreq(final byte[] vpre) {
int np = 4;
int nl = Num.length(vpre, np);
int p = Num.get(vpre, np);
final int ns = Num.size(vpre);
while(np < ns) {
int u = unit.sortedIndexOf(p);
if(u < 0) u = -u - 1;
int fr = 0;
do {
++fr;
np += nl;
if(np >= ns) break;
p = Num.get(vpre, np);
nl = Num.length(vpre, np);
} while(scm == 1 && (u == unit.size() || p < unit.get(u)) ||
scm == 2 && p == unit.get(u));
freq.add(fr);
if(maxfreq[u] < fr) maxfreq[u] = fr;
ntoken[token]++;
}
++token;
}
/**
* Writes the current index to disk.
* @param cs current file pointer
* @throws IOException I/O exception
*/
protected abstract void writeIndex(final int cs) throws IOException;
/**
* Merges temporary indexes for the current token.
* @param out full-text data
* @param il array mapping
* @param v full-text list
* @return written size
* @throws IOException I/O exception
*/
final int merge(final DataOutput out, final IntList il,
final FTList[] v) throws IOException {
int s = 0;
final TokenBuilder tbp = new TokenBuilder();
final TokenBuilder tbo = new TokenBuilder();
tbp.add(new byte[4]);
tbo.add(new byte[4]);
// merge full-text data of all sorted lists with the same token
for(int j = 0; j < il.size(); ++j) {
final int m = il.get(j);
for(final int p : v[m].prv) tbp.add(Num.num(p));
for(final int p : v[m].pov) tbo.add(Num.num(p));
s += v[m].size;
v[m].next();
}
// write compressed pre and pos arrays
final byte[] pr = tbp.finish();
Num.size(pr, pr.length);
final byte[] po = tbo.finish();
Num.size(po, po.length);
// write full-text data
writeFTData(out, pr, po);
return s;
}
/**
* Writes full-text data for a single token to disk.<br/>
* Format: {@code score? pre1 pos1 pre2 pos2 ... (0 score)? pre...}
* @param out DataOutput for disk access
* @param vpre compressed pre values
* @param vpos compressed pos values
* @throws IOException IOException
*/
final void writeFTData(final DataOutput out, final byte[] vpre,
final byte[] vpos) throws IOException {
int np = 4, pp = 4, lp = -1, lu = -1;
final int ns = Num.size(vpre);
while(np < ns) {
if(scm > 0) {
final int p = Num.get(vpre, np);
if(lp != p) {
// new pre value: find document root
int u = unit.sortedIndexOf(p);
if(u < 0) u = -u - 1;
if(lu != u) {
// new unit: store scoring
final int s = Scoring.tfIDF(freq.get(fc++),
maxfreq[u], unit.size(), ntoken[token]);
if(max < s) max = s;
if(min > s) min = s;
if(np != 4) out.write(0);
out.writeNum(s);
lu = u;
}
lp = p;
}
}
// full-text data is stored here, with -scoreU, pre1, pos1, ...,
// -scoreU, preU, posU
for(final int l = np + Num.length(vpre, np); np < l; ++np)
out.write(vpre[np]);
for(final int l = pp + Num.length(vpos, pp); pp < l; ++pp)
out.write(vpos[pp]);
}
++token;
}
/**
* Checks if any unprocessed pre values are remaining.
* @param lists lists
* @return boolean
*/
static final boolean check(final FTList[] lists) {
for(final FTList l : lists) if(l.tok.length > 0) return true;
return false;
}
/**
* Indexes a single token.
* @param tok token to be indexed
*/
abstract void index(final byte[] tok);
/**
* Returns the number of disjunct tokens.
* @return number of tokens
*/
abstract int nrTokens();
/**
* Evaluates the maximum frequencies for tfidf.
*/
abstract void calcFreq();
/**
* Writes the index data to disk.
* @throws IOException I/O exception
*/
abstract void write() throws IOException;
@Override
public final void abort() {
data.meta.drop(DATAFTX + ".*");
data.meta.ftxtindex = false;
}
@Override
protected final String det() {
return INDEX_FULLTEXT_D;
}
}