package org.basex.index.ft;
import static org.basex.core.Text.*;
import static org.basex.data.DataText.*;
import static org.basex.util.Token.*;
import static org.basex.util.ft.FTFlag.*;
import java.io.IOException;
import org.basex.core.Prop;
import org.basex.data.Data;
import org.basex.data.DataText;
import org.basex.index.IndexIterator;
import org.basex.index.IndexStats;
import org.basex.index.IndexToken;
import org.basex.io.random.DataAccess;
import org.basex.util.Levenshtein;
import org.basex.util.Num;
import org.basex.util.Performance;
import org.basex.util.TokenBuilder;
import org.basex.util.Util;
import org.basex.util.ft.FTLexer;
import org.basex.util.hash.TokenIntMap;
/**
* <p>This class provides access to a fuzzy full-text index structure
* stored on disk. Each token has an entry in sizes, saving its length and a
* pointer on ftdata, where to find the token and its ftdata.
* The three database index files start with the prefix
* {@link DataText#DATAFTX} and have the following format:</p>
*
* <ul>
* <li>File <b>x</b> contains an entry for each token length:<br/>
* Structure: {@code [l, p] ...}<br/>
* {@code l} is the length of a token [byte].<br/>
* {@code p} is the pointer of the first token with length {@code l} [int].
* </li>
* <li>File <b>y</b> contains the tokens and references:<br/>
* Structure: {@code [t0, t1, ... tl, z, s]}<br/>
* {@code t0, t1, ... tl-1} is the token [byte[l]]<br/>
* {@code z} is the pointer on the data entries of the token [long]<br/>
* {@code s} is the number of pre values, saved in data [int]
* </li>
* <li>File <b>z</b> contains the {@code pre/pos} references.
* The values are ordered, but not distinct:<br/>
* {@code pre1/pos1, pre2/pos2, pre3/pos3, ...} [{@link Num}]</li>
* </ul>
*
* @author BaseX Team 2005-12, BSD License
* @author Christian Gruen
* @author Sebastian Gath
*/
final class FTFuzzy extends FTIndex {
/** Entry size. */
private static final int ENTRY = 9;
/** Token positions. */
private final int[] tp;
/** Levenshtein reference. */
private final Levenshtein ls = new Levenshtein();
/** Index storing each unique token length and pointer
* on the first token with this length. */
private final DataAccess inX;
/** Index storing each token, its data size and pointer on the data. */
private final DataAccess inY;
/** Storing pre and pos values for each token. */
private final DataAccess inZ;
/**
* Constructor, initializing the index structure.
* @param d data reference
* @throws IOException I/O Exception
*/
FTFuzzy(final Data d) throws IOException {
super(d);
// cache token length index
inY = new DataAccess(d.meta.dbfile(DATAFTX + 'y'));
inZ = new DataAccess(d.meta.dbfile(DATAFTX + 'z'));
inX = new DataAccess(d.meta.dbfile(DATAFTX + 'x'));
tp = new int[d.meta.maxlen + 3];
for(int i = 0; i < tp.length; ++i) tp[i] = -1;
int is = inX.readNum();
while(--is >= 0) {
int p = inX.readNum();
final int r;
// legacy issue (7.0.2 -> 7.1)
if(p >= 4096) {
r = p << 24 | (inX.read1() & 0xFF) << 16 |
(inX.read1() & 0xFF) << 8 | inX.read1() & 0xFF;
p = p >> 8 | 0x40;
} else {
r = inX.read4();
}
tp[p] = r;
}
tp[tp.length - 1] = (int) inY.length();
}
@Override
public synchronized int count(final IndexToken ind) {
if(ind.get().length > data.meta.maxlen) return Integer.MAX_VALUE;
// estimate costs for queries which stretch over multiple index entries
final FTLexer lex = (FTLexer) ind;
if(lex.ftOpt().is(FZ)) return Math.max(1, data.meta.size / 10);
final byte[] tok = lex.get();
final int id = cache.id(tok);
if(id > 0) return cache.size(id);
int s = 0;
long poi = 0;
final long p = token(tok);
if(p > -1) {
s = size(p, tok.length);
poi = pointer(p, tok.length);
}
cache.add(tok, s, poi);
return s;
}
@Override
public synchronized IndexIterator iter(final IndexToken ind) {
final byte[] tok = ind.get();
// support fuzzy search
if(((FTLexer) ind).ftOpt().is(FZ)) {
int k = data.meta.prop.num(Prop.LSERROR);
if(k == 0) k = tok.length >> 2;
return fuzzy(tok, k, false);
}
// return cached or new result
final int id = cache.id(tok);
if(id == 0) {
final int p = token(tok);
return p > -1 ? iter(pointer(p, tok.length),
size(p, tok.length), inZ, false) : FTIndexIterator.FTEMPTY;
}
return iter(cache.pointer(id), cache.size(id), inZ, false);
}
@Override
public TokenIntMap entries(final byte[] prefix) {
final TokenIntMap tim = new TokenIntMap();
for(int s = prefix.length; s < tp.length - 1; s++) {
int p = tp[s];
if(p == -1) continue;
int i = s + 1;
int r;
do r = tp[i++]; while(r == -1);
inY.cursor(p);
boolean f = false;
while(p < r) {
final byte[] tok = inY.readBytes(s);
final long poi = inY.read5();
final int size = inY.read4();
cache.add(tok, size, poi);
if(startsWith(tok, prefix)) {
tim.add(tok, size);
f = true;
} else if(f) {
break;
}
p += s + ENTRY;
}
}
return tim;
}
@Override
public synchronized byte[] info() {
final TokenBuilder tb = new TokenBuilder();
tb.add(LI_STRUCTURE + FUZZY + NL);
tb.addExt("- %: %" + NL, STEMMING, Util.flag(data.meta.stemming));
tb.addExt("- %: %" + NL, CASE_SENSITIVITY, Util.flag(data.meta.casesens));
tb.addExt("- %: %" + NL, DIACRITICS, Util.flag(data.meta.diacritics));
if(data.meta.language != null)
tb.addExt("- %: %" + NL, LANGUAGE, data.meta.language);
final long l = inX.length() + inY.length() + inZ.length();
tb.add(LI_SIZE + Performance.format(l, true) + NL);
final IndexStats stats = new IndexStats(data);
addOccs(stats);
stats.print(tb);
return tb.finish();
}
@Override
public synchronized void close() throws IOException {
inX.close();
inY.close();
inZ.close();
}
/**
* Determines the pointer on a token.
* @param tok token looking for
* @return int pointer or {@code -1} if token was not found
*/
private int token(final byte[] tok) {
final int tl = tok.length;
// left limit
int l = tp[tl];
if(l == -1) return -1;
int i = 1;
int r;
// find right limit
do r = tp[tl + i++]; while(r == -1);
final int x = r;
// binary search
final int o = tl + ENTRY;
while(l < r) {
final int m = l + (r - l >> 1) / o * o;
final int c = diff(inY.readBytes(m, tl), tok);
if(c == 0) return m;
if(c < 0) l = m + o;
else r = m - o;
}
// accept entry if pointer is inside relevant tokens
return r != x && l == r && eq(inY.readBytes(l, tl), tok) ? l : -1;
}
/**
* Collects all tokens and their sizes found in the index structure.
* @param stats statistics
*/
private void addOccs(final IndexStats stats) {
int i = 0;
while(i < tp.length && tp[i] == -1) ++i;
int p = tp[i];
int j = i + 1;
while(j < tp.length && tp[j] == -1) ++j;
while(p < tp[tp.length - 1]) {
if(stats.adding(size(p, i))) stats.add(inY.readBytes(p, i));
p += i + ENTRY;
if(p == tp[j]) {
i = j;
while(j + 1 < tp.length && tp[++j] == -1);
}
}
}
/**
* Gets the pointer on ftdata for a token.
* @param pt pointer on token
* @param lt length of the token
* @return int pointer on ftdata
*/
private long pointer(final long pt, final int lt) {
return inY.read5(pt + lt);
}
/**
* Reads the size of ftdata from disk.
* @param pt pointer on token
* @param lt length of the token
* @return size of the ftdata
*/
private int size(final long pt, final int lt) {
return inY.read4(pt + lt + 5);
}
/**
* Performs a fuzzy search for token, with e maximal number
* of errors e.
* @param tok token looking for
* @param k number of errors allowed
* @param f fast evaluation
* @return iterator
*/
private IndexIterator fuzzy(final byte[] tok, final int k, final boolean f) {
FTIndexIterator it = FTIndexIterator.FTEMPTY;
final int tl = tok.length;
final int e = Math.min(tp.length - 1, tl + k);
int s = Math.max(1, tl - k) - 1;
while(++s <= e) {
int p = tp[s];
if(p == -1) continue;
int i = s + 1;
int r = -1;
while(i < tp.length && r == -1) r = tp[i++];
while(p < r) {
if(ls.similar(inY.readBytes(p, s), tok, k)) {
it = FTIndexIterator.union(
iter(pointer(p, s), size(p, s), inZ, f), it);
}
p += s + ENTRY;
}
}
return it;
}
}