package org.basex.index.ft;
import static org.basex.data.DataText.*;
import static org.basex.util.Token.*;
import java.io.IOException;
import org.basex.data.Data;
import org.basex.data.MetaData;
import org.basex.io.out.DataOutput;
import org.basex.io.random.DataAccess;
import org.basex.util.list.IntArrayList;
import org.basex.util.list.IntList;
import org.basex.util.list.TokenList;
/**
* <p>This class builds an index for text contents in a compressed trie:</p>
*
* <ol>
* <li> The tokens are indexed in a main memory tree structure.</li>
* <li> If main memory is full, data is written as sorted list to disk.</li>
* <li> The temporary index instances are merged and written to disk.</li>
* </ol>
*
* <p>The file structure is specified in the {@link FTTrie} class.</p>
*
* @author BaseX Team 2005-12, BSD License
* @author Sebastian Gath
* @author Christian Gruen
*/
final class FTTrieBuilder extends FTBuilder {
/** Trie index. */
private FTTrieArray index = new FTTrieArray(128);
/** Hash structure for temporarily saving the tokens. */
private FTTrieHash hash = new FTTrieHash();
/** Offset for joining subtrees. */
private int offset;
/**
* Constructor.
* @param d data reference
* @throws IOException IOException
*/
FTTrieBuilder(final Data d) throws IOException {
super(d);
}
@Override
public FTIndex build() throws IOException {
index();
return new FTTrie(data);
}
@Override
void index(final byte[] tok) {
hash.index(tok, pre, pos);
}
@Override
int nrTokens() {
return hash.size();
}
@Override
void calcFreq() {
hash.init();
while(hash.more()) calcFreq(hash.pre[hash.next()]);
}
@Override
public void write() throws IOException {
if(!merge) {
writeAll();
return;
}
// merges temporary index files
writeIndex(csize++);
final DataOutput outB = new DataOutput(data.meta.dbfile(DATAFTX + 'b'));
final DataOutput outT = new DataOutput(data.meta.dbfile(DATAFTX + 't'));
final IntList ind = new IntList();
// open all temporary sorted lists
final FTList[] v = new FTList[csize];
for(int b = 0; b < csize; ++b) v[b] = new FTTrieList(data, b);
final IntList il = new IntList();
while(check(v)) {
int min = 0;
il.reset();
il.add(min);
// find next token to write on disk
for(int i = 0; i < csize; ++i) {
if(min == i || v[i].tok.length == 0) continue;
final int d = diff(v[min].tok, v[i].tok);
if(d > 0 || v[min].tok.length == 0) {
min = i;
il.reset();
il.add(min);
} else if(d == 0 && v[i].tok.length > 0) {
il.add(i);
}
}
// collect each child of the root node
if(ind.size() == 0 || ind.get(ind.size() - 1) != v[min].tok[0]) {
ind.add(v[min].tok[0]);
}
// write token to disk
outT.writeToken(v[min].tok);
// merge and write data size
outT.write4(merge(outB, il, v));
// write pointer on full-text data
outT.write5(outB.size());
}
outT.writeToken(EMPTY);
outT.close();
outB.close();
// write trie index structure to disk, split in subtrees
writeSplitTrie(ind);
}
/**
* Writes the trie structure to disk.
* @throws IOException I/O exception
*/
private void writeAll() throws IOException {
if(scm == 0) hash.init();
else hash.initIter();
final DataOutput outB = new DataOutput(data.meta.dbfile(DATAFTX + 'b'));
while(hash.more()) {
final int p = hash.next();
final byte[] tok = hash.key();
final int ds = hash.sizes[p];
final long cpre = outB.size();
// write compressed pre and pos arrays
writeFTData(outB, hash.pre[p], hash.pos[p]);
index.insertSorted(tok, ds, cpre);
}
outB.close();
hash = null;
final TokenList tokens = index.tokens;
final IntArrayList next = index.next;
final DataOutput outA = new DataOutput(data.meta.dbfile(DATAFTX + 'a'));
final DataOutput outC = new DataOutput(data.meta.dbfile(DATAFTX + 'c'));
// write root node (token length and bytes)
outA.write1(1);
outA.write1(0);
// write next pointer
final int[] root = next.get(0);
final int js = root.length - 2;
for(int j = 1; j < js; ++j) {
// pointer
final int p = root[j];
outA.write4(p);
// first char of next node
outA.write1(tokens.get(next.get(p)[0])[0]);
}
// write root node
outA.write4(root[root.length - 2]); // data size
// root has no data
outA.write5(0);
// write offset to first node
outC.write4(0);
// all other nodes
writeSubTree(null, outA, outC, 0, (root.length - 3) * 5 + 11);
outC.write4(0);
outA.close();
outC.close();
}
/**
* Writes the trie structure to disk, split in subtrees to save memory.
* @param roots root nodes
* @throws IOException I/O exception
*/
private void writeSplitTrie(final IntList roots) throws IOException {
final MetaData md = data.meta;
final DataOutput outA = new DataOutput(md.dbfile(DATAFTX + 'a'));
final DataOutput outC = new DataOutput(md.dbfile(DATAFTX + 'c'));
final DataAccess outT = new DataAccess(md.dbfile(DATAFTX + 't'));
final int[] root = new int[roots.size()];
int rp = 0;
// write root node (token length and bytes)
outA.write1(1);
outA.write1(0);
// write next pointers
for(int j = 0; j < roots.size(); ++j) {
// dummy pointer
outA.write4(0);
// first char of next node
outA.write1(roots.get(j));
}
// data size
outA.write4(0);
// pointer on data - root has no data
outA.write5(0);
// write offset to first node
outC.write4(0);
int siz = (int) (2L + roots.size() * 5L + 9L);
while(true) {
final byte[] tok = outT.readToken();
if(tok.length == 0) break;
final int s = outT.read4();
final long off = outT.read5();
if(rp < roots.size() && tok[0] != roots.get(rp)) {
// write subtree to disk
siz = writeSubTree(root, outA, outC, rp, siz);
++rp;
index = new FTTrieArray(128);
}
index.insertSorted(tok, s, off);
}
// write subtree to disk
writeSubTree(root, outA, outC, rp, siz);
outT.close();
outA.close();
outC.write4(0);
outC.close();
final DataAccess tmp = new DataAccess(md.dbfile(DATAFTX + 'a'));
long c = 2;
for(final int r : root) {
tmp.write4(c, r);
c += 5;
}
tmp.close();
md.drop(DATAFTX + 't');
}
/**
* Writes subtree to disk.
* @param root Array with root offsets
* @param outA trie structure
* @param outC node sizes
* @param rp pointer on root offsets
* @param siz size
* @return new size
* @throws IOException I/O exception
*/
private int writeSubTree(final int[] root, final DataOutput outA,
final DataOutput outC, final int rp, final int siz) throws IOException {
// indexed full-text tokens
final TokenList tokens = index.tokens;
// trie index structure
final IntArrayList next = index.next;
if(root != null) root[rp] = next.get(0)[1] + offset;
int s = siz;
final int il = next.size();
// loop over all trie nodes
for(int i = 1; i < il; ++i) {
final int[] nxt = next.get(i);
// check if pointer on data needs 1 or 2 integers
final int lp = nxt[nxt.length - 1] >= 0 ? 0 : -1;
// write token
outA.write1(tokens.get(nxt[0]).length);
outA.writeBytes(tokens.get(nxt[0]));
// write next pointer
final int jl = nxt.length - 2 + lp;
for(int j = 1; j < jl; ++j) {
// pointer
outA.write4(nxt[j] + offset);
// first char of next node
outA.write1(tokens.get(next.get(nxt[j])[0])[0]);
}
// data size
outA.write4(nxt[jl]);
if(lp == 0 || nxt[jl] == 0 && nxt[jl + 1] == 0) {
// node has no data
outA.write5(nxt[jl + 1]);
} else {
// write pointer on data
final int n = nxt.length - 2;
outA.write5((long) nxt[n] << 16 + (-nxt[n + 1] & 0xFFFF));
}
// write node offset
outC.write4(s);
s += tokens.get(nxt[0]).length + (nxt.length - 3 + lp) * 5 + 10;
}
offset += next.size() - 1;
return s;
}
/**
* Writes the data as sorted list to disk.
* The data is stored in two files:
* <ul>
* <li>File <b>a</b>: {@code length|byte|,token|byte[length]|,
* size|int|, offset|long|, ...}</li>
* <li>File <b>b</b>: written via {@link #writeFTData}</li>
* </ul>
* @param cs current file
* @throws IOException I/O exception
*/
@Override
protected void writeIndex(final int cs) throws IOException {
final String f = DATAFTX + (merge ? cs : "");
final DataOutput outA = new DataOutput(data.meta.dbfile(f + 'a'));
final DataOutput outB = new DataOutput(data.meta.dbfile(f + 'b'));
if(scm == 0) hash.init();
else hash.initIter();
while(hash.more()) {
final int p = hash.next();
final byte[] t = hash.key();
final int s = hash.sizes[p];
// write compressed pre and pos arrays
writeFTData(outB, hash.pre[p], hash.pos[p]);
// write token length
outA.write1(t.length);
// write token
outA.writeBytes(t);
// write number of full-text data size
outA.write4(s);
// write pointer on full-text data
outA.write5(outB.size());
}
outA.write1(0);
outA.close();
outB.close();
hash = new FTTrieHash();
}
}