package org.basex.index.ft; import static org.basex.data.DataText.*; import static org.basex.util.Token.*; import java.io.IOException; import org.basex.data.Data; import org.basex.io.out.DataOutput; import org.basex.util.list.IntList; /** * <p>This class builds an index for text contents, optimized for fuzzy search, * in an ordered table:</p> * * <ol> * <li> The tokens are indexed in a main memory tree structure.</li> * <li> If main memory is full, the index is written to disk.</li> * <li> The temporary index instances are merged.</li> * </ol> * * <p>The file format is described in the {@link FTFuzzy} class.</p> * * @author BaseX Team 2005-12, BSD License * @author Sebastian Gath * @author Christian Gruen */ final class FTFuzzyBuilder extends FTBuilder { /** Value trees. */ private final FTIndexTrees tree; /** * Constructor. * @param d data reference * @throws IOException IOException */ FTFuzzyBuilder(final Data d) throws IOException { super(d); tree = new FTIndexTrees(d.meta.maxlen); } @Override public FTIndex build() throws IOException { index(); return new FTFuzzy(data); } @Override void index(final byte[] tok) { tree.index(tok, pre, pos, csize); } @Override int nrTokens() { int l = 0; for(final FTIndexTree t : tree.trees) if(t != null) l += t.size(); return l; } @Override void calcFreq() { tree.init(); while(tree.more(0)) { final FTIndexTree t = tree.nextTree(); t.next(); calcFreq(t.nextPres()); } } @Override public void write() throws IOException { writeIndex(csize++); if(!merge) return; // merges temporary index files final DataOutput outX = new DataOutput(data.meta.dbfile(DATAFTX + 'x')); final DataOutput outY = new DataOutput(data.meta.dbfile(DATAFTX + 'y')); final DataOutput outZ = new DataOutput(data.meta.dbfile(DATAFTX + 'z')); final IntList ind = new IntList(); // open all temporary sorted lists final FTList[] v = new FTList[csize]; for(int b = 0; b < csize; ++b) v[b] = new FTFuzzyList(data, b); final IntList il = new IntList(); while(check(v)) { int min = 0; il.reset(); il.add(min); // find next token to write on disk for(int i = 0; i < csize; ++i) { if(min == i || v[i].tok.length == 0) continue; final int l = v[i].tok.length - v[min].tok.length; final int d = diff(v[min].tok, v[i].tok); if(l < 0 || l == 0 && d > 0 || v[min].tok.length == 0) { min = i; il.reset(); il.add(min); } else if(d == 0 && v[i].tok.length > 0) { il.add(i); } } if(ind.size() == 0 || ind.get(ind.size() - 2) < v[min].tok.length) { ind.add(v[min].tok.length); ind.add((int) outY.size()); } // write token outY.writeBytes(v[min].tok); // pointer on full-text data outY.write5(outZ.size()); // merge and write data size outY.write4(merge(outZ, il, v)); } writeInd(outX, ind, ind.get(ind.size() - 2) + 1, (int) outY.size()); outX.close(); outY.close(); outZ.close(); } /** * Writes the token length index to disk. * @param outX output * @param il token length and offsets * @param ls last token length * @param lp last offset * @throws IOException I/O exception */ private static void writeInd(final DataOutput outX, final IntList il, final int ls, final int lp) throws IOException { final int is = il.size(); outX.writeNum(is >> 1); for(int i = 0; i < is; i += 2) { outX.writeNum(il.get(i)); outX.write4(il.get(i + 1)); } outX.writeNum(ls); outX.write4(lp); } @Override protected void writeIndex(final int cs) throws IOException { final String s = DATAFTX + (merge ? cs : ""); final DataOutput outX = new DataOutput(data.meta.dbfile(s + 'x')); final DataOutput outY = new DataOutput(data.meta.dbfile(s + 'y')); final DataOutput outZ = new DataOutput(data.meta.dbfile(s + 'z')); final IntList ind = new IntList(); long dr = 0; int tr = 0; int j = 0; tree.init(); while(tree.more(cs)) { final FTIndexTree t = tree.nextTree(); t.next(); final byte[] key = t.nextTok(); if(j < key.length) { j = key.length; // write index and pointer on first token ind.add(j); ind.add(tr); } for(int i = 0; i < j; ++i) outY.write1(key[i]); // write pointer on full-text data outY.write5(dr); // write full-text data size (number of pre values) outY.write4(t.nextNumPre()); // write compressed pre and pos arrays writeFTData(outZ, t.nextPres(), t.nextPoss()); dr = outZ.size(); tr = (int) outY.size(); } writeInd(outX, ind, ++j, tr); outX.close(); outY.close(); outZ.close(); tree.initFT(); } }