/**
*
*/
package querqy.lucene.contrib.rewrite;
import java.io.IOException;
import java.util.*;
import org.apache.lucene.analysis.synonym.SynonymMap;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util.fst.FST;
import querqy.SimpleComparableCharSequence;
import querqy.model.BooleanQuery;
import querqy.model.DisjunctionMaxClause;
import querqy.model.DisjunctionMaxQuery;
import querqy.model.Term;
import querqy.model.Clause.Occur;
/**
* @author rene
*
*/
class Sequences {
final FST<BytesRef> fst;
Map<DisjunctionMaxQuery, Set<DisjunctionMaxClause>> addenda = new HashMap<>();
Set<Sequence> sequences = new LinkedHashSet<>();
Set<Sequence> addSequences = new LinkedHashSet<>();
private DisjunctionMaxQuery currentDmq = null;
ByteArrayDataInput bytesReader = new ByteArrayDataInput();
private final SynonymMap map;
BytesRef scratchBytes = new BytesRef();
public Sequences(SynonymMap map) {
this.fst = map.fst;
this.map = map;
}
public void apply() {
for (Map.Entry<DisjunctionMaxQuery, Set<DisjunctionMaxClause>> entry : addenda.entrySet()) {
DisjunctionMaxQuery dmq = entry.getKey();
for (DisjunctionMaxClause clause : entry.getValue()) {
dmq.addClause(clause);
}
}
addenda.clear();
}
public void nextPosition(DisjunctionMaxQuery dmq) {
currentDmq = dmq;
sequences = addSequences;
addSequences = new LinkedHashSet<>();
}
static final int cpBlank = Character.codePointAt(new char[] { ' ' }, 0);
public void appendToSequences(Term term) throws IOException {
FST.BytesReader fstReader = fst.getBytesReader();
FST.Arc<BytesRef> scratchArc = new FST.Arc<>();
boolean ok;
for (Sequence sequence : sequences) {
// try to append a blank after the sequence
ok = null != fst.findTargetArc(cpBlank, sequence.arc, scratchArc, fstReader);
if (ok) {
// pending contains sequence + ' ' now
BytesRef pendingOutput = fst.outputs.add(sequence.output, scratchArc.output);
CharSequence termValue = term.getValue();
// iterate over term chars and try to append them to the sequence
for (int pos = 0, len = termValue.length(); ok && pos < len;) {
int codePoint = Character.codePointAt(termValue, pos);
ok = null != fst.findTargetArc(codePoint, scratchArc, scratchArc, fstReader);
pendingOutput = fst.outputs.add(pendingOutput, scratchArc.output);
pos += Character.charCount(codePoint);
}
if (ok) {
// ok means that we could consume the complete term char buffer,
// thus,
// append it to the sequence
List<Term> terms = new LinkedList<>(sequence.terms);
terms.add(term);
FST.Arc<BytesRef> arc = new FST.Arc<>();
Sequence newSequence = new Sequence(arc.copyFrom(scratchArc), terms, pendingOutput);
addSequences.add(newSequence);
// however, it might not have consumed the complete dictionary
// lookup key (it might
// complete at the next term position)
if (scratchArc.isFinal()) {
// the term completes the lookup key --> output the dictionary
// values
newSequence.addOutputs(addenda, map, bytesReader);
}
}
}
}
}
public void putTerm(Term term) throws IOException {
appendToSequences(term);
FST.Arc<BytesRef> scratchArc = new FST.Arc<>();
fst.getFirstArc(scratchArc);
BytesRef pendingOutput = fst.outputs.getNoOutput();
FST.BytesReader fstReader = fst.getBytesReader();
boolean ok = true;
CharSequence termValue = term.getValue();
for (int pos = 0, len = termValue.length(); ok && (pos < len);) {
int codePoint = Character.codePointAt(termValue, pos);
ok = null != fst.findTargetArc(codePoint, scratchArc, scratchArc, fstReader);
pendingOutput = fst.outputs.add(pendingOutput, scratchArc.output);
pos += Character.charCount(codePoint);
}
if (ok) {
List<Term> terms = Collections.singletonList(term);
FST.Arc<BytesRef> arc = new FST.Arc<>();
addSequences.add(new Sequence(arc.copyFrom(scratchArc), terms, pendingOutput));
if (scratchArc.isFinal()) {
addOutput(fst.outputs.add(pendingOutput, scratchArc.nextFinalOutput));
}
}
}
private void addOutput(BytesRef bytes) {
bytesReader.reset(bytes.bytes, bytes.offset, bytes.length);
final int code = bytesReader.readVInt();
// final boolean keepOrig = (code & 0x1) == 0;
final int count = code >>> 1;
Set<DisjunctionMaxClause> adds = addenda.get(currentDmq);
if (adds == null) {
adds = new LinkedHashSet<>();
addenda.put(currentDmq, adds);
}
for (int outputIDX = 0; outputIDX < count; outputIDX++) {
map.words.get(bytesReader.readVInt(), scratchBytes);
// not re-using scratchChars globally -> would have to copy to Terms
// anyway
char[] scratchChars = new char[scratchBytes.length];
UnicodeUtil.UTF8toUTF16(scratchBytes, scratchChars);
BooleanQuery add = null;
int start = 0;
for (int i = 0; i < scratchChars.length; i++) {
if (scratchChars[i] == ' ' && (i > start)) {
if (add == null) {
add = new BooleanQuery(currentDmq, Occur.SHOULD, true);
}
DisjunctionMaxQuery newDmq = new DisjunctionMaxQuery(add, Occur.MUST, true);
newDmq.addClause(new Term(newDmq,
new SimpleComparableCharSequence(scratchChars, start, i - start), true));
add.addClause(newDmq);
start = i + 1;
}
}
if (add != null) {
if (start < scratchChars.length) {
DisjunctionMaxQuery newDmq = new DisjunctionMaxQuery(add, Occur.MUST, true);
newDmq.addClause(new Term(newDmq, new SimpleComparableCharSequence(scratchChars, start,
scratchChars.length - start), true));
add.addClause(newDmq);
}
adds.add(add);
} else {
adds.add(new Term(currentDmq, new SimpleComparableCharSequence(scratchChars, 0, scratchChars.length),
true));
}
}
}
}