Sequences.java example

Explorer
querqy-master
/**
 * 
 */
package querqy.lucene.contrib.rewrite;

import java.io.IOException;
import java.util.*;

import org.apache.lucene.analysis.synonym.SynonymMap;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util.fst.FST;

import querqy.SimpleComparableCharSequence;
import querqy.model.BooleanQuery;
import querqy.model.DisjunctionMaxClause;
import querqy.model.DisjunctionMaxQuery;
import querqy.model.Term;
import querqy.model.Clause.Occur;

/**
 * @author rene
 *
 */
class Sequences {

   final FST<BytesRef> fst;

   Map<DisjunctionMaxQuery, Set<DisjunctionMaxClause>> addenda = new HashMap<>();

   Set<Sequence> sequences = new LinkedHashSet<>();
   Set<Sequence> addSequences = new LinkedHashSet<>();

   private DisjunctionMaxQuery currentDmq = null;
   ByteArrayDataInput bytesReader = new ByteArrayDataInput();
   private final SynonymMap map;
   BytesRef scratchBytes = new BytesRef();

   public Sequences(SynonymMap map) {
      this.fst = map.fst;
      this.map = map;
   }

   public void apply() {
      for (Map.Entry<DisjunctionMaxQuery, Set<DisjunctionMaxClause>> entry : addenda.entrySet()) {
         DisjunctionMaxQuery dmq = entry.getKey();
         for (DisjunctionMaxClause clause : entry.getValue()) {
            dmq.addClause(clause);
         }
      }
      addenda.clear();
   }

   public void nextPosition(DisjunctionMaxQuery dmq) {
      currentDmq = dmq;
      sequences = addSequences;
      addSequences = new LinkedHashSet<>();
   }

   static final int cpBlank = Character.codePointAt(new char[] { ' ' }, 0);

   public void appendToSequences(Term term) throws IOException {

      FST.BytesReader fstReader = fst.getBytesReader();
      FST.Arc<BytesRef> scratchArc = new FST.Arc<>();

      boolean ok;

      for (Sequence sequence : sequences) {

         // try to append a blank after the sequence
         ok = null != fst.findTargetArc(cpBlank, sequence.arc, scratchArc, fstReader);
         if (ok) {
            // pending contains sequence + ' ' now
            BytesRef pendingOutput = fst.outputs.add(sequence.output, scratchArc.output);

            CharSequence termValue = term.getValue();

            // iterate over term chars and try to append them to the sequence
            for (int pos = 0, len = termValue.length(); ok && pos < len;) {

               int codePoint = Character.codePointAt(termValue, pos);

               ok = null != fst.findTargetArc(codePoint, scratchArc, scratchArc, fstReader);

               pendingOutput = fst.outputs.add(pendingOutput, scratchArc.output);

               pos += Character.charCount(codePoint);
            }

            if (ok) {
               // ok means that we could consume the complete term char buffer,
               // thus,
               // append it to the sequence
               List<Term> terms = new LinkedList<>(sequence.terms);
               terms.add(term);
               FST.Arc<BytesRef> arc = new FST.Arc<>();
               Sequence newSequence = new Sequence(arc.copyFrom(scratchArc), terms, pendingOutput);
               addSequences.add(newSequence);
               // however, it might not have consumed the complete dictionary
               // lookup key (it might
               // complete at the next term position)
               if (scratchArc.isFinal()) {
                  // the term completes the lookup key --> output the dictionary
                  // values
                  newSequence.addOutputs(addenda, map, bytesReader);
               }
            }
         }
      }
   }

   public void putTerm(Term term) throws IOException {

      appendToSequences(term);

      FST.Arc<BytesRef> scratchArc = new FST.Arc<>();
      fst.getFirstArc(scratchArc);
      BytesRef pendingOutput = fst.outputs.getNoOutput();

      FST.BytesReader fstReader = fst.getBytesReader();

      boolean ok = true;

      CharSequence termValue = term.getValue();

      for (int pos = 0, len = termValue.length(); ok && (pos < len);) {

         int codePoint = Character.codePointAt(termValue, pos);

         ok = null != fst.findTargetArc(codePoint, scratchArc, scratchArc, fstReader);

         pendingOutput = fst.outputs.add(pendingOutput, scratchArc.output);

         pos += Character.charCount(codePoint);
      }
      if (ok) {

         List<Term> terms = Collections.singletonList(term);
         FST.Arc<BytesRef> arc = new FST.Arc<>();
         addSequences.add(new Sequence(arc.copyFrom(scratchArc), terms, pendingOutput));

         if (scratchArc.isFinal()) {
            addOutput(fst.outputs.add(pendingOutput, scratchArc.nextFinalOutput));
         }

      }

   }

   private void addOutput(BytesRef bytes) {

      bytesReader.reset(bytes.bytes, bytes.offset, bytes.length);

      final int code = bytesReader.readVInt();
      // final boolean keepOrig = (code & 0x1) == 0;
      final int count = code >>> 1;

      Set<DisjunctionMaxClause> adds = addenda.get(currentDmq);
      if (adds == null) {
         adds = new LinkedHashSet<>();
         addenda.put(currentDmq, adds);
      }

      for (int outputIDX = 0; outputIDX < count; outputIDX++) {

         map.words.get(bytesReader.readVInt(), scratchBytes);
         // not re-using scratchChars globally -> would have to copy to Terms
         // anyway
         char[] scratchChars = new char[scratchBytes.length];
         UnicodeUtil.UTF8toUTF16(scratchBytes, scratchChars);

         BooleanQuery add = null;

         int start = 0;
         for (int i = 0; i < scratchChars.length; i++) {
            if (scratchChars[i] == ' ' && (i > start)) {
               if (add == null) {
                  add = new BooleanQuery(currentDmq, Occur.SHOULD, true);
               }
               DisjunctionMaxQuery newDmq = new DisjunctionMaxQuery(add, Occur.MUST, true);
               newDmq.addClause(new Term(newDmq,
                     new SimpleComparableCharSequence(scratchChars, start, i - start), true));
               add.addClause(newDmq);
               start = i + 1;
            }
         }

         if (add != null) {
            if (start < scratchChars.length) {
               DisjunctionMaxQuery newDmq = new DisjunctionMaxQuery(add, Occur.MUST, true);
               newDmq.addClause(new Term(newDmq, new SimpleComparableCharSequence(scratchChars, start,
                     scratchChars.length - start), true));
               add.addClause(newDmq);
            }
            adds.add(add);
         } else {
            adds.add(new Term(currentDmq, new SimpleComparableCharSequence(scratchChars, 0, scratchChars.length),
                  true));
         }

      }

   }

}