PostingsReaderWrapper.java example

Explorer
Ivory-master
- src
  - java
package ivory.sqe.retrieval;

import ivory.core.RetrievalEnvironment;
import ivory.core.data.index.Posting;
import ivory.core.data.index.PostingsList;
import ivory.core.data.index.PostingsReader;
import ivory.core.data.index.ProximityPostingsReaderOrderedWindow;
import ivory.smrf.model.GlobalEvidence;
import ivory.smrf.model.GlobalTermEvidence;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;

import com.google.common.base.Preconditions;
import com.google.gson.JsonArray;
import com.google.gson.JsonObject;

public class PostingsReaderWrapper {
  // Default score for potentials with no postings.
  protected static final float DEFAULT_SCORE = 0.0f;

  protected final Posting curPosting = new Posting();
  protected RetrievalEnvironment env;
  protected PostingsReader postingsReader = null;
  protected GlobalTermEvidence gte;
  protected GlobalEvidence ge;

  protected boolean endOfList = true; // Whether or not we're at the end of
  // the postings list.
  protected int lastScoredDocno = 0, iterStart = 0, iterStep = 1;

  protected String operator, termOrPhrase, terms[];
  protected JsonArray values;
  protected List<PostingsReaderWrapper> children;

  protected boolean isOOV = false;
  protected float weights[];

  private final int numDocs;
  private final float avgDocLen;

  public PostingsReaderWrapper(JsonObject query, RetrievalEnvironment env, GlobalEvidence ge) {
    this.operator = query.entrySet().iterator().next().getKey();
    this.values = query.getAsJsonArray(operator);

    if (operator.equals("#weight") || operator.equals("#combweight")) {
      iterStart = 1;
      iterStep = 2;
      weights = new float[values.size() / 2];

      // in #weight or #combweight structure, even-numbered indices corr. to
      // weights, odd-numbered indices corr. to terms/phrases
      for (int i = 0; i < values.size(); i = i + iterStep) {
        weights[i / 2] = (float) values.get(i).getAsDouble();
      }
    }

    this.env = Preconditions.checkNotNull(env);
    this.numDocs = (int) env.getDocumentCount();
    this.avgDocLen = env.getCollectionSize() / numDocs;

    // Read first posting.
    endOfList = false;

    // If this is not a leaf node, create children
    children = new ArrayList<PostingsReaderWrapper>();
    // //LOG.info("non-leaf node with "+values.length()+" children");
    for (int i = iterStart; i < values.size(); i = i + iterStep) {
      if (!values.get(i).isJsonPrimitive()) {
        // If child is an object (non-leaf), call nonleaf-constructor
        children.add(new PostingsReaderWrapper(values.get(i).getAsJsonObject(), env, ge));
      } else {
        // If child is leaf, call leaf-constructor
        children.add(new PostingsReaderWrapper(values.get(i).getAsString(), env, ge));
      }
    }

    lastScoredDocno = 0;
  }

  public PostingsReaderWrapper(String termOrPhrase, RetrievalEnvironment env, GlobalEvidence ge) {
    this.env = Preconditions.checkNotNull(env);
    this.numDocs = (int) env.getDocumentCount();
    this.avgDocLen = env.getCollectionSize() / numDocs;

    // Read first posting.
    endOfList = false;

    // If this is a leaf node (i.e., single term), create postings list
    this.termOrPhrase = termOrPhrase;
    terms = termOrPhrase.split("\\s+");
    if (terms.length > 1) {
      operator = "phrase";
      List<PostingsReader> prs = new ArrayList<PostingsReader>();
      for (String term : terms) {
        PostingsList pl = env.getPostingsList(term);
        // if any of the tokens is OOV, then the phrase is considered OOV
        if (pl == null) {
          isOOV = true;
          endOfList = true;
          return;
        }
        prs.add(pl.getPostingsReader());
      }
      postingsReader = new ProximityPostingsReaderOrderedWindow(prs.toArray(new PostingsReader[0]),
          2);
      postingsReader.nextPosting(curPosting);
      gte = new GlobalTermEvidence(env.getDefaultDf(), env.getDefaultCf());
      this.ge = ge;
      lastScoredDocno = 0;
    } else {
      operator = "term";
      PostingsList pl = env.getPostingsList(termOrPhrase);
      if (pl == null) {
        isOOV = true;
        endOfList = true;
      } else {
        postingsReader = pl.getPostingsReader();
        gte = new GlobalTermEvidence(pl.getDf(), pl.getCf());
        this.ge = ge;
        lastScoredDocno = 0;
      }
    }
  }

  public NodeWeight computeScore(int curDocno) {
    NodeWeight score;
    if (isOOV) {
      int docLen = env.getDocumentLength(curDocno);
      score = new TfDfWeight(0, 0, docLen, numDocs, avgDocLen);
    } else if (!isLeaf()) {
      score = runOperator(curDocno);
      lastScoredDocno = curDocno;
    } else { // leaf node
      // Advance postings reader. Invariant: curPosting will always point
      // to the next posting that has not yet been scored.
      while (!endOfList && postingsReader.getDocno() < curDocno) {
        if (!postingsReader.nextPosting(curPosting)) {
          endOfList = true;
        }
      }

      // Compute term frequency if postings list contains this docno,
      // otherwise tf=0
      int tf = 0;
      if (curDocno == postingsReader.getDocno()) {
        tf = postingsReader.getTf();
      }

      int docLen = env.getDocumentLength(curDocno);
      score = new TfDfWeight(tf, gte.getDf(), docLen, numDocs, avgDocLen);

      lastScoredDocno = curDocno;
    }
    return score;
  }

  private NodeWeight runOperator(int curDocno) {
    // If this is not a leaf node, compute scores from children and
    // combine them w.r.t operator
    NodeWeight[] scores = new NodeWeight[children.size()];
    for (int i = 0; i < children.size(); i++) {
      scores[i] = children.get(i).computeScore(curDocno);
    }
    int docLen = env.getDocumentLength(curDocno);

    NodeWeight resultScore;
    if (operator.equals("#combine")) {
      // sum bm25 scores
      float score = 0f;
      for (int i = 0; i < scores.length; i++) {
        score += scores[i].getScore();
      }
      resultScore = new FloatWeight(score);
    } else if (operator.equals("#weight")) {
      if (scores.length == 0) {
        resultScore = new FloatWeight();
      } else {
        if (scores[0] instanceof TfDfWeight) {
          resultScore = new TfDfWeight(0, 0, docLen, numDocs, avgDocLen);
        } else {
          resultScore = new FloatWeight();
        }
        // tf,df = sum{weight_i * (tf_i,df_i)}
        for (int i = 0; i < scores.length; i++) {
          resultScore.add(scores[i].multiply(weights[i]));
        }
      }
    } else if (operator.equals("#combweight")) {
      // sum bm25 scores
      float score = 0f;
      for (int i = 0; i < scores.length; i++) {
        score += scores[i].getScore() * weights[i];
      }
      resultScore = new FloatWeight(score);
    } else {
      throw new RuntimeException("Unknown operator: " + operator);
    }
    return resultScore;
  }

  /**
   * @param docno
   * @return next smallest docno from posting lists of leaf nodes
   */
  public int getNextCandidate(int docno) {
    if (isOOV) {
      return docno;
    } else if (!isLeaf()) { // not a leaf node
      for (int i = 0; i < children.size(); i++) {
        int nextDocno = children.get(i).getNextCandidate(docno);
        if (nextDocno != lastScoredDocno && nextDocno < docno) {
          docno = nextDocno;
        }
      }
      return docno;
    } else { // leaf node
      if (endOfList) {
        return Integer.MAX_VALUE;
      }
      int nextDocno = findNextDocnoWithPositiveTF(operator);
      if (nextDocno == Integer.MAX_VALUE) {
        endOfList = true;
      }
      return nextDocno;
    }
  }

  private int findNextDocnoWithPositiveTF(String operator) {
    boolean t = true;
    while (t && (postingsReader.getTf() == 0 || postingsReader.getDocno() == lastScoredDocno)) {
      t = postingsReader.nextPosting(curPosting);
    }
    if (t) {
      return postingsReader.getDocno();
    } else {
      return Integer.MAX_VALUE;
    }
  }

  public void reset() {
    endOfList = false;
    lastScoredDocno = -1;
  }

  public void setNextCandidate(int docno) {
    // Advance postings reader. Invariant: curPosting will always point to
    // the next posting that has not yet been scored.
    while (!endOfList && postingsReader.getDocno() < docno) {
      if (!postingsReader.nextPosting(curPosting)) {
        endOfList = true;
      }
    }
  }

  public String toString() {
    if (isOOV) {
      return "OOV";
    } else if (!isLeaf()) { // not a leaf node
      return operator + "::" + values.toString();
    } else {
      return operator + "::" + Arrays.asList(terms).toString();
    }
  }

  private boolean isLeaf() {
    return postingsReader != null;
  }
}