MRFDocumentRanker.java example

Explorer
Ivory-master
- src
  - java
/*
 * Ivory: A Hadoop toolkit for web-scale information retrieval
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you
 * may not use this file except in compliance with the License. You may
 * obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied. See the License for the specific language governing
 * permissions and limitations under the License.
 */

package ivory.smrf.retrieval;

import ivory.core.ConfigurationException;
import ivory.smrf.model.Clique;
import ivory.smrf.model.DocumentNode;
import ivory.smrf.model.GraphNode;
import ivory.smrf.model.MarkovRandomField;

import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.PriorityQueue;

import org.apache.log4j.Logger;

import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;

/**
 * @author Don Metzler
 */
public class MRFDocumentRanker {
  private static final Logger LOG = Logger.getLogger(MRFDocumentRanker.class);

  // Pool of accumulators.
  private Accumulator[] accumulators = null;

  // Sorted list of accumulators.
  private final PriorityQueue<Accumulator> sortedAccumulators = new PriorityQueue<Accumulator>();

  // Comparator used to sort cliques by their max score.
  private final Comparator<Clique> maxscoreComparator = new Clique.MaxScoreComparator();

  // If defined, only documents within this set will be scored.
  private int[] docs = null;

  // Markov Random Field that we are using to generate the ranking.
  private MarkovRandomField mrf = null;

  // MRF document nodes.
  private List<DocumentNode> docNodes = null;

  // Maximum number of results to return.
  private int numResults;

  public MRFDocumentRanker(MarkovRandomField mrf, int numResults) {
    this(mrf, null, numResults);
  }

  public MRFDocumentRanker(MarkovRandomField mrf, int[] docSet, int numResults) {
    Preconditions.checkArgument(numResults > 0);
    this.mrf = Preconditions.checkNotNull(mrf);
    this.docs = docSet;
    this.numResults = numResults;
    this.docNodes = getDocNodes();

    // Create single pool of reusable accumulators.
    accumulators = new Accumulator[numResults + 1];
    for (int i = 0; i < numResults + 1; i++) {
      accumulators[i] = new Accumulator(0, 0.0f);
    }
  }

  public Accumulator[] rank() {
    // Clear priority queue.
    sortedAccumulators.clear();

    // Cliques associated with the MRF.
    List<Clique> cliques = mrf.getCliques();

    // Current accumulator.
    Accumulator a = accumulators[0];

    // Initialize the MRF.
    try {
      mrf.initialize();
    } catch (ConfigurationException e) {
      LOG.error("Error initializing MRF. Aborting ranking!");
      return null;
    }

    // Maximum possible score that this MRF can achieve.
    float mrfMaxScore = 0.0f;
    for (Clique c : cliques) {
      mrfMaxScore += c.getMaxScore();
    }

    // Sort cliques according to their max scores.
    Collections.sort(cliques, maxscoreComparator);

    // Score that must be achieved to enter result set.
    double scoreThreshold = Double.NEGATIVE_INFINITY;

    // Offset into document set we're currently at (if applicable).
    int docsetOffset = 0;

    int docno = 0;
    if (docs != null) {
      docno = docsetOffset < docs.length ? docs[docsetOffset++] : Integer.MAX_VALUE;
    } else {
      docno = mrf.getNextCandidate();
    }

    while (docno < Integer.MAX_VALUE) {
      float score = 0.0f;

      for (DocumentNode documentNode : docNodes) {
        documentNode.setDocno(docno);
      }

      // Document-at-a-time scoring.
      float docMaxScore = mrfMaxScore;
      boolean skipped = false;
      for (int i = 0; i < cliques.size(); i++) {
        // Current clique that we're scoring.
        Clique c = cliques.get(i);

        // If there's no way that this document can enter the result set
        // then exit.
        if (score + docMaxScore <= scoreThreshold) {
          // Advance postings readers (but don't score).
          for (int j = i; j < cliques.size(); j++) {
            cliques.get(j).setNextCandidate(docno + 1);
          }
          skipped = true;
          break;
        }

        // Document independent cliques do not affect the ranking.
        if (!c.isDocDependent()) {
          continue;
        }

        // Update document score.
        score += c.getWeight() * c.getPotential();

        // Update the max score for the rest of the cliques.
        docMaxScore -= c.getMaxScore();
      }

      // Keep track of mNumResults best accumulators.
      if (!skipped && score > scoreThreshold) {
        a.docno = docno;
        a.score = score;
        sortedAccumulators.add(a);

        if (sortedAccumulators.size() == numResults + 1) {
          a = sortedAccumulators.poll();
          scoreThreshold = sortedAccumulators.peek().score;
        } else {
          a = accumulators[sortedAccumulators.size()];
        }
      }

      if (docs != null) {
        docno = docsetOffset < docs.length ? docs[docsetOffset++] : Integer.MAX_VALUE;
      } else {
        docno = mrf.getNextCandidate();
      }
    }

    // Grab the accumulators off the stack, in (reverse) order.
    Accumulator[] results = new Accumulator[Math.min(numResults, sortedAccumulators.size())];
    for (int i = 0; i < results.length; i++) {
      results[results.length - 1 - i] = sortedAccumulators.poll();
    }

    return results;
  }

  /**
   * Returns the Markov Random Field associated with this ranker.
   */
  public MarkovRandomField getMRF() {
    return mrf;
  }

  /**
   * Sets the number of results to return.
   */
  public void setNumResults(int numResults) {
    Preconditions.checkArgument(numResults > 0);
    this.numResults = numResults;
  }

  private List<DocumentNode> getDocNodes() {
    List<DocumentNode> docNodes = Lists.newArrayList();

    // Check which of the nodes are DocumentNodes.
    List<GraphNode> nodes = mrf.getNodes();
    for (GraphNode node : nodes) {
      if (node.getType() == GraphNode.Type.DOCUMENT) {
        docNodes.add((DocumentNode) node);
      }
    }
    return docNodes;
  }
}