/*
* Ivory: A Hadoop toolkit for web-scale information retrieval
*
* Licensed under the Apache License, Version 2.0 (the "License"); you
* may not use this file except in compliance with the License. You may
* obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied. See the License for the specific language governing
* permissions and limitations under the License.
*/
package ivory.smrf.retrieval;
import ivory.core.ConfigurationException;
import ivory.smrf.model.Clique;
import ivory.smrf.model.DocumentNode;
import ivory.smrf.model.GraphNode;
import ivory.smrf.model.MarkovRandomField;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.PriorityQueue;
import org.apache.log4j.Logger;
import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
/**
* @author Don Metzler
*/
public class MRFDocumentRanker {
private static final Logger LOG = Logger.getLogger(MRFDocumentRanker.class);
// Pool of accumulators.
private Accumulator[] accumulators = null;
// Sorted list of accumulators.
private final PriorityQueue<Accumulator> sortedAccumulators = new PriorityQueue<Accumulator>();
// Comparator used to sort cliques by their max score.
private final Comparator<Clique> maxscoreComparator = new Clique.MaxScoreComparator();
// If defined, only documents within this set will be scored.
private int[] docs = null;
// Markov Random Field that we are using to generate the ranking.
private MarkovRandomField mrf = null;
// MRF document nodes.
private List<DocumentNode> docNodes = null;
// Maximum number of results to return.
private int numResults;
public MRFDocumentRanker(MarkovRandomField mrf, int numResults) {
this(mrf, null, numResults);
}
public MRFDocumentRanker(MarkovRandomField mrf, int[] docSet, int numResults) {
Preconditions.checkArgument(numResults > 0);
this.mrf = Preconditions.checkNotNull(mrf);
this.docs = docSet;
this.numResults = numResults;
this.docNodes = getDocNodes();
// Create single pool of reusable accumulators.
accumulators = new Accumulator[numResults + 1];
for (int i = 0; i < numResults + 1; i++) {
accumulators[i] = new Accumulator(0, 0.0f);
}
}
public Accumulator[] rank() {
// Clear priority queue.
sortedAccumulators.clear();
// Cliques associated with the MRF.
List<Clique> cliques = mrf.getCliques();
// Current accumulator.
Accumulator a = accumulators[0];
// Initialize the MRF.
try {
mrf.initialize();
} catch (ConfigurationException e) {
LOG.error("Error initializing MRF. Aborting ranking!");
return null;
}
// Maximum possible score that this MRF can achieve.
float mrfMaxScore = 0.0f;
for (Clique c : cliques) {
mrfMaxScore += c.getMaxScore();
}
// Sort cliques according to their max scores.
Collections.sort(cliques, maxscoreComparator);
// Score that must be achieved to enter result set.
double scoreThreshold = Double.NEGATIVE_INFINITY;
// Offset into document set we're currently at (if applicable).
int docsetOffset = 0;
int docno = 0;
if (docs != null) {
docno = docsetOffset < docs.length ? docs[docsetOffset++] : Integer.MAX_VALUE;
} else {
docno = mrf.getNextCandidate();
}
while (docno < Integer.MAX_VALUE) {
float score = 0.0f;
for (DocumentNode documentNode : docNodes) {
documentNode.setDocno(docno);
}
// Document-at-a-time scoring.
float docMaxScore = mrfMaxScore;
boolean skipped = false;
for (int i = 0; i < cliques.size(); i++) {
// Current clique that we're scoring.
Clique c = cliques.get(i);
// If there's no way that this document can enter the result set
// then exit.
if (score + docMaxScore <= scoreThreshold) {
// Advance postings readers (but don't score).
for (int j = i; j < cliques.size(); j++) {
cliques.get(j).setNextCandidate(docno + 1);
}
skipped = true;
break;
}
// Document independent cliques do not affect the ranking.
if (!c.isDocDependent()) {
continue;
}
// Update document score.
score += c.getWeight() * c.getPotential();
// Update the max score for the rest of the cliques.
docMaxScore -= c.getMaxScore();
}
// Keep track of mNumResults best accumulators.
if (!skipped && score > scoreThreshold) {
a.docno = docno;
a.score = score;
sortedAccumulators.add(a);
if (sortedAccumulators.size() == numResults + 1) {
a = sortedAccumulators.poll();
scoreThreshold = sortedAccumulators.peek().score;
} else {
a = accumulators[sortedAccumulators.size()];
}
}
if (docs != null) {
docno = docsetOffset < docs.length ? docs[docsetOffset++] : Integer.MAX_VALUE;
} else {
docno = mrf.getNextCandidate();
}
}
// Grab the accumulators off the stack, in (reverse) order.
Accumulator[] results = new Accumulator[Math.min(numResults, sortedAccumulators.size())];
for (int i = 0; i < results.length; i++) {
results[results.length - 1 - i] = sortedAccumulators.poll();
}
return results;
}
/**
* Returns the Markov Random Field associated with this ranker.
*/
public MarkovRandomField getMRF() {
return mrf;
}
/**
* Sets the number of results to return.
*/
public void setNumResults(int numResults) {
Preconditions.checkArgument(numResults > 0);
this.numResults = numResults;
}
private List<DocumentNode> getDocNodes() {
List<DocumentNode> docNodes = Lists.newArrayList();
// Check which of the nodes are DocumentNodes.
List<GraphNode> nodes = mrf.getNodes();
for (GraphNode node : nodes) {
if (node.getType() == GraphNode.Type.DOCUMENT) {
docNodes.add((DocumentNode) node);
}
}
return docNodes;
}
}