UnigramLatentConceptExpander.java example

Explorer
Ivory-master
- src
  - java
/*
 * Ivory: A Hadoop toolkit for web-scale information retrieval
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you
 * may not use this file except in compliance with the License. You may
 * obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied. See the License for the specific language governing
 * permissions and limitations under the License.
 */

package ivory.smrf.model.expander;

import ivory.core.ConfigurationException;
import ivory.core.RetrievalEnvironment;
import ivory.core.RetrievalException;
import ivory.core.data.document.IntDocVector;
import ivory.core.data.index.PostingsList;
import ivory.core.data.index.PostingsReader;
import ivory.core.util.XMLTools;
import ivory.smrf.model.Clique;
import ivory.smrf.model.DocumentNode;
import ivory.smrf.model.GlobalEvidence;
import ivory.smrf.model.GlobalTermEvidence;
import ivory.smrf.model.GraphNode;
import ivory.smrf.model.MarkovRandomField;
import ivory.smrf.model.Parameter;
import ivory.smrf.model.TermNode;
import ivory.smrf.model.VocabFrequencyPair;
import ivory.smrf.model.builder.Expression;
import ivory.smrf.model.builder.ExpressionGenerator;
import ivory.smrf.model.builder.TermExpressionGenerator;
import ivory.smrf.model.importance.ConceptImportanceModel;
import ivory.smrf.model.potential.PotentialFunction;
import ivory.smrf.model.potential.QueryPotential;
import ivory.smrf.model.score.ScoringFunction;
import ivory.smrf.retrieval.Accumulator;

import java.io.IOException;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.PriorityQueue;

import org.w3c.dom.Node;

import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;

/**
 * @author Don Metzler
 * @author Lidan Wang
 */
public class UnigramLatentConceptExpander extends MRFExpander {
  private List<Parameter> parameters = null;
  private List<Node> scoringFunctionNodes = null;
  private List<ConceptImportanceModel> importanceModels = null;

  private final GlobalTermEvidence termEvidence = new GlobalTermEvidence();

  public UnigramLatentConceptExpander(RetrievalEnvironment env, int fbDocs, int fbTerms,
      float expanderWeight, List<Parameter> params, List<Node> scoringFunctionNodes,
      List<ConceptImportanceModel> importanceModels) {
    this.env = Preconditions.checkNotNull(env);
    this.numFeedbackDocs = Preconditions.checkNotNull(fbDocs);
    this.numFeedbackTerms = Preconditions.checkNotNull(fbTerms);
    this.expanderWeight = Preconditions.checkNotNull(expanderWeight);
    this.parameters = Preconditions.checkNotNull(params);
    this.scoringFunctionNodes = Preconditions.checkNotNull(scoringFunctionNodes);
    this.importanceModels = importanceModels;
  }

  @Override
  public MarkovRandomField getExpandedMRF(MarkovRandomField mrf, Accumulator[] results)
      throws ConfigurationException {
    Preconditions.checkNotNull(mrf);
    Preconditions.checkNotNull(results);

    // Begin constructing the expanded MRF.
    MarkovRandomField expandedMRF = new MarkovRandomField(mrf.getQueryTerms(), env);

    // Add cliques corresponding to original MRF.
    List<Clique> cliques = mrf.getCliques();
    for (Clique clique : cliques) {
      expandedMRF.addClique(clique);
    }

    // Get MRF global evidence.
    GlobalEvidence globalEvidence = mrf.getGlobalEvidence();

    // Gather Accumulators we're actually going to use for feedback purposes.
    Accumulator[] fbResults = new Accumulator[Math.min(results.length, numFeedbackDocs)];
    for (int i = 0; i < Math.min(results.length, numFeedbackDocs); i++) {
      fbResults[i] = results[i];
    }

    // Sort the Accumulators by docid.
    Arrays.sort(fbResults, new Accumulator.DocnoComparator());

    // Get docids that correspond to the accumulators.
    int[] docSet = Accumulator.accumulatorsToDocnos(fbResults);

    // Get document vectors for results.
    IntDocVector[] docVecs = env.documentVectors(docSet);

    // Extract tf and doclen information from document vectors.
    TfDoclengthStatistics stats = null;
    try {
      stats = getTfDoclengthStatistics(docVecs);
    } catch (IOException e) {
      throw new RetrievalException(
          "Error: Unable to extract tf and doclen information from document vectors!");
    }

    VocabFrequencyPair[] vocab = stats.getVocab();
    Map<String, Short>[] tfs = stats.getTfs();
    int[] doclens = stats.getDoclens();

    // Priority queue for the concepts associated with this builder.
    PriorityQueue<Accumulator> sortedConcepts = new PriorityQueue<Accumulator>();

    // Create scoring functions.
    ScoringFunction[] scoringFunctions = new ScoringFunction[scoringFunctionNodes.size()];
    for (int i = 0; i < scoringFunctionNodes.size(); i++) {
      Node functionNode = scoringFunctionNodes.get(i);
      String functionType = XMLTools.getAttributeValueOrThrowException(functionNode,
          "scoreFunction", "conceptscore node must specify a scorefunction attribute!");
      scoringFunctions[i] = ScoringFunction.create(functionType, functionNode);
    }

    // Score each concept.
    for (int conceptID = 0; conceptID < vocab.length; conceptID++) {
      if (maxCandidates > 0 && conceptID >= maxCandidates) {
        break;
      }

      // The current concept.
      String concept = vocab[conceptID].getKey();

      // Get df and cf information for the concept.
      PostingsReader reader = env.getPostingsReader(new Expression(concept));
      if (reader == null) {
        continue;
      }
      PostingsList list = reader.getPostingsList();
      int df = list.getDf();
      long cf = list.getCf();
      env.clearPostingsReaderCache();

      // Construct concept evidence.
      termEvidence.set(df, cf);

      // Score the concept.
      float score = 0.0f;
      for (int i = 0; i < fbResults.length; i++) {
        float docScore = 0.0f;
        for (int j = 0; j < scoringFunctions.length; j++) {
          float weight = parameters.get(j).getWeight();
          ConceptImportanceModel importanceModel = importanceModels.get(j);
          if (importanceModel != null) {
            weight *= importanceModel.getConceptWeight(concept);
          }
          ScoringFunction fn = scoringFunctions[j];
          fn.initialize(termEvidence, globalEvidence);

          Short tf = tfs[i].get(vocab[conceptID].getKey());
          if (tf == null) {
            tf = 0;
          }
          float s = fn.getScore(tf, doclens[i]);

          docScore += weight * s;
        }
        score += Math.exp(fbResults[i].score + docScore);
      }

      int size = sortedConcepts.size();
      if (size < numFeedbackTerms || sortedConcepts.peek().score < score) {
        if (size == numFeedbackTerms) {
          sortedConcepts.poll(); // Remove worst concept.
        }
        sortedConcepts.add(new Accumulator(conceptID, score));
      }
    }

    // Compute the weights of the expanded terms.
    int numTerms = Math.min(numFeedbackTerms, sortedConcepts.size());
    float totalWt = 0.0f;
    Accumulator[] bestConcepts = new Accumulator[numTerms];
    for (int i = 0; i < numTerms; i++) {
      Accumulator a = sortedConcepts.poll();
      bestConcepts[i] = a;
      totalWt += a.score;
    }

    // Document node (shared across all expansion cliques).
    DocumentNode docNode = new DocumentNode();

    // Expression generator (shared across all expansion cliques).
    ExpressionGenerator generator = new TermExpressionGenerator();

    // Add cliques corresponding to best expansion concepts.
    for (int i = 0; i < numTerms; i++) {
      Accumulator a = bestConcepts[i];

      // Construct the MRF corresponding to this concept.
      String concept = vocab[a.docno].getKey();

      for (int j = 0; j < scoringFunctionNodes.size(); j++) {
        Node functionNode = scoringFunctionNodes.get(j);
        String functionType = XMLTools.getAttributeValue(functionNode, "scoreFunction", null);
        ScoringFunction fn = ScoringFunction.create(functionType, functionNode);

        Parameter parameter = parameters.get(j);
        ConceptImportanceModel importanceModel = importanceModels.get(j);

        List<GraphNode> cliqueNodes = Lists.newArrayList();
        cliqueNodes.add(docNode);

        TermNode termNode = new TermNode(concept);
        cliqueNodes.add(termNode);

        PotentialFunction potential = new QueryPotential(env, generator, fn);

        Clique c = new Clique(cliqueNodes, potential, parameter);
        c.setType(Clique.Type.Term);

        // Scale importance values by LCE likelihood.
        float normalizedScore = expanderWeight * (a.score / totalWt);
        if (importanceModel != null) {
          c.setImportance(normalizedScore * importanceModel.getCliqueWeight(c));
        } else {
          c.setImportance(normalizedScore);
        }

        expandedMRF.addClique(c);
      }
    }

    return expandedMRF;
  }
}