NGramLatentConceptExpander.java example

Explorer
Ivory-master
- src
  - java
/*
 * Ivory: A Hadoop toolkit for web-scale information retrieval
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you
 * may not use this file except in compliance with the License. You may
 * obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied. See the License for the specific language governing
 * permissions and limitations under the License.
 */

package ivory.smrf.model.expander;

import ivory.core.ConfigurationException;
import ivory.core.RetrievalEnvironment;
import ivory.core.RetrievalException;
import ivory.core.data.document.IntDocVector;
import ivory.smrf.model.Clique;
import ivory.smrf.model.MarkovRandomField;
import ivory.smrf.model.VocabFrequencyPair;
import ivory.smrf.model.builder.MRFBuilder;
import ivory.smrf.retrieval.Accumulator;
import ivory.smrf.retrieval.MRFDocumentRanker;

import java.io.IOException;
import java.util.Arrays;
import java.util.List;
import java.util.PriorityQueue;

import com.google.common.base.Preconditions;

/**
 * @author Don Metzler
 */
public class NGramLatentConceptExpander extends MRFExpander {
  private List<Integer> gramSizes = null;   // Gram sizes associated with each expansionmodel.
  private List<Integer> fbDocList = null;   // Number of documents to expand with.
  private List<Integer> fbTermList = null;  // Number of concepts to expand with.
  private List<MRFBuilder> builders = null; // Builders used to build MRFs from expansion concepts.

  public NGramLatentConceptExpander(RetrievalEnvironment env, List<Integer> gramList,
      List<MRFBuilder> builderList, List<Integer> fbDocsList, List<Integer> fbTermsList) {
    this.env = Preconditions.checkNotNull(env);
    this.gramSizes = Preconditions.checkNotNull(gramList);
    this.builders = Preconditions.checkNotNull(builderList);
    this.fbDocList = Preconditions.checkNotNull(fbDocsList);
    this.fbTermList = Preconditions.checkNotNull(fbTermsList);
  }

  @Override
  public MarkovRandomField getExpandedMRF(MarkovRandomField mrf, Accumulator[] results)
      throws ConfigurationException {
    // Begin constructing the expanded MRF.
    MarkovRandomField expandedMRF = new MarkovRandomField(mrf.getQueryTerms(), env);

    // Add cliques corresponding to original MRF.
    List<Clique> cliques = mrf.getCliques();
    for (Clique clique : cliques) {
      expandedMRF.addClique(clique);
    }

    // Find the best concepts for each of the expansion models.
    for (int modelNum = 0; modelNum < builders.size(); modelNum++) {
      // Get information about this expansion model.
      int curGramSize = gramSizes.get(modelNum);
      MRFBuilder curBuilder = builders.get(modelNum);
      int curFbDocs = fbDocList.get(modelNum);
      int curFbTerms = fbTermList.get(modelNum);

      // Gather Accumulators we're actually going to use for feedback purposes.
      Accumulator[] fbResults = new Accumulator[Math.min(results.length, curFbDocs)];
      for (int i = 0; i < Math.min(results.length, curFbDocs); i++) {
        fbResults[i] = results[i];
      }

      // Sort the Accumulators by docid.
      Arrays.sort(fbResults, new Accumulator.DocnoComparator());

      // Get docnos that correspond to the accumulators.
      int[] docSet = Accumulator.accumulatorsToDocnos(fbResults);

      // Get document vectors for results.
      IntDocVector[] docVecs = env.documentVectors(docSet);

      // Extract vocabulary from results.
      VocabFrequencyPair[] vocab = null;
      try {
        vocab = getVocabulary(docVecs, curGramSize);
      } catch (IOException e) {
        throw new RuntimeException("Error: Unable to fetch the vocabulary!");
      }

      // Priority queue for the concepts associated with this builder.
      PriorityQueue<Accumulator> sortedConcepts = new PriorityQueue<Accumulator>();

      // Score each concept.
      for (int conceptID = 0; conceptID < vocab.length; conceptID++) {
        if (maxCandidates > 0 && conceptID >= maxCandidates) {
          break;
        }

        // The current concept.
        String concept = vocab[conceptID].getKey();

        String[] concepts = concept.split(" ");
        MarkovRandomField conceptMRF = curBuilder.buildMRF(concepts);

        MRFDocumentRanker ranker = new MRFDocumentRanker(conceptMRF, docSet, docSet.length);
        Accumulator[] conceptResults = ranker.rank();
        Arrays.sort(conceptResults, new Accumulator.DocnoComparator());

        float score = 0.0f;
        for (int i = 0; i < conceptResults.length; i++) {
          if (fbResults[i].docno != conceptResults[i].docno) {
            throw new RetrievalException("Error: Mismatch occured in getExpandedMRF!");
          }
          score += Math.exp(fbResults[i].score + conceptResults[i].score);
        }

        int size = sortedConcepts.size();
        if (size < curFbTerms || sortedConcepts.peek().score < score) {
          if (size == curFbTerms) {
            sortedConcepts.poll(); // Remove worst concept.
          }
          sortedConcepts.add(new Accumulator(conceptID, score));
        }
      }

      // Compute the weights of the expanded terms.
      int numTerms = Math.min(curFbTerms, sortedConcepts.size());
      float totalWt = 0.0f;
      Accumulator[] bestConcepts = new Accumulator[numTerms];
      for (int i = 0; i < numTerms; i++) {
        Accumulator a = sortedConcepts.poll();
        bestConcepts[i] = a;
        totalWt += a.score;
      }

      // Add cliques corresponding to best expansion concepts.
      for (int i = 0; i < numTerms; i++) {
        Accumulator a = bestConcepts[i];

        // Construct the MRF corresponding to this concept.
        String[] concepts = vocab[a.docno].getKey().split(" ");
        MarkovRandomField conceptMRF = curBuilder.buildMRF(concepts);

        // Normalized score.
        float normalizedScore = a.score / totalWt;

        // Add cliques.
        cliques = conceptMRF.getCliques();
        for (Clique c : cliques) {
          if (c.isDocDependent() && c.getWeight() != 0.0) {
            c.setImportance(normalizedScore * c.getImportance());
            expandedMRF.addClique(c);
          }
        }
      }
    }

    return expandedMRF;
  }
}