/* * Ivory: A Hadoop toolkit for web-scale information retrieval * * Licensed under the Apache License, Version 2.0 (the "License"); you * may not use this file except in compliance with the License. You may * obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or * implied. See the License for the specific language governing * permissions and limitations under the License. */ package ivory.smrf.model.expander; import ivory.core.ConfigurationException; import ivory.core.RetrievalEnvironment; import ivory.core.RetrievalException; import ivory.core.data.document.IntDocVector; import ivory.smrf.model.Clique; import ivory.smrf.model.MarkovRandomField; import ivory.smrf.model.VocabFrequencyPair; import ivory.smrf.model.builder.MRFBuilder; import ivory.smrf.retrieval.Accumulator; import ivory.smrf.retrieval.MRFDocumentRanker; import java.io.IOException; import java.util.Arrays; import java.util.List; import java.util.PriorityQueue; import com.google.common.base.Preconditions; /** * @author Don Metzler */ public class NGramLatentConceptExpander extends MRFExpander { private List<Integer> gramSizes = null; // Gram sizes associated with each expansionmodel. private List<Integer> fbDocList = null; // Number of documents to expand with. private List<Integer> fbTermList = null; // Number of concepts to expand with. private List<MRFBuilder> builders = null; // Builders used to build MRFs from expansion concepts. public NGramLatentConceptExpander(RetrievalEnvironment env, List<Integer> gramList, List<MRFBuilder> builderList, List<Integer> fbDocsList, List<Integer> fbTermsList) { this.env = Preconditions.checkNotNull(env); this.gramSizes = Preconditions.checkNotNull(gramList); this.builders = Preconditions.checkNotNull(builderList); this.fbDocList = Preconditions.checkNotNull(fbDocsList); this.fbTermList = Preconditions.checkNotNull(fbTermsList); } @Override public MarkovRandomField getExpandedMRF(MarkovRandomField mrf, Accumulator[] results) throws ConfigurationException { // Begin constructing the expanded MRF. MarkovRandomField expandedMRF = new MarkovRandomField(mrf.getQueryTerms(), env); // Add cliques corresponding to original MRF. List<Clique> cliques = mrf.getCliques(); for (Clique clique : cliques) { expandedMRF.addClique(clique); } // Find the best concepts for each of the expansion models. for (int modelNum = 0; modelNum < builders.size(); modelNum++) { // Get information about this expansion model. int curGramSize = gramSizes.get(modelNum); MRFBuilder curBuilder = builders.get(modelNum); int curFbDocs = fbDocList.get(modelNum); int curFbTerms = fbTermList.get(modelNum); // Gather Accumulators we're actually going to use for feedback purposes. Accumulator[] fbResults = new Accumulator[Math.min(results.length, curFbDocs)]; for (int i = 0; i < Math.min(results.length, curFbDocs); i++) { fbResults[i] = results[i]; } // Sort the Accumulators by docid. Arrays.sort(fbResults, new Accumulator.DocnoComparator()); // Get docnos that correspond to the accumulators. int[] docSet = Accumulator.accumulatorsToDocnos(fbResults); // Get document vectors for results. IntDocVector[] docVecs = env.documentVectors(docSet); // Extract vocabulary from results. VocabFrequencyPair[] vocab = null; try { vocab = getVocabulary(docVecs, curGramSize); } catch (IOException e) { throw new RuntimeException("Error: Unable to fetch the vocabulary!"); } // Priority queue for the concepts associated with this builder. PriorityQueue<Accumulator> sortedConcepts = new PriorityQueue<Accumulator>(); // Score each concept. for (int conceptID = 0; conceptID < vocab.length; conceptID++) { if (maxCandidates > 0 && conceptID >= maxCandidates) { break; } // The current concept. String concept = vocab[conceptID].getKey(); String[] concepts = concept.split(" "); MarkovRandomField conceptMRF = curBuilder.buildMRF(concepts); MRFDocumentRanker ranker = new MRFDocumentRanker(conceptMRF, docSet, docSet.length); Accumulator[] conceptResults = ranker.rank(); Arrays.sort(conceptResults, new Accumulator.DocnoComparator()); float score = 0.0f; for (int i = 0; i < conceptResults.length; i++) { if (fbResults[i].docno != conceptResults[i].docno) { throw new RetrievalException("Error: Mismatch occured in getExpandedMRF!"); } score += Math.exp(fbResults[i].score + conceptResults[i].score); } int size = sortedConcepts.size(); if (size < curFbTerms || sortedConcepts.peek().score < score) { if (size == curFbTerms) { sortedConcepts.poll(); // Remove worst concept. } sortedConcepts.add(new Accumulator(conceptID, score)); } } // Compute the weights of the expanded terms. int numTerms = Math.min(curFbTerms, sortedConcepts.size()); float totalWt = 0.0f; Accumulator[] bestConcepts = new Accumulator[numTerms]; for (int i = 0; i < numTerms; i++) { Accumulator a = sortedConcepts.poll(); bestConcepts[i] = a; totalWt += a.score; } // Add cliques corresponding to best expansion concepts. for (int i = 0; i < numTerms; i++) { Accumulator a = bestConcepts[i]; // Construct the MRF corresponding to this concept. String[] concepts = vocab[a.docno].getKey().split(" "); MarkovRandomField conceptMRF = curBuilder.buildMRF(concepts); // Normalized score. float normalizedScore = a.score / totalWt; // Add cliques. cliques = conceptMRF.getCliques(); for (Clique c : cliques) { if (c.isDocDependent() && c.getWeight() != 0.0) { c.setImportance(normalizedScore * c.getImportance()); expandedMRF.addClique(c); } } } } return expandedMRF; } }