/* * Ivory: A Hadoop toolkit for web-scale information retrieval * * Licensed under the Apache License, Version 2.0 (the "License"); you * may not use this file except in compliance with the License. You may * obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or * implied. See the License for the specific language governing * permissions and limitations under the License. */ package ivory.smrf.model.expander; import ivory.core.ConfigurationException; import ivory.core.RetrievalEnvironment; import ivory.core.RetrievalException; import ivory.core.data.document.IntDocVector; import ivory.core.data.index.PostingsList; import ivory.core.data.index.PostingsReader; import ivory.core.util.XMLTools; import ivory.smrf.model.Clique; import ivory.smrf.model.DocumentNode; import ivory.smrf.model.GlobalEvidence; import ivory.smrf.model.GlobalTermEvidence; import ivory.smrf.model.GraphNode; import ivory.smrf.model.MarkovRandomField; import ivory.smrf.model.Parameter; import ivory.smrf.model.TermNode; import ivory.smrf.model.VocabFrequencyPair; import ivory.smrf.model.builder.Expression; import ivory.smrf.model.builder.ExpressionGenerator; import ivory.smrf.model.builder.TermExpressionGenerator; import ivory.smrf.model.importance.ConceptImportanceModel; import ivory.smrf.model.potential.PotentialFunction; import ivory.smrf.model.potential.QueryPotential; import ivory.smrf.model.score.ScoringFunction; import ivory.smrf.retrieval.Accumulator; import java.io.IOException; import java.util.Arrays; import java.util.List; import java.util.Map; import java.util.PriorityQueue; import org.w3c.dom.Node; import com.google.common.base.Preconditions; import com.google.common.collect.Lists; /** * @author Don Metzler * @author Lidan Wang */ public class UnigramLatentConceptExpander extends MRFExpander { private List<Parameter> parameters = null; private List<Node> scoringFunctionNodes = null; private List<ConceptImportanceModel> importanceModels = null; private final GlobalTermEvidence termEvidence = new GlobalTermEvidence(); public UnigramLatentConceptExpander(RetrievalEnvironment env, int fbDocs, int fbTerms, float expanderWeight, List<Parameter> params, List<Node> scoringFunctionNodes, List<ConceptImportanceModel> importanceModels) { this.env = Preconditions.checkNotNull(env); this.numFeedbackDocs = Preconditions.checkNotNull(fbDocs); this.numFeedbackTerms = Preconditions.checkNotNull(fbTerms); this.expanderWeight = Preconditions.checkNotNull(expanderWeight); this.parameters = Preconditions.checkNotNull(params); this.scoringFunctionNodes = Preconditions.checkNotNull(scoringFunctionNodes); this.importanceModels = importanceModels; } @Override public MarkovRandomField getExpandedMRF(MarkovRandomField mrf, Accumulator[] results) throws ConfigurationException { Preconditions.checkNotNull(mrf); Preconditions.checkNotNull(results); // Begin constructing the expanded MRF. MarkovRandomField expandedMRF = new MarkovRandomField(mrf.getQueryTerms(), env); // Add cliques corresponding to original MRF. List<Clique> cliques = mrf.getCliques(); for (Clique clique : cliques) { expandedMRF.addClique(clique); } // Get MRF global evidence. GlobalEvidence globalEvidence = mrf.getGlobalEvidence(); // Gather Accumulators we're actually going to use for feedback purposes. Accumulator[] fbResults = new Accumulator[Math.min(results.length, numFeedbackDocs)]; for (int i = 0; i < Math.min(results.length, numFeedbackDocs); i++) { fbResults[i] = results[i]; } // Sort the Accumulators by docid. Arrays.sort(fbResults, new Accumulator.DocnoComparator()); // Get docids that correspond to the accumulators. int[] docSet = Accumulator.accumulatorsToDocnos(fbResults); // Get document vectors for results. IntDocVector[] docVecs = env.documentVectors(docSet); // Extract tf and doclen information from document vectors. TfDoclengthStatistics stats = null; try { stats = getTfDoclengthStatistics(docVecs); } catch (IOException e) { throw new RetrievalException( "Error: Unable to extract tf and doclen information from document vectors!"); } VocabFrequencyPair[] vocab = stats.getVocab(); Map<String, Short>[] tfs = stats.getTfs(); int[] doclens = stats.getDoclens(); // Priority queue for the concepts associated with this builder. PriorityQueue<Accumulator> sortedConcepts = new PriorityQueue<Accumulator>(); // Create scoring functions. ScoringFunction[] scoringFunctions = new ScoringFunction[scoringFunctionNodes.size()]; for (int i = 0; i < scoringFunctionNodes.size(); i++) { Node functionNode = scoringFunctionNodes.get(i); String functionType = XMLTools.getAttributeValueOrThrowException(functionNode, "scoreFunction", "conceptscore node must specify a scorefunction attribute!"); scoringFunctions[i] = ScoringFunction.create(functionType, functionNode); } // Score each concept. for (int conceptID = 0; conceptID < vocab.length; conceptID++) { if (maxCandidates > 0 && conceptID >= maxCandidates) { break; } // The current concept. String concept = vocab[conceptID].getKey(); // Get df and cf information for the concept. PostingsReader reader = env.getPostingsReader(new Expression(concept)); if (reader == null) { continue; } PostingsList list = reader.getPostingsList(); int df = list.getDf(); long cf = list.getCf(); env.clearPostingsReaderCache(); // Construct concept evidence. termEvidence.set(df, cf); // Score the concept. float score = 0.0f; for (int i = 0; i < fbResults.length; i++) { float docScore = 0.0f; for (int j = 0; j < scoringFunctions.length; j++) { float weight = parameters.get(j).getWeight(); ConceptImportanceModel importanceModel = importanceModels.get(j); if (importanceModel != null) { weight *= importanceModel.getConceptWeight(concept); } ScoringFunction fn = scoringFunctions[j]; fn.initialize(termEvidence, globalEvidence); Short tf = tfs[i].get(vocab[conceptID].getKey()); if (tf == null) { tf = 0; } float s = fn.getScore(tf, doclens[i]); docScore += weight * s; } score += Math.exp(fbResults[i].score + docScore); } int size = sortedConcepts.size(); if (size < numFeedbackTerms || sortedConcepts.peek().score < score) { if (size == numFeedbackTerms) { sortedConcepts.poll(); // Remove worst concept. } sortedConcepts.add(new Accumulator(conceptID, score)); } } // Compute the weights of the expanded terms. int numTerms = Math.min(numFeedbackTerms, sortedConcepts.size()); float totalWt = 0.0f; Accumulator[] bestConcepts = new Accumulator[numTerms]; for (int i = 0; i < numTerms; i++) { Accumulator a = sortedConcepts.poll(); bestConcepts[i] = a; totalWt += a.score; } // Document node (shared across all expansion cliques). DocumentNode docNode = new DocumentNode(); // Expression generator (shared across all expansion cliques). ExpressionGenerator generator = new TermExpressionGenerator(); // Add cliques corresponding to best expansion concepts. for (int i = 0; i < numTerms; i++) { Accumulator a = bestConcepts[i]; // Construct the MRF corresponding to this concept. String concept = vocab[a.docno].getKey(); for (int j = 0; j < scoringFunctionNodes.size(); j++) { Node functionNode = scoringFunctionNodes.get(j); String functionType = XMLTools.getAttributeValue(functionNode, "scoreFunction", null); ScoringFunction fn = ScoringFunction.create(functionType, functionNode); Parameter parameter = parameters.get(j); ConceptImportanceModel importanceModel = importanceModels.get(j); List<GraphNode> cliqueNodes = Lists.newArrayList(); cliqueNodes.add(docNode); TermNode termNode = new TermNode(concept); cliqueNodes.add(termNode); PotentialFunction potential = new QueryPotential(env, generator, fn); Clique c = new Clique(cliqueNodes, potential, parameter); c.setType(Clique.Type.Term); // Scale importance values by LCE likelihood. float normalizedScore = expanderWeight * (a.score / totalWt); if (importanceModel != null) { c.setImportance(normalizedScore * importanceModel.getCliqueWeight(c)); } else { c.setImportance(normalizedScore); } expandedMRF.addClique(c); } } return expandedMRF; } }