/* * Ivory: A Hadoop toolkit for web-scale information retrieval * * Licensed under the Apache License, Version 2.0 (the "License"); you * may not use this file except in compliance with the License. You may * obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or * implied. See the License for the specific language governing * permissions and limitations under the License. */ package ivory.smrf.model.builder; import ivory.core.ConfigurationException; import ivory.core.RetrievalEnvironment; import ivory.core.RetrievalException; import ivory.core.util.XMLTools; import ivory.smrf.model.Clique; import ivory.smrf.model.MarkovRandomField; import ivory.smrf.model.importance.ConceptImportanceModel; import java.util.List; import java.util.Set; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import com.google.common.base.Preconditions; import com.google.common.collect.Sets; /** * @author Don Metzler */ public class FeatureBasedMRFBuilder extends MRFBuilder { // XML specification of features. private Node model = null; // Whether or not to normalize the feature importance weights. protected boolean normalizeImportance = false; float pruningThresholdBigram = 0.0f; public FeatureBasedMRFBuilder(RetrievalEnvironment env, Node model) { super(env); this.model = Preconditions.checkNotNull(model); // Whether or not we should normalize the feature importance weights. normalizeImportance = XMLTools.getAttributeValue(model, "normalizeImportance", false); pruningThresholdBigram = XMLTools.getAttributeValue(model, "pruningThresholdBigram", 0.0f); } public Node getModel() { return model; } @Override public MarkovRandomField buildMRF(String[] queryTerms) throws ConfigurationException { // This is the MRF we're building. MarkovRandomField mrf = new MarkovRandomField(queryTerms, env); // Construct MRF feature by feature. NodeList children = model.getChildNodes(); // Sum of query-dependent importance weights. float totalImportance = 0.0f; // Cliques that have query-dependent importance weights. Set<Clique> cliquesWithImportance = Sets.newHashSet(); for (int i = 0; i < children.getLength(); i++) { Node child = children.item(i); if ("feature".equals(child.getNodeName())) { // Get the feature id. String featureID = XMLTools.getAttributeValueOrThrowException(child, "id", "Each feature must specify an id attribute!"); // Get feature weight (default = 1.0). float weight = XMLTools.getAttributeValue(child, "weight", 1.0f); // Concept importance model (optional). ConceptImportanceModel importanceModel = null; // Get concept importance source (if applicable). String importanceSource = XMLTools.getAttributeValue(child, "importance", ""); if (!importanceSource.equals("")) { importanceModel = env.getImportanceModel(importanceSource); if (importanceModel == null) { throw new RetrievalException("ImportanceModel " + importanceSource + " not found!"); } } // Get CliqueSet type. String cliqueSetType = XMLTools.getAttributeValue(child, "cliqueSet", ""); // Construct the clique set. CliqueSet cliqueSet = CliqueSet.create(cliqueSetType, env, queryTerms, child); // Get cliques from clique set. List<Clique> cliques = cliqueSet.getCliques(); for (Clique c : cliques) { double w = weight; c.setParameterName(featureID); // Parameter id. c.setParameterWeight(weight); // Weight. c.setType(cliqueSet.getType()); // Clique type. // Get clique weight. if (importanceModel != null) { float importance = importanceModel.getCliqueWeight(c); c.setImportance(importance); totalImportance += importance; cliquesWithImportance.add(c); w = importance; } if (w < pruningThresholdBigram && c.getType() != Clique.Type.Term) { } else { mrf.addClique(c); } } } } // Normalize query-dependent feature importance values. if (normalizeImportance) { for (Clique c : cliquesWithImportance) { c.setImportance(c.getImportance() / totalImportance); } } return mrf; } }