/* * Ivory: A Hadoop toolkit for web-scale information retrieval * * Licensed under the Apache License, Version 2.0 (the "License"); you * may not use this file except in compliance with the License. You may * obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or * implied. See the License for the specific language governing * permissions and limitations under the License. */ package ivory.ltr; import ivory.core.ConfigurationException; import ivory.core.RetrievalEnvironment; import ivory.smrf.model.Clique; import ivory.smrf.model.DocumentNode; import ivory.smrf.model.GraphNode; import ivory.smrf.model.MarkovRandomField; import ivory.smrf.model.builder.MRFBuilder; import ivory.smrf.model.importance.ConceptImportanceModel; import ivory.smrf.model.importance.LinearImportanceModel; import ivory.smrf.model.importance.MetaFeature; import ivory.smrf.retrieval.BatchQueryRunner; import java.io.IOException; import java.rmi.NotBoundException; import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Set; import java.util.SortedMap; import java.util.SortedSet; import java.util.TreeMap; import java.util.TreeSet; import java.util.Map.Entry; import javax.xml.parsers.ParserConfigurationException; import com.google.common.collect.Maps; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.w3c.dom.Node; import org.xml.sax.SAXException; import edu.umd.cloud9.collection.DocnoMapping; import ivory.ltr.operator.Operator; import ivory.ltr.operator.OperatorUtil; import ivory.ltr.operator.Sum; /** * @author Don Metzler * * Modified by Nima Asadi */ public class ExtractFeatures { private static final double DEFAULT_FEATURE_VALUE = 0.0; // default feature value private static final String QUERY_FEATURE_NAME = "qid"; // query id feature name private static final String DOC_FEATURE_NAME = "docid"; // document id feature name private static final String JUDGMENT_FEATURE_NAME = "grade"; // relevance grade feature name private BatchQueryRunner runner = null; // batch query runner private RetrievalEnvironment env = null; // retrieval environment private Map<String, String> queries = null; // query id -> query text mapping private DocnoMapping docnoMapping = null; // docno mapping private Map<String, Operator> operators = null; public ExtractFeatures(String [] args, FileSystem fs) throws SAXException, IOException, ParserConfigurationException, NotBoundException, Exception { loadQueryRunner(args, fs); env = runner.getRetrievalEnvironment(); queries = runner.getQueries(); docnoMapping = env.getDocnoMapping(); operators = Maps.newHashMap(); for(String configPath: args) { Map<String, Operator> ops = OperatorUtil.parseOperators(configPath); if(ops.size() > 0) { for(String key: ops.keySet()) { operators.put(key, ops.get(key)); } } } Map<String, String> finalQueries = new HashMap<String, String>(); for(Entry<String, String> queryEntry : queries.entrySet()) { String queryKey = queryEntry.getKey(); String queryText = queryEntry.getValue(); String finalQuery = ""; String[] parts = env.tokenize(queryText); for(String part: parts) { if(env.getPostingsList(part) != null) { finalQuery += part + " "; } } finalQuery = finalQuery.trim(); if(finalQuery.length() != 0) { finalQueries.put(queryKey, finalQuery); } } queries = finalQueries; } public void loadQueryRunner(String [] args, FileSystem fs) throws ConfigurationException{ runner = new BatchQueryRunner(args, fs); } private void extract() throws Exception { // models specified in parameter files Set<String> modelNames = runner.getModels(); // feature importance models Collection<ConceptImportanceModel> importanceModels = env.getImportanceModels(); // we only know how to deal with linear importance models, so filter the rest out List<LinearImportanceModel> linearImportanceModels = new ArrayList<LinearImportanceModel>(); for(ConceptImportanceModel model : importanceModels) { if(model instanceof LinearImportanceModel) { linearImportanceModels.add((LinearImportanceModel)model); } } SortedSet<String> featureNames = new TreeSet<String>(); for(Entry<String, String> queryEntry : queries.entrySet()) { // query text String queryText = queryEntry.getValue(); // compute features for each model for(String modelName : modelNames) { // build mrf from model node Node modelNode = runner.getModel(modelName); MRFBuilder builder = MRFBuilder.get(env, modelNode); MarkovRandomField mrf = builder.buildMRF(queryText.split("\\s+")); // get mrf cliques List<Clique> cliques = mrf.getCliques(); // add parameter name to feature name set for(Clique c : cliques) { // parameter id String paramId = c.getParameter().getName(); // handle linear importance model weights if(importanceModels.size() != 0) { for(LinearImportanceModel model : linearImportanceModels) { List<MetaFeature> metaFeatures = model.getMetaFeatures(); for(MetaFeature metaFeat : metaFeatures) { // feature id = modelName-metaFeatId-paramId String featId = modelName + "-" + metaFeat.getName() + "-" + paramId; featureNames.add(featId); } } } // feature id = modelName-paramId String featId = modelName + "-" + paramId; featureNames.add(featId); } } } // add judgment feature name featureNames.add(JUDGMENT_FEATURE_NAME); // print feature name header System.out.print(QUERY_FEATURE_NAME + "\t" + DOC_FEATURE_NAME); for(String featureName : featureNames) { System.out.print("\t" + featureName); } System.out.println(); // extract features query-by-query for(Entry<String, String> queryEntry : queries.entrySet()) { // feature map (docname -> feature name -> feature value) SortedMap<String,SortedMap<String,Operator>> featureValues = new TreeMap<String,SortedMap<String,Operator>>(); // query id and text String qid = queryEntry.getKey(); String queryText = queryEntry.getValue(); // compute features for each model for(String modelName : modelNames) { // build mrf from model node Node modelNode = runner.getModel(modelName); MRFBuilder builder = MRFBuilder.get(env, modelNode); MarkovRandomField mrf = builder.buildMRF(queryText.split("\\s+")); // initialize mrf mrf.initialize(); // get mrf cliques List<Clique> cliques = mrf.getCliques(); // get docnodes associated with mrf ArrayList<DocumentNode> docNodes = new ArrayList<DocumentNode>(); List<GraphNode> nodes = mrf.getNodes(); for (GraphNode node : nodes) { if (node instanceof DocumentNode) { docNodes.add((DocumentNode) node); } } // get document set to extract features for Map<String,Double> origJudgments = runner.getJudgmentSet(qid); if(origJudgments == null) { System.err.println("Warning: no judgments found for qid = " + qid + " -- skipping!"); continue; } // convert to docid -> judgment mapping SortedMap<Integer,Double> judgments = new TreeMap<Integer,Double>(); Map<Integer,String> docIdToNameMap = new HashMap<Integer,String>(); for(Entry<String,Double> judgmentEntry : origJudgments.entrySet()) { // document name String docName = judgmentEntry.getKey(); // judgment double judgment = judgmentEntry.getValue(); // doc id int docid = docnoMapping.getDocno(docName); // update maps judgments.put(docid, judgment); docIdToNameMap.put(docid, docName); } for(Entry<Integer,Double> judgmentEntry : judgments.entrySet()) { // document id int docid = judgmentEntry.getKey(); // document name String docName = docIdToNameMap.get(docid); // get feature map for this docname SortedMap<String,Operator> docFeatures = featureValues.get(docName); if(docFeatures == null) { docFeatures = new TreeMap<String,Operator>(); featureValues.put(docName, docFeatures); } // document judgment double judgment = judgmentEntry.getValue(); // set judgment feature docFeatures.put(JUDGMENT_FEATURE_NAME, new Sum()); docFeatures.get(JUDGMENT_FEATURE_NAME).addScore(judgment); // initialize doc nodes for(DocumentNode node : docNodes) { node.setDocno(docid); } // compute potentials for each clique for(Clique c : cliques) { // parameter id String paramId = c.getParameter().getName(); // handle linear importance model weights (for everything except query-independent clique types) if(importanceModels.size() != 0 && c.getType() != Clique.Type.Document) { for(LinearImportanceModel model : linearImportanceModels) { List<MetaFeature> metaFeatures = model.getMetaFeatures(); for(MetaFeature metaFeat : metaFeatures) { // feature id = modelName-metaFeatId-paramId String featId = modelName + "-" + metaFeat.getName() + "-" + paramId; // score = meta-feature weight * (raw) clique potential double score = model.computeFeatureValue(c.getConcept(), metaFeat) * c.getPotential(); // update feature values if(!docFeatures.containsKey(featId)) { docFeatures.put(featId, operators.get(modelName + "-" + paramId).newInstance()); } docFeatures.get(featId).addScore(score); } } } // feature id = modelName-paramId String featId = modelName + "-" + paramId; // score = (raw) clique potential double score = c.getPotential(); // update feature values if(!docFeatures.containsKey(featId)) { docFeatures.put(featId, operators.get(featId).newInstance()); } docFeatures.get(featId).addScore(score); } } } // print feature values for current query for(Entry<String, SortedMap<String, Operator>> featureEntry : featureValues.entrySet()) { String docName = featureEntry.getKey(); System.out.print(qid + "\t" + docName); Map<String,Operator> docFeatures = featureEntry.getValue(); for(String featureName : featureNames) { Operator op = docFeatures.get(featureName); double featVal = DEFAULT_FEATURE_VALUE; if(op != null) { featVal = op.getFinalScore(); } System.out.print("\t" + featVal); } System.out.println(); } } } public static void main(String[] args) throws SAXException, ParserConfigurationException, NotBoundException, Exception { Configuration conf = new Configuration(); FileSystem fs = FileSystem.getLocal(conf); ExtractFeatures extractor = new ExtractFeatures(args, fs); extractor.extract(); } }