package ivory.ffg.preprocessing; import ivory.bloomir.util.OptionManager; import ivory.core.RetrievalEnvironment; import ivory.core.data.document.IntDocVector; import ivory.ffg.data.DocumentVectorUtility; import ivory.ffg.util.QrelUtility; import java.util.Set; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.log4j.Logger; import tl.lin.data.map.HMapIV; import com.google.common.collect.Sets; /** * @author Nima Asadi */ public class GenerateDocumentVectors { private static final Logger LOGGER = Logger.getLogger(GenerateDocumentVectors.class); public static void main(String[] args) throws Exception { OptionManager options = new OptionManager(GenerateDocumentVectors.class.getName()); options.addOption(OptionManager.INDEX_ROOT_PATH, "path", "index root", true); options.addOption(OptionManager.DOCUMENT_VECTOR_CLASS, "class_name", "documentVector class", true); options.addOption(OptionManager.OUTPUT_PATH, "path", "output", true); options.addOption(OptionManager.JUDGMENT_PATH, "path", "Tab-Delimited documents", true); try { options.parse(args); } catch(Exception exp) { return; } String indexPath = options.getOptionValue(OptionManager.INDEX_ROOT_PATH); String documentVectorClass = options.getOptionValue(OptionManager.DOCUMENT_VECTOR_CLASS); String outputPath = options.getOptionValue(OptionManager.OUTPUT_PATH); String qrelPath = options.getOptionValue(OptionManager.JUDGMENT_PATH); FileSystem fs = FileSystem.get(new Configuration()); RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs); env.initialize(true); //Parse queries, judgemnts and features HMapIV<int[]> qrels = QrelUtility.parseQrelsFromTabDelimited(qrelPath); FSDataOutputStream output = fs.create(new Path(outputPath)); Set<Integer> docidHistory = Sets.newHashSet(); //Evaluate queries and/or write the results to an output file for(int qid: qrels.keySet()) { for(int docid: qrels.get(qid)) { if(!docidHistory.contains(docid)) { docidHistory.add(docid); IntDocVector vector = env.documentVectors(new int[]{docid})[0]; output.writeInt(docid); DocumentVectorUtility.newInstance(documentVectorClass, vector).write(output); } } LOGGER.info("Compressed query " + qid); } output.writeInt(-1); output.close(); } }