package ivory.core.index; import ivory.core.RetrievalEnvironment; import ivory.core.data.dictionary.DefaultFrequencySortedDictionary; import ivory.core.data.index.PostingsList; import ivory.core.data.index.PostingsListDocSortedPositional; import ivory.core.data.stat.PrefixEncodedGlobalStats; import java.io.IOException; import java.net.URI; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.filecache.DistributedCache; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.mapred.FileInputFormat; import org.apache.hadoop.mapred.FileOutputFormat; import org.apache.hadoop.mapred.JobClient; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.MapReduceBase; import org.apache.hadoop.mapred.Mapper; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.mapred.SequenceFileInputFormat; import org.apache.hadoop.mapred.SequenceFileOutputFormat; import org.apache.hadoop.mapred.lib.IdentityReducer; import org.apache.log4j.Logger; import tl.lin.data.pair.PairOfIntLong; import edu.umd.cloud9.util.PowerTool; public class DistributeGlobalStatsToPostings extends PowerTool { private static final Logger sLogger = Logger.getLogger(DistributeGlobalStatsToPostings.class); private static class MyMapper extends MapReduceBase implements Mapper<IntWritable, PostingsList, IntWritable, PostingsList> { private PrefixEncodedGlobalStats gs; private DefaultFrequencySortedDictionary mTermIdMap; public void configure(JobConf job) { try { Path[] localFiles = DistributedCache.getLocalCacheFiles(job); sLogger.info("0: " + localFiles[0]); sLogger.info("1: " + localFiles[1]); sLogger.info("2: " + localFiles[2]); sLogger.info("3: " + localFiles[3]); sLogger.info("4: " + localFiles[4]); sLogger.info("5: " + localFiles[5]); FileSystem fs = FileSystem.getLocal(job); gs = new PrefixEncodedGlobalStats(localFiles[0], fs); gs.loadDFStats(localFiles[1], fs); gs.loadCFStats(localFiles[2], fs); String indexPath = job.get("Ivory.IndexPath"); sLogger.info("loading TermIdMap from " + indexPath); mTermIdMap = new DefaultFrequencySortedDictionary(localFiles[3], localFiles[4], localFiles[5], fs); } catch (IOException e) { e.printStackTrace(); throw new RuntimeException("Error loading global term stats!"); } } public void map(IntWritable key, PostingsList p, OutputCollector<IntWritable, PostingsList> output, Reporter reporter) throws IOException { // map from the id back to text // sLogger.info("termid: " + key); String term = mTermIdMap.getTerm(key.get()); // sLogger.info("term: " + term); PairOfIntLong pair = gs.getStats(term); if (pair == null) { p.setCf(-1); p.setDf(-1); } else { p.setCf(pair.getRightElement()); p.setDf(pair.getLeftElement()); } output.collect(key, p); } } public static final String[] RequiredParameters = { "Ivory.IndexPath", "Ivory.GlobalStatsPath", "Ivory.NumMapTasks" }; public String[] getRequiredParameters() { return RequiredParameters; } public DistributeGlobalStatsToPostings(Configuration conf) { super(conf); } public int runTool() throws Exception { sLogger.info("Distributing df/cf stats..."); JobConf conf = new JobConf(getConf(), DistributeGlobalStatsToPostings.class); FileSystem fs = FileSystem.get(conf); int mapTasks = conf.getInt("Ivory.NumMapTasks", 0); int reduceTasks = conf.getInt("Ivory.NumReduceTasks", 1); String indexPath = conf.get("Ivory.IndexPath"); String statsPath = conf.get("Ivory.GlobalStatsPath"); RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs); String collectionName = env.readCollectionName(); sLogger.info(" - CollectionName: " + collectionName); sLogger.info(" - IndexPath: " + indexPath); sLogger.info(" - NumMapTasks: " + mapTasks); sLogger.info(" - NumReduceTasks: " + reduceTasks); // back up old stats Path p1 = new Path(indexPath + "/property.CollectionDocumentCount"); Path p2 = new Path(indexPath + "/property.CollectionDocumentCount.local"); if (!fs.exists(p2)) { sLogger.info("preserving local " + p1.getName()); fs.rename(p1, p2); } p1 = new Path(indexPath + "/property.CollectionAverageDocumentLength"); p2 = new Path(indexPath + "/property.CollectionAverageDocumentLength.local"); if (!fs.exists(p2)) { sLogger.info("preserving local " + p1.getName()); fs.rename(p1, p2); } p1 = new Path(indexPath + "/property.CollectionLength"); p2 = new Path(indexPath + "/property.CollectionLength.local"); if (!fs.exists(p2)) { sLogger.info("preserving local " + p1.getName()); fs.rename(p1, p2); } // distribute global stats RetrievalEnvironment genv = new RetrievalEnvironment(statsPath, fs); long collectionLength = genv.readCollectionLength(); int docCount = genv.readCollectionDocumentCount(); float avgdl = genv.readCollectionAverageDocumentLength(); sLogger.info("writing global stats from all index segments: "); sLogger.info(" - CollectionLength: " + collectionLength); sLogger.info(" - CollectionDocumentCount: " + docCount); sLogger.info(" - AverageDocumentLength: " + avgdl); env.writeCollectionLength(collectionLength); env.writeCollectionDocumentCount(docCount); env.writeCollectionAverageDocumentLength(avgdl); // preserve old postings Path postingsPath1 = new Path(indexPath + "/postings/"); Path postingsPath2 = new Path(indexPath + "/postings.old/"); if (fs.exists(postingsPath1)) { sLogger.info("renaming " + postingsPath1.getName() + " to " + postingsPath2.getName()); fs.rename(postingsPath1, postingsPath2); } conf.setJobName("DistributeGlobalStatsToPostings:" + collectionName); FileInputFormat.setInputPaths(conf, postingsPath2); FileOutputFormat.setOutputPath(conf, postingsPath1); conf.setNumMapTasks(mapTasks); conf.setNumReduceTasks(0); DistributedCache.addCacheFile(new URI(statsPath + "/dict.terms"), conf); DistributedCache.addCacheFile(new URI(statsPath + "/dict.df"), conf); DistributedCache.addCacheFile(new URI(statsPath + "/dict.cf"), conf); DistributedCache.addCacheFile(new URI(env.getIndexTermsData()), conf); DistributedCache.addCacheFile(new URI(env.getIndexTermIdsData()), conf); DistributedCache.addCacheFile(new URI(env.getIndexTermIdMappingData()), conf); conf.set("mapred.child.java.opts", "-Xmx2048m"); conf.setInt("mapred.map.max.attempts", 10); conf.setInt("mapred.reduce.max.attempts", 10); conf.setInt("mapred.min.split.size", 1024 * 1024 * 1024); conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(PostingsListDocSortedPositional.class); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.setMapperClass(MyMapper.class); conf.setReducerClass(IdentityReducer.class); JobClient.runJob(conf); return 0; } public static void main(String[] args) throws Exception { if (args.length != 2) { System.out.println("usage: [global-stats] [index-path]"); System.exit(-1); } Configuration conf = new Configuration(); String gsPath = args[0]; String indexPath = args[1]; conf.set("Ivory.IndexPath", indexPath); conf.set("Ivory.GlobalStatsPath", gsPath); conf.setInt("Ivory.NumMapTasks", 100); sLogger.info("Distributing global statistics to " + indexPath); new DistributeGlobalStatsToPostings(conf).run(); } }