package ivory.core.util; import ivory.core.RetrievalEnvironment; import ivory.core.data.dictionary.DefaultFrequencySortedDictionary; import ivory.core.data.stat.DfTableArray; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; public class PrintMostFrequentTerms { public static void main(String[] args) throws Exception { if (args.length != 1) { System.out.println("usage: [index-path]"); System.exit(-1); } String indexPath = args[0]; Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(conf); RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs); DfTableArray dfs = new DfTableArray(new Path(env.getDfByIntData()), fs); Path termsFilePath = new Path(env.getIndexTermsData()); Path termIDsFilePath = new Path(env.getIndexTermIdsData()); Path idToTermFilePath = new Path(env.getIndexTermIdMappingData()); DefaultFrequencySortedDictionary termIDMap = new DefaultFrequencySortedDictionary(termsFilePath, termIDsFilePath, idToTermFilePath, fs); for (int i=1; i<=200; i++) { System.out.println(String.format("%d\t%s\t%d", i, termIDMap.getTerm(i), dfs.getDf(i))); } } }