/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.nutch.indexer; import org.apache.lucene.util.PriorityQueue; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; import org.apache.lucene.index.TermEnum; import java.io.OutputStreamWriter; /** Lists the most frequent terms in an index. */ public class HighFreqTerms { public static int numTerms = 100; private static class TermFreq { TermFreq(Term t, int df) { term = t; docFreq = df; } int docFreq; Term term; } private static class TermFreqQueue extends PriorityQueue { TermFreqQueue(int size) { initialize(size); } protected final boolean lessThan(Object a, Object b) { TermFreq termInfoA = (TermFreq)a; TermFreq termInfoB = (TermFreq)b; return termInfoA.docFreq < termInfoB.docFreq; } } public static void main(String[] args) throws Exception { IndexReader reader = null; boolean noFreqs = false; int count = 100; String usage = "HighFreqTerms [-count <n>] [-nofreqs] <index dir>"; if (args.length == 0) { System.err.println(usage); System.exit(-1); } for (int i = 0; i < args.length; i++) { // parse command line if (args[i].equals("-count")) { // found -count option count = Integer.parseInt(args[++i]); } else if (args[i].equals("-nofreqs")) { // found -nofreqs option noFreqs = true; } else { reader = IndexReader.open(args[i]); } } TermFreqQueue tiq = new TermFreqQueue(count); TermEnum terms = reader.terms(); int minFreq = 0; while (terms.next()) { if (terms.docFreq() > minFreq) { tiq.put(new TermFreq(terms.term(), terms.docFreq())); if (tiq.size() >= count) { // if tiq overfull tiq.pop(); // remove lowest in tiq minFreq = ((TermFreq)tiq.top()).docFreq; // reset minFreq } } } OutputStreamWriter out = new OutputStreamWriter(System.out, "UTF-8"); while (tiq.size() != 0) { TermFreq termInfo = (TermFreq)tiq.pop(); out.write(termInfo.term.toString()); if (!noFreqs) { out.write(" "); out.write(Integer.toString(termInfo.docFreq)); } out.write("\n"); } out.flush(); reader.close(); } }