/* * Copyright 2004-2009 the original author or authors. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.search.spell; import java.io.IOException; import java.util.Iterator; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; import org.apache.lucene.index.TermEnum; /** * HighFrequencyDictionary: terms taken from the given field * of a Lucene index, which appear in a number of documents * above a given threshold. * * When using IndexReader.terms(Term) the code must not call next() on TermEnum * as the first call to TermEnum, see: http://issues.apache.org/jira/browse/LUCENE-6 * * Threshold is a value in [0..1] representing the minimum * number of documents (of the total) where a term should appear. * * Based on LuceneDictionary. */ public class HighFrequencyDictionary implements Dictionary { private IndexReader reader; private String field; private float thresh; public HighFrequencyDictionary(IndexReader reader, String field, float thresh) { this.reader = reader; this.field = field.intern(); this.thresh = thresh; } public final Iterator getWordsIterator() { return new HighFrequencyIterator(); } final class HighFrequencyIterator implements Iterator { private TermEnum termEnum; private Term actualTerm; private boolean hasNextCalled; private int minNumDocs; HighFrequencyIterator() { try { termEnum = reader.terms(new Term(field, "")); minNumDocs = (int) (thresh * (float) reader.numDocs()); } catch (IOException e) { throw new RuntimeException(e); } } private boolean isFrequent(Term term) { try { return reader.docFreq(term) >= minNumDocs; } catch (IOException e) { throw new RuntimeException(e); } } public Object next() { if (!hasNextCalled) { hasNext(); } hasNextCalled = false; try { termEnum.next(); } catch (IOException e) { throw new RuntimeException(e); } return (actualTerm != null) ? actualTerm.text() : null; } public boolean hasNext() { if (hasNextCalled) { return actualTerm != null; } hasNextCalled = true; do { actualTerm = termEnum.term(); // if there are no words return false if (actualTerm == null) { return false; } String currentField = actualTerm.field(); // if the next word doesn't have the same field return false if (currentField != field) { actualTerm = null; return false; } // got a valid term, does it pass the threshold? if (isFrequent(actualTerm)) { return true; } // term not up to threshold try { termEnum.next(); } catch (IOException e) { throw new RuntimeException(e); } } while (true); } public void remove() { throw new UnsupportedOperationException(); } } }