/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.search.spell; import java.io.IOException; import java.util.Iterator; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; import org.apache.lucene.index.TermEnum; import org.apache.lucene.search.spell.Dictionary; import org.apache.lucene.util.StringHelper; /** * HighFrequencyDictionary: terms taken from the given field * of a Lucene index, which appear in a number of documents * above a given threshold. * * When using IndexReader.terms(Term) the code must not call next() on TermEnum * as the first call to TermEnum, see: http://issues.apache.org/jira/browse/LUCENE-6 * * Threshold is a value in [0..1] representing the minimum * number of documents (of the total) where a term should appear. * * Based on LuceneDictionary. */ public class HighFrequencyDictionary implements Dictionary { private IndexReader reader; private String field; private float thresh; public HighFrequencyDictionary(IndexReader reader, String field, float thresh) { this.reader = reader; this.field = StringHelper.intern(field); this.thresh = thresh; } public final Iterator<String> getWordsIterator() { return new HighFrequencyIterator(); } final class HighFrequencyIterator implements TermFreqIterator { private TermEnum termEnum; private Term actualTerm; private int actualFreq; private boolean hasNextCalled; private int minNumDocs; HighFrequencyIterator() { try { termEnum = reader.terms(new Term(field, "")); minNumDocs = (int)(thresh * (float)reader.numDocs()); } catch (IOException e) { throw new RuntimeException(e); } } private boolean isFrequent(Term term) { try { return reader.docFreq(term) >= minNumDocs; } catch (IOException e) { throw new RuntimeException(e); } } public String next() { if (!hasNextCalled) { hasNext(); } hasNextCalled = false; try { termEnum.next(); } catch (IOException e) { throw new RuntimeException(e); } return (actualTerm != null) ? actualTerm.text() : null; } public float freq() { return actualFreq; } public boolean hasNext() { if (hasNextCalled) { return actualTerm != null; } hasNextCalled = true; do { actualTerm = termEnum.term(); actualFreq = termEnum.docFreq(); // if there are no words return false if (actualTerm == null) { return false; } String currentField = actualTerm.field(); // if the next word doesn't have the same field return false if (currentField != field) { // intern'd comparison actualTerm = null; return false; } // got a valid term, does it pass the threshold? if (isFrequent(actualTerm)) { return true; } // term not up to threshold try { termEnum.next(); } catch (IOException e) { throw new RuntimeException(e); } } while (true); } public void remove() { throw new UnsupportedOperationException(); } } }