/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.action.termvectors;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.TermStatistics;
import org.apache.lucene.search.similarities.ClassicSimilarity;
import org.apache.lucene.search.similarities.TFIDFSimilarity;
import org.apache.lucene.util.BytesRef;
import org.elasticsearch.common.Nullable;
import org.elasticsearch.search.dfs.AggregatedDfs;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
public class TermVectorsFilter {
public static final int DEFAULT_MAX_QUERY_TERMS = 25;
public static final int DEFAULT_MIN_TERM_FREQ = 0;
public static final int DEFAULT_MAX_TERM_FREQ = Integer.MAX_VALUE;
public static final int DEFAULT_MIN_DOC_FREQ = 0;
public static final int DEFAULT_MAX_DOC_FREQ = Integer.MAX_VALUE;
public static final int DEFAULT_MIN_WORD_LENGTH = 0;
public static final int DEFAULT_MAX_WORD_LENGTH = 0;
private int maxNumTerms = DEFAULT_MAX_QUERY_TERMS;
private int minTermFreq = DEFAULT_MIN_TERM_FREQ;
private int maxTermFreq = DEFAULT_MAX_TERM_FREQ;
private int minDocFreq = DEFAULT_MIN_DOC_FREQ;
private int maxDocFreq = DEFAULT_MAX_DOC_FREQ;
private int minWordLength = DEFAULT_MIN_WORD_LENGTH;
private int maxWordLength = DEFAULT_MAX_WORD_LENGTH;
private Fields fields;
private Fields topLevelFields;
private final Set<String> selectedFields;
private AggregatedDfs dfs;
private Map<Term, ScoreTerm> scoreTerms;
private Map<String, Integer> sizes = new HashMap<>();
private TFIDFSimilarity similarity;
public TermVectorsFilter(Fields termVectorsByField, Fields topLevelFields, Set<String> selectedFields, @Nullable AggregatedDfs dfs) {
this.fields = termVectorsByField;
this.topLevelFields = topLevelFields;
this.selectedFields = selectedFields;
this.dfs = dfs;
this.scoreTerms = new HashMap<>();
this.similarity = new ClassicSimilarity();
}
public void setSettings(TermVectorsRequest.FilterSettings settings) {
if (settings.maxNumTerms != null) {
setMaxNumTerms(settings.maxNumTerms);
}
if (settings.minTermFreq != null) {
setMinTermFreq(settings.minTermFreq);
}
if (settings.maxTermFreq != null) {
setMaxTermFreq(settings.maxTermFreq);
}
if (settings.minDocFreq != null) {
setMinDocFreq(settings.minDocFreq);
}
if (settings.maxDocFreq != null) {
setMaxDocFreq(settings.maxDocFreq);
}
if (settings.minWordLength != null) {
setMinWordLength(settings.minWordLength);
}
if (settings.maxWordLength != null) {
setMaxWordLength(settings.maxWordLength);
}
}
public ScoreTerm getScoreTerm(Term term) {
return scoreTerms.get(term);
}
public boolean hasScoreTerm(Term term) {
return getScoreTerm(term) != null;
}
public long size(String fieldName) {
return sizes.get(fieldName);
}
public int getMaxNumTerms() {
return maxNumTerms;
}
public int getMinTermFreq() {
return minTermFreq;
}
public int getMaxTermFreq() {
return maxTermFreq;
}
public int getMinDocFreq() {
return minDocFreq;
}
public int getMaxDocFreq() {
return maxDocFreq;
}
public int getMinWordLength() {
return minWordLength;
}
public int getMaxWordLength() {
return maxWordLength;
}
public void setMaxNumTerms(int maxNumTerms) {
this.maxNumTerms = maxNumTerms;
}
public void setMinTermFreq(int minTermFreq) {
this.minTermFreq = minTermFreq;
}
public void setMaxTermFreq(int maxTermFreq) {
this.maxTermFreq = maxTermFreq;
}
public void setMinDocFreq(int minDocFreq) {
this.minDocFreq = minDocFreq;
}
public void setMaxDocFreq(int maxDocFreq) {
this.maxDocFreq = maxDocFreq;
}
public void setMinWordLength(int minWordLength) {
this.minWordLength = minWordLength;
}
public void setMaxWordLength(int maxWordLength) {
this.maxWordLength = maxWordLength;
}
public static final class ScoreTerm {
public String field;
public String word;
public float score;
ScoreTerm(String field, String word, float score) {
this.field = field;
this.word = word;
this.score = score;
}
void update(String field, String word, float score) {
this.field = field;
this.word = word;
this.score = score;
}
}
public void selectBestTerms() throws IOException {
PostingsEnum docsEnum = null;
for (String fieldName : fields) {
if ((selectedFields != null) && (!selectedFields.contains(fieldName))) {
continue;
}
Terms terms = fields.terms(fieldName);
Terms topLevelTerms = topLevelFields.terms(fieldName);
// if no terms found, take the retrieved term vector fields for stats
if (topLevelTerms == null) {
topLevelTerms = terms;
}
long numDocs = getDocCount(fieldName, topLevelTerms);
// one queue per field name
ScoreTermsQueue queue = new ScoreTermsQueue(Math.min(maxNumTerms, (int) terms.size()));
// select terms with highest tf-idf
TermsEnum termsEnum = terms.iterator();
TermsEnum topLevelTermsEnum = topLevelTerms.iterator();
while (termsEnum.next() != null) {
BytesRef termBytesRef = termsEnum.term();
boolean foundTerm = topLevelTermsEnum.seekExact(termBytesRef);
assert foundTerm : "Term: " + termBytesRef.utf8ToString() + " not found!";
Term term = new Term(fieldName, termBytesRef);
// remove noise words
int freq = getTermFreq(termsEnum, docsEnum);
if (isNoise(term.bytes().utf8ToString(), freq)) {
continue;
}
// now call on docFreq
long docFreq = getTermStatistics(topLevelTermsEnum, term).docFreq();
if (!isAccepted(docFreq)) {
continue;
}
// filter based on score
float score = computeScore(docFreq, freq, numDocs);
queue.addOrUpdate(new ScoreTerm(term.field(), term.bytes().utf8ToString(), score));
}
// retain the best terms for quick lookups
ScoreTerm scoreTerm;
int count = 0;
while ((scoreTerm = queue.pop()) != null) {
scoreTerms.put(new Term(scoreTerm.field, scoreTerm.word), scoreTerm);
count++;
}
sizes.put(fieldName, count);
}
}
private boolean isNoise(String word, int freq) {
// filter out words based on length
int len = word.length();
if (minWordLength > 0 && len < minWordLength) {
return true;
}
if (maxWordLength > 0 && len > maxWordLength) {
return true;
}
// filter out words that don't occur enough times in the source
if (minTermFreq > 0 && freq < minTermFreq) {
return true;
}
// filter out words that occur too many times in the source
if (freq > maxTermFreq) {
return true;
}
return false;
}
private boolean isAccepted(long docFreq) {
// filter out words that don't occur in enough docs
if (minDocFreq > 0 && docFreq < minDocFreq) {
return false;
}
// filter out words that occur in too many docs
if (docFreq > maxDocFreq) {
return false;
}
// index update problem?
if (docFreq == 0) {
return false;
}
return true;
}
private long getDocCount(String fieldName, Terms topLevelTerms) throws IOException {
if (dfs != null) {
return dfs.fieldStatistics().get(fieldName).docCount();
}
return topLevelTerms.getDocCount();
}
private TermStatistics getTermStatistics(TermsEnum termsEnum, Term term) throws IOException {
if (dfs != null) {
return dfs.termStatistics().get(term);
}
return new TermStatistics(termsEnum.term(), termsEnum.docFreq(), termsEnum.totalTermFreq());
}
private int getTermFreq(TermsEnum termsEnum, PostingsEnum docsEnum) throws IOException {
docsEnum = termsEnum.postings(docsEnum);
docsEnum.nextDoc();
return docsEnum.freq();
}
private float computeScore(long docFreq, int freq, long numDocs) {
return freq * similarity.idf(docFreq, numDocs);
}
private static class ScoreTermsQueue extends org.apache.lucene.util.PriorityQueue<ScoreTerm> {
private final int limit;
ScoreTermsQueue(int maxSize) {
super(maxSize);
this.limit = maxSize;
}
@Override
protected boolean lessThan(ScoreTerm a, ScoreTerm b) {
return a.score < b.score;
}
public void addOrUpdate(ScoreTerm scoreTerm) {
if (this.size() < limit) {
// there is still space in the queue
this.add(scoreTerm);
} else {
// otherwise update the smallest in the queue in place and update the queue
ScoreTerm scoreTermTop = this.top();
if (scoreTermTop.score < scoreTerm.score) {
scoreTermTop.update(scoreTerm.field, scoreTerm.word, scoreTerm.score);
this.updateTop();
}
}
}
}
}