TermVectorsFilter.java example

Explorer
elasticsearch-master
/*
 * Licensed to Elasticsearch under one or more contributor
 * license agreements. See the NOTICE file distributed with
 * this work for additional information regarding copyright
 * ownership. Elasticsearch licenses this file to you under
 * the Apache License, Version 2.0 (the "License"); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.elasticsearch.action.termvectors;

import org.apache.lucene.index.Fields;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.TermStatistics;
import org.apache.lucene.search.similarities.ClassicSimilarity;
import org.apache.lucene.search.similarities.TFIDFSimilarity;
import org.apache.lucene.util.BytesRef;
import org.elasticsearch.common.Nullable;
import org.elasticsearch.search.dfs.AggregatedDfs;

import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;

public class TermVectorsFilter {
    public static final int DEFAULT_MAX_QUERY_TERMS = 25;
    public static final int DEFAULT_MIN_TERM_FREQ = 0;
    public static final int DEFAULT_MAX_TERM_FREQ = Integer.MAX_VALUE;
    public static final int DEFAULT_MIN_DOC_FREQ = 0;
    public static final int DEFAULT_MAX_DOC_FREQ = Integer.MAX_VALUE;
    public static final int DEFAULT_MIN_WORD_LENGTH = 0;
    public static final int DEFAULT_MAX_WORD_LENGTH = 0;

    private int maxNumTerms = DEFAULT_MAX_QUERY_TERMS;
    private int minTermFreq = DEFAULT_MIN_TERM_FREQ;
    private int maxTermFreq = DEFAULT_MAX_TERM_FREQ;
    private int minDocFreq = DEFAULT_MIN_DOC_FREQ;
    private int maxDocFreq = DEFAULT_MAX_DOC_FREQ;
    private int minWordLength = DEFAULT_MIN_WORD_LENGTH;
    private int maxWordLength = DEFAULT_MAX_WORD_LENGTH;

    private Fields fields;
    private Fields topLevelFields;
    private final Set<String> selectedFields;
    private AggregatedDfs dfs;
    private Map<Term, ScoreTerm> scoreTerms;
    private Map<String, Integer> sizes = new HashMap<>();
    private TFIDFSimilarity similarity;

    public TermVectorsFilter(Fields termVectorsByField, Fields topLevelFields, Set<String> selectedFields, @Nullable AggregatedDfs dfs) {
        this.fields = termVectorsByField;
        this.topLevelFields = topLevelFields;
        this.selectedFields = selectedFields;

        this.dfs = dfs;
        this.scoreTerms = new HashMap<>();
        this.similarity = new ClassicSimilarity();
    }

    public void setSettings(TermVectorsRequest.FilterSettings settings) {
        if (settings.maxNumTerms != null) {
            setMaxNumTerms(settings.maxNumTerms);
        }
        if (settings.minTermFreq != null) {
            setMinTermFreq(settings.minTermFreq);
        }
        if (settings.maxTermFreq != null) {
            setMaxTermFreq(settings.maxTermFreq);
        }
        if (settings.minDocFreq != null) {
            setMinDocFreq(settings.minDocFreq);
        }
        if (settings.maxDocFreq != null) {
            setMaxDocFreq(settings.maxDocFreq);
        }
        if (settings.minWordLength != null) {
            setMinWordLength(settings.minWordLength);
        }
        if (settings.maxWordLength != null) {
            setMaxWordLength(settings.maxWordLength);
        }
    }

    public ScoreTerm getScoreTerm(Term term) {
        return scoreTerms.get(term);
    }

    public boolean hasScoreTerm(Term term) {
        return getScoreTerm(term) != null;
    }

    public long size(String fieldName) {
        return sizes.get(fieldName);
    }

    public int getMaxNumTerms() {
        return maxNumTerms;
    }

    public int getMinTermFreq() {
        return minTermFreq;
    }

    public int getMaxTermFreq() {
        return maxTermFreq;
    }

    public int getMinDocFreq() {
        return minDocFreq;
    }

    public int getMaxDocFreq() {
        return maxDocFreq;
    }

    public int getMinWordLength() {
        return minWordLength;
    }

    public int getMaxWordLength() {
        return maxWordLength;
    }

    public void setMaxNumTerms(int maxNumTerms) {
        this.maxNumTerms = maxNumTerms;
    }

    public void setMinTermFreq(int minTermFreq) {
        this.minTermFreq = minTermFreq;
    }

    public void setMaxTermFreq(int maxTermFreq) {
        this.maxTermFreq = maxTermFreq;
    }

    public void setMinDocFreq(int minDocFreq) {
        this.minDocFreq = minDocFreq;
    }

    public void setMaxDocFreq(int maxDocFreq) {
        this.maxDocFreq = maxDocFreq;
    }

    public void setMinWordLength(int minWordLength) {
        this.minWordLength = minWordLength;
    }

    public void setMaxWordLength(int maxWordLength) {
        this.maxWordLength = maxWordLength;
    }

    public static final class ScoreTerm {
        public String field;
        public String word;
        public float score;

        ScoreTerm(String field, String word, float score) {
            this.field = field;
            this.word = word;
            this.score = score;
        }

        void update(String field, String word, float score) {
            this.field = field;
            this.word = word;
            this.score = score;
        }
    }

    public void selectBestTerms() throws IOException {
        PostingsEnum docsEnum = null;

        for (String fieldName : fields) {
            if ((selectedFields != null) && (!selectedFields.contains(fieldName))) {
                continue;
            }

            Terms terms = fields.terms(fieldName);
            Terms topLevelTerms = topLevelFields.terms(fieldName);

            // if no terms found, take the retrieved term vector fields for stats
            if (topLevelTerms == null) {
                topLevelTerms = terms;
            }

            long numDocs = getDocCount(fieldName, topLevelTerms);

            // one queue per field name
            ScoreTermsQueue queue = new ScoreTermsQueue(Math.min(maxNumTerms, (int) terms.size()));

            // select terms with highest tf-idf
            TermsEnum termsEnum = terms.iterator();
            TermsEnum topLevelTermsEnum = topLevelTerms.iterator();
            while (termsEnum.next() != null) {
                BytesRef termBytesRef = termsEnum.term();
                boolean foundTerm = topLevelTermsEnum.seekExact(termBytesRef);
                assert foundTerm : "Term: " + termBytesRef.utf8ToString() + " not found!";

                Term term = new Term(fieldName, termBytesRef);

                // remove noise words
                int freq = getTermFreq(termsEnum, docsEnum);
                if (isNoise(term.bytes().utf8ToString(), freq)) {
                    continue;
                }

                // now call on docFreq
                long docFreq = getTermStatistics(topLevelTermsEnum, term).docFreq();
                if (!isAccepted(docFreq)) {
                    continue;
                }

                // filter based on score
                float score = computeScore(docFreq, freq, numDocs);
                queue.addOrUpdate(new ScoreTerm(term.field(), term.bytes().utf8ToString(), score));
            }

            // retain the best terms for quick lookups
            ScoreTerm scoreTerm;
            int count = 0;
            while ((scoreTerm = queue.pop()) != null) {
                scoreTerms.put(new Term(scoreTerm.field, scoreTerm.word), scoreTerm);
                count++;
            }
            sizes.put(fieldName, count);
        }
    }

    private boolean isNoise(String word, int freq) {
        // filter out words based on length
        int len = word.length();
        if (minWordLength > 0 && len < minWordLength) {
            return true;
        }
        if (maxWordLength > 0 && len > maxWordLength) {
            return true;
        }
        // filter out words that don't occur enough times in the source
        if (minTermFreq > 0 && freq < minTermFreq) {
            return true;
        }
        // filter out words that occur too many times in the source
        if (freq > maxTermFreq) {
            return true;
        }
        return false;
    }

    private boolean isAccepted(long docFreq) {
        // filter out words that don't occur in enough docs
        if (minDocFreq > 0 && docFreq < minDocFreq) {
            return false;
        }
        // filter out words that occur in too many docs
        if (docFreq > maxDocFreq) {
            return false;
        }
        // index update problem?
        if (docFreq == 0) {
            return false;
        }
        return true;
    }

    private long getDocCount(String fieldName, Terms topLevelTerms) throws IOException {
        if (dfs != null) {
            return dfs.fieldStatistics().get(fieldName).docCount();
        }
        return topLevelTerms.getDocCount();
    }

    private TermStatistics getTermStatistics(TermsEnum termsEnum, Term term) throws IOException {
        if (dfs != null) {
            return dfs.termStatistics().get(term);
        }
        return new TermStatistics(termsEnum.term(), termsEnum.docFreq(), termsEnum.totalTermFreq());
    }

    private int getTermFreq(TermsEnum termsEnum, PostingsEnum docsEnum) throws IOException {
        docsEnum = termsEnum.postings(docsEnum);
        docsEnum.nextDoc();
        return docsEnum.freq();
    }

    private float computeScore(long docFreq, int freq, long numDocs) {
        return freq * similarity.idf(docFreq, numDocs);
    }

    private static class ScoreTermsQueue extends org.apache.lucene.util.PriorityQueue<ScoreTerm> {
        private final int limit;

        ScoreTermsQueue(int maxSize) {
            super(maxSize);
            this.limit = maxSize;
        }

        @Override
        protected boolean lessThan(ScoreTerm a, ScoreTerm b) {
            return a.score < b.score;
        }

        public void addOrUpdate(ScoreTerm scoreTerm) {
            if (this.size() < limit) {
                // there is still space in the queue
                this.add(scoreTerm);
            } else {
                // otherwise update the smallest in the queue in place and update the queue
                ScoreTerm scoreTermTop = this.top();
                if (scoreTermTop.score < scoreTerm.score) {
                    scoreTermTop.update(scoreTerm.field, scoreTerm.word, scoreTerm.score);
                    this.updateTop();
                }
            }
        }
    }
}