TermVectorsWriter.java example

Explorer
elasticsearch-master
/*
 * Licensed to Elasticsearch under one or more contributor
 * license agreements. See the NOTICE file distributed with
 * this work for additional information regarding copyright
 * ownership. Elasticsearch licenses this file to you under
 * the Apache License, Version 2.0 (the "License"); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.elasticsearch.action.termvectors;

import org.apache.lucene.index.Fields;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.CollectionStatistics;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.TermStatistics;
import org.apache.lucene.util.BytesRef;
import org.elasticsearch.action.termvectors.TermVectorsRequest.Flag;
import org.elasticsearch.common.Nullable;
import org.elasticsearch.common.bytes.BytesReference;
import org.elasticsearch.common.io.stream.BytesStreamOutput;
import org.elasticsearch.search.dfs.AggregatedDfs;

import java.io.IOException;
import java.util.ArrayList;
import java.util.EnumSet;
import java.util.List;
import java.util.Set;

// package only - this is an internal class!
final class TermVectorsWriter {
    final List<String> fields = new ArrayList<>();
    final List<Long> fieldOffset = new ArrayList<>();
    final BytesStreamOutput output = new BytesStreamOutput(1); // can we somehow
    // predict the
    // size here?
    private static final String HEADER = "TV";
    private static final int CURRENT_VERSION = -1;
    TermVectorsResponse response = null;

    TermVectorsWriter(TermVectorsResponse termVectorsResponse) throws IOException {
        response = termVectorsResponse;
    }

    void setFields(Fields termVectorsByField, Set<String> selectedFields, EnumSet<Flag> flags, Fields topLevelFields,
                   @Nullable AggregatedDfs dfs, @Nullable TermVectorsFilter termVectorsFilter) throws IOException {
        int numFieldsWritten = 0;
        PostingsEnum docsAndPosEnum = null;
        PostingsEnum docsEnum = null;
        boolean hasScores = termVectorsFilter != null;

        for (String field : termVectorsByField) {
            if ((selectedFields != null) && (!selectedFields.contains(field))) {
                continue;
            }

            Terms fieldTermVector = termVectorsByField.terms(field);
            Terms topLevelTerms = topLevelFields.terms(field);

            // if no terms found, take the retrieved term vector fields for stats
            if (topLevelTerms == null) {
                topLevelTerms = EMPTY_TERMS;
            }

            TermsEnum topLevelIterator = topLevelTerms.iterator();
            boolean positions = flags.contains(Flag.Positions) && fieldTermVector.hasPositions();
            boolean offsets = flags.contains(Flag.Offsets) && fieldTermVector.hasOffsets();
            boolean payloads = flags.contains(Flag.Payloads) && fieldTermVector.hasPayloads();

            long termsSize = fieldTermVector.size();
            if (hasScores) {
                termsSize = Math.min(termsSize, termVectorsFilter.size(field));
            }
            startField(field, termsSize, positions, offsets, payloads);

            if (flags.contains(Flag.FieldStatistics)) {
                if (dfs != null) {
                    writeFieldStatistics(dfs.fieldStatistics().get(field));
                } else {
                    writeFieldStatistics(topLevelTerms);
                }
            }
            TermsEnum iterator = fieldTermVector.iterator();
            final boolean useDocsAndPos = positions || offsets || payloads;
            while (iterator.next() != null) { // iterate all terms of the current field
                BytesRef termBytesRef = iterator.term();
                Term term = new Term(field, termBytesRef);

                // with filtering we only keep the best terms
                if (hasScores && !termVectorsFilter.hasScoreTerm(term)) {
                    continue;
                }

                startTerm(termBytesRef);
                if (flags.contains(Flag.TermStatistics)) {
                    // get the doc frequency
                    if (dfs != null) {
                        final TermStatistics statistics = dfs.termStatistics().get(term);
                        writeTermStatistics(statistics == null ? new TermStatistics(termBytesRef, 0, 0) : statistics);
                    } else {
                        boolean foundTerm = topLevelIterator.seekExact(termBytesRef);
                        if (foundTerm) {
                            writeTermStatistics(topLevelIterator);
                        } else {
                            writeTermStatistics(new TermStatistics(termBytesRef, 0, 0));
                        }
                    }
                }
                if (useDocsAndPos) {
                    // given we have pos or offsets
                    docsAndPosEnum = writeTermWithDocsAndPos(iterator, docsAndPosEnum, positions, offsets, payloads);
                } else {
                    // if we do not have the positions stored, we need to
                    // get the frequency from a PostingsEnum.
                    docsEnum = writeTermWithDocsOnly(iterator, docsEnum);
                }
                if (hasScores) {
                    writeScoreTerm(termVectorsFilter.getScoreTerm(term));
                }
            }
            numFieldsWritten++;
        }
        response.setTermVectorsField(output);
        response.setHeader(writeHeader(numFieldsWritten, flags.contains(Flag.TermStatistics), flags.contains(Flag.FieldStatistics), hasScores));
    }

    private BytesReference writeHeader(int numFieldsWritten, boolean getTermStatistics, boolean getFieldStatistics, boolean scores) throws IOException {
        // now, write the information about offset of the terms in the
        // termVectors field
        BytesStreamOutput header = new BytesStreamOutput();
        header.writeString(HEADER);
        header.writeInt(CURRENT_VERSION);
        header.writeBoolean(getTermStatistics);
        header.writeBoolean(getFieldStatistics);
        header.writeBoolean(scores);
        header.writeVInt(numFieldsWritten);
        for (int i = 0; i < fields.size(); i++) {
            header.writeString(fields.get(i));
            header.writeVLong(fieldOffset.get(i).longValue());
        }
        header.close();
        return header.bytes();
    }

    private PostingsEnum writeTermWithDocsOnly(TermsEnum iterator, PostingsEnum docsEnum) throws IOException {
        docsEnum = iterator.postings(docsEnum);
        int nextDoc = docsEnum.nextDoc();
        assert nextDoc != DocIdSetIterator.NO_MORE_DOCS;
        writeFreq(docsEnum.freq());
        nextDoc = docsEnum.nextDoc();
        assert nextDoc == DocIdSetIterator.NO_MORE_DOCS;
        return docsEnum;
    }

    private PostingsEnum writeTermWithDocsAndPos(TermsEnum iterator, PostingsEnum docsAndPosEnum, boolean positions,
                                                         boolean offsets, boolean payloads) throws IOException {
        docsAndPosEnum = iterator.postings(docsAndPosEnum, PostingsEnum.ALL);
        // for each term (iterator next) in this field (field)
        // iterate over the docs (should only be one)
        int nextDoc = docsAndPosEnum.nextDoc();
        assert nextDoc != DocIdSetIterator.NO_MORE_DOCS;
        final int freq = docsAndPosEnum.freq();
        writeFreq(freq);
        for (int j = 0; j < freq; j++) {
            int curPos = docsAndPosEnum.nextPosition();
            if (positions) {
                writePosition(curPos);
            }
            if (offsets) {
                writeOffsets(docsAndPosEnum.startOffset(), docsAndPosEnum.endOffset());
            }
            if (payloads) {
                writePayload(docsAndPosEnum.getPayload());
            }
        }
        nextDoc = docsAndPosEnum.nextDoc();
        assert nextDoc == DocIdSetIterator.NO_MORE_DOCS;
        return docsAndPosEnum;
    }

    private void writePayload(BytesRef payload) throws IOException {
        if (payload != null) {
            output.writeVInt(payload.length);
            output.writeBytes(payload.bytes, payload.offset, payload.length);
        } else {
            output.writeVInt(0);
        }
    }

    private void writeFreq(int termFreq) throws IOException {
        writePotentiallyNegativeVInt(termFreq);
    }

    private void writeOffsets(int startOffset, int endOffset) throws IOException {
        assert (startOffset >= 0);
        assert (endOffset >= 0);
        if ((startOffset >= 0) && (endOffset >= 0)) {
            output.writeVInt(startOffset);
            output.writeVInt(endOffset);
        }
    }

    private void writePosition(int pos) throws IOException {
        assert (pos >= 0);
        if (pos >= 0) {
            output.writeVInt(pos);
        }
    }

    private void startField(String fieldName, long termsSize, boolean writePositions, boolean writeOffsets, boolean writePayloads)
            throws IOException {
        fields.add(fieldName);
        fieldOffset.add(output.position());
        output.writeVLong(termsSize);
        // add information on if positions etc. are written
        output.writeBoolean(writePositions);
        output.writeBoolean(writeOffsets);
        output.writeBoolean(writePayloads);
    }

    private void startTerm(BytesRef term) throws IOException {
        output.writeVInt(term.length);
        output.writeBytes(term.bytes, term.offset, term.length);
    }

    private void writeTermStatistics(TermsEnum topLevelIterator) throws IOException {
        int docFreq = topLevelIterator.docFreq();
        assert (docFreq >= -1);
        writePotentiallyNegativeVInt(docFreq);
        long ttf = topLevelIterator.totalTermFreq();
        assert (ttf >= -1);
        writePotentiallyNegativeVLong(ttf);
    }

    private void writeTermStatistics(TermStatistics termStatistics) throws IOException {
        int docFreq = (int) termStatistics.docFreq();
        assert (docFreq >= -1);
        writePotentiallyNegativeVInt(docFreq);
        long ttf = termStatistics.totalTermFreq();
        assert (ttf >= -1);
        writePotentiallyNegativeVLong(ttf);
    }

    private void writeFieldStatistics(Terms topLevelTerms) throws IOException {
        long sttf = topLevelTerms.getSumTotalTermFreq();
        assert (sttf >= -1);
        writePotentiallyNegativeVLong(sttf);
        long sdf = topLevelTerms.getSumDocFreq();
        assert (sdf >= -1);
        writePotentiallyNegativeVLong(sdf);
        int dc = topLevelTerms.getDocCount();
        assert (dc >= -1);
        writePotentiallyNegativeVInt(dc);
    }

    private void writeFieldStatistics(CollectionStatistics fieldStats) throws IOException {
        long sttf = fieldStats.sumTotalTermFreq();
        assert (sttf >= -1);
        writePotentiallyNegativeVLong(sttf);
        long sdf = fieldStats.sumDocFreq();
        assert (sdf >= -1);
        writePotentiallyNegativeVLong(sdf);
        int dc = (int) fieldStats.docCount();
        assert (dc >= -1);
        writePotentiallyNegativeVInt(dc);
    }

    private void writeScoreTerm(TermVectorsFilter.ScoreTerm scoreTerm) throws IOException {
        output.writeFloat(Math.max(0, scoreTerm.score));
    }

    private void writePotentiallyNegativeVInt(int value) throws IOException {
        // term freq etc. can be negative if not present... we transport that
        // further...
        output.writeVInt(Math.max(0, value + 1));
    }

    private void writePotentiallyNegativeVLong(long value) throws IOException {
        // term freq etc. can be negative if not present... we transport that
        // further...
        output.writeVLong(Math.max(0, value + 1));
    }

    /** Implements an empty {@link Terms}. */
    private static final Terms EMPTY_TERMS = new Terms() {
        @Override public TermsEnum iterator() throws IOException { return TermsEnum.EMPTY; }
        @Override public long size() throws IOException { return 0; }
        @Override public long getSumTotalTermFreq() throws IOException { return 0; }
        @Override public long getSumDocFreq() throws IOException { return 0; }
        @Override public int getDocCount() throws IOException { return 0; }
        @Override public boolean hasFreqs() { return false; }
        @Override public boolean hasOffsets() { return false; }
        @Override public boolean hasPositions() { return false; }
        @Override public boolean hasPayloads() { return false; }
    };

}