AnalyzingCompletionLookupProvider.java example

Explorer
elassandra-master
/*
 * Licensed to Elasticsearch under one or more contributor
 * license agreements. See the NOTICE file distributed with
 * this work for additional information regarding copyright
 * ownership. Elasticsearch licenses this file to you under
 * the Apache License, Version 2.0 (the "License"); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.elasticsearch.search.suggest.completion;

import com.carrotsearch.hppc.ObjectLongHashMap;

import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.TokenStreamToAutomaton;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.FieldsConsumer;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.suggest.Lookup;
import org.apache.lucene.search.suggest.analyzing.XAnalyzingSuggester;
import org.apache.lucene.search.suggest.analyzing.XFuzzySuggester;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.Accountables;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.LimitedFiniteStringsIterator;
import org.apache.lucene.util.fst.ByteSequenceOutputs;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.PairOutputs;
import org.apache.lucene.util.fst.PairOutputs.Pair;
import org.apache.lucene.util.fst.PositiveIntOutputs;
import org.elasticsearch.common.regex.Regex;
import org.elasticsearch.index.mapper.MappedFieldType;
import org.elasticsearch.index.mapper.core.CompletionFieldMapper;
import org.elasticsearch.search.suggest.completion.Completion090PostingsFormat.CompletionLookupProvider;
import org.elasticsearch.search.suggest.completion.Completion090PostingsFormat.LookupFactory;
import org.elasticsearch.search.suggest.context.ContextMapping.ContextQuery;

import java.io.IOException;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;

public class AnalyzingCompletionLookupProvider extends CompletionLookupProvider {

    // for serialization
    public static final int SERIALIZE_PRESERVE_SEPARATORS = 1;
    public static final int SERIALIZE_HAS_PAYLOADS = 2;
    public static final int SERIALIZE_PRESERVE_POSITION_INCREMENTS = 4;

    private static final int MAX_SURFACE_FORMS_PER_ANALYZED_FORM = 256;
    private static final int MAX_GRAPH_EXPANSIONS = -1;

    public static final String CODEC_NAME = "analyzing";
    public static final int CODEC_VERSION_START = 1;
    public static final int CODEC_VERSION_SERIALIZED_LABELS = 2;
    public static final int CODEC_VERSION_CHECKSUMS = 3;
    public static final int CODEC_VERSION_LATEST = CODEC_VERSION_CHECKSUMS;

    private final boolean preserveSep;
    private final boolean preservePositionIncrements;
    private final int maxSurfaceFormsPerAnalyzedForm;
    private final int maxGraphExpansions;
    private final boolean hasPayloads;
    private final XAnalyzingSuggester prototype;

    public AnalyzingCompletionLookupProvider(boolean preserveSep, boolean exactFirst, boolean preservePositionIncrements, boolean hasPayloads) {
        this.preserveSep = preserveSep;
        this.preservePositionIncrements = preservePositionIncrements;
        this.hasPayloads = hasPayloads;
        this.maxSurfaceFormsPerAnalyzedForm = MAX_SURFACE_FORMS_PER_ANALYZED_FORM;
        this.maxGraphExpansions = MAX_GRAPH_EXPANSIONS;
        int options = preserveSep ? XAnalyzingSuggester.PRESERVE_SEP : 0;
        // needs to fixed in the suggester first before it can be supported
        //options |= exactFirst ? XAnalyzingSuggester.EXACT_FIRST : 0;
        prototype = new XAnalyzingSuggester(null, null, null, options, maxSurfaceFormsPerAnalyzedForm, maxGraphExpansions, preservePositionIncrements, null, false, 1, XAnalyzingSuggester.SEP_LABEL, XAnalyzingSuggester.PAYLOAD_SEP, XAnalyzingSuggester.END_BYTE, XAnalyzingSuggester.HOLE_CHARACTER);
    }

    @Override
    public String getName() {
        return "analyzing";
    }

    public boolean getPreserveSep() {
        return preserveSep;
    }

    public boolean getPreservePositionsIncrements() {
        return preservePositionIncrements;
    }

    public boolean hasPayloads() {
        return hasPayloads;
    }

    @Override
    public FieldsConsumer consumer(final IndexOutput output) throws IOException {
        CodecUtil.writeHeader(output, CODEC_NAME, CODEC_VERSION_LATEST);
        return new FieldsConsumer() {
            private Map<String, Long> fieldOffsets = new HashMap<>();

            @Override
            public void close() throws IOException {
                try {
                  /*
                   * write the offsets per field such that we know where
                   * we need to load the FSTs from
                   */
                    long pointer = output.getFilePointer();
                    output.writeVInt(fieldOffsets.size());
                    for (Map.Entry<String, Long> entry : fieldOffsets.entrySet()) {
                        output.writeString(entry.getKey());
                        output.writeVLong(entry.getValue());
                    }
                    output.writeLong(pointer);
                    CodecUtil.writeFooter(output);
                } finally {
                    IOUtils.close(output);
                }
            }

            @Override
            public void write(Fields fields) throws IOException {
                for(String field : fields) {
                    Terms terms = fields.terms(field);
                    if (terms == null) {
                        continue;
                    }
                    TermsEnum termsEnum = terms.iterator();
                    PostingsEnum docsEnum = null;
                    final SuggestPayload spare = new SuggestPayload();
                    int maxAnalyzedPathsForOneInput = 0;
                    final XAnalyzingSuggester.XBuilder builder = new XAnalyzingSuggester.XBuilder(maxSurfaceFormsPerAnalyzedForm, hasPayloads, XAnalyzingSuggester.PAYLOAD_SEP);
                    int docCount = 0;
                    while (true) {
                        BytesRef term = termsEnum.next();
                        if (term == null) {
                            break;
                        }
                        docsEnum = termsEnum.postings(docsEnum, PostingsEnum.PAYLOADS);
                        builder.startTerm(term);
                        int docFreq = 0;
                        while (docsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
                            for (int i = 0; i < docsEnum.freq(); i++) {
                                final int position = docsEnum.nextPosition();
                                AnalyzingCompletionLookupProvider.this.parsePayload(docsEnum.getPayload(), spare);
                                builder.addSurface(spare.surfaceForm.get(), spare.payload.get(), spare.weight);
                                // multi fields have the same surface form so we sum up here
                                maxAnalyzedPathsForOneInput = Math.max(maxAnalyzedPathsForOneInput, position + 1);
                            }
                            docFreq++;
                            docCount = Math.max(docCount, docsEnum.docID()+1);
                        }
                        builder.finishTerm(docFreq);
                    }
                    /*
                     * Here we are done processing the field and we can
                     * buid the FST and write it to disk.
                     */
                    FST<Pair<Long, BytesRef>> build = builder.build();
                    assert build != null || docCount == 0: "the FST is null but docCount is != 0 actual value: [" + docCount + "]";
                    /*
                     * it's possible that the FST is null if we have 2 segments that get merged
                     * and all docs that have a value in this field are deleted. This will cause
                     * a consumer to be created but it doesn't consume any values causing the FSTBuilder
                     * to return null.
                     */
                    if (build != null) {
                        fieldOffsets.put(field, output.getFilePointer());
                        build.save(output);
                        /* write some more meta-info */
                        output.writeVInt(maxAnalyzedPathsForOneInput);
                        output.writeVInt(maxSurfaceFormsPerAnalyzedForm);
                        output.writeInt(maxGraphExpansions); // can be negative
                        int options = 0;
                        options |= preserveSep ? SERIALIZE_PRESERVE_SEPARATORS : 0;
                        options |= hasPayloads ? SERIALIZE_HAS_PAYLOADS : 0;
                        options |= preservePositionIncrements ? SERIALIZE_PRESERVE_POSITION_INCREMENTS : 0;
                        output.writeVInt(options);
                        output.writeVInt(XAnalyzingSuggester.SEP_LABEL);
                        output.writeVInt(XAnalyzingSuggester.END_BYTE);
                        output.writeVInt(XAnalyzingSuggester.PAYLOAD_SEP);
                        output.writeVInt(XAnalyzingSuggester.HOLE_CHARACTER);
                    }
                }
            }
        };
    }


    @Override
    public LookupFactory load(IndexInput input) throws IOException {
        long sizeInBytes = 0;
        int version = CodecUtil.checkHeader(input, CODEC_NAME, CODEC_VERSION_START, CODEC_VERSION_LATEST);
        if (version >= CODEC_VERSION_CHECKSUMS) {
            CodecUtil.checksumEntireFile(input);
        }
        final long metaPointerPosition = input.length() - (version >= CODEC_VERSION_CHECKSUMS? 8 + CodecUtil.footerLength() : 8);
        final Map<String, AnalyzingSuggestHolder> lookupMap = new HashMap<>();
        input.seek(metaPointerPosition);
        long metaPointer = input.readLong();
        input.seek(metaPointer);
        int numFields = input.readVInt();

        Map<Long, String> meta = new TreeMap<>();
        for (int i = 0; i < numFields; i++) {
            String name = input.readString();
            long offset = input.readVLong();
            meta.put(offset, name);
        }

        for (Map.Entry<Long, String> entry : meta.entrySet()) {
            input.seek(entry.getKey());
            FST<Pair<Long, BytesRef>> fst = new FST<>(input, new PairOutputs<>(
                    PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton()));
            int maxAnalyzedPathsForOneInput = input.readVInt();
            int maxSurfaceFormsPerAnalyzedForm = input.readVInt();
            int maxGraphExpansions = input.readInt();
            int options = input.readVInt();
            boolean preserveSep = (options & SERIALIZE_PRESERVE_SEPARATORS) != 0;
            boolean hasPayloads = (options & SERIALIZE_HAS_PAYLOADS) != 0;
            boolean preservePositionIncrements = (options & SERIALIZE_PRESERVE_POSITION_INCREMENTS) != 0;

            // first version did not include these three fields, so fall back to old default (before the analyzingsuggester
            // was updated in Lucene, so we cannot use the suggester defaults)
            int sepLabel, payloadSep, endByte, holeCharacter;
            switch (version) {
                case CODEC_VERSION_START:
                    sepLabel = 0xFF;
                    payloadSep = '\u001f';
                    endByte = 0x0;
                    holeCharacter = '\u001E';
                    break;
                default:
                    sepLabel = input.readVInt();
                    endByte = input.readVInt();
                    payloadSep = input.readVInt();
                    holeCharacter = input.readVInt();
            }

            AnalyzingSuggestHolder holder = new AnalyzingSuggestHolder(preserveSep, preservePositionIncrements, maxSurfaceFormsPerAnalyzedForm, maxGraphExpansions,
                    hasPayloads, maxAnalyzedPathsForOneInput, fst, sepLabel, payloadSep, endByte, holeCharacter);
            sizeInBytes += fst.ramBytesUsed();
            lookupMap.put(entry.getValue(), holder);
        }
        final long ramBytesUsed = sizeInBytes;
        return new LookupFactory() {
            @Override
            public Lookup getLookup(CompletionFieldMapper.CompletionFieldType fieldType, CompletionSuggestionContext suggestionContext) {
                AnalyzingSuggestHolder analyzingSuggestHolder = lookupMap.get(fieldType.names().indexName());
                if (analyzingSuggestHolder == null) {
                    return null;
                }
                int flags = analyzingSuggestHolder.getPreserveSeparator() ? XAnalyzingSuggester.PRESERVE_SEP : 0;

                final XAnalyzingSuggester suggester;
                final Automaton queryPrefix = fieldType.requiresContext() ? ContextQuery.toAutomaton(analyzingSuggestHolder.getPreserveSeparator(), suggestionContext.getContextQueries()) : null;

                if (suggestionContext.isFuzzy()) {
                    suggester = new XFuzzySuggester(fieldType.indexAnalyzer(), queryPrefix, fieldType.searchAnalyzer(), flags,
                        analyzingSuggestHolder.maxSurfaceFormsPerAnalyzedForm, analyzingSuggestHolder.maxGraphExpansions,
                        suggestionContext.getFuzzyEditDistance(), suggestionContext.isFuzzyTranspositions(),
                        suggestionContext.getFuzzyPrefixLength(), suggestionContext.getFuzzyMinLength(), suggestionContext.isFuzzyUnicodeAware(),
                        analyzingSuggestHolder.fst, analyzingSuggestHolder.hasPayloads,
                        analyzingSuggestHolder.maxAnalyzedPathsForOneInput, analyzingSuggestHolder.sepLabel, analyzingSuggestHolder.payloadSep, analyzingSuggestHolder.endByte,
                        analyzingSuggestHolder.holeCharacter);
                } else {
                    suggester = new XAnalyzingSuggester(fieldType.indexAnalyzer(), queryPrefix, fieldType.searchAnalyzer(), flags,
                        analyzingSuggestHolder.maxSurfaceFormsPerAnalyzedForm, analyzingSuggestHolder.maxGraphExpansions,
                        analyzingSuggestHolder.preservePositionIncrements, analyzingSuggestHolder.fst, analyzingSuggestHolder.hasPayloads,
                        analyzingSuggestHolder.maxAnalyzedPathsForOneInput, analyzingSuggestHolder.sepLabel, analyzingSuggestHolder.payloadSep, analyzingSuggestHolder.endByte,
                        analyzingSuggestHolder.holeCharacter);
                }
                return suggester;
            }

            @Override
            public CompletionStats stats(String... fields) {
                long sizeInBytes = 0;
                ObjectLongHashMap<String> completionFields = null;
                if (fields != null  && fields.length > 0) {
                    completionFields = new ObjectLongHashMap<>(fields.length);
                }

                for (Map.Entry<String, AnalyzingSuggestHolder> entry : lookupMap.entrySet()) {
                    sizeInBytes += entry.getValue().fst.ramBytesUsed();
                    if (fields == null || fields.length == 0) {
                        continue;
                    }
                    if (Regex.simpleMatch(fields, entry.getKey())) {
                        long fstSize = entry.getValue().fst.ramBytesUsed();
                        completionFields.addTo(entry.getKey(), fstSize);
                    }
                }

                return new CompletionStats(sizeInBytes, completionFields);
            }

            @Override
            AnalyzingSuggestHolder getAnalyzingSuggestHolder(MappedFieldType fieldType) {
                return lookupMap.get(fieldType.names().indexName());
            }

            @Override
            public long ramBytesUsed() {
                return ramBytesUsed;
            }

            @Override
            public Collection<Accountable> getChildResources() {
                return Accountables.namedAccountables("field", lookupMap);
            }
        };
    }

    static class AnalyzingSuggestHolder implements Accountable {
        final boolean preserveSep;
        final boolean preservePositionIncrements;
        final int maxSurfaceFormsPerAnalyzedForm;
        final int maxGraphExpansions;
        final boolean hasPayloads;
        final int maxAnalyzedPathsForOneInput;
        final FST<Pair<Long, BytesRef>> fst;
        final int sepLabel;
        final int payloadSep;
        final int endByte;
        final int holeCharacter;

        public AnalyzingSuggestHolder(boolean preserveSep, boolean preservePositionIncrements, int maxSurfaceFormsPerAnalyzedForm, int maxGraphExpansions,
                                      boolean hasPayloads, int maxAnalyzedPathsForOneInput, FST<Pair<Long, BytesRef>> fst) {
            this(preserveSep, preservePositionIncrements, maxSurfaceFormsPerAnalyzedForm, maxGraphExpansions, hasPayloads, maxAnalyzedPathsForOneInput, fst, XAnalyzingSuggester.SEP_LABEL, XAnalyzingSuggester.PAYLOAD_SEP, XAnalyzingSuggester.END_BYTE, XAnalyzingSuggester.HOLE_CHARACTER);
        }

        public AnalyzingSuggestHolder(boolean preserveSep, boolean preservePositionIncrements, int maxSurfaceFormsPerAnalyzedForm, int maxGraphExpansions, boolean hasPayloads, int maxAnalyzedPathsForOneInput, FST<Pair<Long, BytesRef>> fst, int sepLabel, int payloadSep, int endByte, int holeCharacter) {
            this.preserveSep = preserveSep;
            this.preservePositionIncrements = preservePositionIncrements;
            this.maxSurfaceFormsPerAnalyzedForm = maxSurfaceFormsPerAnalyzedForm;
            this.maxGraphExpansions = maxGraphExpansions;
            this.hasPayloads = hasPayloads;
            this.maxAnalyzedPathsForOneInput = maxAnalyzedPathsForOneInput;
            this.fst = fst;
            this.sepLabel = sepLabel;
            this.payloadSep = payloadSep;
            this.endByte = endByte;
            this.holeCharacter = holeCharacter;
        }

        public boolean getPreserveSeparator() {
            return preserveSep;
        }

        public boolean getPreservePositionIncrements() {
            return preservePositionIncrements;
        }

        public boolean hasPayloads() {
            return hasPayloads;
        }

        @Override
        public long ramBytesUsed() {
            if (fst != null) {
                return fst.ramBytesUsed();
            } else {
                return 0;
            }
        }

        @Override
        public Collection<Accountable> getChildResources() {
            if (fst != null) {
                return Collections.singleton(Accountables.namedAccountable("fst", fst));
            } else {
                return Collections.emptyList();
            }
        }
    }

    @Override
    public Set<IntsRef> toFiniteStrings(TokenStream stream) throws IOException {
        return prototype.toFiniteStrings(stream);
    }


}