DocInverterPerField.java example

Explorer
heliosearch-master
- lucene
- solr
package org.apache.lucene.index;

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.IOException;

import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.index.FieldInfo.IndexOptions;

/**
 * Holds state for inverting all occurrences of a single
 * field in the document.  This class doesn't do anything
 * itself; instead, it forwards the tokens produced by
 * analysis to its own consumer
 * (InvertedDocConsumerPerField).  It also interacts with an
 * endConsumer (InvertedDocEndConsumerPerField).
 */

final class DocInverterPerField extends DocFieldConsumerPerField {

  final FieldInfo fieldInfo;
  final InvertedDocConsumerPerField consumer;
  final InvertedDocEndConsumerPerField endConsumer;
  final DocumentsWriterPerThread.DocState docState;
  final FieldInvertState fieldState;

  public DocInverterPerField(DocInverter parent, FieldInfo fieldInfo) {
    this.fieldInfo = fieldInfo;
    docState = parent.docState;
    fieldState = new FieldInvertState(fieldInfo.name);
    this.consumer = parent.consumer.addField(this, fieldInfo);
    this.endConsumer = parent.endConsumer.addField(this, fieldInfo);
  }

  @Override
  void abort() {
    try {
      consumer.abort();
    } finally {
      endConsumer.abort();
    }
  }

  @Override
  public void processFields(final IndexableField[] fields,
                            final int count) throws IOException {

    fieldState.reset();

    final boolean doInvert = consumer.start(fields, count);

    for(int i=0;i<count;i++) {

      final IndexableField field = fields[i];
      final IndexableFieldType fieldType = field.fieldType();

      // TODO FI: this should be "genericized" to querying
      // consumer if it wants to see this particular field
      // tokenized.
      if (doInvert) {
        final boolean analyzed = fieldType.tokenized() && docState.analyzer != null;
        
        // if the field omits norms, the boost cannot be indexed.
        if (fieldType.omitNorms() && field.boost() != 1.0f) {
          throw new UnsupportedOperationException("You cannot set an index-time boost: norms are omitted for field '" + field.name() + "'");
        }
        
        // only bother checking offsets if something will consume them.
        // TODO: after we fix analyzers, also check if termVectorOffsets will be indexed.
        final boolean checkOffsets = fieldType.indexOptions() == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS;
        int lastStartOffset = 0;

        if (i > 0) {
          fieldState.position += analyzed ? docState.analyzer.getPositionIncrementGap(fieldInfo.name) : 0;
        }

        /*
        * To assist people in tracking down problems in analysis components, we wish to write the field name to the infostream
        * when we fail. We expect some caller to eventually deal with the real exception, so we don't want any 'catch' clauses,
        * but rather a finally that takes note of the problem.
        */

        boolean succeededInProcessingField = false;
        try (TokenStream stream = field.tokenStream(docState.analyzer)) {
          // reset the TokenStream to the first token
          stream.reset();
          boolean hasMoreTokens = stream.incrementToken();

          fieldState.attributeSource = stream;

          OffsetAttribute offsetAttribute = fieldState.attributeSource.addAttribute(OffsetAttribute.class);
          PositionIncrementAttribute posIncrAttribute = fieldState.attributeSource.addAttribute(PositionIncrementAttribute.class);

          if (hasMoreTokens) {
            consumer.start(field);

            do {
              // If we hit an exception in stream.next below
              // (which is fairly common, eg if analyzer
              // chokes on a given document), then it's
              // non-aborting and (above) this one document
              // will be marked as deleted, but still
              // consume a docID

              final int posIncr = posIncrAttribute.getPositionIncrement();
              if (posIncr < 0) {
                throw new IllegalArgumentException("position increment must be >=0 (got " + posIncr + ") for field '" + field.name() + "'");
              }
              if (fieldState.position == 0 && posIncr == 0) {
                throw new IllegalArgumentException("first position increment must be > 0 (got 0) for field '" + field.name() + "'");
              }
              int position = fieldState.position + posIncr;
              if (position > 0) {
                // NOTE: confusing: this "mirrors" the
                // position++ we do below
                position--;
              } else if (position < 0) {
                throw new IllegalArgumentException("position overflow for field '" + field.name() + "'");
              }
              
              // position is legal, we can safely place it in fieldState now.
              // not sure if anything will use fieldState after non-aborting exc...
              fieldState.position = position;

              if (posIncr == 0)
                fieldState.numOverlap++;
              
              if (checkOffsets) {
                int startOffset = fieldState.offset + offsetAttribute.startOffset();
                int endOffset = fieldState.offset + offsetAttribute.endOffset();
                if (startOffset < 0 || endOffset < startOffset) {
                  throw new IllegalArgumentException("startOffset must be non-negative, and endOffset must be >= startOffset, "
                      + "startOffset=" + startOffset + ",endOffset=" + endOffset + " for field '" + field.name() + "'");
                }
                if (startOffset < lastStartOffset) {
                  throw new IllegalArgumentException("offsets must not go backwards startOffset=" 
                       + startOffset + " is < lastStartOffset=" + lastStartOffset + " for field '" + field.name() + "'");
                }
                lastStartOffset = startOffset;
              }

              boolean success = false;
              try {
                // If we hit an exception in here, we abort
                // all buffered documents since the last
                // flush, on the likelihood that the
                // internal state of the consumer is now
                // corrupt and should not be flushed to a
                // new segment:
                consumer.add();
                success = true;
              } finally {
                if (!success) {
                  docState.docWriter.setAborting();
                }
              }
              fieldState.length++;
              fieldState.position++;
            } while (stream.incrementToken());
          }
          // trigger streams to perform end-of-stream operations
          stream.end();
          // TODO: maybe add some safety? then again, its already checked 
          // when we come back around to the field...
          fieldState.position += posIncrAttribute.getPositionIncrement();
          fieldState.offset += offsetAttribute.endOffset();


          if (docState.maxTermPrefix != null) {
            final String msg = "Document contains at least one immense term in field=\"" + fieldInfo.name + "\" (whose UTF8 encoding is longer than the max length " + DocumentsWriterPerThread.MAX_TERM_LENGTH_UTF8 + "), all of which were skipped.  Please correct the analyzer to not produce such terms.  The prefix of the first immense term is: '" + docState.maxTermPrefix + "...'";
            if (docState.infoStream.isEnabled("IW")) {
              docState.infoStream.message("IW", "ERROR: " + msg);
            }
            docState.maxTermPrefix = null;
            throw new IllegalArgumentException(msg);
          }

          /* if success was false above there is an exception coming through and we won't get here.*/
          succeededInProcessingField = true;
        } finally {
          if (!succeededInProcessingField && docState.infoStream.isEnabled("DW")) {
            docState.infoStream.message("DW", "An exception was thrown while processing field " + fieldInfo.name);
          }
        }

        fieldState.offset += analyzed ? docState.analyzer.getOffsetGap(fieldInfo.name) : 0;
        fieldState.boost *= field.boost();
      }

      // LUCENE-2387: don't hang onto the field, so GC can
      // reclaim
      fields[i] = null;
    }

    consumer.finish();
    endConsumer.finish();
  }

  @Override
  FieldInfo getFieldInfo() {
    return fieldInfo;
  }
}