package org.apache.lucene.index; /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.index.FieldInfo.IndexOptions; /** * Holds state for inverting all occurrences of a single * field in the document. This class doesn't do anything * itself; instead, it forwards the tokens produced by * analysis to its own consumer * (InvertedDocConsumerPerField). It also interacts with an * endConsumer (InvertedDocEndConsumerPerField). */ final class DocInverterPerField extends DocFieldConsumerPerField { final FieldInfo fieldInfo; final InvertedDocConsumerPerField consumer; final InvertedDocEndConsumerPerField endConsumer; final DocumentsWriterPerThread.DocState docState; final FieldInvertState fieldState; public DocInverterPerField(DocInverter parent, FieldInfo fieldInfo) { this.fieldInfo = fieldInfo; docState = parent.docState; fieldState = new FieldInvertState(fieldInfo.name); this.consumer = parent.consumer.addField(this, fieldInfo); this.endConsumer = parent.endConsumer.addField(this, fieldInfo); } @Override void abort() { try { consumer.abort(); } finally { endConsumer.abort(); } } @Override public void processFields(final IndexableField[] fields, final int count) throws IOException { fieldState.reset(); final boolean doInvert = consumer.start(fields, count); for(int i=0;i<count;i++) { final IndexableField field = fields[i]; final IndexableFieldType fieldType = field.fieldType(); // TODO FI: this should be "genericized" to querying // consumer if it wants to see this particular field // tokenized. if (doInvert) { final boolean analyzed = fieldType.tokenized() && docState.analyzer != null; // if the field omits norms, the boost cannot be indexed. if (fieldType.omitNorms() && field.boost() != 1.0f) { throw new UnsupportedOperationException("You cannot set an index-time boost: norms are omitted for field '" + field.name() + "'"); } // only bother checking offsets if something will consume them. // TODO: after we fix analyzers, also check if termVectorOffsets will be indexed. final boolean checkOffsets = fieldType.indexOptions() == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS; int lastStartOffset = 0; if (i > 0) { fieldState.position += analyzed ? docState.analyzer.getPositionIncrementGap(fieldInfo.name) : 0; } /* * To assist people in tracking down problems in analysis components, we wish to write the field name to the infostream * when we fail. We expect some caller to eventually deal with the real exception, so we don't want any 'catch' clauses, * but rather a finally that takes note of the problem. */ boolean succeededInProcessingField = false; try (TokenStream stream = field.tokenStream(docState.analyzer)) { // reset the TokenStream to the first token stream.reset(); boolean hasMoreTokens = stream.incrementToken(); fieldState.attributeSource = stream; OffsetAttribute offsetAttribute = fieldState.attributeSource.addAttribute(OffsetAttribute.class); PositionIncrementAttribute posIncrAttribute = fieldState.attributeSource.addAttribute(PositionIncrementAttribute.class); if (hasMoreTokens) { consumer.start(field); do { // If we hit an exception in stream.next below // (which is fairly common, eg if analyzer // chokes on a given document), then it's // non-aborting and (above) this one document // will be marked as deleted, but still // consume a docID final int posIncr = posIncrAttribute.getPositionIncrement(); if (posIncr < 0) { throw new IllegalArgumentException("position increment must be >=0 (got " + posIncr + ") for field '" + field.name() + "'"); } if (fieldState.position == 0 && posIncr == 0) { throw new IllegalArgumentException("first position increment must be > 0 (got 0) for field '" + field.name() + "'"); } int position = fieldState.position + posIncr; if (position > 0) { // NOTE: confusing: this "mirrors" the // position++ we do below position--; } else if (position < 0) { throw new IllegalArgumentException("position overflow for field '" + field.name() + "'"); } // position is legal, we can safely place it in fieldState now. // not sure if anything will use fieldState after non-aborting exc... fieldState.position = position; if (posIncr == 0) fieldState.numOverlap++; if (checkOffsets) { int startOffset = fieldState.offset + offsetAttribute.startOffset(); int endOffset = fieldState.offset + offsetAttribute.endOffset(); if (startOffset < 0 || endOffset < startOffset) { throw new IllegalArgumentException("startOffset must be non-negative, and endOffset must be >= startOffset, " + "startOffset=" + startOffset + ",endOffset=" + endOffset + " for field '" + field.name() + "'"); } if (startOffset < lastStartOffset) { throw new IllegalArgumentException("offsets must not go backwards startOffset=" + startOffset + " is < lastStartOffset=" + lastStartOffset + " for field '" + field.name() + "'"); } lastStartOffset = startOffset; } boolean success = false; try { // If we hit an exception in here, we abort // all buffered documents since the last // flush, on the likelihood that the // internal state of the consumer is now // corrupt and should not be flushed to a // new segment: consumer.add(); success = true; } finally { if (!success) { docState.docWriter.setAborting(); } } fieldState.length++; fieldState.position++; } while (stream.incrementToken()); } // trigger streams to perform end-of-stream operations stream.end(); // TODO: maybe add some safety? then again, its already checked // when we come back around to the field... fieldState.position += posIncrAttribute.getPositionIncrement(); fieldState.offset += offsetAttribute.endOffset(); if (docState.maxTermPrefix != null) { final String msg = "Document contains at least one immense term in field=\"" + fieldInfo.name + "\" (whose UTF8 encoding is longer than the max length " + DocumentsWriterPerThread.MAX_TERM_LENGTH_UTF8 + "), all of which were skipped. Please correct the analyzer to not produce such terms. The prefix of the first immense term is: '" + docState.maxTermPrefix + "...'"; if (docState.infoStream.isEnabled("IW")) { docState.infoStream.message("IW", "ERROR: " + msg); } docState.maxTermPrefix = null; throw new IllegalArgumentException(msg); } /* if success was false above there is an exception coming through and we won't get here.*/ succeededInProcessingField = true; } finally { if (!succeededInProcessingField && docState.infoStream.isEnabled("DW")) { docState.infoStream.message("DW", "An exception was thrown while processing field " + fieldInfo.name); } } fieldState.offset += analyzed ? docState.analyzer.getOffsetGap(fieldInfo.name) : 0; fieldState.boost *= field.boost(); } // LUCENE-2387: don't hang onto the field, so GC can // reclaim fields[i] = null; } consumer.finish(); endConsumer.finish(); } @Override FieldInfo getFieldInfo() { return fieldInfo; } }