/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.codecs; import java.io.Closeable; import java.io.IOException; import java.io.Reader; import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.List; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.document.StoredField; import org.apache.lucene.index.DocIDMerger; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FieldInfos; import org.apache.lucene.index.IndexableField; import org.apache.lucene.index.IndexableFieldType; import org.apache.lucene.index.MergeState; import org.apache.lucene.index.StoredFieldVisitor; import org.apache.lucene.util.BytesRef; import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS; /** * Codec API for writing stored fields: * <ol> * <li>For every document, {@link #startDocument()} is called, * informing the Codec that a new document has started. * <li>{@link #writeField(FieldInfo, IndexableField)} is called for * each field in the document. * <li>After all documents have been written, {@link #finish(FieldInfos, int)} * is called for verification/sanity-checks. * <li>Finally the writer is closed ({@link #close()}) * </ol> * * @lucene.experimental */ public abstract class StoredFieldsWriter implements Closeable { /** Sole constructor. (For invocation by subclass * constructors, typically implicit.) */ protected StoredFieldsWriter() { } /** Called before writing the stored fields of the document. * {@link #writeField(FieldInfo, IndexableField)} will be called * for each stored field. Note that this is * called even if the document has no stored fields. */ public abstract void startDocument() throws IOException; /** Called when a document and all its fields have been added. */ public void finishDocument() throws IOException {} /** Writes a single stored field. */ public abstract void writeField(FieldInfo info, IndexableField field) throws IOException; /** Called before {@link #close()}, passing in the number * of documents that were written. Note that this is * intentionally redundant (equivalent to the number of * calls to {@link #startDocument()}, but a Codec should * check that this is the case to detect the JRE bug described * in LUCENE-1282. */ public abstract void finish(FieldInfos fis, int numDocs) throws IOException; private static class StoredFieldsMergeSub extends DocIDMerger.Sub { private final StoredFieldsReader reader; private final int maxDoc; private final MergeVisitor visitor; int docID = -1; public StoredFieldsMergeSub(MergeVisitor visitor, MergeState.DocMap docMap, StoredFieldsReader reader, int maxDoc) { super(docMap); this.maxDoc = maxDoc; this.reader = reader; this.visitor = visitor; } @Override public int nextDoc() { docID++; if (docID == maxDoc) { return NO_MORE_DOCS; } else { return docID; } } } /** Merges in the stored fields from the readers in * <code>mergeState</code>. The default implementation skips * over deleted documents, and uses {@link #startDocument()}, * {@link #writeField(FieldInfo, IndexableField)}, and {@link #finish(FieldInfos, int)}, * returning the number of documents that were written. * Implementations can override this method for more sophisticated * merging (bulk-byte copying, etc). */ public int merge(MergeState mergeState) throws IOException { List<StoredFieldsMergeSub> subs = new ArrayList<>(); for(int i=0;i<mergeState.storedFieldsReaders.length;i++) { StoredFieldsReader storedFieldsReader = mergeState.storedFieldsReaders[i]; storedFieldsReader.checkIntegrity(); subs.add(new StoredFieldsMergeSub(new MergeVisitor(mergeState, i), mergeState.docMaps[i], storedFieldsReader, mergeState.maxDocs[i])); } final DocIDMerger<StoredFieldsMergeSub> docIDMerger = DocIDMerger.of(subs, mergeState.needsIndexSort); int docCount = 0; while (true) { StoredFieldsMergeSub sub = docIDMerger.next(); if (sub == null) { break; } assert sub.mappedDocID == docCount; startDocument(); sub.reader.visitDocument(sub.docID, sub.visitor); finishDocument(); docCount++; } finish(mergeState.mergeFieldInfos, docCount); return docCount; } /** * A visitor that adds every field it sees. * <p> * Use like this: * <pre> * MergeVisitor visitor = new MergeVisitor(mergeState, readerIndex); * for (...) { * startDocument(); * storedFieldsReader.visitDocument(docID, visitor); * finishDocument(); * } * </pre> */ protected class MergeVisitor extends StoredFieldVisitor implements IndexableField { BytesRef binaryValue; String stringValue; Number numericValue; FieldInfo currentField; FieldInfos remapper; /** * Create new merge visitor. */ public MergeVisitor(MergeState mergeState, int readerIndex) { // if field numbers are aligned, we can save hash lookups // on every field access. Otherwise, we need to lookup // fieldname each time, and remap to a new number. for (FieldInfo fi : mergeState.fieldInfos[readerIndex]) { FieldInfo other = mergeState.mergeFieldInfos.fieldInfo(fi.number); if (other == null || !other.name.equals(fi.name)) { remapper = mergeState.mergeFieldInfos; break; } } } @Override public void binaryField(FieldInfo fieldInfo, byte[] value) throws IOException { reset(fieldInfo); // TODO: can we avoid new BR here? binaryValue = new BytesRef(value); write(); } @Override public void stringField(FieldInfo fieldInfo, byte[] value) throws IOException { reset(fieldInfo); // TODO: can we avoid new String here? stringValue = new String(value, StandardCharsets.UTF_8); write(); } @Override public void intField(FieldInfo fieldInfo, int value) throws IOException { reset(fieldInfo); numericValue = value; write(); } @Override public void longField(FieldInfo fieldInfo, long value) throws IOException { reset(fieldInfo); numericValue = value; write(); } @Override public void floatField(FieldInfo fieldInfo, float value) throws IOException { reset(fieldInfo); numericValue = value; write(); } @Override public void doubleField(FieldInfo fieldInfo, double value) throws IOException { reset(fieldInfo); numericValue = value; write(); } @Override public Status needsField(FieldInfo fieldInfo) throws IOException { return Status.YES; } @Override public String name() { return currentField.name; } @Override public IndexableFieldType fieldType() { return StoredField.TYPE; } @Override public BytesRef binaryValue() { return binaryValue; } @Override public String stringValue() { return stringValue; } @Override public Number numericValue() { return numericValue; } @Override public Reader readerValue() { return null; } @Override public TokenStream tokenStream(Analyzer analyzer, TokenStream reuse) { return null; } void reset(FieldInfo field) { if (remapper != null) { // field numbers are not aligned, we need to remap to the new field number currentField = remapper.fieldInfo(field.name); } else { currentField = field; } binaryValue = null; stringValue = null; numericValue = null; } void write() throws IOException { writeField(currentField, this); } } @Override public abstract void close() throws IOException; }