package org.apache.lucene.codecs.lucene3x; /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.util.Comparator; import org.apache.lucene.codecs.TermVectorsWriter; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FieldInfos; import org.apache.lucene.index.IndexFileNames; import org.apache.lucene.store.DataInput; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.StringHelper; final class PreFlexRWTermVectorsWriter extends TermVectorsWriter { private final Directory directory; private final String segment; private IndexOutput tvx = null, tvd = null, tvf = null; public PreFlexRWTermVectorsWriter(Directory directory, String segment, IOContext context) throws IOException { this.directory = directory; this.segment = segment; boolean success = false; try { // Open files for TermVector storage tvx = directory.createOutput(IndexFileNames.segmentFileName(segment, "", Lucene3xTermVectorsReader.VECTORS_INDEX_EXTENSION), context); tvx.writeInt(Lucene3xTermVectorsReader.FORMAT_CURRENT); tvd = directory.createOutput(IndexFileNames.segmentFileName(segment, "", Lucene3xTermVectorsReader.VECTORS_DOCUMENTS_EXTENSION), context); tvd.writeInt(Lucene3xTermVectorsReader.FORMAT_CURRENT); tvf = directory.createOutput(IndexFileNames.segmentFileName(segment, "", Lucene3xTermVectorsReader.VECTORS_FIELDS_EXTENSION), context); tvf.writeInt(Lucene3xTermVectorsReader.FORMAT_CURRENT); success = true; } finally { if (!success) { abort(); } } } @Override public void startDocument(int numVectorFields) throws IOException { lastFieldName = null; this.numVectorFields = numVectorFields; tvx.writeLong(tvd.getFilePointer()); tvx.writeLong(tvf.getFilePointer()); tvd.writeVInt(numVectorFields); fieldCount = 0; fps = ArrayUtil.grow(fps, numVectorFields); } private long fps[] = new long[10]; // pointers to the tvf before writing each field private int fieldCount = 0; // number of fields we have written so far for this document private int numVectorFields = 0; // total number of fields we will write for this document private String lastFieldName; @Override public void startField(FieldInfo info, int numTerms, boolean positions, boolean offsets, boolean payloads) throws IOException { assert lastFieldName == null || info.name.compareTo(lastFieldName) > 0: "fieldName=" + info.name + " lastFieldName=" + lastFieldName; lastFieldName = info.name; if (payloads) { throw new UnsupportedOperationException("3.x codec does not support payloads on vectors!"); } this.positions = positions; this.offsets = offsets; lastTerm.length = 0; fps[fieldCount++] = tvf.getFilePointer(); tvd.writeVInt(info.number); tvf.writeVInt(numTerms); byte bits = 0x0; if (positions) bits |= Lucene3xTermVectorsReader.STORE_POSITIONS_WITH_TERMVECTOR; if (offsets) bits |= Lucene3xTermVectorsReader.STORE_OFFSET_WITH_TERMVECTOR; tvf.writeByte(bits); assert fieldCount <= numVectorFields; if (fieldCount == numVectorFields) { // last field of the document // this is crazy because the file format is crazy! for (int i = 1; i < fieldCount; i++) { tvd.writeVLong(fps[i] - fps[i-1]); } } } private final BytesRef lastTerm = new BytesRef(10); // NOTE: we override addProx, so we don't need to buffer when indexing. // we also don't buffer during bulk merges. private int offsetStartBuffer[] = new int[10]; private int offsetEndBuffer[] = new int[10]; private int offsetIndex = 0; private int offsetFreq = 0; private boolean positions = false; private boolean offsets = false; @Override public void startTerm(BytesRef term, int freq) throws IOException { final int prefix = StringHelper.bytesDifference(lastTerm, term); final int suffix = term.length - prefix; tvf.writeVInt(prefix); tvf.writeVInt(suffix); tvf.writeBytes(term.bytes, term.offset + prefix, suffix); tvf.writeVInt(freq); lastTerm.copyBytes(term); lastPosition = lastOffset = 0; if (offsets && positions) { // we might need to buffer if its a non-bulk merge offsetStartBuffer = ArrayUtil.grow(offsetStartBuffer, freq); offsetEndBuffer = ArrayUtil.grow(offsetEndBuffer, freq); offsetIndex = 0; offsetFreq = freq; } } int lastPosition = 0; int lastOffset = 0; @Override public void addPosition(int position, int startOffset, int endOffset, BytesRef payload) throws IOException { assert payload == null; if (positions && offsets) { // write position delta tvf.writeVInt(position - lastPosition); lastPosition = position; // buffer offsets offsetStartBuffer[offsetIndex] = startOffset; offsetEndBuffer[offsetIndex] = endOffset; offsetIndex++; // dump buffer if we are done if (offsetIndex == offsetFreq) { for (int i = 0; i < offsetIndex; i++) { tvf.writeVInt(offsetStartBuffer[i] - lastOffset); tvf.writeVInt(offsetEndBuffer[i] - offsetStartBuffer[i]); lastOffset = offsetEndBuffer[i]; } } } else if (positions) { // write position delta tvf.writeVInt(position - lastPosition); lastPosition = position; } else if (offsets) { // write offset deltas tvf.writeVInt(startOffset - lastOffset); tvf.writeVInt(endOffset - startOffset); lastOffset = endOffset; } } @Override public void abort() { try { close(); } catch (Throwable ignored) {} IOUtils.deleteFilesIgnoringExceptions(directory, IndexFileNames.segmentFileName(segment, "", Lucene3xTermVectorsReader.VECTORS_INDEX_EXTENSION), IndexFileNames.segmentFileName(segment, "", Lucene3xTermVectorsReader.VECTORS_DOCUMENTS_EXTENSION), IndexFileNames.segmentFileName(segment, "", Lucene3xTermVectorsReader.VECTORS_FIELDS_EXTENSION)); } @Override public void finish(FieldInfos fis, int numDocs) throws IOException { if (4+((long) numDocs)*16 != tvx.getFilePointer()) // This is most likely a bug in Sun JRE 1.6.0_04/_05; // we detect that the bug has struck, here, and // throw an exception to prevent the corruption from // entering the index. See LUCENE-1282 for // details. throw new RuntimeException("tvx size mismatch: mergedDocs is " + numDocs + " but tvx size is " + tvx.getFilePointer() + " file=" + tvx.toString() + "; now aborting this merge to prevent index corruption"); } /** Close all streams. */ @Override public void close() throws IOException { // make an effort to close all streams we can but remember and re-throw // the first exception encountered in this process IOUtils.close(tvx, tvd, tvf); tvx = tvd = tvf = null; } @Override public Comparator<BytesRef> getComparator() throws IOException { return BytesRef.getUTF8SortedAsUTF16Comparator(); } }