package org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.store.Directory; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.util.StringHelper; import org.apache.lucene.util.UnicodeUtil; import java.io.IOException; final class TermVectorsWriter { private IndexOutput tvx = null, tvd = null, tvf = null; private FieldInfos fieldInfos; final UnicodeUtil.UTF8Result[] utf8Results = new UnicodeUtil.UTF8Result[] {new UnicodeUtil.UTF8Result(), new UnicodeUtil.UTF8Result()}; public TermVectorsWriter(Directory directory, String segment, FieldInfos fieldInfos) throws IOException { // Open files for TermVector storage tvx = directory.createOutput(segment + "." + IndexFileNames.VECTORS_INDEX_EXTENSION); tvx.writeInt(TermVectorsReader.FORMAT_CURRENT); tvd = directory.createOutput(segment + "." + IndexFileNames.VECTORS_DOCUMENTS_EXTENSION); tvd.writeInt(TermVectorsReader.FORMAT_CURRENT); tvf = directory.createOutput(segment + "." + IndexFileNames.VECTORS_FIELDS_EXTENSION); tvf.writeInt(TermVectorsReader.FORMAT_CURRENT); this.fieldInfos = fieldInfos; } /** * Add a complete document specified by all its term vectors. If document has no * term vectors, add value for tvx. * * @param vectors * @throws IOException */ public final void addAllDocVectors(TermFreqVector[] vectors) throws IOException { tvx.writeLong(tvd.getFilePointer()); tvx.writeLong(tvf.getFilePointer()); if (vectors != null) { final int numFields = vectors.length; tvd.writeVInt(numFields); long[] fieldPointers = new long[numFields]; for (int i=0; i<numFields; i++) { fieldPointers[i] = tvf.getFilePointer(); final int fieldNumber = fieldInfos.fieldNumber(vectors[i].getField()); // 1st pass: write field numbers to tvd tvd.writeVInt(fieldNumber); final int numTerms = vectors[i].size(); tvf.writeVInt(numTerms); final TermPositionVector tpVector; final byte bits; final boolean storePositions; final boolean storeOffsets; if (vectors[i] instanceof TermPositionVector) { // May have positions & offsets tpVector = (TermPositionVector) vectors[i]; storePositions = tpVector.size() > 0 && tpVector.getTermPositions(0) != null; storeOffsets = tpVector.size() > 0 && tpVector.getOffsets(0) != null; bits = (byte) ((storePositions ? TermVectorsReader.STORE_POSITIONS_WITH_TERMVECTOR : 0) + (storeOffsets ? TermVectorsReader.STORE_OFFSET_WITH_TERMVECTOR : 0)); } else { tpVector = null; bits = 0; storePositions = false; storeOffsets = false; } tvf.writeVInt(bits); final String[] terms = vectors[i].getTerms(); final int[] freqs = vectors[i].getTermFrequencies(); int utf8Upto = 0; utf8Results[1].length = 0; for (int j=0; j<numTerms; j++) { UnicodeUtil.UTF16toUTF8(terms[j], 0, terms[j].length(), utf8Results[utf8Upto]); int start = StringHelper.bytesDifference(utf8Results[1-utf8Upto].result, utf8Results[1-utf8Upto].length, utf8Results[utf8Upto].result, utf8Results[utf8Upto].length); int length = utf8Results[utf8Upto].length - start; tvf.writeVInt(start); // write shared prefix length tvf.writeVInt(length); // write delta length tvf.writeBytes(utf8Results[utf8Upto].result, start, length); // write delta bytes utf8Upto = 1-utf8Upto; final int termFreq = freqs[j]; tvf.writeVInt(termFreq); if (storePositions) { final int[] positions = tpVector.getTermPositions(j); if (positions == null) throw new IllegalStateException("Trying to write positions that are null!"); assert positions.length == termFreq; // use delta encoding for positions int lastPosition = 0; for(int k=0;k<positions.length;k++) { final int position = positions[k]; tvf.writeVInt(position-lastPosition); lastPosition = position; } } if (storeOffsets) { final TermVectorOffsetInfo[] offsets = tpVector.getOffsets(j); if (offsets == null) throw new IllegalStateException("Trying to write offsets that are null!"); assert offsets.length == termFreq; // use delta encoding for offsets int lastEndOffset = 0; for(int k=0;k<offsets.length;k++) { final int startOffset = offsets[k].getStartOffset(); final int endOffset = offsets[k].getEndOffset(); tvf.writeVInt(startOffset-lastEndOffset); tvf.writeVInt(endOffset-startOffset); lastEndOffset = endOffset; } } } } // 2nd pass: write field pointers to tvd if (numFields > 1) { long lastFieldPointer = fieldPointers[0]; for (int i=1; i<numFields; i++) { final long fieldPointer = fieldPointers[i]; tvd.writeVLong(fieldPointer-lastFieldPointer); lastFieldPointer = fieldPointer; } } } else tvd.writeVInt(0); } /** * Do a bulk copy of numDocs documents from reader to our * streams. This is used to expedite merging, if the * field numbers are congruent. */ final void addRawDocuments(TermVectorsReader reader, int[] tvdLengths, int[] tvfLengths, int numDocs) throws IOException { long tvdPosition = tvd.getFilePointer(); long tvfPosition = tvf.getFilePointer(); long tvdStart = tvdPosition; long tvfStart = tvfPosition; for(int i=0;i<numDocs;i++) { tvx.writeLong(tvdPosition); tvdPosition += tvdLengths[i]; tvx.writeLong(tvfPosition); tvfPosition += tvfLengths[i]; } tvd.copyBytes(reader.getTvdStream(), tvdPosition-tvdStart); tvf.copyBytes(reader.getTvfStream(), tvfPosition-tvfStart); assert tvd.getFilePointer() == tvdPosition; assert tvf.getFilePointer() == tvfPosition; } /** Close all streams. */ final void close() throws IOException { // make an effort to close all streams we can but remember and re-throw // the first exception encountered in this process IOException keep = null; if (tvx != null) try { tvx.close(); } catch (IOException e) { if (keep == null) keep = e; } if (tvd != null) try { tvd.close(); } catch (IOException e) { if (keep == null) keep = e; } if (tvf != null) try { tvf.close(); } catch (IOException e) { if (keep == null) keep = e; } if (keep != null) throw (IOException) keep.fillInStackTrace(); } }