package org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.Closeable; import java.io.IOException; import java.util.zip.CRC32; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.store.Directory; import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.UnicodeUtil; import org.apache.lucene.util.ArrayUtil; import org.apache.solr.request.mdrill.MdrillUtils; import org.apache.solr.request.uninverted.TermIndex; import org.apache.solr.request.uninverted.UnInvertedFieldUtils; import org.apache.solr.request.uninverted.UnInvertedFieldUtils.FieldDatatype; import org.apache.solr.schema.FieldType; import org.apache.solr.schema.IndexSchema; import org.slf4j.Logger; import org.slf4j.LoggerFactory; public final class TermInfosWriter implements Closeable { public static Logger LOG = LoggerFactory.getLogger(TermInfosWriter.class); private static IndexSchema schema=null; private static boolean notUseQuick=false; public static boolean isNotUseQuick() { return notUseQuick; } public static void setNotUseQuick(boolean notUseQuick) { TermInfosWriter.notUseQuick = notUseQuick; } public static void setSchema(IndexSchema schema) { TermInfosWriter.schema=schema; } public static final int QUICK_TII = -1210; public static final int FORMAT = -3; public static final int FORMAT_VERSION_UTF8_LENGTH_IN_BYTES = -4; public static final int FORMAT_CURRENT = FORMAT_VERSION_UTF8_LENGTH_IN_BYTES; private FieldInfos fieldInfos; private IndexOutput output; private IndexOutput outputSize; private IndexOutput outputQuickTii=null; private boolean isquickTis=false; private IndexSchema schemainfo=TermInfosWriter.schema; private TermInfo lastTi = new TermInfo(); private long size; int indexInterval = 128; private static int SKIP_INTERVAL=16;//Integer.MAX_VALUE; public static void setSkipInterVal(int i) { //如果是全文检索模式,为了提升跳跃的效率,该值不宜设置的太大,其他模式设置的DataOutput.BLOGK_SIZE_COMPRESS,能有比较好的压缩比 SKIP_INTERVAL=i; } int skipInterval = 16; int maxSkipLevels = 10; private boolean isIndex; private byte[] lastTermBytes = new byte[10]; private int lastTermBytesLength = 0; private int lastFieldNumber = -1; private TermInfosWriter other; private UnicodeUtil.UTF8Result utf8Result = new UnicodeUtil.UTF8Result(); TermInfosWriter(Directory directory, String segment, FieldInfos fis, int interval) throws IOException { initialize(directory, segment, fis, interval, false); boolean success = false; try { other = new TermInfosWriter(directory, segment, fis, interval, true); other.other = this; success = true; } finally { if (!success) { IOUtils.closeWhileHandlingException(output,outputSize, other,outputQuickTii); docValues.close(); } } } private TermInfosWriter(Directory directory, String segment, FieldInfos fis, int interval, boolean isIndex) throws IOException { initialize(directory, segment, fis, interval, isIndex); } private void initialize(Directory directory, String segment, FieldInfos fis, int interval, boolean isi) throws IOException { skipInterval=SKIP_INTERVAL<(Integer.MAX_VALUE-1000)?SKIP_INTERVAL:Integer.MAX_VALUE; docValues=new DocValuesWriteEmpty(); indexInterval = interval; fieldInfos = fis; isIndex = isi; output = directory.createOutput(segment + (isIndex ? ".tii" : ".tis")); outputQuickTii=isIndex?directory.createOutput(segment+"." +IndexFileNames.TERMS_INDEX_EXTENSION_QUICK):null; IndexSchema schema=directory.getSchema(); if(schema!=null) { this.schemainfo=schema; } if(this.schemainfo!=null&&!isNotUseQuick()&&!(directory instanceof RAMDirectory)) { if(!isIndex) { DocValuesWriteImpl impl=new DocValuesWriteImpl(); impl.outputQuickTis=directory.createOutput(segment+"." +IndexFileNames.TERMS_EXTENSION_QUICK); impl.outputQuickTisTxt=directory.createOutput(segment+"." +IndexFileNames.TERMS_EXTENSION_QUICK_TXT); impl.outputQuickTisVal=directory.createOutput(segment+"." +IndexFileNames.TERMS_EXTENSION_QUICK_VAL); docValues=impl; } this.isquickTis=true; } outputSize = directory.createOutput(segment + (isIndex ? "."+IndexFileNames.TERMS_INDEX_EXTENSION_SIZE : "."+IndexFileNames.TERMS_EXTENSION_SIZE)); boolean success = false; try { output.writeInt(FORMAT_CURRENT); // write format output.writeLong(QUICK_TII); // leave space for size output.writeInt(indexInterval); // write indexInterval output.writeInt(skipInterval); // write skipInterval output.writeInt(maxSkipLevels); // write maxSkipLevels assert initUTF16Results(); success = true; } finally { if (!success) { IOUtils.closeWhileHandlingException(output,outputSize,outputQuickTii); docValues.close(); } } } void add(Term term, TermInfo ti) throws IOException { UnicodeUtil.UTF16toUTF8(term.text, 0, term.text.length(), utf8Result); add(term,fieldInfos.fieldNumber(term.field), utf8Result.result, utf8Result.length, ti); } // Currently used only by assert statements UnicodeUtil.UTF16Result utf16Result1; UnicodeUtil.UTF16Result utf16Result2; // Currently used only by assert statements private boolean initUTF16Results() { utf16Result1 = new UnicodeUtil.UTF16Result(); utf16Result2 = new UnicodeUtil.UTF16Result(); return true; } // Currently used only by assert statement private int compareToLastTerm(int fieldNumber, byte[] termBytes, int termBytesLength) { if (lastFieldNumber != fieldNumber) { final int cmp = fieldInfos.fieldName(lastFieldNumber).compareTo(fieldInfos.fieldName(fieldNumber)); // If there is a field named "" (empty string) then we // will get 0 on this comparison, yet, it's "OK". But // it's not OK if two different field numbers map to // the same name. if (cmp != 0 || lastFieldNumber != -1) return cmp; } UnicodeUtil.UTF8toUTF16(lastTermBytes, 0, lastTermBytesLength, utf16Result1); UnicodeUtil.UTF8toUTF16(termBytes, 0, termBytesLength, utf16Result2); final int len; if (utf16Result1.length < utf16Result2.length) len = utf16Result1.length; else len = utf16Result2.length; for(int i=0;i<len;i++) { final char ch1 = utf16Result1.result[i]; final char ch2 = utf16Result2.result[i]; if (ch1 != ch2) return ch1-ch2; } if (utf16Result1.length == 0 && lastFieldNumber == -1) { // If there is a field named "" (empty string) with a term text of "" (empty string) then we // will get 0 on this comparison, yet, it's "OK". return -1; } return utf16Result1.length - utf16Result2.length; } DocValuesWriter docValues; void collect(int docid) { synchronized (lock) { if(!this.isIndex) { docValues.collectDoc(docid,this.termNum); } } } void add(Term term,int fieldNumber, byte[] termBytes, int termBytesLength, TermInfo ti) throws IOException { assert compareToLastTerm(fieldNumber, termBytes, termBytesLength) < 0 || (isIndex && termBytesLength == 0 && lastTermBytesLength == 0) : "Terms are out of order: field=" + fieldInfos.fieldName(fieldNumber) + " (number " + fieldNumber + ")" + " lastField=" + fieldInfos.fieldName(lastFieldNumber) + " (number " + lastFieldNumber + ")" + " text=" + new String(termBytes, 0, termBytesLength, "UTF-8") + " lastText=" + new String(lastTermBytes, 0, lastTermBytesLength, "UTF-8"); assert ti.freqPointer >= lastTi.freqPointer: "freqPointer out of order (" + ti.freqPointer + " < " + lastTi.freqPointer + ")"; assert ti.proxPointer >= lastTi.proxPointer: "proxPointer out of order (" + ti.proxPointer + " < " + lastTi.proxPointer + ")"; if(this.isIndex) { this.addtii(fieldNumber, termBytes, termBytesLength, ti); return ; } if ( size % indexInterval == 0) { other.add(term,lastFieldNumber, lastTermBytes, lastTermBytesLength, lastTi); // add an index term } writeTerm(fieldNumber, termBytes, termBytesLength); // write term output.writeVInt(ti.docFreq); // write doc freq output.writeVLong(ti.freqPointer - lastTi.freqPointer); // write pointers output.writeVLong(ti.proxPointer - lastTi.proxPointer); if (ti.docFreq >= skipInterval) { output.writeVInt(ti.skipOffset); } lastFieldNumber = fieldNumber; lastTi.set(ti); size++; } static int unIntfieldnum=-9999; int lastquickfieldNumber=unIntfieldnum; FieldDatatype dataType; FieldType ft; int termNum=0; Object lock=new Object(); public void startTerm(Term term, int fieldNumber) throws IOException { synchronized (lock) { if (this.lastquickfieldNumber != fieldNumber) { if (this.lastquickfieldNumber != unIntfieldnum) { this.docValues.flushFieldDoc(this.termNum); } this.docValues.start(fieldNumber,term.field); this.ft = this.schemainfo.getField(term.field).getType(); this.dataType = UnInvertedFieldUtils.getDataType(ft); this.lastquickfieldNumber = fieldNumber; this.termNum = 0; } } } void addTm(Term term,int fieldNumber) throws IOException { if(!this.isquickTis) { return ; } synchronized (lock) { if(this.ft.isMultiValued()) { return ; } long tmValue=0; if (dataType == FieldDatatype.d_long){ tmValue=Long.parseLong(ft.indexedToReadable(term.text())); }else if (dataType == FieldDatatype.d_double) { Double val=MdrillUtils.ParseDouble(ft.indexedToReadable(term.text())); tmValue=Double.doubleToLongBits(val); }else{ CRC32 crc32 = new CRC32(); crc32.update(new String(ft.indexedToReadable(term.text())).getBytes()); tmValue=crc32.getValue(); } docValues.collectTm(tmValue); if ((this.termNum & TermIndex.intervalMask)==0){ docValues.collectTmIndex(term.text()); } this.termNum++; } } void addtii(int fieldNumber, byte[] termBytes, int termBytesLength, TermInfo ti) throws IOException { output.writeInt(ti.docFreq); // write doc freq output.writeLong(ti.freqPointer); // write pointers output.writeLong(ti.proxPointer); output.writeInt(ti.skipOffset); output.writeLong(other.output.getFilePointer()); output.writeLong(this.outputQuickTii.getFilePointer()); this.writeTermTii(fieldNumber, termBytes, termBytesLength); size++; } private void writeTerm(int fieldNumber, byte[] termBytes, int termBytesLength) throws IOException { // TODO: UTF16toUTF8 could tell us this prefix // Compute prefix in common with last term: int start = 0; final int limit = termBytesLength < lastTermBytesLength ? termBytesLength : lastTermBytesLength; while(start < limit) { if (termBytes[start] != lastTermBytes[start]) break; start++; } final int length = termBytesLength - start; output.writeVInt(start); // write shared prefix length output.writeVInt(length); // write delta length output.writeBytes(termBytes, start, length); // write delta bytes output.writeVInt(fieldNumber); // write field num if (lastTermBytes.length < termBytesLength) { lastTermBytes = ArrayUtil.grow(lastTermBytes, termBytesLength); } System.arraycopy(termBytes, start, lastTermBytes, start, length); lastTermBytesLength = termBytesLength; } private void writeTermTii(int fieldNumber, byte[] termBytes, int termBytesLength) throws IOException { this.outputQuickTii.writeVInt(termBytesLength); // write delta bytes this.outputQuickTii.writeBytes(termBytes, 0, termBytesLength); // write delta bytes this.outputQuickTii.writeVInt(fieldNumber); // write field num } /** Called to complete TermInfos creation. */ public void close() throws IOException { outputSize.writeLong(size); try { synchronized (lock) { this.docValues.flushFieldDoc(this.termNum); this.docValues.flushPosTo(outputSize); this.docValues.free(); this.docValues.close(); } if(outputQuickTii!=null) { outputQuickTii.close(); } } finally { try { outputSize.close(); output.close(); } finally { if (!isIndex) { other.close(); } } } } }