TermInfosWriter.java example

Explorer
mdrill-master
- trunk
package org.apache.lucene.index;

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */


import java.io.Closeable;
import java.io.IOException;
import java.util.zip.CRC32;

import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util.ArrayUtil;
import org.apache.solr.request.mdrill.MdrillUtils;
import org.apache.solr.request.uninverted.TermIndex;
import org.apache.solr.request.uninverted.UnInvertedFieldUtils;
import org.apache.solr.request.uninverted.UnInvertedFieldUtils.FieldDatatype;
import org.apache.solr.schema.FieldType;
import org.apache.solr.schema.IndexSchema;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public final class TermInfosWriter implements Closeable {
	  public static Logger LOG = LoggerFactory.getLogger(TermInfosWriter.class);

	private static IndexSchema schema=null;
	private static boolean notUseQuick=false;
	public static boolean isNotUseQuick() {
		return notUseQuick;
	}

	public static void setNotUseQuick(boolean notUseQuick) {
		TermInfosWriter.notUseQuick = notUseQuick;
	}

	public static void setSchema(IndexSchema schema)
	{
		TermInfosWriter.schema=schema;
	}

	public static final int QUICK_TII = -1210;
	public static final int FORMAT = -3;
	public static final int FORMAT_VERSION_UTF8_LENGTH_IN_BYTES = -4;
	public static final int FORMAT_CURRENT = FORMAT_VERSION_UTF8_LENGTH_IN_BYTES;

  private FieldInfos fieldInfos;
  private IndexOutput output;
  private IndexOutput outputSize;
  private IndexOutput outputQuickTii=null;
  

  private boolean isquickTis=false;
  private IndexSchema schemainfo=TermInfosWriter.schema;
  private TermInfo lastTi = new TermInfo();
  private long size;

  int indexInterval = 128;

  private static int SKIP_INTERVAL=16;//Integer.MAX_VALUE;
  public static void setSkipInterVal(int i)
  {
	  //如果是全文检索模式，为了提升跳跃的效率，该值不宜设置的太大，其他模式设置的DataOutput.BLOGK_SIZE_COMPRESS,能有比较好的压缩比
	  SKIP_INTERVAL=i;
  }
  int skipInterval = 16; 
  
  int maxSkipLevels = 10;

  private boolean isIndex;
  private byte[] lastTermBytes = new byte[10];
  private int lastTermBytesLength = 0;
  private int lastFieldNumber = -1;

  private TermInfosWriter other;
  private UnicodeUtil.UTF8Result utf8Result = new UnicodeUtil.UTF8Result();

  TermInfosWriter(Directory directory, String segment, FieldInfos fis,
                  int interval)
       throws IOException {
	  
    initialize(directory, segment, fis, interval, false);
    boolean success = false;
    try {
      other = new TermInfosWriter(directory, segment, fis, interval, true);
      other.other = this;
      success = true;
    } finally {
      if (!success) {
        IOUtils.closeWhileHandlingException(output,outputSize, other,outputQuickTii);
        docValues.close();
      }
    }
  }

  private TermInfosWriter(Directory directory, String segment, FieldInfos fis,
                          int interval, boolean isIndex) throws IOException {
    initialize(directory, segment, fis, interval, isIndex);
  }

  private void initialize(Directory directory, String segment, FieldInfos fis,
                          int interval, boolean isi) throws IOException {
	skipInterval=SKIP_INTERVAL<(Integer.MAX_VALUE-1000)?SKIP_INTERVAL:Integer.MAX_VALUE;

	docValues=new DocValuesWriteEmpty();
    indexInterval = interval;
    fieldInfos = fis;
    isIndex = isi;
    output = directory.createOutput(segment + (isIndex ? ".tii" : ".tis"));
    outputQuickTii=isIndex?directory.createOutput(segment+"." +IndexFileNames.TERMS_INDEX_EXTENSION_QUICK):null;
    IndexSchema schema=directory.getSchema();
    if(schema!=null)
    {
    	this.schemainfo=schema;
    }
    if(this.schemainfo!=null&&!isNotUseQuick()&&!(directory instanceof RAMDirectory))
    {
    	if(!isIndex)
    	{
    		DocValuesWriteImpl impl=new DocValuesWriteImpl();
    		impl.outputQuickTis=directory.createOutput(segment+"." +IndexFileNames.TERMS_EXTENSION_QUICK);
    		impl.outputQuickTisTxt=directory.createOutput(segment+"." +IndexFileNames.TERMS_EXTENSION_QUICK_TXT);
    		impl.outputQuickTisVal=directory.createOutput(segment+"." +IndexFileNames.TERMS_EXTENSION_QUICK_VAL);
    		docValues=impl;

    	}
    	this.isquickTis=true;
    }

    outputSize = directory.createOutput(segment + (isIndex ? "."+IndexFileNames.TERMS_INDEX_EXTENSION_SIZE : "."+IndexFileNames.TERMS_EXTENSION_SIZE));
    boolean success = false;
    try {
      output.writeInt(FORMAT_CURRENT);              // write format
      output.writeLong(QUICK_TII);                          // leave space for size
      output.writeInt(indexInterval);               // write indexInterval
      output.writeInt(skipInterval);                // write skipInterval
      output.writeInt(maxSkipLevels);               // write maxSkipLevels
      assert initUTF16Results();
      success = true;
    } finally {
      if (!success) {
        IOUtils.closeWhileHandlingException(output,outputSize,outputQuickTii);
        docValues.close();
      }
    }
  }

  void add(Term term, TermInfo ti) throws IOException {
    UnicodeUtil.UTF16toUTF8(term.text, 0, term.text.length(), utf8Result);
    add(term,fieldInfos.fieldNumber(term.field), utf8Result.result, utf8Result.length, ti);
  }

  // Currently used only by assert statements
  UnicodeUtil.UTF16Result utf16Result1;
  UnicodeUtil.UTF16Result utf16Result2;

  // Currently used only by assert statements
  private boolean initUTF16Results() {
    utf16Result1 = new UnicodeUtil.UTF16Result();
    utf16Result2 = new UnicodeUtil.UTF16Result();
    return true;
  }

  // Currently used only by assert statement
  private int compareToLastTerm(int fieldNumber, byte[] termBytes, int termBytesLength) {

    if (lastFieldNumber != fieldNumber) {
      final int cmp = fieldInfos.fieldName(lastFieldNumber).compareTo(fieldInfos.fieldName(fieldNumber));
      // If there is a field named "" (empty string) then we
      // will get 0 on this comparison, yet, it's "OK".  But
      // it's not OK if two different field numbers map to
      // the same name.
      if (cmp != 0 || lastFieldNumber != -1)
        return cmp;
    }

    UnicodeUtil.UTF8toUTF16(lastTermBytes, 0, lastTermBytesLength, utf16Result1);
    UnicodeUtil.UTF8toUTF16(termBytes, 0, termBytesLength, utf16Result2);
    final int len;
    if (utf16Result1.length < utf16Result2.length)
      len = utf16Result1.length;
    else
      len = utf16Result2.length;

    for(int i=0;i<len;i++) {
      final char ch1 = utf16Result1.result[i];
      final char ch2 = utf16Result2.result[i];
      if (ch1 != ch2)
        return ch1-ch2;
    }
    if (utf16Result1.length == 0 && lastFieldNumber == -1) {
      // If there is a field named "" (empty string) with a term text of "" (empty string) then we
      // will get 0 on this comparison, yet, it's "OK". 
      return -1;
    }
    return utf16Result1.length - utf16Result2.length;
  }
  


  

  DocValuesWriter docValues;
  void collect(int docid)
  {
		synchronized (lock) {

	  if(!this.isIndex)
	  {
		  docValues.collectDoc(docid,this.termNum);
	  }
		}
  }
  
 
  
  void add(Term term,int fieldNumber, byte[] termBytes, int termBytesLength, TermInfo ti)
    throws IOException {
	  
    assert compareToLastTerm(fieldNumber, termBytes, termBytesLength) < 0 ||
      (isIndex && termBytesLength == 0 && lastTermBytesLength == 0) :
      "Terms are out of order: field=" + fieldInfos.fieldName(fieldNumber) + " (number " + fieldNumber + ")" +
        " lastField=" + fieldInfos.fieldName(lastFieldNumber) + " (number " + lastFieldNumber + ")" +
        " text=" + new String(termBytes, 0, termBytesLength, "UTF-8") + " lastText=" + new String(lastTermBytes, 0, lastTermBytesLength, "UTF-8");

    assert ti.freqPointer >= lastTi.freqPointer: "freqPointer out of order (" + ti.freqPointer + " < " + lastTi.freqPointer + ")";
    assert ti.proxPointer >= lastTi.proxPointer: "proxPointer out of order (" + ti.proxPointer + " < " + lastTi.proxPointer + ")";

    if(this.isIndex)
	  {
		  this.addtii(fieldNumber, termBytes, termBytesLength, ti);
		  return ;
	  }
    if ( size % indexInterval == 0)
    {
      other.add(term,lastFieldNumber, lastTermBytes, lastTermBytesLength, lastTi);                      // add an index term
    }

    writeTerm(fieldNumber, termBytes, termBytesLength);                        // write term
    output.writeVInt(ti.docFreq);                       // write doc freq
    output.writeVLong(ti.freqPointer - lastTi.freqPointer); // write pointers
    output.writeVLong(ti.proxPointer - lastTi.proxPointer);

    if (ti.docFreq >= skipInterval) {
      output.writeVInt(ti.skipOffset);
    }
    
   

    lastFieldNumber = fieldNumber;
    lastTi.set(ti);
    size++;
  }
  
   static int unIntfieldnum=-9999;
  int lastquickfieldNumber=unIntfieldnum;
  FieldDatatype dataType;
  FieldType ft;
  int termNum=0;


  Object lock=new Object();
	public void startTerm(Term term, int fieldNumber) throws IOException {
		synchronized (lock) {
		if (this.lastquickfieldNumber != fieldNumber) {
			if (this.lastquickfieldNumber != unIntfieldnum) {
				this.docValues.flushFieldDoc(this.termNum);
			}
			this.docValues.start(fieldNumber,term.field);
			this.ft = this.schemainfo.getField(term.field).getType();
			this.dataType = UnInvertedFieldUtils.getDataType(ft);
			this.lastquickfieldNumber = fieldNumber;
			this.termNum = 0;

		}
		}
	}
  
  
  void addTm(Term term,int fieldNumber) throws IOException
  {
	  if(!this.isquickTis)
	  {
		  return ;
	  }
		synchronized (lock) {
 
      if(this.ft.isMultiValued())
      {
    	  return ;
      }
      
      long tmValue=0;
      if (dataType == FieldDatatype.d_long){
    	  tmValue=Long.parseLong(ft.indexedToReadable(term.text()));
		}else if (dataType == FieldDatatype.d_double) {
			
			Double val=MdrillUtils.ParseDouble(ft.indexedToReadable(term.text()));
			tmValue=Double.doubleToLongBits(val);
		}else{
			CRC32 crc32 = new CRC32();
			crc32.update(new String(ft.indexedToReadable(term.text())).getBytes());
			tmValue=crc32.getValue();
		}
      

      docValues.collectTm(tmValue);
      if ((this.termNum & TermIndex.intervalMask)==0){
    	  docValues.collectTmIndex(term.text());
      }
      
      this.termNum++;
		}
  }
  

  
  void addtii(int fieldNumber, byte[] termBytes, int termBytesLength, TermInfo ti)
  throws IOException {
  output.writeInt(ti.docFreq);                       // write doc freq
  output.writeLong(ti.freqPointer); // write pointers
  output.writeLong(ti.proxPointer);
  output.writeInt(ti.skipOffset);
  output.writeLong(other.output.getFilePointer());
  output.writeLong(this.outputQuickTii.getFilePointer());
  this.writeTermTii(fieldNumber, termBytes, termBytesLength);
  size++;
}

  private void writeTerm(int fieldNumber, byte[] termBytes, int termBytesLength)
       throws IOException {

    // TODO: UTF16toUTF8 could tell us this prefix
    // Compute prefix in common with last term:
    int start = 0;
    final int limit = termBytesLength < lastTermBytesLength ? termBytesLength : lastTermBytesLength;
    while(start < limit) {
      if (termBytes[start] != lastTermBytes[start])
        break;
      start++;
    }

    final int length = termBytesLength - start;
    output.writeVInt(start);                     // write shared prefix length
    output.writeVInt(length);                  // write delta length
    output.writeBytes(termBytes, start, length);  // write delta bytes
    output.writeVInt(fieldNumber); // write field num
    if (lastTermBytes.length < termBytesLength) {
      lastTermBytes = ArrayUtil.grow(lastTermBytes, termBytesLength);
    }
    System.arraycopy(termBytes, start, lastTermBytes, start, length);
    lastTermBytesLength = termBytesLength;
  }
  

  
	private void writeTermTii(int fieldNumber, byte[] termBytes,
			int termBytesLength) throws IOException {
		this.outputQuickTii.writeVInt(termBytesLength); // write delta bytes
		this.outputQuickTii.writeBytes(termBytes, 0, termBytesLength); // write delta bytes
		this.outputQuickTii.writeVInt(fieldNumber); // write field num
	}

  /** Called to complete TermInfos creation. */
  public void close() throws IOException {
	 
  	outputSize.writeLong(size);

	  
    try {
		synchronized (lock) {

	  	this.docValues.flushFieldDoc(this.termNum);
    	this.docValues.flushPosTo(outputSize);
    	this.docValues.free();
	  	this.docValues.close();
		}

	  	 if(outputQuickTii!=null)
		  {
			  outputQuickTii.close();
		  }
    } finally {
      try {
    	  outputSize.close();
    	  output.close();
      } finally {
        if (!isIndex) {
          other.close();
        }
      }
    }

  }

}