IntermediateForm.java example

Explorer
sensei-master
/**
 * This software is licensed to you under the Apache License, Version 2.0 (the
 * "Apache License").
 *
 * LinkedIn's contributions are made under the Apache License. If you contribute
 * to the Software, the contributions will be deemed to have been made under the
 * Apache License, unless you expressly indicate otherwise. Please do not make any
 * contributions that would be inconsistent with the Apache License.
 *
 * You may obtain a copy of the Apache License at http://www.apache.org/licenses/LICENSE-2.0
 * Unless required by applicable law or agreed to in writing, this software
 * distributed under the Apache License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the Apache
 * License for the specific language governing permissions and limitations for the
 * software governed under the Apache License.
 *
 * © 2012 LinkedIn Corp. All Rights Reserved.  
 */
package com.senseidb.indexing.hadoop.keyvalueformat;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Writable;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriter.MaxFieldLength;
import org.apache.lucene.index.KeepOnlyLastCommitDeletionPolicy;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.store.RAMDirectorySerializer;

import com.senseidb.indexing.hadoop.reduce.RAMDirectoryUtil;
import com.senseidb.indexing.hadoop.util.SenseiJobConfig;

/**
 * An intermediate form for one or more parsed Lucene documents and/or
 * delete terms. It actually uses Lucene file format as the format for
 * the intermediate form by using RAM dir files.
 * 
 * Note: If process(*) is ever called, closeWriter() should be called.
 * Otherwise, no need to call closeWriter().
 */
public class IntermediateForm implements Writable {

  private Configuration conf = null;
  private RAMDirectory dir;
  private IndexWriter writer;
  private int numDocs;

  /**
   * Constructor
   * @throws IOException
   */
  public IntermediateForm() throws IOException {
    dir = new RAMDirectory();
    writer = null;
    numDocs = 0;
  }

  /**
   * Configure using an index update configuration.
   * @param iconf  the index update configuration
   */
  public void configure(Configuration iconf) {
    this.conf = iconf;
  }

  /**
   * Get the ram directory of the intermediate form.
   * @return the ram directory
   */
  public Directory getDirectory() {
    return dir;
  }


  /**
   * This method is used by the index update mapper and process a document
   * operation into the current intermediate form.
   * @param doc  input document operation
   * @param analyzer  the analyzer
   * @throws IOException
   */
  public void process(Document doc, Analyzer analyzer) throws IOException {


      if (writer == null) {
        // analyzer is null because we specify an analyzer with addDocument
        writer = createWriter();
      }

      writer.addDocument(doc, analyzer);
      numDocs++;

  }

  /**
   * This method is used by the index update combiner and process an
   * intermediate form into the current intermediate form. More specifically,
   * the input intermediate forms are a single-document ram index and/or a
   * single delete term.
   * @param form  the input intermediate form
   * @throws IOException
   */
  public void process(IntermediateForm form) throws IOException {

    if (form.dir.sizeInBytes() > 0) {
      if (writer == null) {
        writer = createWriter();
      }

      writer.addIndexesNoOptimize(new Directory[] { form.dir });
      numDocs++;
    }

  }

  /**
   * Close the Lucene index writer associated with the intermediate form,
   * if created. Do not close the ram directory. In fact, there is no need
   * to close a ram directory.
   * @throws IOException
   */
  public void closeWriter() throws IOException {
    if (writer != null) {
      writer.optimize();
      writer.close();
      writer = null;
    }
  }

  /**
   * The total size of files in the directory and ram used by the index writer.
   * It does not include memory used by the delete list.
   * @return the total size in bytes
   */
  public long totalSizeInBytes() throws IOException {
    long size = dir.sizeInBytes();
    if (writer != null) {
      size += writer.ramSizeInBytes();
    }
    return size;
  }

  /* (non-Javadoc)
   * @see java.lang.Object#toString()
   */
  public String toString() {
    StringBuilder buffer = new StringBuilder();
    buffer.append(this.getClass().getSimpleName());
    buffer.append("[numDocs=");
    buffer.append(numDocs);
    buffer.append(", numDeletes=");
    buffer.append("]");
    return buffer.toString();
  }

  private IndexWriter createWriter() throws IOException {
    IndexWriter writer =
//        new IndexWriter(dir, false, null, new KeepOnlyLastCommitDeletionPolicy());
    	new IndexWriter(dir,  null, new KeepOnlyLastCommitDeletionPolicy(), MaxFieldLength.UNLIMITED);
    writer.setUseCompoundFile(true);  //use compound file fortmat to speed up;

    if (conf != null) {
      int maxFieldLength = conf.getInt(SenseiJobConfig.MAX_FIELD_LENGTH, -1);
      if (maxFieldLength > 0) {
        writer.setMaxFieldLength(maxFieldLength);
      }
    }

    return writer;
  }

  private void resetForm() throws IOException {
    if (dir.sizeInBytes() > 0) {
      // it's ok if we don't close a ram directory
      dir.close();
      // an alternative is to delete all the files and reuse the ram directory
      dir = new RAMDirectory();
    }
    assert (writer == null);
    numDocs = 0;
  }

  // ///////////////////////////////////
  // Writable
  // ///////////////////////////////////

  /* (non-Javadoc)
   * @see org.apache.hadoop.io.Writable#write(java.io.DataOutput)
   */
  public void write(DataOutput out) throws IOException {

    String[] files = dir.listAll();
    RAMDirectoryUtil.writeRAMFiles(out, dir, files);
    
//    RAMDirectorySerializer.toDataOutput(out, dir);
  }

  /* (non-Javadoc)
   * @see org.apache.hadoop.io.Writable#readFields(java.io.DataInput)
   */
  public void readFields(DataInput in) throws IOException {
    resetForm();
    RAMDirectoryUtil.readRAMFiles(in, dir);

//	  numDocs = 0;
//	  dir = RAMDirectorySerializer.fromDataInput(in);
  }

}