/**
* This software is licensed to you under the Apache License, Version 2.0 (the
* "Apache License").
*
* LinkedIn's contributions are made under the Apache License. If you contribute
* to the Software, the contributions will be deemed to have been made under the
* Apache License, unless you expressly indicate otherwise. Please do not make any
* contributions that would be inconsistent with the Apache License.
*
* You may obtain a copy of the Apache License at http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, this software
* distributed under the Apache License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the Apache
* License for the specific language governing permissions and limitations for the
* software governed under the Apache License.
*
* © 2012 LinkedIn Corp. All Rights Reserved.
*/
package com.senseidb.indexing.hadoop.keyvalueformat;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Writable;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriter.MaxFieldLength;
import org.apache.lucene.index.KeepOnlyLastCommitDeletionPolicy;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.store.RAMDirectorySerializer;
import com.senseidb.indexing.hadoop.reduce.RAMDirectoryUtil;
import com.senseidb.indexing.hadoop.util.SenseiJobConfig;
/**
* An intermediate form for one or more parsed Lucene documents and/or
* delete terms. It actually uses Lucene file format as the format for
* the intermediate form by using RAM dir files.
*
* Note: If process(*) is ever called, closeWriter() should be called.
* Otherwise, no need to call closeWriter().
*/
public class IntermediateForm implements Writable {
private Configuration conf = null;
private RAMDirectory dir;
private IndexWriter writer;
private int numDocs;
/**
* Constructor
* @throws IOException
*/
public IntermediateForm() throws IOException {
dir = new RAMDirectory();
writer = null;
numDocs = 0;
}
/**
* Configure using an index update configuration.
* @param iconf the index update configuration
*/
public void configure(Configuration iconf) {
this.conf = iconf;
}
/**
* Get the ram directory of the intermediate form.
* @return the ram directory
*/
public Directory getDirectory() {
return dir;
}
/**
* This method is used by the index update mapper and process a document
* operation into the current intermediate form.
* @param doc input document operation
* @param analyzer the analyzer
* @throws IOException
*/
public void process(Document doc, Analyzer analyzer) throws IOException {
if (writer == null) {
// analyzer is null because we specify an analyzer with addDocument
writer = createWriter();
}
writer.addDocument(doc, analyzer);
numDocs++;
}
/**
* This method is used by the index update combiner and process an
* intermediate form into the current intermediate form. More specifically,
* the input intermediate forms are a single-document ram index and/or a
* single delete term.
* @param form the input intermediate form
* @throws IOException
*/
public void process(IntermediateForm form) throws IOException {
if (form.dir.sizeInBytes() > 0) {
if (writer == null) {
writer = createWriter();
}
writer.addIndexesNoOptimize(new Directory[] { form.dir });
numDocs++;
}
}
/**
* Close the Lucene index writer associated with the intermediate form,
* if created. Do not close the ram directory. In fact, there is no need
* to close a ram directory.
* @throws IOException
*/
public void closeWriter() throws IOException {
if (writer != null) {
writer.optimize();
writer.close();
writer = null;
}
}
/**
* The total size of files in the directory and ram used by the index writer.
* It does not include memory used by the delete list.
* @return the total size in bytes
*/
public long totalSizeInBytes() throws IOException {
long size = dir.sizeInBytes();
if (writer != null) {
size += writer.ramSizeInBytes();
}
return size;
}
/* (non-Javadoc)
* @see java.lang.Object#toString()
*/
public String toString() {
StringBuilder buffer = new StringBuilder();
buffer.append(this.getClass().getSimpleName());
buffer.append("[numDocs=");
buffer.append(numDocs);
buffer.append(", numDeletes=");
buffer.append("]");
return buffer.toString();
}
private IndexWriter createWriter() throws IOException {
IndexWriter writer =
// new IndexWriter(dir, false, null, new KeepOnlyLastCommitDeletionPolicy());
new IndexWriter(dir, null, new KeepOnlyLastCommitDeletionPolicy(), MaxFieldLength.UNLIMITED);
writer.setUseCompoundFile(true); //use compound file fortmat to speed up;
if (conf != null) {
int maxFieldLength = conf.getInt(SenseiJobConfig.MAX_FIELD_LENGTH, -1);
if (maxFieldLength > 0) {
writer.setMaxFieldLength(maxFieldLength);
}
}
return writer;
}
private void resetForm() throws IOException {
if (dir.sizeInBytes() > 0) {
// it's ok if we don't close a ram directory
dir.close();
// an alternative is to delete all the files and reuse the ram directory
dir = new RAMDirectory();
}
assert (writer == null);
numDocs = 0;
}
// ///////////////////////////////////
// Writable
// ///////////////////////////////////
/* (non-Javadoc)
* @see org.apache.hadoop.io.Writable#write(java.io.DataOutput)
*/
public void write(DataOutput out) throws IOException {
String[] files = dir.listAll();
RAMDirectoryUtil.writeRAMFiles(out, dir, files);
// RAMDirectorySerializer.toDataOutput(out, dir);
}
/* (non-Javadoc)
* @see org.apache.hadoop.io.Writable#readFields(java.io.DataInput)
*/
public void readFields(DataInput in) throws IOException {
resetForm();
RAMDirectoryUtil.readRAMFiles(in, dir);
// numDocs = 0;
// dir = RAMDirectorySerializer.fromDataInput(in);
}
}