package com.alimama.mdrill.index.utils;
import java.io.IOException;
import java.util.Date;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.Trash;
import org.apache.log4j.Logger;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriter.MaxFieldLength;
import org.apache.lucene.index.KeepOnlyLastCommitDeletionPolicy;
import org.apache.lucene.index.SerialMergeScheduler;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermEnum;
import org.apache.lucene.store.Directory;
import com.alimama.mdrill.hdfsDirectory.FileSystemDirectory;
public class ShardWriter {
private static Logger logger = Logger.getLogger(ShardWriter.class);
private Path perm;
private FileSystemDirectory dir;
public FileSystemDirectory getDir() {
return dir;
}
private IndexWriter writer;
private long numForms = 0;
private long numDocs=0;
public long getNumDocs() {
return numDocs;
}
public ShardWriter(FileSystem fs, String indexOutputPathStr,Configuration iconf) throws IOException {
logger.info("Construct a shard writer " + indexOutputPathStr);
perm = new Path(indexOutputPathStr);
if (!fs.exists(perm)) {
fs.mkdirs(perm);
} else {
moveToTrash(iconf, perm);
fs.mkdirs(perm);
}
this.dir = new FileSystemDirectory(fs, perm, true, iconf);
writer = new IndexWriter(dir, null,
new KeepOnlyLastCommitDeletionPolicy(),
MaxFieldLength.UNLIMITED);
// writer.setMergeScheduler(new SerialMergeScheduler());//new ConcurrentMergeScheduler()
writer.setMergeFactor(256);
writer.setTermIndexInterval(128);
writer.setUseCompoundFile(false);
logger.info("finish Construct a shard writer " + indexOutputPathStr);
}
public void addEmptyDoc() throws CorruptIndexException, IOException
{
Document empty=new Document();
empty.add(new Field("higoempty_emptydoc_s", "1", Field.Store.NO, Field.Index.NOT_ANALYZED_NO_NORMS));
writer.addDocument(empty);
}
public void process(RamWriter form) throws IOException {
int docs=form.getNumDocs();
if(docs>0)
{
numDocs+=docs;
numForms++;
this.process(form.getDirectory());
}
}
public void process(Directory dir) throws CorruptIndexException, IOException
{
Date d1=new Date();
writer.addIndexesNoOptimize(new Directory[] { dir });
Date d2=new Date();
logger.info("process time "+(d2.getTime()-d1.getTime()));
}
public void close() throws IOException {
logger.info("Closing the shard writer, processed " + numForms + " forms "+numDocs +" docs ");
try {
writer.getReader();
writer.close();
} finally {
this.writer = null;
// IndexReader reader=IndexReader.open(this.dir);
// TermEnum te=reader.terms(new Term("thedate"));
// StringBuffer buff=new StringBuffer();
// int index=0;
// for (;;) {
// index++;
// Term t = te.term();
// if (t == null||!t.field().equals("thedate")||index>100) {
// break;
// }
// buff.append(t.toString());
// buff.append(",");
// te.next();
// }
// reader.close();
//
// System.out.println("thedate:"+buff.toString());
this.dir.close();
this.dir = null;
logger.info("Closed Lucene index writer");
}
logger.info("Moved new index files to " + perm);
}
public String toString() {
return this.getClass().getName() + "@" + perm + "&";
}
public void optimize() throws IOException {
Throwable error=null;
for(int i=0;i<10;i++)
{
error=null;
try {
writer.optimize();
return ;
} catch (CorruptIndexException e) {
logger.error("optimize Corrupt Index error. ", e);
error=e;
} catch (IOException e) {
logger.error("optimize IOException . ", e);
error=e;
}
try {
Thread.sleep(5000);
} catch (InterruptedException e) {
logger.error(e);
}
}
if(error!=null)
{
throw new IOException(error);
}
}
public static void moveToTrash(Configuration conf, Path path)
throws IOException {
Trash t = new Trash(conf);
boolean isMoved = t.moveToTrash(path);
t.expunge();
if (!isMoved) {
logger.error("Trash is not enabled or file is already in the trash.");
}
}
}