SaveToIndexBatch.java example

Explorer
hsearch-obsolete-master
- src
/*
* Copyright 2010 Bizosys Technologies Limited
*
* Licensed to the Bizosys Technologies Limited (Bizosys) under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  The Bizosys licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.bizosys.hsearch.inpipe;

import java.io.IOException;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import com.bizosys.hsearch.filter.Storable;
import com.bizosys.hsearch.hbase.HWriter;
import com.bizosys.hsearch.hbase.NV;
import com.bizosys.hsearch.index.Doc;
import com.bizosys.hsearch.index.IndexLog;
import com.bizosys.hsearch.index.Term;
import com.bizosys.hsearch.index.TermColumns;
import com.bizosys.hsearch.index.TermFamilies;
import com.bizosys.hsearch.index.TermList;
import com.bizosys.hsearch.index.TermTables;
import com.bizosys.hsearch.schema.ILanguageMap;
import com.bizosys.hsearch.schema.SchemaManager;
import com.bizosys.hsearch.util.ObjectFactory;
import com.bizosys.hsearch.util.Record;
import com.bizosys.oneline.ApplicationFault;
import com.bizosys.oneline.SystemFault;
import com.bizosys.oneline.conf.Configuration;
import com.bizosys.oneline.pipes.PipeIn;
 
/**
 * Saves the term vector to the index table in a batch mode. In the batch mode, it expects
 * 1) Bunch comes under one bucket only  
 * 2) Each bucket will be overriden with no merging
 * 3) At one time one client only operates on a bucket. So thread safety is not provided. 
 * @author karan
 *
 */
public class SaveToIndexBatch implements PipeIn {

	int docMergeFactor = 1000;
	/** Arranged by document */
	Map<Long, TermTables> mergedTermTables = null; 
	Map<Long, Map<String,byte[]>> mergedBytes = 
		new HashMap<Long, Map<String,byte[]>>(2) ;
	int bufferSize = 0;
	boolean isIdMappingEnabled = true;  
	
	public SaveToIndexBatch() {
		
	}
	
	public SaveToIndexBatch(int docMergeFactor, boolean isIdMappingEnabled) {
		this.docMergeFactor = docMergeFactor;
		this.isIdMappingEnabled = isIdMappingEnabled;
	}

	public void visit(Object docObj, boolean multiWriter) throws ApplicationFault, SystemFault {
		
		if ( null == docObj) throw new ApplicationFault("No document");
		Doc doc = (Doc) docObj;

		if ( null == doc.bucketId || null == doc.docSerialId ) 
			throw new ApplicationFault ("Ids missed from document :" + doc.toString());

		if ( null == doc.terms) return;
		if ( null == doc.terms.all) return;
		
		ILanguageMap map = SchemaManager.getInstance().getLanguageMap(doc.meta.locale);
		TermTables termTable = ( null == doc.bucketId ) ?
			new TermTables(multiWriter) : new TermTables( new Storable(doc.bucketId),multiWriter);
		
		short docId = doc.docSerialId;
		for (Term term : doc.terms.getTermList()) {
			term.setDocumentPosition(docId);
			termTable.add(term, map);
		}
		bufferSize = bufferSize + doc.terms.getTermList().size();
		doc.terms.closeTermList();

		/**
		 * We need to arrange all terms from documents to arrange in term buckets.
		 */
		if ( null == mergedTermTables)
			mergedTermTables = ObjectFactory.getInstance().getBucketTermTable();
		
		buildTermTables(doc.bucketId, mergedTermTables, termTable);
		
		/**
		 * ************** Check For Intermediate Flushing **************
		 */
		if ( bufferSize > 1000000) {
			if ( IndexLog.l.isInfoEnabled() ) 			 
				IndexLog.l.info("Flush Intermediate Size > " + bufferSize );
			flushIntermediate(this.mergedBytes, mergedTermTables);
		}
		/**
		 * ************** ----------------------- **************
		 */
	}

	/**
	 * Persist the Index
	 */
	public void commit(boolean multiWriter) throws ApplicationFault, SystemFault {
		if ( null == mergedTermTables) return;
		if ( IndexLog.l.isDebugEnabled()) printMergedTT(mergedTermTables);
		
		if ( mergedTermTables.size() > 0) {
			flushIntermediate(this.mergedBytes, mergedTermTables);
			this.mergedTermTables.clear();
		}

		if ( null != mergedTermTables) {
			ObjectFactory.getInstance().putBucketTermTable(mergedTermTables);
		}
		
		if ( this.mergedBytes.size() == 0 ) return;
		
		for (long bucketId : this.mergedBytes.keySet()) {
			if ( IndexLog.l.isInfoEnabled() ) 			 
				IndexLog.l.info("Commit > " + bucketId);
			Map<String,byte[]> values = this.mergedBytes.get(bucketId);
			Map<Character, List<NV>> tableNVs = new HashMap<Character, List<NV>>(); 
			for (String tfc : values.keySet()) {
				char tab = tfc.charAt(0);
				if (tableNVs.containsKey(tab) ) {
					List<NV> tabNvs = tableNVs.get(tab);
					tabNvs.add(new NV( new byte[]{(byte)tfc.charAt(1)}, 
						new byte[]{(byte)tfc.charAt(2)}, 
						new Storable(values.get(tfc))));
				} else {
					List<NV> nvs = ObjectFactory.getInstance().getNVList();					
					nvs.add(new NV( new byte[]{(byte)tfc.charAt(1)}, 
							new byte[]{(byte)tfc.charAt(2)}, 
							new Storable(values.get(tfc))));
					tableNVs.put(tab, nvs);
				}
			}
			
			values.clear();
			try {
				for (Character tab : tableNVs.keySet()) {
					List<NV> tabNV = tableNVs.get(tab);
					Record rec = new Record(new Storable(bucketId),tabNV );
					HWriter.getInstance(multiWriter).insert(tab.toString(), rec);
					ObjectFactory.getInstance().putNVList(tabNV);
				}
			} catch (IOException ex) {
				throw new SystemFault(ex);
			} finally {
				for ( Map<String,byte[]> bucketM : mergedBytes.values()) {
					bucketM.clear();
				}
				mergedBytes.clear();
				tableNVs.clear();
			}
		}
	}	

	/**
	 * 
	 * @param mergedTermTables
	 * @param docTermTable
	 */
	private void buildTermTables( long bucketId,
		Map<Long, TermTables> mergedTermTables, TermTables docTermTable) {
		
		if ( mergedTermTables.containsKey(bucketId)) {
			TermTables mtt = mergedTermTables.get(bucketId);
			mtt.addInSameBucket(docTermTable);
		} else {
			mergedTermTables.put(bucketId, docTermTable);
		}
	}
	
	/**
	 * It transforms the term objects to a byte array.
	 * It saves pointed and allows more documents to get merged.
	 * @param allByteValues	All Bytes
	 * @param mtt	The merged term tables
	 */
	private void flushIntermediate( Map<Long, Map<String,byte[]>> 
		allByteValues, Map<Long, TermTables> mtt) {
		
		if ( null == mtt) return;
		if ( 0 == mtt.size()) return;

		long s = System.currentTimeMillis();
		StringBuilder sb = new StringBuilder(10);
		for (long bucketId : mtt.keySet()) {
			
			TermTables termTables = mtt.get(bucketId);
			for ( char tableName : termTables.tables.keySet()) {
				TermFamilies tf = termTables.tables.get(tableName);
				for (char family : tf.families.keySet()) {
					TermColumns tcs = tf.families.get(family);
					for (char col : tcs.columns.keySet()) {
						sb.append(tableName).append(family).append(col);
						String tfc = sb.toString();
						sb.delete(0, 10);
						TermList tl = tcs.columns.get(col);
						Map<String,byte[]> bytesV = null;
						if ( allByteValues.containsKey(bucketId)) {
							bytesV = allByteValues.get(bucketId);
							if (bytesV.containsKey(tfc)) {
								byte[] tlB = bytesV.get(tfc);
								tl.setExistingBytes(tlB);
								byte[] newB = tl.toBytes();
								bytesV.put(tfc, newB);
								tlB = null;
							} else {
								bytesV.put(tfc, tl.toBytes());
							}
						} else {
							bytesV = new HashMap<String, byte[]>(1);
							bytesV.put(tfc, tl.toBytes());
							allByteValues.put(bucketId, bytesV);
						}
						
						tl.cleanup();
					}
				}
			}
			termTables.cleanup();
		}
		bufferSize = 0;
		if ( InpipeLog.l.isDebugEnabled()) InpipeLog.l.debug(
				"SaveToIndexBatch > flushIntermediate Executation Time " + 
				(System.currentTimeMillis() - s) + " ms");
	}	

	public void init(Configuration conf) throws ApplicationFault, SystemFault {
		this.docMergeFactor = 
			conf.getInt("index.documents.merge", 10000);
		
		this.isIdMappingEnabled = conf.getBoolean("idmapping.enable", true);
	}

	public PipeIn getInstance() {
		return new SaveToIndexBatch(this.docMergeFactor, this.isIdMappingEnabled);
	}

	public String getName() {
		return "SaveToIndexBatch";
	}
	
	/**
	 * Creates a string representation of the table.
	 * @param mergedTermTables
	 * @return
	 */
	private String  printMergedTT(Map<Long, TermTables> mergedTermTables) {
		StringBuilder sb = new StringBuilder();
		for (long bucket : mergedTermTables.keySet()) {
			sb.append("Bucket:").append(bucket);
			TermTables tt =  mergedTermTables.get(bucket);
			for (char table: tt.tables.keySet()) {
				sb.append("\n\tTable:").append(table);
				TermFamilies tf = tt.tables.get(table);
				for (char family : tf.families.keySet()) {
					sb.append("\n\t\tfamily:").append(family);
					TermColumns tc = tf.families.get(family);
					for (char col : tc.columns.keySet()) {
						sb.append("\n\t\t\tColumn:").append(col);
						sb.append(tc.columns.get(col).toString());
					}
				}
			}
		}
		return sb.toString();
	}	

}