SequenceProcessorFindHBase.java example

Explorer
hsearch-obsolete-master
- src
/*
* Copyright 2010 Bizosys Technologies Limited
*
* Licensed to the Bizosys Technologies Limited (Bizosys) under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  The Bizosys licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.bizosys.hsearch.outpipe;

import java.io.IOException;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.Vector;
import java.util.concurrent.Callable;

import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.ResultScanner;
import org.apache.hadoop.hbase.client.Scan;

import com.bizosys.hsearch.filter.Storable;
import com.bizosys.hsearch.filter.TermFilter;
import com.bizosys.hsearch.hbase.HBaseFacade;
import com.bizosys.hsearch.hbase.HTableWrapper;
import com.bizosys.hsearch.index.DocumentType;
import com.bizosys.hsearch.index.TermList;
import com.bizosys.hsearch.index.TermType;
import com.bizosys.hsearch.query.QueryLog;
import com.bizosys.hsearch.query.QueryTerm;

/**
 * This implements callable interface for execution in parallel
 * This actually executes and fetches IDs from the HBase table.
 * @author karan
 *
 */
class SequenceProcessorFindHBase implements Callable<Object> {
	
	public TermFilter tf;
	public List<byte[]> foundBuckets = new Vector<byte[]>();
	
	protected boolean isBlockCache = true;
	protected int scanIpcLimit = 300;
	protected Map<Long, TermList> lastTermLists = null;
	protected long fromTime = -1;
	protected long toTime = System.currentTimeMillis();
	protected QueryTerm term = null;
	
	public SequenceProcessorFindHBase(QueryTerm term, List<byte[]> findWithinBuckets) {
		
		this.term = term;
		
		int totalBytes = 6 /** Hashcode + DocType + TermType*/;
		if ( null != findWithinBuckets) {
			totalBytes = totalBytes + findWithinBuckets.size() * 8;
		}
		
		int pos = 0;

		byte[] filterBytes = new byte[totalBytes];
		byte[] hashBytes = Storable.putInt(term.wordStemmed.hashCode());
		System.arraycopy(hashBytes, 0, filterBytes, pos, 4);
		pos = pos + 4;
		filterBytes[pos++] = term.docTypeCode;
		filterBytes[pos++] = term.termTypeCode;
		
		if ( null != findWithinBuckets) {
			for (byte[] bucket: findWithinBuckets) {
				System.arraycopy(bucket, 0, filterBytes, pos, 8);
				pos = pos + 8;
			}
		}
		this.tf = new TermFilter(filterBytes);
	}
	
	/**
	 * This filters based on the last term ids
	 * The subset is only kept.
	 * Non matching buckets are removed and Document positions marked -1
	 * @param lastMustTerm
	 */
	public void setFilterByIds(QueryTerm lastMustTerm) {
		if ( null == lastMustTerm.foundIds) return;
		if ( 0 == lastMustTerm.foundIds.size()) return;
		
		this.lastTermLists = lastMustTerm.foundIds;
	}

	/**
	 * Go to respective table, colFamily, call
	 * Pass the Matching IDs, Term Type, Document Type, Security Information
	 * Collect only matching Document Sequences 
	 */
	public Object call() throws Exception {
		
		QueryLog.l.debug("SequenceProcessorFindHBase > Call START");
		if ( null == this.term) return null;
		
		/**
		 * Step 1 Identify table, family and column
		 */
		char tableName = this.term.lang.getTableName(this.term.wordStemmed);
		char familyName = this.term.lang.getColumnFamily(this.term.wordStemmed);
		char colName = this.term.lang.getColumn(this.term.wordStemmed);
		if ( QueryLog.l.isDebugEnabled()) {
			StringBuilder sb = new StringBuilder();
			sb.append("SequenceProcessorFindHBase > Term:").append(this.term.wordOrig);
			sb.append(" , Table [").append(tableName);
			sb.append("] , Family [").append(familyName);
			sb.append("] , Column [").append(colName).append(']');
			QueryLog.l.debug(sb.toString());
		}
		
		/**
		 * Step 2 Configure Filtering mechanism 
		 */
		HTableWrapper table = null;
		HBaseFacade facade = null;
		ResultScanner scanner = null;

		try {

			facade = HBaseFacade.getInstance();
			table = facade.getTable(new String(new char[]{tableName}));
			
			/**
			 * Configure the scanning mechanism.
			 */
			byte[] familyB = new byte[]{(byte)familyName};
			byte[] nameB = new byte[]{(byte)colName};
			Scan scan = configScanner(familyB, nameB);
			scanner = table.getScanner(scan);
			
			byte[] storedB = null;
			byte[] row = null;
			boolean hasTypeFilter = ( DocumentType.NONE_TYPECODE != term.docTypeCode 
				|| TermType.NONE_TYPECODE != term.termTypeCode);
			
			readScanner(scanner, familyB, nameB, storedB, row, hasTypeFilter);
			
			
		} catch ( IOException ex) {
			QueryLog.l.fatal("SequenceProcessorFindHBase:", ex);
			return null;
		} finally {
			if ( null != scanner) scanner.close();
			if ( null != table ) facade.putTable(table);
		}		
		return null;
	}

	/**
	 * Parses the scanner. Differ loading terms as much as possible.
	 * @param scanner
	 * @param familyB
	 * @param nameB
	 * @param storedB
	 * @param row
	 * @param hasTypeFilter
	 * @param ignorePos
	 */
	protected void readScanner(ResultScanner scanner, byte[] familyB, byte[] nameB, byte[] storedB, byte[] row, boolean hasTypeFilter) {
		TermList lastTermL;
		boolean hasElementsLeft;
		
		for (Result r: scanner) {
			if ( null == r) continue;
			if ( r.isEmpty()) continue;
			storedB = r.getValue(familyB, nameB);
			if ( null == storedB) continue;
			
			row = r.getRow();
			long rowId = Storable.getLong(0, row);
			
			lastTermL = null;
			if ( !(null == this.lastTermLists || this.term.isOptional) ) { 
				lastTermL = this.lastTermLists.get(rowId);
				if ( null == lastTermL) continue;
			}
			TermList foundTermL = new TermList();
			foundTermL.loadTerms(storedB);
			if ( null != lastTermL) {
				hasElementsLeft = foundTermL.intersect(lastTermL);
				if ( ! hasElementsLeft ) continue;
			}

			/**
			 * There are definite ID subsets in this bucket.
			 */ 
			this.foundBuckets.add(row);
			this.term.foundIds.put(rowId, foundTermL);
		}
		
		if ( QueryLog.l.isDebugEnabled()) {
			int foundT = ( null == this.term.foundIds) ? 
				0 : this.term.foundIds.size();
			QueryLog.l.debug(
				"SequenceProcessorFindHBase > Matching Terms > " + foundT);	
		}
	}

	/**
	 * Configure the remote filtering mechanism.
	 * @param familyB
	 * @param nameB
	 * @return
	 * @throws IOException
	 */
	protected Scan configScanner(byte[] familyB, byte[] nameB) throws IOException {
		Scan scan = new Scan();
		scan.setCacheBlocks(isBlockCache);
		scan.setCaching(scanIpcLimit);
		scan = scan.addColumn(familyB, nameB);
		scan.setMaxVersions(1);
		if ( -1 != fromTime) scan = scan.setTimeRange(fromTime, toTime);
		
		this.tf.addColumn(familyB, nameB);
		scan = scan.setFilter(this.tf);
		
		return scan;
	}
}