/* * Copyright 2010 Bizosys Technologies Limited * * Licensed to the Bizosys Technologies Limited (Bizosys) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The Bizosys licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.bizosys.hsearch.outpipe; import java.io.IOException; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.Vector; import java.util.concurrent.Callable; import org.apache.hadoop.hbase.client.Result; import org.apache.hadoop.hbase.client.ResultScanner; import org.apache.hadoop.hbase.client.Scan; import com.bizosys.hsearch.filter.Storable; import com.bizosys.hsearch.filter.TermFilter; import com.bizosys.hsearch.hbase.HBaseFacade; import com.bizosys.hsearch.hbase.HTableWrapper; import com.bizosys.hsearch.index.DocumentType; import com.bizosys.hsearch.index.TermList; import com.bizosys.hsearch.index.TermType; import com.bizosys.hsearch.query.QueryLog; import com.bizosys.hsearch.query.QueryTerm; /** * This implements callable interface for execution in parallel * This actually executes and fetches IDs from the HBase table. * @author karan * */ class SequenceProcessorFindHBase implements Callable<Object> { public TermFilter tf; public List<byte[]> foundBuckets = new Vector<byte[]>(); protected boolean isBlockCache = true; protected int scanIpcLimit = 300; protected Map<Long, TermList> lastTermLists = null; protected long fromTime = -1; protected long toTime = System.currentTimeMillis(); protected QueryTerm term = null; public SequenceProcessorFindHBase(QueryTerm term, List<byte[]> findWithinBuckets) { this.term = term; int totalBytes = 6 /** Hashcode + DocType + TermType*/; if ( null != findWithinBuckets) { totalBytes = totalBytes + findWithinBuckets.size() * 8; } int pos = 0; byte[] filterBytes = new byte[totalBytes]; byte[] hashBytes = Storable.putInt(term.wordStemmed.hashCode()); System.arraycopy(hashBytes, 0, filterBytes, pos, 4); pos = pos + 4; filterBytes[pos++] = term.docTypeCode; filterBytes[pos++] = term.termTypeCode; if ( null != findWithinBuckets) { for (byte[] bucket: findWithinBuckets) { System.arraycopy(bucket, 0, filterBytes, pos, 8); pos = pos + 8; } } this.tf = new TermFilter(filterBytes); } /** * This filters based on the last term ids * The subset is only kept. * Non matching buckets are removed and Document positions marked -1 * @param lastMustTerm */ public void setFilterByIds(QueryTerm lastMustTerm) { if ( null == lastMustTerm.foundIds) return; if ( 0 == lastMustTerm.foundIds.size()) return; this.lastTermLists = lastMustTerm.foundIds; } /** * Go to respective table, colFamily, call * Pass the Matching IDs, Term Type, Document Type, Security Information * Collect only matching Document Sequences */ public Object call() throws Exception { QueryLog.l.debug("SequenceProcessorFindHBase > Call START"); if ( null == this.term) return null; /** * Step 1 Identify table, family and column */ char tableName = this.term.lang.getTableName(this.term.wordStemmed); char familyName = this.term.lang.getColumnFamily(this.term.wordStemmed); char colName = this.term.lang.getColumn(this.term.wordStemmed); if ( QueryLog.l.isDebugEnabled()) { StringBuilder sb = new StringBuilder(); sb.append("SequenceProcessorFindHBase > Term:").append(this.term.wordOrig); sb.append(" , Table [").append(tableName); sb.append("] , Family [").append(familyName); sb.append("] , Column [").append(colName).append(']'); QueryLog.l.debug(sb.toString()); } /** * Step 2 Configure Filtering mechanism */ HTableWrapper table = null; HBaseFacade facade = null; ResultScanner scanner = null; try { facade = HBaseFacade.getInstance(); table = facade.getTable(new String(new char[]{tableName})); /** * Configure the scanning mechanism. */ byte[] familyB = new byte[]{(byte)familyName}; byte[] nameB = new byte[]{(byte)colName}; Scan scan = configScanner(familyB, nameB); scanner = table.getScanner(scan); byte[] storedB = null; byte[] row = null; boolean hasTypeFilter = ( DocumentType.NONE_TYPECODE != term.docTypeCode || TermType.NONE_TYPECODE != term.termTypeCode); readScanner(scanner, familyB, nameB, storedB, row, hasTypeFilter); } catch ( IOException ex) { QueryLog.l.fatal("SequenceProcessorFindHBase:", ex); return null; } finally { if ( null != scanner) scanner.close(); if ( null != table ) facade.putTable(table); } return null; } /** * Parses the scanner. Differ loading terms as much as possible. * @param scanner * @param familyB * @param nameB * @param storedB * @param row * @param hasTypeFilter * @param ignorePos */ protected void readScanner(ResultScanner scanner, byte[] familyB, byte[] nameB, byte[] storedB, byte[] row, boolean hasTypeFilter) { TermList lastTermL; boolean hasElementsLeft; for (Result r: scanner) { if ( null == r) continue; if ( r.isEmpty()) continue; storedB = r.getValue(familyB, nameB); if ( null == storedB) continue; row = r.getRow(); long rowId = Storable.getLong(0, row); lastTermL = null; if ( !(null == this.lastTermLists || this.term.isOptional) ) { lastTermL = this.lastTermLists.get(rowId); if ( null == lastTermL) continue; } TermList foundTermL = new TermList(); foundTermL.loadTerms(storedB); if ( null != lastTermL) { hasElementsLeft = foundTermL.intersect(lastTermL); if ( ! hasElementsLeft ) continue; } /** * There are definite ID subsets in this bucket. */ this.foundBuckets.add(row); this.term.foundIds.put(rowId, foundTermL); } if ( QueryLog.l.isDebugEnabled()) { int foundT = ( null == this.term.foundIds) ? 0 : this.term.foundIds.size(); QueryLog.l.debug( "SequenceProcessorFindHBase > Matching Terms > " + foundT); } } /** * Configure the remote filtering mechanism. * @param familyB * @param nameB * @return * @throws IOException */ protected Scan configScanner(byte[] familyB, byte[] nameB) throws IOException { Scan scan = new Scan(); scan.setCacheBlocks(isBlockCache); scan.setCaching(scanIpcLimit); scan = scan.addColumn(familyB, nameB); scan.setMaxVersions(1); if ( -1 != fromTime) scan = scan.setTimeRange(fromTime, toTime); this.tf.addColumn(familyB, nameB); scan = scan.setFilter(this.tf); return scan; } }