/* * Copyright 2010 Bizosys Technologies Limited * * Licensed to the Bizosys Technologies Limited (Bizosys) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The Bizosys licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.bizosys.hsearch.filter; import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; import java.util.List; import org.apache.hadoop.hbase.KeyValue; import org.apache.hadoop.hbase.filter.Filter; /** * Sending the complete document over the wire may Jam the network on a * heavy concurrent user base. This filter ensures sending the most * relevant section only. It also uses multiple Region servers to create * the teasers to serve a search request. * @author karan */ public class TeaserFilterMerged implements Filter { private static final char TEASER_HEADER = 't'; private static final char TEASER_DETAIL = 'u'; private static final byte[] TEASER_HEADER_BYTES = "t".getBytes(); private static final byte[] TEASER_DETAIL_BYTES = "u".getBytes(); /** * Default teaser section length */ short cutLength = 360; /** * Searched words */ byte[][] bWords = null; /** * Only extract these document serial numbers */ private int[] docSerials = null; /** * Default constructor * */ public TeaserFilterMerged(){} /** * Constructor * @param bWords Searched words * @param cutLength Teaser section length */ public TeaserFilterMerged(byte[][] bWords, short cutLength){ this.bWords = bWords; this.cutLength = cutLength; } public void setDocSerials(int[] docSerials) { this.docSerials = docSerials; } public boolean filterAllRemaining() { return false; } public boolean filterRow() { return false; } /** * last chance to drop entire row based on the sequence of filterValue() * calls. Eg: filter a row if it doesn't contain a specified column */ public void filterRow(List<KeyValue> kvL) { if ( null == kvL || null == docSerials ) return; if ( 0 == kvL.size()) return; MergedBlocks.Block teaserBlocks = new MergedBlocks.Block(); KeyValue prestine = getExistingBlocks(kvL, teaserBlocks); if ( null == prestine) return; byte[] header = teaserBlocks.header; if ( null == header) return; byte[] data = teaserBlocks.data; if ( null == data) return; int totalDocs = docSerials.length; int start=0; List<TeaserMarker> markings = FilterObjectFactory.getInstance().getTeaserMarker(); TeaserFilterCommon tf = new TeaserFilterCommon(bWords); for ( int i=0; i< totalDocs; i++) { int docSerial = docSerials[i]; start = MergedBlocks.readHeader(header, docSerial); if ( -1 == start) continue; TeaserMarker marker = new TeaserMarker(docSerial,data,start,tf,cutLength); markings.add(marker); } if ( 0 == markings.size()) { kvL.clear(); FilterObjectFactory.getInstance().putTeaserMarker(markings); return; } int dataLenTeaser = 0; for (TeaserMarker marker : markings) { dataLenTeaser = dataLenTeaser + marker.getNewSize(); } byte[] teaserHeader = new byte[markings.size() * 2]; byte[] teaserData = new byte[dataLenTeaser]; int teaserDataPos = 0, teaserHeaderPos=0; for (TeaserMarker marker : markings) { teaserHeader[teaserHeaderPos++] = (byte)(marker.serial >> 8 & 0xff); teaserHeader[teaserHeaderPos++] = (byte)(marker.serial & 0xff); teaserDataPos = marker.extract(teaserBlocks.data, teaserData , teaserDataPos); } FilterObjectFactory.getInstance().putTeaserMarker(markings); kvL.clear(); byte[] r = prestine.getRow(); byte[] f = prestine.getFamily(); kvL.add(new KeyValue(r,f,TEASER_HEADER_BYTES,teaserHeader)); kvL.add(new KeyValue(r,f,TEASER_DETAIL_BYTES,teaserData)); } private KeyValue getExistingBlocks(List<KeyValue> kvL, MergedBlocks.Block teaserBlocks) { KeyValue prestine = null; for (KeyValue kv : kvL) { if ( null == prestine) { prestine = new KeyValue(kv.getRow(),kv.getFamily(), kv.getQualifier()); } switch ( kv.getQualifier()[0] ) { case TEASER_HEADER: teaserBlocks.header = kv.getValue(); break; case TEASER_DETAIL: teaserBlocks.data = kv.getValue(); break; default: System.err.println("\n\n Error : Unknown Column :" + new String(kv.getQualifier())); break; } } return prestine; } /** * true to drop this row, if false, we will also call */ public boolean filterRowKey(byte[] rowKey, int offset, int length) { return false; } public KeyValue getNextKeyHint(KeyValue arg0) { return null; } public boolean hasFilterRow() { return true; } public void reset() { } @Override public void readFields(DataInput in) throws IOException { int totalDocs = in.readInt(); docSerials = new int[totalDocs]; for ( int i=0; i<totalDocs; i++ ) { docSerials[i] = in.readInt(); } this.cutLength = in.readShort(); int len = in.readByte(); int index = 1; this.bWords = new byte[len][]; for ( int i=0; i<len; i++ ) { int wLen = in.readByte() ; index++; this.bWords[i] = new byte[wLen]; in.readFully(this.bWords[i], 0, wLen); index = index + wLen; } } @Override public void write(DataOutput out) throws IOException { int totalDocs = ( null == this.docSerials) ? 0 : docSerials.length; out.writeInt(totalDocs); if ( totalDocs > 0) { for (int serial : docSerials) { out.writeInt(serial); } } out.writeShort(cutLength); out.writeByte(bWords.length); for ( int i=0; i<bWords.length; i++ ) { out.writeByte(bWords[i].length); out.write(bWords[i]); } } public ReturnCode filterKeyValue(KeyValue arg0) { return ReturnCode.INCLUDE; } public TeaserFilterMerged clone() { TeaserFilterMerged another = new TeaserFilterMerged(); another.bWords = this.bWords; another.cutLength = this.cutLength; return another; } }