TeaserFilterCommon.java example

Explorer
hsearch-obsolete-master
- src
/*
* Copyright 2010 Bizosys Technologies Limited
*
* Licensed to the Bizosys Technologies Limited (Bizosys) under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  The Bizosys licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.bizosys.hsearch.filter;

import java.util.List;

/**
 * Exract relevant section of document for showing in the search result.
 * @author karan
 *
 */
public class TeaserFilterCommon {
	
	/**
	 * The special characters for cutting the document
	 * We have removed [;] character as this conflicts with XML escape Ex. >
	 */
	private static final byte[] WORD_DELIMITERS = new String(" .,\r\n-").getBytes();
	private static final int WORD_DELIMITERS_LENGTH = WORD_DELIMITERS.length;
	
	private static final byte[] LINE_DELIMITERS = new String(".\r\n-").getBytes();
	private static final int LINE_DELIMITERS_LENGTH = LINE_DELIMITERS.length;

	/**
	 * Byte content 
	 */
	private byte[] bContent;
	
	/**
	 * Location from which to start reading
	 */
	private int offset = 0;
	
	/**
	 * Till which location we need to read
	 */
	private int endPos = 0;
	
	/**
	 * Found matching words in bytes
	 */
	private byte[][] bWords;
	
	/**
	 * Input contnet size
	 */
	private int csize;
	
	/**
	 * Matching words sizes
	 */
	private int wsize[];
	
	/**
	 * Default Constructor
	 *
	 */
	public TeaserFilterCommon() {
	}
	
	public TeaserFilterCommon(byte[][] words) {
		setWords(words);
	}

	public TeaserFilterCommon(byte[] content, byte[][] words) {
		setContent(content, 0, -1);
		setWords(words);
	}

	/**
	 * Constructor
	 * @param content	Content bytes
	 * @param words	Matching words sections
	 */
	public TeaserFilterCommon(byte[] content, int offset, int length, byte[][] words) {
		setContent(content, offset, length);
		setWords(words);
	}
	
	/**
	 * Set the matching words
	 * @param words	The matching words
	 */
	public void setWords(byte[][] words) {
		this.bWords = words;

		int wordsT = words.length;
		this.wsize = new int[wordsT];
		for (int i=0; i<wordsT; i++) {
			this.wsize[i] = words[i].length;
		}
	}	
	
	public void setContent(byte[] content, int offset, int length) {
		this.bContent = content;
		if ( null == content ) return;
		this.offset = offset;
		this.csize = length;
		if ( -1 == length) this.csize  = content.length - offset - 1;
		else {
			if ( this.csize > (offset + content.length) ) this.csize =  content.length - offset - 1;
			if ( this.csize < 0 ) this.csize = 0;
		}
		this.endPos = this.offset + this.csize - 1;
	}
	
	/**
	 * Extract the most suitable section of matching words 
	 * @param sectionSize	The teaser section size (e.g. 300 words)
	 * @return	byte[] The content section on bytes
	 */
	public byte[] find(int sectionSize) {
		if ( null == this.bContent) return null;
		List<WordPosition> wpL = findTerms();
		byte[] section = cutSection (wpL, sectionSize);
		FilterObjectFactory.getInstance().putWordPosition(wpL);
		return section;
	}
	
	public int[] mark(int sectionSize) {
		if ( null == this.bContent) return null;
		List<WordPosition> wpL = findTerms();
		int[] marks = markSection(wpL, sectionSize);
		FilterObjectFactory.getInstance().putWordPosition(wpL);
		return marks;
	}	

	/**
	 * Fins all position of occurances of the supplied words
	 * @return	Found word positions
	 */
	public List<WordPosition> findTerms() {
		
		if( null == this.bContent) return null;
		int wordCount = this.wsize.length;
		int wi = 0;
		//byte bbyte;
		byte cbyte;
		
		List<WordPosition> posL = null;
		
		for (int ci = this.offset; ci < this.endPos; ci++) {
			cbyte = this.bContent[ci];
			
			//Convert only for english
			if ( cbyte >= 'A' && cbyte <= 'Z') cbyte = (byte) (cbyte + 32);  
			/**
			 * cbyte = (char) bbyte;
			 * cbyte = Character.toLowerCase(cbyte);
			 */ 
			
			int cj = 0;
			for (; cj < WORD_DELIMITERS_LENGTH; cj++) {
				if (cbyte == WORD_DELIMITERS[cj]) break; 
			}
			
			if (cj < WORD_DELIMITERS_LENGTH) continue; //Got a word delimiter
			
			for (wi = 0; wi < wordCount; wi++) {
				if (cbyte != this.bWords[wi][0]) continue; //First character matched with one word. Possible 2 words with same first char
				
				int wj = 1;
				for (; wj < this.wsize[wi]; wj++) {
					if ( (ci + wj) > this.endPos ) break;
					byte lbyte = this.bContent[ci + wj]; 
					if ( lbyte >= 'A' && lbyte <= 'Z') lbyte = (byte) (lbyte + 32);  
					if (this.bWords[wi][wj] != lbyte) break;   
				}
				if (wj < this.wsize[wi]) continue;
				
				/**
				 *The word has matched. Check for the next char to
				 *be a word delimiter from WORD_DELIMITERS  
				 */
				if ( (ci + this.wsize[wi]) > this.endPos ) break;
				
				cbyte = this.bContent[ci + this.wsize[wi]];
				if ( cbyte >= 'A' && cbyte <= 'Z') cbyte = (byte) (cbyte + 32);  
				for (wj = 0; wj < WORD_DELIMITERS_LENGTH; wj++) {
					if (cbyte == WORD_DELIMITERS[wj]) break; 
				}
				
				//	Go to the next word
				if (wj >= WORD_DELIMITERS_LENGTH) continue;  
				
				//Found the word, so add the position
				if ( null == posL) posL = FilterObjectFactory.getInstance().getWordPosition();
				posL.add(new WordPosition(wi, ci, (ci + this.wsize[wi])));
				
				/**
				 * Move the reader till the end of the word and into the space.
				 * The for loop will advance it to the next
				 */
				ci = ci + this.wsize[wi]; 
			}
			
			//	Found a word, just go back to the main loop
			if (wi < wordCount) continue; 
			
			//Skip to the start of the next word
			for (; ci <= this.endPos; ci++) {
				cbyte = this.bContent[ci];
				if ( cbyte >= 'A' && cbyte <= 'Z') cbyte = (byte) (cbyte + 32);  
				for (cj = 0; cj < WORD_DELIMITERS_LENGTH; cj++) {
					if (cbyte == WORD_DELIMITERS[cj]) break; 
				}
				//	Got a word delimiter
				if (cj < WORD_DELIMITERS_LENGTH) break; 
			}
			
		}
		return posL;
	}
	
	/**
	 * Cut the most suitable sections
	 * @param wpL	Multiple sighted word positions
	 * @param sectionSize	The length of the teaser section
	 * @return	The best found section
	 */
	public byte[] cutSection (List<WordPosition> wpL, int sectionSize) {
		int[] section = markSection(wpL, sectionSize);
		if ( null == section) return null;
		sectionSize =  section[1] - section[0];
		byte[] sectionB = new byte[sectionSize];
		System.arraycopy(this.bContent, section[0], sectionB , 0, sectionSize);
		return sectionB;
		
	}
	
	/**
	 * Mark the section which requires to be taken out for the teaser.
	 * This can be used for direct array copy than creating temporary
	 * byte arrays.
	 * @param wpL	Multiple sighted word positions
	 * @param sectionSize	The length of the teaser section
	 * @return int array. 0th Location = start position, 1st Location = end position
	 */
	public int[] markSection (List<WordPosition> wpL, int sectionSize) {
		if ( null == this.bContent) return null;
		if ( null == wpL) return null;
		if ( wpL.size() == 0) return null;

		/**
		 * Find the zone where all matchings are noticed.
		 */
		int matchingWordStart = -1, matchingWordEnd = this.offset + sectionSize;
		for (WordPosition wp : wpL) {
			if ( -1 == matchingWordStart) {
				matchingWordStart = wp.start; 
				matchingWordEnd = wp.end;
				continue;
			}
			if (wp.start < matchingWordStart ) matchingWordStart = wp.start; 
			if (wp.end > matchingWordEnd ) matchingWordEnd = wp.end; 
		}
		
		/**
		 * Divide this section to multiple parts and find the degree of concentration
		 */
		int matchingSectionLen = (matchingWordEnd - matchingWordStart);
		if ( matchingSectionLen == 0) return null;
		if ( matchingSectionLen < sectionSize) matchingSectionLen =  sectionSize;
		int matchingSectionsT = matchingSectionLen / sectionSize;
		int[] matchingSections = new int[matchingSectionsT];
		
		for (WordPosition wp : wpL) {
			for ( int zoneIndex=0; zoneIndex<matchingSectionsT; zoneIndex++) {
				int zoneStart = matchingWordStart+ (zoneIndex * sectionSize);
				int zoneEnd = zoneStart + sectionSize;
				if ( wp.start >= zoneStart && wp.start < zoneEnd) 
					matchingSections[zoneIndex] = matchingSections[zoneIndex] + 1; 
			}
		}
		
		/**
		 * Extract maximum concentration area
		 */
		int maxZoneIndex = 0;
		int maxZoneValue = 0;
		for ( int zoneIndex=0; zoneIndex<matchingSectionsT; zoneIndex++) {
			if ( matchingSections[zoneIndex] > maxZoneValue) {
				maxZoneValue = matchingSections[zoneIndex];
				maxZoneIndex = zoneIndex;
			}
		}
		
		int startPos = matchingWordStart + (maxZoneIndex * sectionSize);
		
		// Give little more scope to find an appropriate matching section
		int halfSection = sectionSize/2;
		int end = startPos; 
		startPos = startPos - halfSection;
		if ( startPos < this.offset) startPos = this.offset; 
		
		/**
		 * Create a starting position from a separator
		 */
		int ci = startPos;
		byte cbyte;
		int cj = 0;
		boolean isFound = false;
		
		for (; ci < end; ci++) {
			cbyte = this.bContent[ci];
			for (cj = 0; cj < LINE_DELIMITERS_LENGTH; cj++) {
				if (cbyte == LINE_DELIMITERS[cj]) {
					isFound = true; break;
				} 
			}
			if (isFound) {
				ci++; break;
			}
		}
		
		if ( isFound) {
			startPos = ci;
		} else {
			for (ci = startPos; ci < end; ci++) {
				cbyte = this.bContent[ci];
				if (cbyte == ' ') {
					startPos = ci++; break; 
				}
			}
		}
		
		end = startPos + sectionSize;
		
		
		/**
		 * Create an ending position away from a separator
		 */
		
		if ( end > this.endPos) end =  this.endPos;

		isFound = false;
		for (ci = end; ci >= startPos; ci--) {
			cbyte = this.bContent[ci];
			for (cj = 0; cj < WORD_DELIMITERS_LENGTH; cj++) {
				if (cbyte == WORD_DELIMITERS[cj]) {
					isFound = true;
					break; 
				}
			}
			if (isFound) break;
		}
		
		end = ci;
		if ( end > this.endPos ) end = this.endPos;
		return new int[]{startPos, end};
	}
	

	/**
	 * Carries sighting information of a word inside the content
	 * @author karan
	 *
	 */
	public static class WordPosition {
		
		/**
		 * Query keyword position E.g. (abinash karan hbase = 0,1,2)
		 */
		public int index;
		
		/**
		 * Start position of the word in the given corpus
		 */
		public int start;
		
		/**
		 * End position start position + word length
		 */
		public int end;
		
		/**
		 * Default constrctor
		 * @param index	Query keyword position
		 * @param start	Start position of the word
		 * @param end	End position
		 */
		public WordPosition(int index, int start, int end) {
			this.index = index;
			this.start = start;
			this.end = end;
		}
		
		@Override
		public String toString() {
			return "index:" + index + ", start:" + start + ", end:" + end;
		}
	}
}