/*
* Copyright 2010 Bizosys Technologies Limited
*
* Licensed to the Bizosys Technologies Limited (Bizosys) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The Bizosys licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.bizosys.hsearch.filter;
import java.util.List;
/**
* Exract relevant section of document for showing in the search result.
* @author karan
*
*/
public class TeaserFilterCommon {
/**
* The special characters for cutting the document
* We have removed [;] character as this conflicts with XML escape Ex. >
*/
private static final byte[] WORD_DELIMITERS = new String(" .,\r\n-").getBytes();
private static final int WORD_DELIMITERS_LENGTH = WORD_DELIMITERS.length;
private static final byte[] LINE_DELIMITERS = new String(".\r\n-").getBytes();
private static final int LINE_DELIMITERS_LENGTH = LINE_DELIMITERS.length;
/**
* Byte content
*/
private byte[] bContent;
/**
* Location from which to start reading
*/
private int offset = 0;
/**
* Till which location we need to read
*/
private int endPos = 0;
/**
* Found matching words in bytes
*/
private byte[][] bWords;
/**
* Input contnet size
*/
private int csize;
/**
* Matching words sizes
*/
private int wsize[];
/**
* Default Constructor
*
*/
public TeaserFilterCommon() {
}
public TeaserFilterCommon(byte[][] words) {
setWords(words);
}
public TeaserFilterCommon(byte[] content, byte[][] words) {
setContent(content, 0, -1);
setWords(words);
}
/**
* Constructor
* @param content Content bytes
* @param words Matching words sections
*/
public TeaserFilterCommon(byte[] content, int offset, int length, byte[][] words) {
setContent(content, offset, length);
setWords(words);
}
/**
* Set the matching words
* @param words The matching words
*/
public void setWords(byte[][] words) {
this.bWords = words;
int wordsT = words.length;
this.wsize = new int[wordsT];
for (int i=0; i<wordsT; i++) {
this.wsize[i] = words[i].length;
}
}
public void setContent(byte[] content, int offset, int length) {
this.bContent = content;
if ( null == content ) return;
this.offset = offset;
this.csize = length;
if ( -1 == length) this.csize = content.length - offset - 1;
else {
if ( this.csize > (offset + content.length) ) this.csize = content.length - offset - 1;
if ( this.csize < 0 ) this.csize = 0;
}
this.endPos = this.offset + this.csize - 1;
}
/**
* Extract the most suitable section of matching words
* @param sectionSize The teaser section size (e.g. 300 words)
* @return byte[] The content section on bytes
*/
public byte[] find(int sectionSize) {
if ( null == this.bContent) return null;
List<WordPosition> wpL = findTerms();
byte[] section = cutSection (wpL, sectionSize);
FilterObjectFactory.getInstance().putWordPosition(wpL);
return section;
}
public int[] mark(int sectionSize) {
if ( null == this.bContent) return null;
List<WordPosition> wpL = findTerms();
int[] marks = markSection(wpL, sectionSize);
FilterObjectFactory.getInstance().putWordPosition(wpL);
return marks;
}
/**
* Fins all position of occurances of the supplied words
* @return Found word positions
*/
public List<WordPosition> findTerms() {
if( null == this.bContent) return null;
int wordCount = this.wsize.length;
int wi = 0;
//byte bbyte;
byte cbyte;
List<WordPosition> posL = null;
for (int ci = this.offset; ci < this.endPos; ci++) {
cbyte = this.bContent[ci];
//Convert only for english
if ( cbyte >= 'A' && cbyte <= 'Z') cbyte = (byte) (cbyte + 32);
/**
* cbyte = (char) bbyte;
* cbyte = Character.toLowerCase(cbyte);
*/
int cj = 0;
for (; cj < WORD_DELIMITERS_LENGTH; cj++) {
if (cbyte == WORD_DELIMITERS[cj]) break;
}
if (cj < WORD_DELIMITERS_LENGTH) continue; //Got a word delimiter
for (wi = 0; wi < wordCount; wi++) {
if (cbyte != this.bWords[wi][0]) continue; //First character matched with one word. Possible 2 words with same first char
int wj = 1;
for (; wj < this.wsize[wi]; wj++) {
if ( (ci + wj) > this.endPos ) break;
byte lbyte = this.bContent[ci + wj];
if ( lbyte >= 'A' && lbyte <= 'Z') lbyte = (byte) (lbyte + 32);
if (this.bWords[wi][wj] != lbyte) break;
}
if (wj < this.wsize[wi]) continue;
/**
*The word has matched. Check for the next char to
*be a word delimiter from WORD_DELIMITERS
*/
if ( (ci + this.wsize[wi]) > this.endPos ) break;
cbyte = this.bContent[ci + this.wsize[wi]];
if ( cbyte >= 'A' && cbyte <= 'Z') cbyte = (byte) (cbyte + 32);
for (wj = 0; wj < WORD_DELIMITERS_LENGTH; wj++) {
if (cbyte == WORD_DELIMITERS[wj]) break;
}
// Go to the next word
if (wj >= WORD_DELIMITERS_LENGTH) continue;
//Found the word, so add the position
if ( null == posL) posL = FilterObjectFactory.getInstance().getWordPosition();
posL.add(new WordPosition(wi, ci, (ci + this.wsize[wi])));
/**
* Move the reader till the end of the word and into the space.
* The for loop will advance it to the next
*/
ci = ci + this.wsize[wi];
}
// Found a word, just go back to the main loop
if (wi < wordCount) continue;
//Skip to the start of the next word
for (; ci <= this.endPos; ci++) {
cbyte = this.bContent[ci];
if ( cbyte >= 'A' && cbyte <= 'Z') cbyte = (byte) (cbyte + 32);
for (cj = 0; cj < WORD_DELIMITERS_LENGTH; cj++) {
if (cbyte == WORD_DELIMITERS[cj]) break;
}
// Got a word delimiter
if (cj < WORD_DELIMITERS_LENGTH) break;
}
}
return posL;
}
/**
* Cut the most suitable sections
* @param wpL Multiple sighted word positions
* @param sectionSize The length of the teaser section
* @return The best found section
*/
public byte[] cutSection (List<WordPosition> wpL, int sectionSize) {
int[] section = markSection(wpL, sectionSize);
if ( null == section) return null;
sectionSize = section[1] - section[0];
byte[] sectionB = new byte[sectionSize];
System.arraycopy(this.bContent, section[0], sectionB , 0, sectionSize);
return sectionB;
}
/**
* Mark the section which requires to be taken out for the teaser.
* This can be used for direct array copy than creating temporary
* byte arrays.
* @param wpL Multiple sighted word positions
* @param sectionSize The length of the teaser section
* @return int array. 0th Location = start position, 1st Location = end position
*/
public int[] markSection (List<WordPosition> wpL, int sectionSize) {
if ( null == this.bContent) return null;
if ( null == wpL) return null;
if ( wpL.size() == 0) return null;
/**
* Find the zone where all matchings are noticed.
*/
int matchingWordStart = -1, matchingWordEnd = this.offset + sectionSize;
for (WordPosition wp : wpL) {
if ( -1 == matchingWordStart) {
matchingWordStart = wp.start;
matchingWordEnd = wp.end;
continue;
}
if (wp.start < matchingWordStart ) matchingWordStart = wp.start;
if (wp.end > matchingWordEnd ) matchingWordEnd = wp.end;
}
/**
* Divide this section to multiple parts and find the degree of concentration
*/
int matchingSectionLen = (matchingWordEnd - matchingWordStart);
if ( matchingSectionLen == 0) return null;
if ( matchingSectionLen < sectionSize) matchingSectionLen = sectionSize;
int matchingSectionsT = matchingSectionLen / sectionSize;
int[] matchingSections = new int[matchingSectionsT];
for (WordPosition wp : wpL) {
for ( int zoneIndex=0; zoneIndex<matchingSectionsT; zoneIndex++) {
int zoneStart = matchingWordStart+ (zoneIndex * sectionSize);
int zoneEnd = zoneStart + sectionSize;
if ( wp.start >= zoneStart && wp.start < zoneEnd)
matchingSections[zoneIndex] = matchingSections[zoneIndex] + 1;
}
}
/**
* Extract maximum concentration area
*/
int maxZoneIndex = 0;
int maxZoneValue = 0;
for ( int zoneIndex=0; zoneIndex<matchingSectionsT; zoneIndex++) {
if ( matchingSections[zoneIndex] > maxZoneValue) {
maxZoneValue = matchingSections[zoneIndex];
maxZoneIndex = zoneIndex;
}
}
int startPos = matchingWordStart + (maxZoneIndex * sectionSize);
// Give little more scope to find an appropriate matching section
int halfSection = sectionSize/2;
int end = startPos;
startPos = startPos - halfSection;
if ( startPos < this.offset) startPos = this.offset;
/**
* Create a starting position from a separator
*/
int ci = startPos;
byte cbyte;
int cj = 0;
boolean isFound = false;
for (; ci < end; ci++) {
cbyte = this.bContent[ci];
for (cj = 0; cj < LINE_DELIMITERS_LENGTH; cj++) {
if (cbyte == LINE_DELIMITERS[cj]) {
isFound = true; break;
}
}
if (isFound) {
ci++; break;
}
}
if ( isFound) {
startPos = ci;
} else {
for (ci = startPos; ci < end; ci++) {
cbyte = this.bContent[ci];
if (cbyte == ' ') {
startPos = ci++; break;
}
}
}
end = startPos + sectionSize;
/**
* Create an ending position away from a separator
*/
if ( end > this.endPos) end = this.endPos;
isFound = false;
for (ci = end; ci >= startPos; ci--) {
cbyte = this.bContent[ci];
for (cj = 0; cj < WORD_DELIMITERS_LENGTH; cj++) {
if (cbyte == WORD_DELIMITERS[cj]) {
isFound = true;
break;
}
}
if (isFound) break;
}
end = ci;
if ( end > this.endPos ) end = this.endPos;
return new int[]{startPos, end};
}
/**
* Carries sighting information of a word inside the content
* @author karan
*
*/
public static class WordPosition {
/**
* Query keyword position E.g. (abinash karan hbase = 0,1,2)
*/
public int index;
/**
* Start position of the word in the given corpus
*/
public int start;
/**
* End position start position + word length
*/
public int end;
/**
* Default constrctor
* @param index Query keyword position
* @param start Start position of the word
* @param end End position
*/
public WordPosition(int index, int start, int end) {
this.index = index;
this.start = start;
this.end = end;
}
@Override
public String toString() {
return "index:" + index + ", start:" + start + ", end:" + end;
}
}
}