/*
* Copyright 2010 Bizosys Technologies Limited
*
* Licensed to the Bizosys Technologies Limited (Bizosys) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The Bizosys licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.bizosys.hsearch.index;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import com.bizosys.hsearch.filter.IStorable;
import com.bizosys.hsearch.filter.Storable;
import com.bizosys.hsearch.schema.ILanguageMap;
import com.bizosys.hsearch.util.ObjectFactory;
/**
* Multiple terms grouped as termlist
* @author karan
*
*/
public class TermList implements IStorable {
public static final int TERM_SIZE_VECTOR = 8;
public static final int TERM_SIZE_NOVECTOR = 5;
public static boolean termVectorStorageEnabled = false;
/**
* Total terms present for this keyword
*/
public int totalTerms;
/**
* Document type codes in a array for all terms
*/
public byte[] docTypesCodes;
/**
* Term type codes in a array for all terms
*/
public byte[] termTypeCodes;
/**
* Term weight in a array for all terms
*/
public byte[] termWeight;
/**
* How many times the term is sightted in the document
*/
public byte[] termFreq;
/**
* Which location of the document, the term is positioned
*/
public short[] termPosition;
/**
* This document is at which location of the bucket
*/
public short[] docPos;
/**
* All terms are listed here. we merge and keep the same term only in the list.
*/
public Map<Integer, List<Term>> lstKeywords = null;
private byte[] existingB = null;
public TermList() {
}
public void setExistingBytes(byte[] existingB) {
this.existingB = existingB;
}
public void loadTerms(byte[] bytes, Set<Integer> ignoreLocation,
Byte docType, Byte termType) {
if ( null == bytes) return;
if ( termVectorStorageEnabled ) {
this.totalTerms = bytes.length / TERM_SIZE_VECTOR;
} else {
this.totalTerms = bytes.length / TERM_SIZE_NOVECTOR;
}
if ( DocumentType.NONE_TYPECODE != docType) {
byte docTypeCode =docType.byteValue();
for (int i=0; i<this.totalTerms; i++ ) {
if ( docTypeCode != bytes[i] ) ignoreLocation.add(i);
}
}
if ( TermType.NONE_TYPECODE != termType) {
byte termTypeCode =termType.byteValue();
for (int i=0; i<this.totalTerms; i++ ) {
if ( termTypeCode != bytes[this.totalTerms + i] ) ignoreLocation.add(i);
}
}
int ignoreLocationT = ignoreLocation.size();
if ( ignoreLocationT == 0) {
loadTerms(bytes);
return;
}
/**
* Document types codes
*/
int newTotals = this.totalTerms - ignoreLocationT;
docTypesCodes = new byte[newTotals];
termTypeCodes = new byte[newTotals];
this.termWeight = new byte[newTotals];
if ( termVectorStorageEnabled ) {
this.termFreq = new byte[newTotals];
this.termPosition = new short[newTotals];
}
this.docPos = new short[newTotals];
int row = 0;
int shift = 0;
for (int i=0; i<this.totalTerms; i++ ) {
if ( ignoreLocation.contains(i)) continue;
docTypesCodes[row] = bytes[i];
termTypeCodes[row] = bytes[this.totalTerms + i];
this.termWeight[row] = bytes[(2 * this.totalTerms) + i];
if ( termVectorStorageEnabled ) {
this.termFreq[row] = bytes[(3 * this.totalTerms) + i];
shift = (this.totalTerms * 4 ) + (i * 2);
this.termPosition[row] = (short) ((bytes[shift] << 8 ) + ( bytes[shift+1] & 0xff ) );
shift = (this.totalTerms * 6) + (i * 2);
this.docPos[row] = (short) ((bytes[shift] << 8 ) + ( bytes[shift+1] & 0xff ) );
} else {
shift = (this.totalTerms * 3) + (i * 2);
this.docPos[row] = (short) ((bytes[shift] << 8 ) + ( bytes[shift+1] & 0xff ) );
}
row++;
}
this.totalTerms = newTotals;
}
/**
* Load by deserializing bytes
* @param bytes
*/
public void loadTerms(byte[] bytes) {
if ( null == bytes) return;
int readPosition = 0;
if ( termVectorStorageEnabled ) {
this.totalTerms = bytes.length / TERM_SIZE_VECTOR;
} else {
this.totalTerms = bytes.length / TERM_SIZE_NOVECTOR;
}
/**
* Document types codes
*/
docTypesCodes = new byte[this.totalTerms];
for (int i=0; i<this.totalTerms; i++ ) {
docTypesCodes[i] = bytes[readPosition++];
}
/**
* Term types codes
*/
termTypeCodes = new byte[this.totalTerms];
for (int i=0; i<this.totalTerms; i++ ) {
termTypeCodes[i] = bytes[readPosition++];
}
/**
* Term weight
*/
this.termWeight = new byte[this.totalTerms];
for (int i=0; i<this.totalTerms; i++ ) {
this.termWeight[i] = bytes[readPosition++];
}
if ( termVectorStorageEnabled ) {
/**
* Term frequency
*/
this.termFreq = new byte[this.totalTerms];
for (int i=0; i<this.totalTerms; i++ ) {
this.termFreq[i] = bytes[readPosition++];
}
/**
* Term Position
*/
this.termPosition = new short[this.totalTerms];
for (int i=0; i<this.totalTerms; i++ ) {
this.termPosition[i] =
(short) ((bytes[readPosition++] << 8 ) + ( bytes[readPosition++] & 0xff ) );
}
}
/**
* Document Position
*/
this.docPos = new short[this.totalTerms];
for (int i=0; i<this.totalTerms; i++ ) {
this.docPos[i] =
(short) ((bytes[readPosition++] << 8 ) + ( bytes[readPosition++] & 0xff ) );
}
}
/**
* Add a keyword. Repetition are taken care
* @param aTerm
*/
public void add(Term aTerm) {
if ( null == aTerm) return;
int keywordHash = aTerm.term.hashCode();
if ( null == lstKeywords) {
lstKeywords = new HashMap<Integer, List<Term>> (ILanguageMap.ALL_COLS.length);
List<Term> lstTerms = ObjectFactory.getInstance().getTermList();
lstTerms.add(aTerm);
lstKeywords.put(keywordHash, lstTerms);
return;
}
boolean isMerged = false;
if ( lstKeywords.containsKey(keywordHash)) {
//Ids from same document is merged.
for ( Term existing : lstKeywords.get(keywordHash)) {
isMerged = existing.merge(aTerm);
if ( isMerged ) break;
}
if ( !isMerged ) lstKeywords.get(keywordHash).add(aTerm);
} else {
List<Term> terms = new ArrayList<Term>(3);
terms.add(aTerm);
lstKeywords.put(keywordHash, terms);
}
}
/**
* Add a complete list here.
* @param anotherList
*/
public void add(TermList anotherList) {
if ( null == anotherList.lstKeywords) return;
for (List<Term> anotherTerms : anotherList.lstKeywords.values()) {
for (Term anotherTerm : anotherTerms) {
this.add(anotherTerm);
}
}
}
/**
* Remove all the ids from another which are absent here.
* Remove all the ids from here which are absent another
* @param another
* @return : After intersect has any element left?
*/
public boolean intersect(TermList another) {
if ( 0 == this.totalTerms) another.cleanup();
if ( 0 == another.totalTerms) this.cleanup();
if ( null == this.docPos) another.cleanup();
if ( null == another.docPos) this.cleanup();
if ( 0 == this.totalTerms) return false;
boolean notSubsetting = true;
short aPos = -1;
int totalMatching = 0;
int posT = this.docPos.length;
/**
* This is a costlier looking cycle.
* TODO: Evaluate Set to make it faster
*/
for (int i=0; i<posT; i++) {
aPos = this.docPos[i];
if ( -1 == aPos) continue;
notSubsetting = true;
for ( short bPos : another.docPos) {
if ( aPos == bPos) {
notSubsetting = false; totalMatching++; break;
}
}
if ( notSubsetting ) this.docPos[i] = -1;
}
/**
* No terms matched
*/
if ( 0 == totalMatching) {
this.cleanup();
another.cleanup();
return false;
}
/**
* Set other document positions also as -1
*
* This is a costlier looking cycle.
* TODO: Evaluate Set to make it faster
*/
posT = another.docPos.length;
for (int i=0; i<posT; i++) {
aPos = another.docPos[i];
if ( -1 == aPos) continue;
notSubsetting = true;
//Is this existing in other list
for ( short posAno : this.docPos) {
if ( aPos == posAno) {
notSubsetting = false; totalMatching--; break;
}
}
if ( notSubsetting ) another.docPos[i] = -1;
if ( -1 == totalMatching) break; //Don't process unnecessarily
}
return true;
}
/**
* This keeps matching ids only of another termlist
* @param another After subsetting has any element left?
*/
public boolean subset(TermList another) {
if ( 0 == another.totalTerms) this.cleanup();
if ( null == another.docPos) this.cleanup();
if ( 0 == this.totalTerms) return false;
short aPos = -1;
int posT = this.docPos.length;
boolean eliminate = true;
boolean noneFound = true;
for ( int i=0; i<posT; i++) { //This term
aPos = this.docPos[i];
if ( -1 == aPos) continue;
eliminate = true;
for (short bPos : another.docPos) { //Any presence @ must terms
if ( -1 == bPos) continue;
if ( aPos == bPos) {
eliminate = false;
noneFound = false;
break;
}
}
if (eliminate) this.docPos[i] = -1;
}
/**
* No terms matched
*/
if ( noneFound ) {
this.cleanup();
return false;
}
return true;
}
/**
* The given document id will be applied to
* @param position
*/
public void assignDocPos(int position) {
if ( null == this.lstKeywords) return;
short pos = (short) position;
for (List<Term> terms : lstKeywords.values()) {
for (Term term : terms) {
term.setDocumentPosition(pos);
}
}
}
public boolean isExistingUnchanged() {
if ( null == lstKeywords) return true;
else return false;
}
/**
* Serialize this
* KeywordHash1/byte(SIZE > 256)/Integer(SIZE)/BYTES
* KeywordHash2/byte(SIZE)/Integer(SIZE)/BYTES
*/
public byte[] toBytes() {
if ( null == lstKeywords) return this.existingB;
InvertedIndex.merge(this.existingB, lstKeywords);
int totalBytes = 0;
int termsT = 0;
List<Term> lstTerms = null;
for (int hash : lstKeywords.keySet()) {
totalBytes = totalBytes + 4; /**Keyword Hash*/
lstTerms = lstKeywords.get(hash);
termsT = lstTerms.size();
if ( termsT < Byte.MAX_VALUE) totalBytes++; /**Low density*/
else totalBytes = totalBytes + 5; /**High density*/
if ( termVectorStorageEnabled ) { /**Terms*/
totalBytes = totalBytes + termsT * 8;
} else {
totalBytes = totalBytes + termsT * 5;
}
}
byte[] bytes = new byte[totalBytes];
int pos = 0;
short tp = 0, dp = 0;
for (int hash : lstKeywords.keySet()) {
/**
* Add the keyword hash
*/
System.arraycopy(Storable.putInt(hash), 0, bytes, pos, 4);
pos = pos + 4;
/**
* Add the total terms
*/
lstTerms = lstKeywords.get(hash);
termsT = lstTerms.size();
if ( termsT < Byte.MAX_VALUE) {
bytes[pos++] = (byte)(termsT);
} else {
bytes[pos++] = (byte)(-1);
System.arraycopy(Storable.putInt(termsT), 0, bytes, pos, 4);
pos = pos + 4;
}
/**
* Document types codes
*/
for (Term t : lstTerms) {
bytes[pos++] = t.getDocumentTypeCode();
}
/**
* Term types codes
*/
for (Term t : lstTerms) {
bytes[pos++] = t.getTermTypeCode();
}
/**
* Term weight
*/
for (Term t : lstTerms) {
bytes[pos++] = t.getTermWeight();
}
if ( termVectorStorageEnabled ) {
/**
* Term frequency
*/
for (Term t : lstTerms) {
bytes[pos++] = t.getTermFrequency();
}
/**
* Term Position
*/
for (Term t : lstTerms) {
tp = t.getTermPosition();
bytes[pos++] = (byte)(tp >> 8 & 0xff);
bytes[pos++] = (byte)(tp & 0xff);
}
}
/**
* Document Position
*/
for (Term t : lstTerms) {
dp = t.getDocumentPosition();
bytes[pos++] = (byte)(dp >> 8 & 0xff);
bytes[pos++] = (byte)(dp & 0xff);
}
}
return bytes;
}
public int fromBytes(byte[] bytes, int pos) {
if ( null == bytes ) return pos;
if ( 0 == pos) {
this.existingB = bytes;
return pos;
}
int size = bytes.length - pos;
this.existingB = new byte[size];
System.arraycopy(bytes, pos, this.existingB, 0, size);
return (pos + size);
}
/**
* Does this list contain the keyword.
* @param bytes Input bytes
* @param keywordHash search word hashcode in bytes
* @param pos starting read position
* @return True if matched
*/
public static boolean isMatchedTerm(byte[] bytes,
byte[] keywordHash, int pos) {
return (bytes[pos++] == keywordHash[0]) &&
(bytes[pos++] == keywordHash[1]) &&
(bytes[pos++] == keywordHash[2]) &&
(bytes[pos++] == keywordHash[3]);
}
public void cleanup() {
totalTerms = 0;
this.docTypesCodes = null;
this.termTypeCodes = null;
this.termWeight = null;
this.termFreq = null;
this.termPosition = null;
this.docPos = null;
if (null != lstKeywords) {
for (List<Term> lt : lstKeywords.values()) {
if ( null == lt) continue;
ObjectFactory.getInstance().putTermList(lt);
}
lstKeywords.clear();
}
this.existingB = null;
}
@Override
public String toString() {
if ( null != this.lstKeywords) {
StringBuilder sb = new StringBuilder(" TermList : ");
for (int termHash: this.lstKeywords.keySet()) {
sb.append(termHash).append(" : ");
for ( Term aTerm : this.lstKeywords.get(termHash)) {
sb.append("\n\t\t\t\t\t").append(aTerm.toString());
}
}
return sb.toString();
} else {
StringBuilder sb = new StringBuilder("\nTermList Total : ");
sb.append(totalTerms);
for ( int i=0; i< totalTerms; i++) {
sb.append("\nPositions: ").append(this.docPos[i]);
sb.append(" Weight: ").append(this.termWeight[i]);
}
return sb.toString();
}
}
}