InvertedIndex.java example

Explorer
hsearch-obsolete-master
- src
/*
* Copyright 2010 Bizosys Technologies Limited
*
* Licensed to the Bizosys Technologies Limited (Bizosys) under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  The Bizosys licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.bizosys.hsearch.index;

import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

import com.bizosys.hsearch.filter.Storable;
import com.bizosys.hsearch.util.ObjectFactory;

/**
 * The inverted Index data structure and byte operations.
 * @author karan
 *
 */
public class InvertedIndex {
	private static final byte[] EMPTY_BYTES = new byte[]{};
	public int hash;
	public byte[] dtc; //doc type code
	public byte[] ttc; //term type code
	public byte[] tw; //term weight
	public byte[] termFreq; //term freq
	public short[] termPos; //term pos
	public short[] docPos; //doc pos
	
	public InvertedIndex(int hash, byte[] dtc, byte[] ttc, byte[] tw,  
		byte[] termFreq,short[] termPos,short[] docPos ) {
		
		this.hash = hash;
		this.dtc = dtc;
		this.ttc = ttc;
		this.tw = tw;
		this.termFreq = termFreq;
		this.termPos = termPos;
		this.docPos = docPos;
	}
	

	
	/**
	 * Reads the bytes to reconstruct the Inverted Index
	 * @param bytes	Input bytes
	 * @return	inverted index entries 
	 */
	public static List<InvertedIndex> read(byte[] bytes) {
		
		if ( null == bytes) return null;
		int cursor = 0;
		int bytesT = bytes.length;
		if ( 0 == bytesT) return null;
		
		List<InvertedIndex> invIndex = new ArrayList<InvertedIndex>(); 
		
		while (cursor < bytesT) {
			int hash = Storable.getInt(cursor, bytes);
			cursor = cursor + 4;
			int termsT = (byte) bytes[cursor];
			cursor++;
			if ( -1 == termsT) {
				termsT = Storable.getInt(cursor,bytes );
				cursor = cursor + 4;
			}

			byte[] dtc = new  byte[termsT];
			System.arraycopy(bytes, cursor, dtc, 0, termsT);
			cursor = cursor + termsT;
			
			byte[] ttc = new  byte[termsT];
			System.arraycopy(bytes, cursor, ttc, 0, termsT);
			cursor = cursor + termsT;
			
			byte[] tw = new  byte[termsT];
			System.arraycopy(bytes, cursor, tw, 0, termsT);
			cursor = cursor + termsT;
			
			byte[] tf = null;
			short[] tp = null;
			if ( TermList.termVectorStorageEnabled ) {
				tf = new  byte[termsT];
				System.arraycopy(bytes, cursor, tf, 0, termsT);
				cursor = cursor + termsT;
		
				tp = new  short[termsT];
				for (int i=0; i< termsT; i++) {
					tp[i] = Storable.getShort(cursor, bytes);
					cursor = cursor + 2;
				}
			}

			short[]  dp = new  short[termsT];
			for (int i=0; i< termsT; i++) {
				dp[i] = Storable.getShort(cursor, bytes);
				cursor = cursor + 2;
			}
			InvertedIndex ii = new InvertedIndex(hash,dtc, ttc,tw,tf,tp,dp);
			invIndex.add(ii);
		}
		return invIndex;
	}
	
	/**
	 * Remove the document at the specified position
	 * @param bytes	Input bytes
	 * @param docPos 	Document position
	 * @return	Document purged
	 */
	public static byte[] delete(byte[] bytes, short docPos) {
		
		if ( null == bytes) return null;
		int cursor = 0;
		int bytesT = bytes.length;
		if ( 0 == bytesT) return null;
		
		Map<Integer,Integer> rowcol = new HashMap<Integer,Integer>(); 
		int row = 0;
		int termsT = 0;
		int col = -1;
		short dp;
		while (cursor < bytesT) {
			row++;
			cursor = cursor + 4; //Hash
			termsT = (byte) bytes[cursor];
			cursor++;
			
			if ( -1 == termsT) {
				termsT = Storable.getInt(cursor,bytes );
				cursor = cursor + 4;
			}
			
			cursor = cursor + (termsT * 3); //dtc + ttc + tw
			if ( TermList.termVectorStorageEnabled ) cursor = cursor + (termsT * 3); //tf + tp
			col = Integer.MIN_VALUE;
			for (int i=0; i< termsT; i++) {
				dp = Storable.getShort(cursor, bytes);
				cursor = cursor + 2;
				if ( dp == docPos) {
					cursor = cursor + (termsT - i - 1) * 2; //Remaining bytes
					col = ( termsT == 1 ) ? -1 : i; 
					break;
				}
			}
			if ( Integer.MIN_VALUE != col ) rowcol.put(row,col);
		}
		
		if ( IndexLog.l.isTraceEnabled()) IndexLog.l.trace(
			"InvertedIndex:delete Rows :" + 
			rowcol.values().toString() + 
			"\tCols :" + rowcol.keySet().toString() );
		
		/**
		 * Now cut the actual values
		 */
		cursor = 0; row = 0;
		ByteBuffer bb = ByteBuffer.allocate(bytes.length);
		
		while (cursor < bytesT) {
			row++;
			boolean cutRow = rowcol.containsKey(row);
			if ( cutRow && rowcol.get(row) == -1 ) {
				cursor = cursor + 4; //Hashcode
				termsT = (byte) bytes[cursor++];
				if ( -1 == termsT) {
					termsT = Storable.getInt(cursor,bytes );
					cursor = cursor + 4;
				}
				if ( TermList.termVectorStorageEnabled ) cursor = cursor + termsT * 8; 
				else cursor = cursor + termsT * 5;
				continue;
			}
			
			bb.put(bytes, cursor, 4);
			cursor = cursor + 4;
			termsT = (byte) bytes[cursor++];
			if ( -1 == termsT) {
				bb.put( (byte) -1);
				termsT = Storable.getInt(cursor,bytes );
				if ( cutRow ) bb.putInt(termsT - 1); 
				else bb.put(bytes, cursor, 4);
				cursor = cursor + 4;
			} else {
				if ( cutRow ) bb.put( (byte) (termsT - 1) );
				else bb.put( (byte) (termsT) );
			}
			
			if ( cutRow ) {
				col = rowcol.get(row);
				if ( col != 0 ) bb.put(bytes, cursor, col);
				bb.put(bytes, cursor + col + 1, termsT - col - 1);
				cursor = cursor + termsT;

				//Copy Term Type Code
				if ( col != 0 )  bb.put(bytes, cursor, col);
				bb.put(bytes, cursor + col + 1, termsT - col - 1);
				cursor = cursor + termsT;

				//Copy Term Weight
				if ( col != 0 ) bb.put(bytes, cursor, col);
				bb.put(bytes, cursor + col + 1, termsT - col - 1);
				cursor = cursor + termsT;

				if ( TermList.termVectorStorageEnabled ) {
					//Copy Term Frequency
					if ( col != 0 ) bb.put(bytes, cursor, col);
					bb.put(bytes, cursor + col + 1, termsT - col - 1);
					cursor = cursor + termsT;
					
					//Copy Term Position
					if ( col != 0 ) bb.put(bytes, cursor, (col) * 2 );
					bb.put(bytes, cursor + (col + 1) * 2, (termsT - col - 1) * 2);
					cursor = cursor + termsT * 2;
				} 
				//Copy Doc Position
				if ( col != 0 ) bb.put(bytes, cursor, col * 2 );
				bb.put(bytes, cursor + (col + 1) * 2, (termsT - col - 1) * 2);
				cursor = cursor + termsT * 2;
				
			} else {
				if ( TermList.termVectorStorageEnabled ) {
					bb.put(bytes, cursor, termsT * 8);
					cursor = cursor + termsT * 8; 
				} else {
					bb.put(bytes, cursor, termsT * 5);
					cursor = cursor + termsT * 5; 
				}
			}
		}
		int len = bb.position();
		if ( IndexLog.l.isTraceEnabled() ) IndexLog.l.trace(
			"InvertedIndex : Original / Cut Byte Size =" + bytes.length + "/" + len);
		if ( 0 == len ) return EMPTY_BYTES;
		byte[] deletedB = new byte[len];
		bb.position(0);
		bb.get(deletedB, 0, len);
		bb.clear();
		return deletedB;
	}	
		
	
	/**
	 * Merge the supplied document list with the documents
	 * already present in the bucket.
	 * 
	 * Ignore all the supplied documents while loading from bytes the existing ones
	 * Create the Term List 
	 *
	 */
	public static void merge(byte[] existingB, Map<Integer, List<Term>> lstKeywords) {
		
		if ( null == existingB) return;

		short docPos;
		Set<Short> freshDocs = getFreshDocs(lstKeywords);
		
		if ( IndexLog.l.isDebugEnabled()) {
			for (int hash : lstKeywords.keySet()) {
				IndexLog.l.debug(
					"List : " + hash + " = " + lstKeywords.get(hash).toString());
			}
		}
		
		int bytesT = existingB.length;
		List<Term> priorDocTerms = ObjectFactory.getInstance().getTermList();
		int keywordHash = -1, termsT = -1, shift = 0, pos = 0, readPos=0;
		byte docTyep=0,termTyep=0,termWeight=0,termFreq=0;
		short termPos=0;
		
		if ( IndexLog.l.isDebugEnabled() ) IndexLog.l.debug("TermList Byte Marshalling: bytesT = " + bytesT);
		while ( pos < bytesT) {
			priorDocTerms.clear();
			keywordHash = Storable.getInt(pos, existingB);
			pos = pos + 4;

			/**
			 * Compute number of terms presence.
			 */
			termsT = existingB[pos++];
			if ( -1 == termsT ) {
				termsT =  Storable.getInt(pos, existingB);
				pos = pos + 4;
			} 
			//if ( L.l.isDebugEnabled() ) L.l.debug("termsT:" + termsT + ":" + pos );
			
			/**
			 * Compute Each Term.
			 */
			shift = TermList.TERM_SIZE_NOVECTOR;
			if ( TermList.termVectorStorageEnabled ) shift = TermList.TERM_SIZE_VECTOR;
			for ( int i=0; i<termsT; i++) {
				//if ( IndexLog.l.isDebugEnabled() ) IndexLog.l.debug("pos:" + pos );
				
				readPos = pos + ((shift - 2) * termsT )+ (i * 2);
				docPos = (short) ((existingB[readPos] << 8 ) + 
					( existingB[++readPos] & 0xff ));
				
				if ( freshDocs.contains(docPos)) continue;
				
				docTyep = existingB[pos+i];
				termTyep = existingB[pos + termsT + i];
				termWeight = existingB[pos + (2 * termsT) + i];
				
				if ( TermList.termVectorStorageEnabled ) {
					termFreq = existingB[pos + (3 * termsT) + i];
					readPos = pos + (4 * termsT) + i;
					termPos = (short) ( (existingB[readPos] << 8 ) + 
							( existingB[++readPos] & 0xff ) );
				}
				Term priorTerm = new Term(docPos,docTyep,termTyep,termWeight,termPos,termFreq);
				priorDocTerms.add(priorTerm);
			}

			if ( TermList.termVectorStorageEnabled ) pos = pos + (8 * termsT);
			else pos = pos + (5 * termsT);
			mergePrior(lstKeywords, priorDocTerms, keywordHash);
			ObjectFactory.getInstance().putTermList(priorDocTerms);
		}
	}

	/**
	 * Merge prior documents
	 * @param lstKeywords
	 * @param priorDocTerms
	 * @param keywordHash
	 */
	private static void mergePrior(Map<Integer, List<Term>> lstKeywords,
		List<Term> priorDocTerms, int keywordHash) {
		
		if ( priorDocTerms.size() > 0 ) {
			List<Term> terms = null;
			if ( lstKeywords.containsKey(keywordHash) ) { //This Keyword exists
				terms = lstKeywords.get(keywordHash);
				terms.addAll(priorDocTerms);
			} else {
				List<Term> docTerms = new ArrayList<Term>(priorDocTerms.size());
				docTerms.addAll(priorDocTerms);
				lstKeywords.put(keywordHash, docTerms);
			}

			if ( IndexLog.l.isDebugEnabled()) {
				IndexLog.l.debug("#### KEYWORDS HASH READ START");
				for (int hash : lstKeywords.keySet()) {
					IndexLog.l.debug("Merged : " + hash + " = " + 
						lstKeywords.get(hash).toString());
				}
				IndexLog.l.debug("KEYWORDS HASH READ ENDS ####");
			}
		}
	}
	
	/**
	 * Get the fresh documentrs (The document position are absent)
	 * @param lstKeywords
	 * @return
	 */
	private static Set<Short> getFreshDocs(Map<Integer, List<Term>> lstKeywords) {
		Set<Short> freshDocs = new HashSet<Short>();
		short docPos;
		for (int hash : lstKeywords.keySet()) {
			List<Term> terms = lstKeywords.get(hash);
			for (Term term : terms) {
				docPos = term.getDocumentPosition();
				if ( freshDocs.contains(docPos)) continue;
				freshDocs.add(docPos);
			}
		}
		if ( IndexLog.l.isDebugEnabled() ) 
			IndexLog.l.debug("Fresh Documents:" + freshDocs.toString());
		return freshDocs;
	}

	public String toString() {
		
		StringBuilder sb = new StringBuilder();
		sb.append("Hash: [").append(hash);

		sb.append("]\nDocument Type: [");
		if ( null != dtc) for (byte c : dtc) sb.append(c).append(',');

		sb.append("]\nTerm Type: [");
		if ( null != ttc) for (byte c : ttc) sb.append(c).append(',');

		sb.append("]\nTerm Weight: [");
		if ( null != tw) for (byte w : tw) sb.append(w).append(',');
		
		sb.append("]\nTerm Frequency: [");
		if ( null != termFreq) for (byte tf : termFreq) sb.append(tf).append(',');
		
		sb.append("]\nTerm Position: [");
		if ( null != termPos) for (short tp : termPos) sb.append(tp).append(',');

		sb.append("]\nDocument Position = [");
		if ( null != docPos) for (short dp : docPos) sb.append(dp).append(',');
		sb.append(']');
		
		return sb.toString();
	}	
	
}