/* * Copyright 2010 Bizosys Technologies Limited * * Licensed to the Bizosys Technologies Limited (Bizosys) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The Bizosys licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.bizosys.hsearch.index; import java.nio.ByteBuffer; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import com.bizosys.hsearch.filter.Storable; import com.bizosys.hsearch.util.ObjectFactory; /** * The inverted Index data structure and byte operations. * @author karan * */ public class InvertedIndex { private static final byte[] EMPTY_BYTES = new byte[]{}; public int hash; public byte[] dtc; //doc type code public byte[] ttc; //term type code public byte[] tw; //term weight public byte[] termFreq; //term freq public short[] termPos; //term pos public short[] docPos; //doc pos public InvertedIndex(int hash, byte[] dtc, byte[] ttc, byte[] tw, byte[] termFreq,short[] termPos,short[] docPos ) { this.hash = hash; this.dtc = dtc; this.ttc = ttc; this.tw = tw; this.termFreq = termFreq; this.termPos = termPos; this.docPos = docPos; } /** * Reads the bytes to reconstruct the Inverted Index * @param bytes Input bytes * @return inverted index entries */ public static List<InvertedIndex> read(byte[] bytes) { if ( null == bytes) return null; int cursor = 0; int bytesT = bytes.length; if ( 0 == bytesT) return null; List<InvertedIndex> invIndex = new ArrayList<InvertedIndex>(); while (cursor < bytesT) { int hash = Storable.getInt(cursor, bytes); cursor = cursor + 4; int termsT = (byte) bytes[cursor]; cursor++; if ( -1 == termsT) { termsT = Storable.getInt(cursor,bytes ); cursor = cursor + 4; } byte[] dtc = new byte[termsT]; System.arraycopy(bytes, cursor, dtc, 0, termsT); cursor = cursor + termsT; byte[] ttc = new byte[termsT]; System.arraycopy(bytes, cursor, ttc, 0, termsT); cursor = cursor + termsT; byte[] tw = new byte[termsT]; System.arraycopy(bytes, cursor, tw, 0, termsT); cursor = cursor + termsT; byte[] tf = null; short[] tp = null; if ( TermList.termVectorStorageEnabled ) { tf = new byte[termsT]; System.arraycopy(bytes, cursor, tf, 0, termsT); cursor = cursor + termsT; tp = new short[termsT]; for (int i=0; i< termsT; i++) { tp[i] = Storable.getShort(cursor, bytes); cursor = cursor + 2; } } short[] dp = new short[termsT]; for (int i=0; i< termsT; i++) { dp[i] = Storable.getShort(cursor, bytes); cursor = cursor + 2; } InvertedIndex ii = new InvertedIndex(hash,dtc, ttc,tw,tf,tp,dp); invIndex.add(ii); } return invIndex; } /** * Remove the document at the specified position * @param bytes Input bytes * @param docPos Document position * @return Document purged */ public static byte[] delete(byte[] bytes, short docPos) { if ( null == bytes) return null; int cursor = 0; int bytesT = bytes.length; if ( 0 == bytesT) return null; Map<Integer,Integer> rowcol = new HashMap<Integer,Integer>(); int row = 0; int termsT = 0; int col = -1; short dp; while (cursor < bytesT) { row++; cursor = cursor + 4; //Hash termsT = (byte) bytes[cursor]; cursor++; if ( -1 == termsT) { termsT = Storable.getInt(cursor,bytes ); cursor = cursor + 4; } cursor = cursor + (termsT * 3); //dtc + ttc + tw if ( TermList.termVectorStorageEnabled ) cursor = cursor + (termsT * 3); //tf + tp col = Integer.MIN_VALUE; for (int i=0; i< termsT; i++) { dp = Storable.getShort(cursor, bytes); cursor = cursor + 2; if ( dp == docPos) { cursor = cursor + (termsT - i - 1) * 2; //Remaining bytes col = ( termsT == 1 ) ? -1 : i; break; } } if ( Integer.MIN_VALUE != col ) rowcol.put(row,col); } if ( IndexLog.l.isTraceEnabled()) IndexLog.l.trace( "InvertedIndex:delete Rows :" + rowcol.values().toString() + "\tCols :" + rowcol.keySet().toString() ); /** * Now cut the actual values */ cursor = 0; row = 0; ByteBuffer bb = ByteBuffer.allocate(bytes.length); while (cursor < bytesT) { row++; boolean cutRow = rowcol.containsKey(row); if ( cutRow && rowcol.get(row) == -1 ) { cursor = cursor + 4; //Hashcode termsT = (byte) bytes[cursor++]; if ( -1 == termsT) { termsT = Storable.getInt(cursor,bytes ); cursor = cursor + 4; } if ( TermList.termVectorStorageEnabled ) cursor = cursor + termsT * 8; else cursor = cursor + termsT * 5; continue; } bb.put(bytes, cursor, 4); cursor = cursor + 4; termsT = (byte) bytes[cursor++]; if ( -1 == termsT) { bb.put( (byte) -1); termsT = Storable.getInt(cursor,bytes ); if ( cutRow ) bb.putInt(termsT - 1); else bb.put(bytes, cursor, 4); cursor = cursor + 4; } else { if ( cutRow ) bb.put( (byte) (termsT - 1) ); else bb.put( (byte) (termsT) ); } if ( cutRow ) { col = rowcol.get(row); if ( col != 0 ) bb.put(bytes, cursor, col); bb.put(bytes, cursor + col + 1, termsT - col - 1); cursor = cursor + termsT; //Copy Term Type Code if ( col != 0 ) bb.put(bytes, cursor, col); bb.put(bytes, cursor + col + 1, termsT - col - 1); cursor = cursor + termsT; //Copy Term Weight if ( col != 0 ) bb.put(bytes, cursor, col); bb.put(bytes, cursor + col + 1, termsT - col - 1); cursor = cursor + termsT; if ( TermList.termVectorStorageEnabled ) { //Copy Term Frequency if ( col != 0 ) bb.put(bytes, cursor, col); bb.put(bytes, cursor + col + 1, termsT - col - 1); cursor = cursor + termsT; //Copy Term Position if ( col != 0 ) bb.put(bytes, cursor, (col) * 2 ); bb.put(bytes, cursor + (col + 1) * 2, (termsT - col - 1) * 2); cursor = cursor + termsT * 2; } //Copy Doc Position if ( col != 0 ) bb.put(bytes, cursor, col * 2 ); bb.put(bytes, cursor + (col + 1) * 2, (termsT - col - 1) * 2); cursor = cursor + termsT * 2; } else { if ( TermList.termVectorStorageEnabled ) { bb.put(bytes, cursor, termsT * 8); cursor = cursor + termsT * 8; } else { bb.put(bytes, cursor, termsT * 5); cursor = cursor + termsT * 5; } } } int len = bb.position(); if ( IndexLog.l.isTraceEnabled() ) IndexLog.l.trace( "InvertedIndex : Original / Cut Byte Size =" + bytes.length + "/" + len); if ( 0 == len ) return EMPTY_BYTES; byte[] deletedB = new byte[len]; bb.position(0); bb.get(deletedB, 0, len); bb.clear(); return deletedB; } /** * Merge the supplied document list with the documents * already present in the bucket. * * Ignore all the supplied documents while loading from bytes the existing ones * Create the Term List * */ public static void merge(byte[] existingB, Map<Integer, List<Term>> lstKeywords) { if ( null == existingB) return; short docPos; Set<Short> freshDocs = getFreshDocs(lstKeywords); if ( IndexLog.l.isDebugEnabled()) { for (int hash : lstKeywords.keySet()) { IndexLog.l.debug( "List : " + hash + " = " + lstKeywords.get(hash).toString()); } } int bytesT = existingB.length; List<Term> priorDocTerms = ObjectFactory.getInstance().getTermList(); int keywordHash = -1, termsT = -1, shift = 0, pos = 0, readPos=0; byte docTyep=0,termTyep=0,termWeight=0,termFreq=0; short termPos=0; if ( IndexLog.l.isDebugEnabled() ) IndexLog.l.debug("TermList Byte Marshalling: bytesT = " + bytesT); while ( pos < bytesT) { priorDocTerms.clear(); keywordHash = Storable.getInt(pos, existingB); pos = pos + 4; /** * Compute number of terms presence. */ termsT = existingB[pos++]; if ( -1 == termsT ) { termsT = Storable.getInt(pos, existingB); pos = pos + 4; } //if ( L.l.isDebugEnabled() ) L.l.debug("termsT:" + termsT + ":" + pos ); /** * Compute Each Term. */ shift = TermList.TERM_SIZE_NOVECTOR; if ( TermList.termVectorStorageEnabled ) shift = TermList.TERM_SIZE_VECTOR; for ( int i=0; i<termsT; i++) { //if ( IndexLog.l.isDebugEnabled() ) IndexLog.l.debug("pos:" + pos ); readPos = pos + ((shift - 2) * termsT )+ (i * 2); docPos = (short) ((existingB[readPos] << 8 ) + ( existingB[++readPos] & 0xff )); if ( freshDocs.contains(docPos)) continue; docTyep = existingB[pos+i]; termTyep = existingB[pos + termsT + i]; termWeight = existingB[pos + (2 * termsT) + i]; if ( TermList.termVectorStorageEnabled ) { termFreq = existingB[pos + (3 * termsT) + i]; readPos = pos + (4 * termsT) + i; termPos = (short) ( (existingB[readPos] << 8 ) + ( existingB[++readPos] & 0xff ) ); } Term priorTerm = new Term(docPos,docTyep,termTyep,termWeight,termPos,termFreq); priorDocTerms.add(priorTerm); } if ( TermList.termVectorStorageEnabled ) pos = pos + (8 * termsT); else pos = pos + (5 * termsT); mergePrior(lstKeywords, priorDocTerms, keywordHash); ObjectFactory.getInstance().putTermList(priorDocTerms); } } /** * Merge prior documents * @param lstKeywords * @param priorDocTerms * @param keywordHash */ private static void mergePrior(Map<Integer, List<Term>> lstKeywords, List<Term> priorDocTerms, int keywordHash) { if ( priorDocTerms.size() > 0 ) { List<Term> terms = null; if ( lstKeywords.containsKey(keywordHash) ) { //This Keyword exists terms = lstKeywords.get(keywordHash); terms.addAll(priorDocTerms); } else { List<Term> docTerms = new ArrayList<Term>(priorDocTerms.size()); docTerms.addAll(priorDocTerms); lstKeywords.put(keywordHash, docTerms); } if ( IndexLog.l.isDebugEnabled()) { IndexLog.l.debug("#### KEYWORDS HASH READ START"); for (int hash : lstKeywords.keySet()) { IndexLog.l.debug("Merged : " + hash + " = " + lstKeywords.get(hash).toString()); } IndexLog.l.debug("KEYWORDS HASH READ ENDS ####"); } } } /** * Get the fresh documentrs (The document position are absent) * @param lstKeywords * @return */ private static Set<Short> getFreshDocs(Map<Integer, List<Term>> lstKeywords) { Set<Short> freshDocs = new HashSet<Short>(); short docPos; for (int hash : lstKeywords.keySet()) { List<Term> terms = lstKeywords.get(hash); for (Term term : terms) { docPos = term.getDocumentPosition(); if ( freshDocs.contains(docPos)) continue; freshDocs.add(docPos); } } if ( IndexLog.l.isDebugEnabled() ) IndexLog.l.debug("Fresh Documents:" + freshDocs.toString()); return freshDocs; } public String toString() { StringBuilder sb = new StringBuilder(); sb.append("Hash: [").append(hash); sb.append("]\nDocument Type: ["); if ( null != dtc) for (byte c : dtc) sb.append(c).append(','); sb.append("]\nTerm Type: ["); if ( null != ttc) for (byte c : ttc) sb.append(c).append(','); sb.append("]\nTerm Weight: ["); if ( null != tw) for (byte w : tw) sb.append(w).append(','); sb.append("]\nTerm Frequency: ["); if ( null != termFreq) for (byte tf : termFreq) sb.append(tf).append(','); sb.append("]\nTerm Position: ["); if ( null != termPos) for (short tp : termPos) sb.append(tp).append(','); sb.append("]\nDocument Position = ["); if ( null != docPos) for (short dp : docPos) sb.append(dp).append(','); sb.append(']'); return sb.toString(); } }