/* * Copyright 2010 Bizosys Technologies Limited * * Licensed to the Bizosys Technologies Limited (Bizosys) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The Bizosys licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.bizosys.hsearch.inpipe; import java.io.IOException; import java.util.HashMap; import java.util.List; import java.util.Map; import com.bizosys.hsearch.filter.Storable; import com.bizosys.hsearch.hbase.HWriter; import com.bizosys.hsearch.hbase.NV; import com.bizosys.hsearch.index.Doc; import com.bizosys.hsearch.index.IndexLog; import com.bizosys.hsearch.index.Term; import com.bizosys.hsearch.index.TermColumns; import com.bizosys.hsearch.index.TermFamilies; import com.bizosys.hsearch.index.TermList; import com.bizosys.hsearch.index.TermTables; import com.bizosys.hsearch.schema.ILanguageMap; import com.bizosys.hsearch.schema.SchemaManager; import com.bizosys.hsearch.util.ObjectFactory; import com.bizosys.hsearch.util.Record; import com.bizosys.oneline.ApplicationFault; import com.bizosys.oneline.SystemFault; import com.bizosys.oneline.conf.Configuration; import com.bizosys.oneline.pipes.PipeIn; /** * Saves the term vector to the index table in a batch mode. In the batch mode, it expects * 1) Bunch comes under one bucket only * 2) Each bucket will be overriden with no merging * 3) At one time one client only operates on a bucket. So thread safety is not provided. * @author karan * */ public class SaveToIndexBatch implements PipeIn { int docMergeFactor = 1000; /** Arranged by document */ Map<Long, TermTables> mergedTermTables = null; Map<Long, Map<String,byte[]>> mergedBytes = new HashMap<Long, Map<String,byte[]>>(2) ; int bufferSize = 0; boolean isIdMappingEnabled = true; public SaveToIndexBatch() { } public SaveToIndexBatch(int docMergeFactor, boolean isIdMappingEnabled) { this.docMergeFactor = docMergeFactor; this.isIdMappingEnabled = isIdMappingEnabled; } public void visit(Object docObj, boolean multiWriter) throws ApplicationFault, SystemFault { if ( null == docObj) throw new ApplicationFault("No document"); Doc doc = (Doc) docObj; if ( null == doc.bucketId || null == doc.docSerialId ) throw new ApplicationFault ("Ids missed from document :" + doc.toString()); if ( null == doc.terms) return; if ( null == doc.terms.all) return; ILanguageMap map = SchemaManager.getInstance().getLanguageMap(doc.meta.locale); TermTables termTable = ( null == doc.bucketId ) ? new TermTables(multiWriter) : new TermTables( new Storable(doc.bucketId),multiWriter); short docId = doc.docSerialId; for (Term term : doc.terms.getTermList()) { term.setDocumentPosition(docId); termTable.add(term, map); } bufferSize = bufferSize + doc.terms.getTermList().size(); doc.terms.closeTermList(); /** * We need to arrange all terms from documents to arrange in term buckets. */ if ( null == mergedTermTables) mergedTermTables = ObjectFactory.getInstance().getBucketTermTable(); buildTermTables(doc.bucketId, mergedTermTables, termTable); /** * ************** Check For Intermediate Flushing ************** */ if ( bufferSize > 1000000) { if ( IndexLog.l.isInfoEnabled() ) IndexLog.l.info("Flush Intermediate Size > " + bufferSize ); flushIntermediate(this.mergedBytes, mergedTermTables); } /** * ************** ----------------------- ************** */ } /** * Persist the Index */ public void commit(boolean multiWriter) throws ApplicationFault, SystemFault { if ( null == mergedTermTables) return; if ( IndexLog.l.isDebugEnabled()) printMergedTT(mergedTermTables); if ( mergedTermTables.size() > 0) { flushIntermediate(this.mergedBytes, mergedTermTables); this.mergedTermTables.clear(); } if ( null != mergedTermTables) { ObjectFactory.getInstance().putBucketTermTable(mergedTermTables); } if ( this.mergedBytes.size() == 0 ) return; for (long bucketId : this.mergedBytes.keySet()) { if ( IndexLog.l.isInfoEnabled() ) IndexLog.l.info("Commit > " + bucketId); Map<String,byte[]> values = this.mergedBytes.get(bucketId); Map<Character, List<NV>> tableNVs = new HashMap<Character, List<NV>>(); for (String tfc : values.keySet()) { char tab = tfc.charAt(0); if (tableNVs.containsKey(tab) ) { List<NV> tabNvs = tableNVs.get(tab); tabNvs.add(new NV( new byte[]{(byte)tfc.charAt(1)}, new byte[]{(byte)tfc.charAt(2)}, new Storable(values.get(tfc)))); } else { List<NV> nvs = ObjectFactory.getInstance().getNVList(); nvs.add(new NV( new byte[]{(byte)tfc.charAt(1)}, new byte[]{(byte)tfc.charAt(2)}, new Storable(values.get(tfc)))); tableNVs.put(tab, nvs); } } values.clear(); try { for (Character tab : tableNVs.keySet()) { List<NV> tabNV = tableNVs.get(tab); Record rec = new Record(new Storable(bucketId),tabNV ); HWriter.getInstance(multiWriter).insert(tab.toString(), rec); ObjectFactory.getInstance().putNVList(tabNV); } } catch (IOException ex) { throw new SystemFault(ex); } finally { for ( Map<String,byte[]> bucketM : mergedBytes.values()) { bucketM.clear(); } mergedBytes.clear(); tableNVs.clear(); } } } /** * * @param mergedTermTables * @param docTermTable */ private void buildTermTables( long bucketId, Map<Long, TermTables> mergedTermTables, TermTables docTermTable) { if ( mergedTermTables.containsKey(bucketId)) { TermTables mtt = mergedTermTables.get(bucketId); mtt.addInSameBucket(docTermTable); } else { mergedTermTables.put(bucketId, docTermTable); } } /** * It transforms the term objects to a byte array. * It saves pointed and allows more documents to get merged. * @param allByteValues All Bytes * @param mtt The merged term tables */ private void flushIntermediate( Map<Long, Map<String,byte[]>> allByteValues, Map<Long, TermTables> mtt) { if ( null == mtt) return; if ( 0 == mtt.size()) return; long s = System.currentTimeMillis(); StringBuilder sb = new StringBuilder(10); for (long bucketId : mtt.keySet()) { TermTables termTables = mtt.get(bucketId); for ( char tableName : termTables.tables.keySet()) { TermFamilies tf = termTables.tables.get(tableName); for (char family : tf.families.keySet()) { TermColumns tcs = tf.families.get(family); for (char col : tcs.columns.keySet()) { sb.append(tableName).append(family).append(col); String tfc = sb.toString(); sb.delete(0, 10); TermList tl = tcs.columns.get(col); Map<String,byte[]> bytesV = null; if ( allByteValues.containsKey(bucketId)) { bytesV = allByteValues.get(bucketId); if (bytesV.containsKey(tfc)) { byte[] tlB = bytesV.get(tfc); tl.setExistingBytes(tlB); byte[] newB = tl.toBytes(); bytesV.put(tfc, newB); tlB = null; } else { bytesV.put(tfc, tl.toBytes()); } } else { bytesV = new HashMap<String, byte[]>(1); bytesV.put(tfc, tl.toBytes()); allByteValues.put(bucketId, bytesV); } tl.cleanup(); } } } termTables.cleanup(); } bufferSize = 0; if ( InpipeLog.l.isDebugEnabled()) InpipeLog.l.debug( "SaveToIndexBatch > flushIntermediate Executation Time " + (System.currentTimeMillis() - s) + " ms"); } public void init(Configuration conf) throws ApplicationFault, SystemFault { this.docMergeFactor = conf.getInt("index.documents.merge", 10000); this.isIdMappingEnabled = conf.getBoolean("idmapping.enable", true); } public PipeIn getInstance() { return new SaveToIndexBatch(this.docMergeFactor, this.isIdMappingEnabled); } public String getName() { return "SaveToIndexBatch"; } /** * Creates a string representation of the table. * @param mergedTermTables * @return */ private String printMergedTT(Map<Long, TermTables> mergedTermTables) { StringBuilder sb = new StringBuilder(); for (long bucket : mergedTermTables.keySet()) { sb.append("Bucket:").append(bucket); TermTables tt = mergedTermTables.get(bucket); for (char table: tt.tables.keySet()) { sb.append("\n\tTable:").append(table); TermFamilies tf = tt.tables.get(table); for (char family : tf.families.keySet()) { sb.append("\n\t\tfamily:").append(family); TermColumns tc = tf.families.get(family); for (char col : tc.columns.keySet()) { sb.append("\n\t\t\tColumn:").append(col); sb.append(tc.columns.get(col).toString()); } } } } return sb.toString(); } }