/* * Copyright 2010 The Apache Software Foundation * * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.bizosys.hsearch.inpipe; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import com.bizosys.oneline.ApplicationFault; import com.bizosys.oneline.SystemFault; import com.bizosys.oneline.conf.Configuration; import com.bizosys.oneline.pipes.PipeIn; import com.bizosys.hsearch.common.IStorable; import com.bizosys.hsearch.common.RecordScalar; import com.bizosys.hsearch.common.Storable; import com.bizosys.hsearch.index.BucketIsFullException; import com.bizosys.hsearch.index.Doc; import com.bizosys.hsearch.index.IdMapping; import com.bizosys.hsearch.index.IndexLog; import com.bizosys.hsearch.index.Term; import com.bizosys.hsearch.index.TermColumns; import com.bizosys.hsearch.index.TermFamilies; import com.bizosys.hsearch.index.TermTables; import com.bizosys.hsearch.schema.ILanguageMap; import com.bizosys.hsearch.schema.SchemaManager; /** * Saves the term vector to the index table * @author karan * */ public class SaveToIndex implements PipeIn { int docMergeFactor = 1000; /** Arranged by document */ Map<Doc, TermTables> docTermTables = new HashMap<Doc, TermTables>(); public SaveToIndex() { } public SaveToIndex(int docMergeFactor) { this.docMergeFactor = docMergeFactor; } public boolean visit(Object objDoc) throws ApplicationFault, SystemFault { if ( null == objDoc) return true; Doc doc = (Doc) objDoc; if ( null == doc.terms) return true; if ( null == doc.terms.all) return true; ILanguageMap map = SchemaManager.getInstance().getLanguageMap(doc.meta.locale); TermTables termTable = ( null == doc.bucketId ) ? new TermTables() : new TermTables( new Storable(doc.bucketId)); for (Term term : doc.terms.all) { termTable.add(term, map); } this.docTermTables.put(doc, termTable); return true; } /** * Creating the term bucket to save the changes. */ public boolean commit() throws ApplicationFault, SystemFault { if ( null == this.docTermTables) return true; /** * We need to arrange all terms from documents to arrange in term buckets. */ Map<Long, TermTables> mergedTermTables = new HashMap<Long, TermTables>(); int totalDocsCount = this.docTermTables.size(); int updateDocsCount = 0; /** * Existing Document : Look for existing documents with valid bucket Id */ for (TermTables docTermTable : this.docTermTables.values()) { if ( null == docTermTable.bucketId ) continue; //New Record updateDocsCount++; buildTermTables(mergedTermTables, docTermTable); } /** * New records - Generate Keys for the bucket and documents */ int newDocsCount = totalDocsCount - updateDocsCount; long currentBucket = -1; short docPos = Short.MIN_VALUE; try { currentBucket = TermTables.getCurrentBucketId(); docPos = TermTables.createDocumentSerialIds( currentBucket,newDocsCount); if ( IndexLog.l.isInfoEnabled()) IndexLog.l.info("StoreToIndex > Document Serial Position moved till :" + docPos); if ( docPos > docMergeFactor ) TermTables.createBucketId(); } catch (BucketIsFullException ex) { throw new ApplicationFault("StoreToIndex : Reduce the merge Factor. It is beyond the short data range.", ex); } /** * Assign the created bucketId and document position to new docs * Create a Key Map with the original Ids */ IStorable storableBucketId = new Storable(currentBucket); List<IdMapping> docMappedIds = null; if (newDocsCount > 0 ) docMappedIds = new ArrayList<IdMapping>(newDocsCount); for ( Doc doc: this.docTermTables.keySet()) { TermTables docTermTable = this.docTermTables.get(doc); if ( null != docTermTable.bucketId ) continue; //Assign Id and Serial Position docTermTable.bucketId = storableBucketId; short thisDocPosition = docPos--; docTermTable.assignDocumentPosition(thisDocPosition); //Set bucket if and doc serial id for original document. doc.bucketId = Storable.getLong(0, storableBucketId.toBytes()); doc.docSerialId = thisDocPosition; //Store the mapping docMappedIds.add(new IdMapping( doc.teaser.id,currentBucket,thisDocPosition)); //Dedup Terms buildTermTables(mergedTermTables, docTermTable); } if ( IndexLog.l.isDebugEnabled()) IndexLog.l.debug(printMergedTT(mergedTermTables)); /** * Persist Ids */ if ( null != docMappedIds) { List<RecordScalar> mapRecords = new ArrayList<RecordScalar>(docMappedIds.size()); for (IdMapping mapping : docMappedIds) { mapping.build(mapRecords); } IdMapping.persist(mapRecords); } /** * Persist Terms */ if ( 0 == mergedTermTables.size()) return true; for (Long bucketId : mergedTermTables.keySet()) { mergedTermTables.get(bucketId).persist(true); } return true; } /** * * @param mergedTermTables * @param docTermTable */ private void buildTermTables( Map<Long, TermTables> mergedTermTables, TermTables docTermTable) { byte[] bucketIdB = docTermTable.bucketId.toBytes(); long bucketId = Storable.getLong(0, bucketIdB); if ( mergedTermTables.containsKey(bucketId)) { TermTables mtt = mergedTermTables.get(bucketId); mtt.add(docTermTable); } else { mergedTermTables.put(bucketId, docTermTable); } } public boolean init(Configuration conf) throws ApplicationFault, SystemFault { this.docMergeFactor = conf.getInt("index.documents.merge", 10000); return true; } public PipeIn getInstance() { return new SaveToIndex(this.docMergeFactor); } public String getName() { return "SaveToIndex"; } /** * Creates a string representation of the table. * @param mergedTermTables * @return */ private String printMergedTT(Map<Long, TermTables> mergedTermTables) { StringBuilder sb = new StringBuilder(); for (long bucket : mergedTermTables.keySet()) { sb.append("Bucket:").append(bucket); TermTables tt = mergedTermTables.get(bucket); for (char table: tt.tables.keySet()) { sb.append("\n\tTable:").append(table); TermFamilies tf = tt.tables.get(table); for (char family : tf.families.keySet()) { sb.append("\n\t\tfamily:").append(family); TermColumns tc = tf.families.get(family); for (char col : tc.columns.keySet()) { sb.append("\n\t\t\tColumn:").append(col); sb.append(tc.columns.get(col).toString()); } } } } return sb.toString(); } }