/*
* Copyright 2010 The Apache Software Foundation
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.bizosys.hsearch.inpipe;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import com.bizosys.oneline.ApplicationFault;
import com.bizosys.oneline.SystemFault;
import com.bizosys.oneline.conf.Configuration;
import com.bizosys.oneline.pipes.PipeIn;
import com.bizosys.hsearch.common.IStorable;
import com.bizosys.hsearch.common.RecordScalar;
import com.bizosys.hsearch.common.Storable;
import com.bizosys.hsearch.index.BucketIsFullException;
import com.bizosys.hsearch.index.Doc;
import com.bizosys.hsearch.index.IdMapping;
import com.bizosys.hsearch.index.IndexLog;
import com.bizosys.hsearch.index.Term;
import com.bizosys.hsearch.index.TermColumns;
import com.bizosys.hsearch.index.TermFamilies;
import com.bizosys.hsearch.index.TermTables;
import com.bizosys.hsearch.schema.ILanguageMap;
import com.bizosys.hsearch.schema.SchemaManager;
/**
* Saves the term vector to the index table
* @author karan
*
*/
public class SaveToIndex implements PipeIn {
int docMergeFactor = 1000;
/** Arranged by document */
Map<Doc, TermTables> docTermTables = new HashMap<Doc, TermTables>();
public SaveToIndex() {
}
public SaveToIndex(int docMergeFactor) {
this.docMergeFactor = docMergeFactor;
}
public boolean visit(Object objDoc) throws ApplicationFault, SystemFault {
if ( null == objDoc) return true;
Doc doc = (Doc) objDoc;
if ( null == doc.terms) return true;
if ( null == doc.terms.all) return true;
ILanguageMap map = SchemaManager.getInstance().getLanguageMap(doc.meta.locale);
TermTables termTable = ( null == doc.bucketId ) ?
new TermTables() : new TermTables( new Storable(doc.bucketId));
for (Term term : doc.terms.all) {
termTable.add(term, map);
}
this.docTermTables.put(doc, termTable);
return true;
}
/**
* Creating the term bucket to save the changes.
*/
public boolean commit() throws ApplicationFault, SystemFault {
if ( null == this.docTermTables) return true;
/**
* We need to arrange all terms from documents to arrange in term buckets.
*/
Map<Long, TermTables> mergedTermTables = new HashMap<Long, TermTables>();
int totalDocsCount = this.docTermTables.size();
int updateDocsCount = 0;
/**
* Existing Document : Look for existing documents with valid bucket Id
*/
for (TermTables docTermTable : this.docTermTables.values()) {
if ( null == docTermTable.bucketId ) continue; //New Record
updateDocsCount++;
buildTermTables(mergedTermTables, docTermTable);
}
/**
* New records - Generate Keys for the bucket and documents
*/
int newDocsCount = totalDocsCount - updateDocsCount;
long currentBucket = -1;
short docPos = Short.MIN_VALUE;
try {
currentBucket = TermTables.getCurrentBucketId();
docPos = TermTables.createDocumentSerialIds(
currentBucket,newDocsCount);
if ( IndexLog.l.isInfoEnabled()) IndexLog.l.info("StoreToIndex > Document Serial Position moved till :" + docPos);
if ( docPos > docMergeFactor ) TermTables.createBucketId();
} catch (BucketIsFullException ex) {
throw new ApplicationFault("StoreToIndex : Reduce the merge Factor. It is beyond the short data range.", ex);
}
/**
* Assign the created bucketId and document position to new docs
* Create a Key Map with the original Ids
*/
IStorable storableBucketId = new Storable(currentBucket);
List<IdMapping> docMappedIds = null;
if (newDocsCount > 0 ) docMappedIds = new ArrayList<IdMapping>(newDocsCount);
for ( Doc doc: this.docTermTables.keySet()) {
TermTables docTermTable = this.docTermTables.get(doc);
if ( null != docTermTable.bucketId ) continue;
//Assign Id and Serial Position
docTermTable.bucketId = storableBucketId;
short thisDocPosition = docPos--;
docTermTable.assignDocumentPosition(thisDocPosition);
//Set bucket if and doc serial id for original document.
doc.bucketId = Storable.getLong(0, storableBucketId.toBytes());
doc.docSerialId = thisDocPosition;
//Store the mapping
docMappedIds.add(new IdMapping(
doc.teaser.id,currentBucket,thisDocPosition));
//Dedup Terms
buildTermTables(mergedTermTables, docTermTable);
}
if ( IndexLog.l.isDebugEnabled()) IndexLog.l.debug(printMergedTT(mergedTermTables));
/**
* Persist Ids
*/
if ( null != docMappedIds) {
List<RecordScalar> mapRecords = new ArrayList<RecordScalar>(docMappedIds.size());
for (IdMapping mapping : docMappedIds) {
mapping.build(mapRecords);
}
IdMapping.persist(mapRecords);
}
/**
* Persist Terms
*/
if ( 0 == mergedTermTables.size()) return true;
for (Long bucketId : mergedTermTables.keySet()) {
mergedTermTables.get(bucketId).persist(true);
}
return true;
}
/**
*
* @param mergedTermTables
* @param docTermTable
*/
private void buildTermTables(
Map<Long, TermTables> mergedTermTables, TermTables docTermTable) {
byte[] bucketIdB = docTermTable.bucketId.toBytes();
long bucketId = Storable.getLong(0, bucketIdB);
if ( mergedTermTables.containsKey(bucketId)) {
TermTables mtt = mergedTermTables.get(bucketId);
mtt.add(docTermTable);
} else {
mergedTermTables.put(bucketId, docTermTable);
}
}
public boolean init(Configuration conf) throws ApplicationFault, SystemFault {
this.docMergeFactor =
conf.getInt("index.documents.merge", 10000);
return true;
}
public PipeIn getInstance() {
return new SaveToIndex(this.docMergeFactor);
}
public String getName() {
return "SaveToIndex";
}
/**
* Creates a string representation of the table.
* @param mergedTermTables
* @return
*/
private String printMergedTT(Map<Long, TermTables> mergedTermTables) {
StringBuilder sb = new StringBuilder();
for (long bucket : mergedTermTables.keySet()) {
sb.append("Bucket:").append(bucket);
TermTables tt = mergedTermTables.get(bucket);
for (char table: tt.tables.keySet()) {
sb.append("\n\tTable:").append(table);
TermFamilies tf = tt.tables.get(table);
for (char family : tf.families.keySet()) {
sb.append("\n\t\tfamily:").append(family);
TermColumns tc = tf.families.get(family);
for (char col : tc.columns.keySet()) {
sb.append("\n\t\t\tColumn:").append(col);
sb.append(tc.columns.get(col).toString());
}
}
}
}
return sb.toString();
}
}