/*
* Copyright 2010 Bizosys Technologies Limited
*
* Licensed to the Bizosys Technologies Limited (Bizosys) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The Bizosys licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.bizosys.hsearch.inpipe;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import com.bizosys.hsearch.filter.Storable;
import com.bizosys.hsearch.index.Doc;
import com.bizosys.hsearch.index.IdMapping;
import com.bizosys.hsearch.index.IndexLog;
import com.bizosys.hsearch.index.Term;
import com.bizosys.hsearch.index.TermColumns;
import com.bizosys.hsearch.index.TermFamilies;
import com.bizosys.hsearch.index.TermTables;
import com.bizosys.hsearch.schema.ILanguageMap;
import com.bizosys.hsearch.schema.SchemaManager;
import com.bizosys.hsearch.util.ObjectFactory;
import com.bizosys.hsearch.util.RecordScalar;
import com.bizosys.oneline.ApplicationFault;
import com.bizosys.oneline.SystemFault;
import com.bizosys.oneline.conf.Configuration;
import com.bizosys.oneline.pipes.PipeIn;
/**
* Saves the term vector to the index table
* @author karan
*
*/
public class SaveToIndex implements PipeIn {
int docMergeFactor = 1000;
boolean isIdMappingEnabled = true;
static boolean isDebug = InpipeLog.l.isDebugEnabled();
/** Arranged by document */
Map<Doc, TermTables> docTermTables = null;
public SaveToIndex() {
}
public SaveToIndex(int docMergeFactor, boolean isIdMappingEnabled) {
this.docMergeFactor = docMergeFactor;
this.isIdMappingEnabled = isIdMappingEnabled;
}
public void visit(Object docObj, boolean concurrency) throws ApplicationFault, SystemFault {
if ( null == docObj) throw new ApplicationFault("No document");
Doc doc = (Doc) docObj;
if ( null == doc.terms) return;
if ( null == doc.terms.all) return;
ILanguageMap map = SchemaManager.getInstance().getLanguageMap(doc.meta.locale);
TermTables termTable = ( null == doc.bucketId ) ?
new TermTables(concurrency) : new TermTables( new Storable(doc.bucketId),concurrency);
for (Term term : doc.terms.all) {
term.setDocumentPosition(doc.docSerialId);
termTable.add(term, map);
}
doc.terms.closeTermList();
if ( null == this.docTermTables) this.docTermTables =
ObjectFactory.getInstance().getDocTermTable();
this.docTermTables.put(doc, termTable);
}
/**
* Creating the term bucket to save the changes.
*/
public void commit(boolean multiWriter) throws ApplicationFault, SystemFault {
if ( null == this.docTermTables) return;
/**
* We need to arrange all terms from documents to arrange in term buckets.
*/
Map<Long, TermTables> mergedTermTables =
ObjectFactory.getInstance().getBucketTermTable();
/**
* Check of any document does not have the document serial Ids
*/
List<IdMapping> docMappedIds = null;
for (Doc aDoc : this.docTermTables.keySet() ) {
TermTables docTermTable = this.docTermTables.get(aDoc);
//Sanity check. Expecting bucket Id for all.
if ( null == docTermTable.bucketId )
throw new ApplicationFault ("SaveToIndex >> Bucket Id Missing." );
if (isDebug) InpipeLog.l.debug("Build term tables and assign document position");
buildTermTables(mergedTermTables, docTermTable);
docTermTable.assignDocumentPosition(aDoc.docSerialId);
if (isDebug) InpipeLog.l.debug("Id Mapping, Necessary for mapping from the original Key");
if ( null == docMappedIds) docMappedIds = new ArrayList<IdMapping>();
docMappedIds.add(new IdMapping(aDoc.tenant, aDoc.teaser.id,aDoc.bucketId,aDoc.docSerialId));
if (isDebug) InpipeLog.l.debug("Dedup Terms");
buildTermTables(mergedTermTables, docTermTable);
}
if ( IndexLog.l.isDebugEnabled()) IndexLog.l.debug(printMergedTT(mergedTermTables));
/**
* Persist Ids
*/
if ( null != docMappedIds && isIdMappingEnabled) {
if (isDebug) InpipeLog.l.debug("Persisting Id Mappings");
List<RecordScalar> mapRecords = ObjectFactory.getInstance().getScalarRecordList();
for (IdMapping mapping : docMappedIds) {
mapping.build(mapRecords);
}
IdMapping.persist(mapRecords,multiWriter);
ObjectFactory.getInstance().putScalarRecordList(mapRecords);
}
/**
* Persist Terms
*/
if ( 0 == mergedTermTables.size()) return;
for (Long bucketId : mergedTermTables.keySet()) {
if (isDebug) InpipeLog.l.debug("Persisting terms : " + bucketId);
TermTables termTables = mergedTermTables.get(bucketId);
termTables.persist(true, false);
termTables.cleanup();
}
cleanup(mergedTermTables);
}
private void cleanup(Map<Long, TermTables> mergedTermTables) {
/**
* Clear the resources.
*/
if ( null != docTermTables) {
for (TermTables tt : docTermTables.values()) {
if ( null != tt) tt.cleanup();
}
ObjectFactory.getInstance().putDocTermTable(docTermTables);
}
if ( null != mergedTermTables) {
for (TermTables tt : mergedTermTables.values()) {
if ( null != tt) tt.cleanup();
}
ObjectFactory.getInstance().putBucketTermTable(mergedTermTables);
}
}
/**
*
* @param mergedTermTables
* @param docTermTable
*/
private void buildTermTables(
Map<Long, TermTables> mergedTermTables, TermTables docTermTable) {
byte[] bucketIdB = docTermTable.bucketId.toBytes();
long bucketId = Storable.getLong(0, bucketIdB);
if ( mergedTermTables.containsKey(bucketId)) {
TermTables mtt = mergedTermTables.get(bucketId);
mtt.add(docTermTable);
} else {
mergedTermTables.put(bucketId, docTermTable);
}
}
public void init(Configuration conf) throws ApplicationFault, SystemFault {
this.docMergeFactor =
conf.getInt("index.documents.merge", 10000);
this.isIdMappingEnabled = conf.getBoolean("idmapping.enable", false);
}
public PipeIn getInstance() {
return new SaveToIndex(this.docMergeFactor, this.isIdMappingEnabled);
}
public String getName() {
return "SaveToIndex";
}
/**
* Creates a string representation of the table.
* @param mergedTermTables
* @return
*/
private String printMergedTT(Map<Long, TermTables> mergedTermTables) {
StringBuilder sb = new StringBuilder();
for (long bucket : mergedTermTables.keySet()) {
sb.append("Bucket:").append(bucket);
TermTables tt = mergedTermTables.get(bucket);
for (char table: tt.tables.keySet()) {
sb.append("\n\tTable:").append(table);
TermFamilies tf = tt.tables.get(table);
for (char family : tf.families.keySet()) {
sb.append("\n\t\tfamily:").append(family);
TermColumns tc = tf.families.get(family);
for (char col : tc.columns.keySet()) {
sb.append("\n\t\t\tColumn:").append(col);
sb.append(tc.columns.get(col).toString());
}
}
}
}
return sb.toString();
}
}