BuildNlp.java example

Explorer
hsearch-obsolete-master
- src
/*
* Copyright 2010 Bizosys Technologies Limited
*
* Licensed to the Bizosys Technologies Limited (Bizosys) under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  The Bizosys licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.bizosys.hsearch.outpipe;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;

import org.carrot2.clustering.lingo.LingoClusteringAlgorithm;
import org.carrot2.core.Cluster;
import org.carrot2.core.Controller;
import org.carrot2.core.ControllerFactory;
import org.carrot2.core.Document;
import org.carrot2.core.ProcessingResult;

import com.bizosys.hsearch.facet.FacetField;
import com.bizosys.hsearch.facet.IFacetField;
import com.bizosys.hsearch.index.IndexLog;
import com.bizosys.hsearch.query.DocTeaserWeight;
import com.bizosys.hsearch.query.HQuery;
import com.bizosys.hsearch.query.QueryContext;
import com.bizosys.hsearch.query.QueryResult;
import com.bizosys.oneline.ApplicationFault;
import com.bizosys.oneline.SystemFault;
import com.bizosys.oneline.conf.Configuration;
import com.bizosys.oneline.pipes.PipeOut;

/**
 * Cluster NLP information
 * @author karan
 *
 */
public class BuildNlp implements PipeOut{
	
	private static final boolean DEBUG_ENABLED = IndexLog.l.isDebugEnabled();
	private static final String NLP = "nlp";
	int maxDocuments = 50;
	
	public BuildNlp() {
	}	

	public void visit(Object objQuery, boolean multiWriter) throws ApplicationFault, SystemFault {
		
        HQuery query = (HQuery) objQuery;
		QueryContext ctx = query.ctx;
		QueryResult res = query.result;
		if ( null == res) return;
		if ( null == ctx.clusters) return;
		if (! ctx.clusters.contains(NLP) ) return;
		
		int teasersT = ( null == res.teasers) ? 0 : res.teasers.length;
		if ( teasersT < 1 ) return;
		
		Object[] teasers = res.teasers;

        ArrayList<Document> documents = new ArrayList<Document>(teasersT);
        
        for (int i=0; i<teasersT; i++) {
        	DocTeaserWeight teaser = (DocTeaserWeight) teasers[i];
    	    if (null == teaser.id ) continue;
    	    if ( teaser.id.length() == 0 ) continue;
    	    
    	    if (null == teaser.title ) continue;
    	    if ( teaser.title.length() == 0 ) continue;
    	    
    	    String summary = ( null == teaser.cacheText) ? teaser.preview : teaser.cacheText;
    	    if (null == summary ) continue;
    	    if ( summary.length() == 0 ) summary = "empty";
    	    
    	    documents.add(new Document(teaser.title, summary, teaser.id));
        }
        
        if ( DEBUG_ENABLED) {
        	IndexLog.l.debug("Total Carrot2 Documents Created :" + teasersT);
        }

        /* A controller to manage the processing pipeline. */
        final Controller controller = ControllerFactory.createSimple();
        

        if ( DEBUG_ENABLED) {
        	IndexLog.l.debug("Carrot2 Controller Created.");
        }

        /*
         * Perform clustering by topic using the Lingo algorithm. Lingo can 
         * take advantage of the original query, so we provide it along with the documents.
         */
        final ProcessingResult byTopicClusters = controller.process(documents, "data mining",
            LingoClusteringAlgorithm.class);
        
        
        if ( DEBUG_ENABLED) {
        	IndexLog.l.debug("Carrot2 Processing is over.");
        }

        final List<Cluster> clustersByTopic = byTopicClusters.getClusters();
        
        int clustersByTopicT = ( null == clustersByTopic) ? 
        	0 : clustersByTopic.size();
        if ( 0 == clustersByTopicT) return;
        
        StringBuilder sb = new StringBuilder(512);
		if ( null == res.facets) res.facets = 
			new HashMap<String, IFacetField[]>(ctx.clusters.size());
			
		IFacetField[] nlpFacets = new FacetField[clustersByTopicT];
		int counter = 0;
        for (Cluster cluster : clustersByTopic) {
        	cluster.setOtherTopics(false);
        	for (Document doc : cluster.getAllDocuments()) {
        		sb.append(doc.getId()).append('|');
			}
            FacetField ff = new FacetField(cluster.getLabel(), 
            	cluster.getAllDocuments().size(),sb.toString());

            nlpFacets[counter++] = ff;
            sb.delete(0, sb.capacity());
        }
        res.facets.put(NLP, nlpFacets);
	}
	
	public void commit(boolean multiWriter) throws ApplicationFault, SystemFault {
	}
	
	public void init(Configuration conf) throws ApplicationFault, SystemFault {
		this.maxDocuments = conf.getInt("cluster.cutoff.size", 50);
	}

	public PipeOut getInstance() {
		return this;
	}

	public String getName() {
		return "BuildNlp";
	}		
}