/*
* Carrot2 project.
*
* Copyright (C) 2002-2016, Dawid Weiss, Stanisław Osiński.
* All rights reserved.
*
* Refer to the full license file "carrot2.LICENSE"
* in the root folder of the repository checkout or at:
* http://www.carrot2.org/carrot2.LICENSE
*/
package org.carrot2.examples.clustering;
import java.util.HashMap;
import java.util.Map;
import org.carrot2.clustering.lingo.LingoClusteringAlgorithm;
import org.carrot2.clustering.lingo.LingoClusteringAlgorithmDescriptor;
import org.carrot2.core.*;
import org.carrot2.core.attribute.CommonAttributesDescriptor;
import org.carrot2.examples.ConsoleFormatter;
import org.carrot2.matrix.factorization.IterationNumberGuesser.FactorizationQuality;
import org.carrot2.text.preprocessing.pipeline.CompletePreprocessingPipelineDescriptor;
import org.carrot2.shaded.guava.common.collect.Maps;
import org.carrot2.source.microsoft.v5.Bing5DocumentSource;
/**
* It is possible to initialize a {@link Controller} to host a number of different
* configurations of the same {@link IDocumentSource} or {@link IClusteringAlgorithm} and
* invoke them as appropriate. This is achieved by assigning a string identifier to each
* configuration and then passing the identifier to the
* {@link Controller#process(Map, String...)} method.
* <p>
* One example where this setting may be useful is when your application serves multiple
* customers, each of which need a different configuration of a document source or a
* clustering algorithm.
* </p>
*/
public class MoreConfigurationsOfOneAlgorithmInCachingController
{
public static void main(String [] args)
{
/*
* Create a controller that caches all documents.
*/
final Controller controller = ControllerFactory.createCachingPooling(IDocumentSource.class);
/*
* You can define global values for some attributes. These will apply to all
* configurations we will define below, unless the specific configuration
* overrides the global attributes.
*/
final Map<String, Object> globalAttributes = new HashMap<String, Object>();
CompletePreprocessingPipelineDescriptor.attributeBuilder(globalAttributes)
.documentAssigner()
.exactPhraseAssignment(false);
/*
* Now we will define two different configurations of the Lingo algorithm. One
* will be optimized for speed of clustering, while the other will optimize the
* quality of clusters.
*/
final Map<String, Object> fastAttributes = Maps.newHashMap();
LingoClusteringAlgorithmDescriptor.attributeBuilder(fastAttributes)
.desiredClusterCountBase(20)
.matrixReducer()
.factorizationQuality(FactorizationQuality.LOW);
CompletePreprocessingPipelineDescriptor.attributeBuilder(fastAttributes)
.caseNormalizer()
.dfThreshold(2);
final Map<String, Object> accurateAttributes = Maps.newHashMap();
LingoClusteringAlgorithmDescriptor.attributeBuilder(accurateAttributes)
.desiredClusterCountBase(40)
.matrixReducer()
.factorizationQuality(FactorizationQuality.HIGH);
CompletePreprocessingPipelineDescriptor.attributeBuilder(accurateAttributes)
.documentAssigner()
.exactPhraseAssignment(true);
CompletePreprocessingPipelineDescriptor.attributeBuilder(fastAttributes)
.caseNormalizer()
.dfThreshold(1);
/*
* We initialize the controller passing the global attributes and the two
* configurations. Notice that a configuration consists of the component
* class (can be a document source as well as a clustering algorithm), its
* string identifier and attributes.
*/
controller.init(globalAttributes,
new ProcessingComponentConfiguration(LingoClusteringAlgorithm.class,
"lingo-fast", fastAttributes),
new ProcessingComponentConfiguration(LingoClusteringAlgorithm.class,
"lingo-accurate", accurateAttributes)
);
/*
* Now we can call the two different clustering configurations. Notice that
* because we now use string identifiers instead of classes, we pass the document
* source class name rather than the class itself.
*/
final Map<String, Object> attributes = new HashMap<String, Object>();
CommonAttributesDescriptor.attributeBuilder(attributes)
.query("data mining");
final ProcessingResult fastResult = controller.process(attributes,
Bing5DocumentSource.class.getName(), "lingo-fast");
ConsoleFormatter.displayClusters(fastResult.getClusters());
final ProcessingResult accurateResult = controller.process(attributes,
Bing5DocumentSource.class.getName(), "lingo-accurate");
ConsoleFormatter.displayClusters(accurateResult.getClusters());
}
}