/* * Carrot2 project. * * Copyright (C) 2002-2010, Dawid Weiss, Stanisław Osiński. * All rights reserved. * * Refer to the full license file "carrot2.LICENSE" * in the root folder of the repository checkout or at: * http://www.carrot2.org/carrot2.LICENSE */ package org.carrot2.examples.clustering; import java.util.HashMap; import java.util.Map; import org.carrot2.clustering.lingo.LingoClusteringAlgorithm; import org.carrot2.clustering.lingo.LingoClusteringAlgorithmDescriptor; import org.carrot2.core.*; import org.carrot2.core.attribute.CommonAttributesDescriptor; import org.carrot2.examples.ConsoleFormatter; import org.carrot2.matrix.factorization.IterationNumberGuesser.FactorizationQuality; import org.carrot2.source.microsoft.BingDocumentSource; import com.google.common.collect.Maps; /** * It is possible to initialize a {@link Controller} to host a number of different * configurations of the same {@link IDocumentSource} or {@link IClusteringAlgorithm} and * invoke them as appropriate. This is achieved by assigning a string identifier to each * configuration and then passing the identifier to the * {@link Controller#process(Map, String...)} method. * <p> * One example where this setting may be useful is when your application serves multiple * customers, each of which need a different configuration of a document source or a * clustering algorithm. * </p> */ public class MoreConfigurationsOfOneAlgorithmInCachingController { @SuppressWarnings( { "unchecked" }) public static void main(String [] args) { /* * Create a controller that caches all documents. */ final Controller controller = ControllerFactory.createCachingPooling(IDocumentSource.class); /* * You can define global values for some attributes. These will apply to all * configurations we will define below, unless the specific configuration * overrides the global attributes. */ final Map<String, Object> globalAttributes = new HashMap<String, Object>(); LingoClusteringAlgorithmDescriptor.attributeBuilder(globalAttributes) .preprocessingPipeline() .documentAssigner() .exactPhraseAssignment(false); /* * Now we will define two different configurations of the Lingo algorithm. One * will be optimized for speed of clustering, while the other will optimize the * quality of clusters. */ final Map<String, Object> fastAttributes = Maps.newHashMap(); LingoClusteringAlgorithmDescriptor.attributeBuilder(fastAttributes) .matrixReducer() .desiredClusterCountBase(20) .factorizationQuality(FactorizationQuality.LOW); LingoClusteringAlgorithmDescriptor.attributeBuilder(fastAttributes) .preprocessingPipeline() .caseNormalizer() .dfThreshold(2); final Map<String, Object> accurateAttributes = Maps.newHashMap(); LingoClusteringAlgorithmDescriptor.attributeBuilder(accurateAttributes) .matrixReducer() .desiredClusterCountBase(40) .factorizationQuality(FactorizationQuality.HIGH); LingoClusteringAlgorithmDescriptor.attributeBuilder(accurateAttributes) .preprocessingPipeline() .documentAssigner() .exactPhraseAssignment(true); LingoClusteringAlgorithmDescriptor.attributeBuilder(fastAttributes) .preprocessingPipeline() .caseNormalizer() .dfThreshold(1); /* * We initialize the controller passing the global attributes and the two * configurations. Notice that a configuration consists of the component * class (can be a document source as well as a clustering algorithm), its * string identifier and attributes. */ controller.init(globalAttributes, new ProcessingComponentConfiguration(LingoClusteringAlgorithm.class, "lingo-fast", fastAttributes), new ProcessingComponentConfiguration(LingoClusteringAlgorithm.class, "lingo-accurate", accurateAttributes) ); /* * Now we can call the two different clustering configurations. Notice that * because we now use string identifiers instead of classes, we pass the document * source class name rather than the class itself. */ final Map<String, Object> attributes = new HashMap<String, Object>(); CommonAttributesDescriptor.attributeBuilder(attributes) .query("data mining"); final ProcessingResult fastResult = controller.process(attributes, BingDocumentSource.class.getName(), "lingo-fast"); ConsoleFormatter.displayClusters(fastResult.getClusters()); final ProcessingResult accurateResult = controller.process(attributes, BingDocumentSource.class.getName(), "lingo-accurate"); ConsoleFormatter.displayClusters(accurateResult.getClusters()); } }