/*
* Carrot2 project.
*
* Copyright (C) 2002-2010, Dawid Weiss, Stanisław Osiński.
* All rights reserved.
*
* Refer to the full license file "carrot2.LICENSE"
* in the root folder of the repository checkout or at:
* http://www.carrot2.org/carrot2.LICENSE
*/
package org.carrot2.examples.clustering;
import java.util.Map;
import java.util.Set;
import org.carrot2.clustering.lingo.LingoClusteringAlgorithm;
import org.carrot2.clustering.stc.STCClusteringAlgorithm;
import org.carrot2.core.Controller;
import org.carrot2.core.ControllerFactory;
import org.carrot2.core.IClusteringAlgorithm;
import org.carrot2.core.IDocumentSource;
import org.carrot2.core.LanguageCode;
import org.carrot2.core.ProcessingResult;
import org.carrot2.core.attribute.CommonAttributesDescriptor;
import org.carrot2.examples.ConsoleFormatter;
import org.carrot2.examples.SampleDocumentData;
import org.carrot2.text.analysis.ExtendedWhitespaceTokenizer;
import org.carrot2.text.analysis.ITokenizer;
import org.carrot2.text.linguistic.ILanguageModel;
import org.carrot2.text.linguistic.ILanguageModelFactory;
import org.carrot2.text.linguistic.ILexicalData;
import org.carrot2.text.linguistic.IStemmer;
import org.carrot2.text.preprocessing.pipeline.BasicPreprocessingPipelineDescriptor;
import org.carrot2.text.util.MutableCharArray;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
/**
* This example shows how to perform clustering using a custom language model, including
* stop words, stop labels and stemmer.
*/
public class UsingCustomLanguageModel
{
public static void main(String [] args)
{
@SuppressWarnings("unchecked")
final Controller controller = ControllerFactory.createCachingPooling(IDocumentSource.class);
// We will pass our custom language model factory class as a initialization-time
// attribute. It is preferred to passing it as a processing-time attribute
// because it the instance created at initialization time is reused for all
// further requests.
Map<String, Object> attrs = Maps.newHashMap();
BasicPreprocessingPipelineDescriptor.attributeBuilder(attrs)
.languageModelFactory(CustomLanguageModelFactory.class);
controller.init(attrs);
// Cluster some data with Lingo and STC. Notice how the cluster quality degrades
// when the stop word list is empty (especially for STC).
clusterAndDisplayClusters(controller, LingoClusteringAlgorithm.class);
clusterAndDisplayClusters(controller, STCClusteringAlgorithm.class);
}
/**
* Clusters results for query "data mining" and displays the clusters.
*/
private static void clusterAndDisplayClusters(final Controller controller,
final Class<? extends IClusteringAlgorithm> clusteringAlgorithm)
{
final Map<String, Object> processingAttributes = Maps.newHashMap();
CommonAttributesDescriptor.attributeBuilder(processingAttributes)
.documents(Lists.newArrayList(SampleDocumentData.DOCUMENTS_DATA_MINING))
.query("data mining");
final ProcessingResult result = controller.process(processingAttributes,
clusteringAlgorithm);
ConsoleFormatter.displayClusters(result.getClusters(), 0);
}
/**
* A custom language model factory.
*/
public static class CustomLanguageModelFactory implements ILanguageModelFactory
{
private static final Set<? extends CharSequence> STOP_WORDS =
ImmutableSet.of("text");
public ILanguageModel getLanguageModel(LanguageCode language)
{
// Here we always return the same language model, regardless of the requested
// language. In your implementation you may want to return different models
// based on the language, if needed.
return new CustomLanguageModel();
}
/**
* Custom language model implementation. This one uses some contrived algorithms
* and stop words just to demonstrate how they work.
*/
private static final class CustomLanguageModel implements ILanguageModel
{
public IStemmer getStemmer()
{
return new IStemmer()
{
public CharSequence stem(CharSequence word)
{
// Some contrived stemming algorithm
return word.length() > 3 ? word.subSequence(0, word.length() - 2)
: null;
}
};
}
public ITokenizer getTokenizer()
{
return new ExtendedWhitespaceTokenizer();
}
public LanguageCode getLanguageCode()
{
return null;
}
@Override
public ILexicalData getLexicalData()
{
return new ILexicalData()
{
@Override
public boolean isStopLabel(CharSequence formattedLabel)
{
return formattedLabel.length() <= 4;
}
@Override
public boolean isCommonWord(MutableCharArray word)
{
return STOP_WORDS.contains(word.toString());
}
};
}
}
}
}