/* * Carrot2 project. * * Copyright (C) 2002-2016, Dawid Weiss, Stanisław Osiński. * All rights reserved. * * Refer to the full license file "carrot2.LICENSE" * in the root folder of the repository checkout or at: * http://www.carrot2.org/carrot2.LICENSE */ package org.carrot2.examples.clustering; import java.util.List; import java.util.Map; import org.carrot2.clustering.lingo.LingoClusteringAlgorithm; import org.carrot2.core.Controller; import org.carrot2.core.ControllerFactory; import org.carrot2.core.Document; import org.carrot2.core.IDocumentSource; import org.carrot2.core.LanguageCode; import org.carrot2.core.ProcessingResult; import org.carrot2.core.attribute.CommonAttributesDescriptor; import org.carrot2.examples.ConsoleFormatter; import org.carrot2.examples.SampleDocumentData; import org.carrot2.shaded.guava.common.collect.Lists; import org.carrot2.shaded.guava.common.collect.Maps; import org.carrot2.source.microsoft.v5.Bing5DocumentSource; import org.carrot2.source.microsoft.v5.Bing5DocumentSourceDescriptor; import org.carrot2.source.microsoft.v5.MarketOption; /** * [[[start:clustering-non-english-content-intro]]] * <div> * <p> * This example shows how to cluster non-English content. By default Carrot2 assumes that * the documents provided for clustering are written in English. When clustering content * written in some different language, it is important to indicate the language to * Carrot2, so that it can use the lexical resources (stop words, tokenizer, stemmer) * appropriate for that language. * </p> * <p> * There are two ways to indicate the desired clustering language to Carrot2: * </p> * <ol> * <li>By setting the language of each document in their * {@link org.carrot2.core.Document#LANGUAGE} field. The language does not necessarily * have to be the same for all documents on the input, Carrot2 can handle multiple * languages in one document set as well. Please see the * {@link org.carrot2.text.clustering.MultilingualClustering#languageAggregationStrategy} * attribute for more details.</li> * <li>By setting the fallback language. For documents with undefined * {@link org.carrot2.core.Document#LANGUAGE} field, Carrot2 will assume the some fallback * language, which is English by default. You can change the fallback language by setting * the {@link org.carrot2.text.clustering.MultilingualClustering#defaultLanguage} * attribute.</li> * </ol> * Additionally, a number of document sources automatically set the * {@link org.carrot2.core.Document#LANGUAGE} of documents they produce based on their * specific language-related attributes. Currently, three documents support this scenario: * <ol> * <li>{@link org.carrot2.source.microsoft.v5.Bing5DocumentSource} through the * {@link org.carrot2.source.microsoft.v5.Bing5DocumentSource#market} attribute,</li> * <li>{@link org.carrot2.source.etools.EToolsDocumentSource} through the * {@link org.carrot2.source.etools.EToolsDocumentSource#language} attribute.</li> * </ol> * For the document sources that do not set the documents' language automatically, the * easiest way to set the clustering language is through the * {@link org.carrot2.text.clustering.MultilingualClustering#defaultLanguage} attribute. * </div> * [[[end:clustering-non-english-content-intro]]] */ public class ClusteringNonEnglishContent { public static void main(String [] args) { // [[[start:clustering-non-english-content]]] /* * We use a Controller that reuse instances of Carrot2 processing components * and caches results produced by document sources. */ final Controller controller = ControllerFactory.createCachingPooling(IDocumentSource.class); /* * In the first call, we'll cluster a document list, setting the language for each * document separately. */ final List<Document> documents = Lists.newArrayList(); for (Document document : SampleDocumentData.DOCUMENTS_DATA_MINING) { documents.add(new Document(document.getTitle(), document.getSummary(), document.getContentUrl(), LanguageCode.ENGLISH)); } final Map<String, Object> attributes = Maps.newHashMap(); CommonAttributesDescriptor.attributeBuilder(attributes) .documents(documents); final ProcessingResult englishResult = controller.process( attributes, LingoClusteringAlgorithm.class); ConsoleFormatter.displayResults(englishResult); /* * In the second call, we will fetch results for a Chinese query from Bing, * setting explicitly the Bing's specific language attribute. Based on that * attribute, the document source will set the appropriate language for each * document. */ attributes.clear(); CommonAttributesDescriptor.attributeBuilder(attributes) .query("聚类" /* clustering? */) .results(100); Bing5DocumentSourceDescriptor.attributeBuilder(attributes) .market(MarketOption.CHINESE_CHINA) .apiKey(BingKeyAccess.getKey()); // use your own ID here! final ProcessingResult chineseResult = controller.process(attributes, Bing5DocumentSource.class, LingoClusteringAlgorithm.class); ConsoleFormatter.displayResults(chineseResult); // [[[end:clustering-non-english-content]]] } }