/* * Carrot2 project. * * Copyright (C) 2002-2016, Dawid Weiss, Stanisław Osiński. * All rights reserved. * * Refer to the full license file "carrot2.LICENSE" * in the root folder of the repository checkout or at: * http://www.carrot2.org/carrot2.LICENSE */ package org.carrot2.examples.clustering; import java.util.HashMap; import java.util.List; import java.util.Map; import org.carrot2.clustering.lingo.LingoClusteringAlgorithm; import org.carrot2.clustering.stc.STCClusteringAlgorithm; import org.carrot2.core.Cluster; import org.carrot2.core.Controller; import org.carrot2.core.ControllerFactory; import org.carrot2.core.Document; import org.carrot2.core.IDocumentSource; import org.carrot2.core.ProcessingResult; import org.carrot2.core.attribute.CommonAttributesDescriptor; import org.carrot2.examples.ConsoleFormatter; import org.carrot2.source.etools.EToolsDocumentSource; import org.carrot2.source.microsoft.v5.Bing5DocumentSource; import org.carrot2.source.microsoft.v5.Bing5DocumentSourceDescriptor; /** * This example shows how to cluster {@link Document}s retrieved from * {@link IDocumentSource}s. There are a number of implementations of this interface in the * Carrot2 project, in this example we will cluster results from Microsoft Live (Web * search). * * <p> * It is assumed that you are familiar with {@link ClusteringDocumentList} example. * </p> * * @see ClusteringDocumentList * @see UsingCachingController */ public class ClusteringDataFromDocumentSources { @SuppressWarnings("unused") public static void main(String [] args) { /* [[[start:clustering-data-from-document-sources-simple-intro]]] * * <div> * One common way to use Carrot2 Java API is to fetch a number of documents * from some {@link org.carrot2.core.IDocumentSource} and cluster them using some * {@link org.carrot2.core.IClusteringAlgorithm}. The simplest yet least flexible * way to do it is to use the {@link org.carrot2.core.Controller#process(String, Integer, Class...)} * method from the {@link org.carrot2.core.Controller}. The code shown below retrieves * 100 search results for query <em>data mining</em> from * {@link org.carrot2.source.etools.EToolsDocumentSource} and clusters them using * the {@link org.carrot2.clustering.lingo.LingoClusteringAlgorithm}. * </div> * * [[[end:clustering-data-from-document-sources-simple-intro]]] */ { /// [[[start:clustering-data-from-document-sources-simple]]] /* A controller to manage the processing pipeline. */ final Controller controller = ControllerFactory.createSimple(); /* Perform processing */ final ProcessingResult result = controller.process("data mining", 100, EToolsDocumentSource.class, LingoClusteringAlgorithm.class); /* Documents fetched from the document source, clusters created by Carrot2. */ final List<Document> documents = result.getDocuments(); final List<Cluster> clusters = result.getClusters(); /// [[[end:clustering-data-from-document-sources-simple]]] ConsoleFormatter.displayResults(result); } /* [[[start:clustering-data-from-document-sources-advanced-intro]]] * * If your production code needs to fetch documents from popular search engines, * it is very important that you generate and use your own API key rather than Carrot2's * default one. You can pass the API key along with the query and the requested * number of results in an attribute map. Carrot2 manual lists all supported attributes * along with their keys, types and allowed values. The code shown below, fetches and clusters * 50 results from {@link org.carrot2.source.microsoft.v5.Bing5DocumentSource}. * * [[[end:clustering-data-from-document-sources-advanced-intro]]] */ { /// [[[start:clustering-data-from-document-sources-advanced]]] /* A controller to manage the processing pipeline. */ final Controller controller = ControllerFactory.createSimple(); /* Prepare attributes */ final Map<String, Object> attributes = new HashMap<String, Object>(); /* Put your own API key here! */ Bing5DocumentSourceDescriptor.attributeBuilder(attributes) .apiKey(BingKeyAccess.getKey()); /* Query an the required number of results */ attributes.put(CommonAttributesDescriptor.Keys.QUERY, "clustering"); attributes.put(CommonAttributesDescriptor.Keys.RESULTS, 50); /* Perform processing */ final ProcessingResult result = controller.process(attributes, Bing5DocumentSource.class, STCClusteringAlgorithm.class); /* Documents fetched from the document source, clusters created by Carrot2. */ final List<Document> documents = result.getDocuments(); final List<Cluster> clusters = result.getClusters(); /// [[[end:clustering-data-from-document-sources-advanced]]] ConsoleFormatter.displayResults(result); } } }