/* * Carrot2 project. * * Copyright (C) 2002-2016, Dawid Weiss, Stanisław Osiński. * All rights reserved. * * Refer to the full license file "carrot2.LICENSE" * in the root folder of the repository checkout or at: * http://www.carrot2.org/carrot2.LICENSE */ package org.carrot2.dcs; import java.io.*; import java.net.*; import java.util.*; import org.dom4j.*; import org.dom4j.io.SAXReader; /** * Contains several examples of clustering using the Document Clustering Server. */ final class Examples { /** Shared path to the XML file to be clustered. */ private static final String XML_FILE_PATH = "../shared/data-mining.xml"; /** Shared XML feed. */ private static final String XML_FEED = "http://search.carrot2.org/stable/xml?source=web&type=CARROT2&q=test&results=20"; /** Shared DCS address. */ private static final URI dcsURI; static { try { dcsURI = new URI("http://localhost:8080/dcs/rest"); } catch (URISyntaxException e) { throw new RuntimeException(e); } } /** * Anything capable of formatting, sending and parsing a multipart HTTP POST. */ private final IHttpMultipartPostProvider httpPoster; /* * */ public Examples(IHttpMultipartPostProvider httpPoster) { this.httpPoster = httpPoster; } /** * Cluster data from an XML file (local). */ public void clusterFromFile() throws IOException { final Map<String, String> attributes = new LinkedHashMap<String, String>(); System.out.println("## Clustering documents from a local file"); /* * Note the optional query attribute, we can provide it to avoid creation of * trivial clusters. */ attributes.put("dcs.c2stream", new String( StreamUtils.readFullyAndClose( new FileInputStream(XML_FILE_PATH)), "UTF-8")); attributes.put("query", "data mining"); displayResults(httpPoster.post(dcsURI, attributes)); } /** * Cluster data from an external XML stream feed (providing an URL to that feed). */ public void clusterFromRemoteXML() throws IOException { final Map<String, String> attributes = new LinkedHashMap<String, String>(); System.out.println("## Clustering documents from a remote XML feed"); attributes.put("dcs.source", "xml"); attributes.put("dcs.algorithm", "stc"); attributes.put("XmlDocumentSource.xml", XML_FEED); displayResults(httpPoster.post(dcsURI, attributes)); } /** * Cluster data retrieved from a search engine or some other source registered in the * DCS as a document source. */ public void clusterFromSearchEngine() throws IOException { final Map<String, String> attributes = new LinkedHashMap<String, String>(); /* * For this request, we will pass some additional attributes to the default * algorithm and ask to skip the fetched documents in the output (retrieve * clusters only). */ System.out.println("## Clustering search results from a search engine"); // We use etools meta search engine input component. attributes.put("dcs.source", "etools"); attributes.put("query", "test"); attributes.put("results", "20"); attributes.put("dcs.algorithm", "lingo"); attributes.put("dcs.clusters.only", "true"); // Some customized algorithm parameters. attributes.put("LingoClusteringAlgorithm.desiredClusterCountBase", "10"); attributes.put("LingoClusteringAlgorithm.factorizationQuality", "LOW"); attributes.put("LingoClusteringAlgorithm.factorizationFactory", "org.carrot2.matrix.factorization.PartialSingularValueDecompositionFactory"); displayResults(httpPoster.post(dcsURI, attributes)); } /** * Runs all examples. */ public void runAllExamples() throws IOException { clusterFromFile(); clusterFromRemoteXML(); clusterFromSearchEngine(); } /** * Run all examples with all HTTP POST providers. */ public static void main(String [] args) throws IOException { IHttpMultipartPostProvider [] providers = { new HttpClientPostProvider(), new JaxRsPostProvider() }; for (IHttpMultipartPostProvider provider : providers) { new Examples(provider).runAllExamples(); } } /** * Simple parsing and display of the response. This method uses dom4j for parsing XML, * feel free to use anything that comes handy. */ @SuppressWarnings("unchecked") private static void displayResults(InputStream results) throws IOException { try { final SAXReader reader = new SAXReader(); final Document document = reader.read(results); final Iterator<Element> i = document.getRootElement().elementIterator("group"); while (i.hasNext()) { final Element group = i.next(); display(group, 1); } System.out.println(); } catch (DocumentException e) { throw new IOException("Could not parse response: " + e.getMessage()); } finally { if (results != null) { results.close(); } } } /** * Display a single cluster and its sub-clusters. */ @SuppressWarnings({"unchecked"}) private static void display(Element group, int level) { final int recursiveDocumentCount = Integer.parseInt(group.attribute("size").getValue()); final String label = group.element("title").elementText("phrase"); for (int i = 0; i < level; i++) System.out.print(" "); System.out.println(label + " [" + recursiveDocumentCount + " document(s)]"); final Iterator<Element> i = group.elementIterator("group"); while (i.hasNext()) { display(i.next(), level + 1); } } }