/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.nutch.clustering.carrot2; import java.io.InputStream; import java.util.ArrayList; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import junit.framework.TestCase; import org.apache.hadoop.conf.Configuration; import org.apache.nutch.clustering.HitsCluster; import org.apache.nutch.searcher.HitDetails; import org.w3c.dom.Document; import org.w3c.dom.Element; import org.w3c.dom.Node; import org.w3c.dom.NodeList; /** * A test case for the Carrot2-based clusterer plugin to Nutch. */ public class TestClusterer extends TestCase { private Clusterer c; public TestClusterer(String testName) { super(testName); } protected void setUp() throws Exception { c = new Clusterer(); c.setConf(new Configuration()); } /** * The clusterer should not fail on empty input, returning * an empty array of {@link HitsCluster}. */ public void testEmptyInput() { final HitDetails [] hitDetails = new HitDetails[0]; final String [] descriptions = new String [0]; final HitsCluster [] clusters = c.clusterHits(hitDetails, descriptions); assertTrue(clusters != null && clusters.length == 0); } /** * Tests the clusterer on some cached data. */ public void testOnCachedData() throws Exception { final DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); final DocumentBuilder parser = factory.newDocumentBuilder(); final InputStream is = getClass().getResourceAsStream("test-input.xml"); assertNotNull("test-input.xml not found", is); final Document document = parser.parse(is); is.close(); final Element data = document.getDocumentElement(); final NodeList docs = data.getElementsByTagName("document"); final ArrayList summaries = new ArrayList(); final ArrayList hitDetails = new ArrayList(); assertTrue(docs.getLength() > 0); for (int i = 0; i < docs.getLength(); i++) { final Element doc = (Element) docs.item(i); assertTrue(doc.getNodeType() == Node.ELEMENT_NODE); final Element urlElement = (Element) doc.getElementsByTagName("url").item(0); final Element snippetElement = (Element) doc.getElementsByTagName("snippet").item(0); final Element titleElement = (Element) doc.getElementsByTagName("title").item(0); summaries.add(toText(titleElement) + " " + toText(snippetElement)); hitDetails.add(new HitDetails( new String [] {"url"}, new String [] {toText(urlElement)})); } HitsCluster [] clusters = c.clusterHits( (HitDetails[]) hitDetails.toArray(new HitDetails[hitDetails.size()]), (String[]) summaries.toArray(new String[summaries.size()])); // There should be SOME clusters in the input... words distribution // should not be random because some words have higher probability. assertTrue(clusters != null); assertTrue("Clusters expected, but not found.", clusters.length > 0); // Check hit references inside clusters. for (int i = 0; i < clusters.length; i++) { assertTrue(clusters[i].getHits().length > 0); } /* // Dump cluster content if you need to. System.out.println("Clusters: " + clusters.length); for (int i = 0; i < clusters.length; i++) { dump(0, clusters[i]); } */ } /** * Converts a {@link Element} to plain text. */ private String toText(Element snippetElement) { final StringBuffer buffer = new StringBuffer(); final NodeList list = snippetElement.getChildNodes(); for (int i = 0; i < list.getLength(); i++) { Node n = list.item(i); if (n.getNodeType() == Node.TEXT_NODE) { buffer.append(n.getNodeValue()); } else if (n.getNodeType() == Node.CDATA_SECTION_NODE) { n.getNodeValue(); } else throw new RuntimeException("Unexpected nested element when converting to text."); } return buffer.toString(); } /** * Dumps the content of {@link HitsCluster} to system output stream. */ private void dump(int level, HitsCluster cluster) { String [] labels = cluster.getDescriptionLabels(); for (int indent = 0; indent<level; indent++) { System.out.print( " " ); } System.out.print(">> "); if (cluster.isJunkCluster()) System.out.print("(Junk) "); System.out.print("CLUSTER: "); for (int i=0;i<labels.length;i++) { System.out.print( labels[i] + "; " ); } System.out.println(); HitsCluster [] subclusters = cluster.getSubclusters(); if (subclusters != null) { for (int i=0;i<subclusters.length;i++) { dump(level + 1, subclusters[i]); } } // dump documents. HitDetails [] hits = cluster.getHits(); if (hits != null) { for (int i=0;i<hits.length;i++ ) { for (int indent = 0; indent<level; indent++) { System.out.print( " " ); } System.out.print( hits[i].getValue("url") ); System.out.print( "; " ); System.out.println( hits[i].getValue("title") ); } } } }