/* * Carrot2 project. * * Copyright (C) 2002-2016, Dawid Weiss, Stanisław Osiński. * All rights reserved. * * Refer to the full license file "carrot2.LICENSE" * in the root folder of the repository checkout or at: * http://www.carrot2.org/carrot2.LICENSE */ package org.carrot2.clustering.stc; import java.util.Collection; import java.util.List; import java.util.Set; import org.carrot2.core.Cluster; import org.carrot2.core.Document; import org.carrot2.core.ProcessingResult; import org.carrot2.core.test.ClusteringAlgorithmTestBase; import org.carrot2.core.test.SampleDocumentData; import org.carrot2.text.preprocessing.CaseNormalizer; import org.carrot2.util.attribute.AttributeUtils; import org.junit.Test; import org.carrot2.shaded.guava.common.collect.Lists; import org.carrot2.shaded.guava.common.io.Resources; import static org.junit.Assert.*; /** * Test cases for the {@link STCClusteringAlgorithm}. */ public class STCClusteringAlgorithmTest extends ClusteringAlgorithmTestBase<STCClusteringAlgorithm> { @Override public Class<STCClusteringAlgorithm> getComponentClass() { return STCClusteringAlgorithm.class; } @Test public void testClusteringWithDfThreshold() { processingAttributes.put( AttributeUtils.getKey(CaseNormalizer.class, "dfThreshold"), 20); final Collection<Cluster> clustersWithThreshold = cluster( SampleDocumentData.DOCUMENTS_DATA_MINING).getClusters(); // Clustering with df threshold must not fail assertThat(clustersWithThreshold.size()).isGreaterThan(0); } @Test public void testMaxClusters() { processingAttributes.put( AttributeUtils.getKey(STCClusteringAlgorithm.class, "maxClusters"), 9); final Collection<Cluster> clusters = cluster(SampleDocumentData.DOCUMENTS_DATA_MINING).getClusters(); // 9 + others assertThat(clusters.size()).isEqualTo(9 + 1); } @Test public void testComputeIntersection() { int [] t1; t1 = new int [] {0, 1, 2, 1, 2, 3}; assertEquals(2, STCClusteringAlgorithm.computeIntersection(t1, 0, 3, t1, 3, 3)); t1 = new int [] {0, 1, 2, 3, 5, 6}; assertEquals(0, STCClusteringAlgorithm.computeIntersection(t1, 0, 3, t1, 3, 3)); t1 = new int [] {0, 1, 2, -1, 2, 6}; assertEquals(1, STCClusteringAlgorithm.computeIntersection(t1, 0, 3, t1, 3, 3)); t1 = new int [] {0, 1, 2, 0}; assertEquals(1, STCClusteringAlgorithm.computeIntersection(t1, 0, 3, t1, 3, 1)); } @Test public void testMergingBaseClustersWithStemEquivalentPhrases() { List<Document> documents = Lists.newArrayList(); documents.add(new Document("good programs . foo1")); documents.add(new Document("foo2 good programs . foo2")); documents.add(new Document("good programs taste good")); documents.add(new Document("good programs are good")); documents.add(new Document("good programming . foo3")); documents.add(new Document("foo4 good programming . foo4")); documents.add(new Document("good programming makes you feel better")); // Lower base cluster score. STCClusteringAlgorithmDescriptor.attributeBuilder(processingAttributes) .minBaseClusterScore(0); ProcessingResult pr = cluster(documents); Set<String> clusterLabels = collectClusterLabels(pr); assertThat("Good Programs").isIn(clusterLabels); assertThat("Good Programming").isNotIn(clusterLabels); } /** * CARROT-1008: STC is not using term stems. */ @Test public void testCarrot1008() throws Exception { ProcessingResult pr = ProcessingResult.deserialize( Resources.asByteSource( Resources.getResource(this.getClass(), "CARROT-1008.xml")).openBufferedStream()); STCClusteringAlgorithmDescriptor.attributeBuilder(processingAttributes) .maxClusters(30); pr = cluster(pr.getDocuments()); dumpClusterLabels(pr); Set<String> clusterLabels = ClusteringAlgorithmTestBase.collectClusterLabels(pr); assertThat( clusterLabels.contains("Guns") && clusterLabels.contains("Gun")).isFalse(); } }