/*
* Carrot2 project.
*
* Copyright (C) 2002-2016, Dawid Weiss, Stanisław Osiński.
* All rights reserved.
*
* Refer to the full license file "carrot2.LICENSE"
* in the root folder of the repository checkout or at:
* http://www.carrot2.org/carrot2.LICENSE
*/
package org.carrot2.text.clustering;
import static org.carrot2.core.test.assertions.Carrot2CoreAssertions.assertThatClusters;
import java.util.*;
import org.carrot2.core.*;
import org.carrot2.text.clustering.MultilingualClustering.LanguageAggregationStrategy;
import org.carrot2.util.tests.CarrotTestCase;
import org.junit.Before;
import org.junit.Test;
import org.carrot2.shaded.guava.common.base.Function;
import org.carrot2.shaded.guava.common.base.Predicate;
import org.carrot2.shaded.guava.common.collect.*;
/**
*
*/
public class MultilingualClusteringTest extends CarrotTestCase
{
private MultilingualClustering multilingualClustering;
private TestMultilingualClusteringAlgorithm testMultilingualClusteringAlgorithm;
@Before
public void setUp()
{
multilingualClustering = new MultilingualClustering();
testMultilingualClusteringAlgorithm = new TestMultilingualClusteringAlgorithm();
}
@Test
public void testEmptyFlattenAll()
{
checkEmpty(LanguageAggregationStrategy.FLATTEN_ALL);
}
@Test
public void testEmptyFlattenMajorLanguage()
{
checkEmpty(LanguageAggregationStrategy.FLATTEN_MAJOR_LANGUAGE);
}
@Test
public void testEmptyFlattenNone()
{
checkEmpty(LanguageAggregationStrategy.FLATTEN_NONE);
}
@Test
public void testEmptyMajorityLanguage()
{
checkEmpty(LanguageAggregationStrategy.CLUSTER_IN_MAJORITY_LANGUAGE);
}
@Test
public void testNoLanguageFlattenAll()
{
checkNoLanguage(LanguageAggregationStrategy.FLATTEN_ALL);
}
@Test
public void testNoLanguageFlattenMajorLanguage()
{
checkNoLanguage(LanguageAggregationStrategy.FLATTEN_MAJOR_LANGUAGE);
}
@Test
public void testNoLanguageFlattenNone()
{
checkNoLanguage(LanguageAggregationStrategy.FLATTEN_NONE);
}
@Test
public void testNoLanguageMajorityLanguage()
{
checkNoLanguage(LanguageAggregationStrategy.CLUSTER_IN_MAJORITY_LANGUAGE);
}
@Test
public void testOneLanguageNontrivialClustersFlattenAll()
{
checkOneLanguageNontrivialClusters(LanguageAggregationStrategy.FLATTEN_ALL,
LanguageCode.GERMAN);
}
@Test
public void testOneLanguageNontrivialClustersFlattenMajorLanguage()
{
checkOneLanguageNontrivialClusters(LanguageAggregationStrategy.FLATTEN_MAJOR_LANGUAGE,
LanguageCode.GERMAN);
}
@Test
public void testOneLanguageNontrivialClustersFlattenNone()
{
checkOneLanguageNontrivialClusters(LanguageAggregationStrategy.FLATTEN_NONE,
LanguageCode.GERMAN);
}
@Test
public void testOneLanguageNontrivialClustersMajorityLanguage()
{
checkOneLanguageNontrivialClusters(LanguageAggregationStrategy.CLUSTER_IN_MAJORITY_LANGUAGE,
LanguageCode.GERMAN);
}
@Test
public void testOneLanguageOtherTopicsClusterFlattenAll()
{
checkOneLanguageOtherTopicsCluster(LanguageAggregationStrategy.FLATTEN_ALL);
}
@Test
public void testOneLanguageOtherTopicsClusterFlattenMajorLanguage()
{
checkOneLanguageOtherTopicsCluster(LanguageAggregationStrategy.FLATTEN_MAJOR_LANGUAGE);
}
@Test
public void testOneLanguageOtherTopicsClusterFlattenNone()
{
checkOneLanguageOtherTopicsCluster(LanguageAggregationStrategy.FLATTEN_NONE);
}
@Test
public void testOneLanguageOtherTopicsClusterMajorityLanguage()
{
checkOneLanguageOtherTopicsCluster(LanguageAggregationStrategy.CLUSTER_IN_MAJORITY_LANGUAGE);
}
@Test
public void testMoreLanguagesFlattenAll()
{
final List<Document> documents = documentsWithLanguages(LanguageCode.POLISH,
LanguageCode.POLISH, LanguageCode.POLISH, LanguageCode.GERMAN,
LanguageCode.GERMAN);
final Cluster c1 = new Cluster("Cluster 2").addDocuments(documents.get(4));
final Cluster co = new Cluster("Other Topics").addDocuments(
documents.subList(0, 4)).setOtherTopics(true);
final List<Cluster> expectedClusters = Lists.newArrayList(c1, co);
check(documents, expectedClusters, Lists.newArrayList(LanguageCode.POLISH,
LanguageCode.GERMAN), LanguageAggregationStrategy.FLATTEN_ALL);
}
@Test
public void testMoreLanguagesMajorityLanguage()
{
final List<Document> documents = documentsWithLanguages(LanguageCode.POLISH,
LanguageCode.POLISH, LanguageCode.GERMAN, LanguageCode.GERMAN,
LanguageCode.GERMAN);
final Cluster c1 = new Cluster("Cluster 1").addDocuments(documents.get(2), documents.get(4));
final Cluster c2 = new Cluster("Cluster 2").addDocuments(documents.get(1), documents.get(3));
final Cluster co = new Cluster("Other Topics").addDocuments(
documents.get(0)).setOtherTopics(true);
final List<Cluster> expectedClusters = Lists.newArrayList(c1, c2, co);
check(documents, expectedClusters, Lists.newArrayList(LanguageCode.GERMAN),
LanguageAggregationStrategy.CLUSTER_IN_MAJORITY_LANGUAGE);
}
@Test
public void testMoreLanguagesFlattenMajorLanguage()
{
final List<Document> documents = documentsWithLanguages(LanguageCode.POLISH,
LanguageCode.POLISH, LanguageCode.POLISH, LanguageCode.GERMAN,
LanguageCode.GERMAN);
final Cluster c1 = new Cluster("Cluster 2").addDocuments(documents.get(4));
final Cluster co = new Cluster("Other Topics").addDocuments(documents.get(3))
.setOtherTopics(true);
final Cluster cl = new Cluster("Other Languages").addSubclusters(new Cluster(
"Polish").addDocuments(documents.subList(0, 3)));
final List<Cluster> expectedClusters = Lists.newArrayList(c1, co, cl);
check(documents, expectedClusters, Lists.newArrayList(LanguageCode.POLISH,
LanguageCode.GERMAN), LanguageAggregationStrategy.FLATTEN_MAJOR_LANGUAGE);
}
@Test
public void testMoreLanguagesFlattenMajorNone()
{
final List<Document> documents = documentsWithLanguages(LanguageCode.POLISH,
LanguageCode.POLISH, LanguageCode.POLISH, LanguageCode.GERMAN,
LanguageCode.GERMAN);
final Cluster cg = new Cluster("German").addSubclusters(new Cluster("Cluster 2")
.addDocuments(documents.get(4)), new Cluster("Other Topics").addDocuments(
documents.get(3)).setOtherTopics(true));
final Cluster cp = new Cluster("Polish").addDocuments(documents.subList(0, 3));
final List<Cluster> expectedClusters = Lists.newArrayList(cp, cg);
check(documents, expectedClusters, Lists.newArrayList(LanguageCode.POLISH,
LanguageCode.GERMAN), LanguageAggregationStrategy.FLATTEN_NONE);
}
@Test
public void testMoreLanguagesTrivialOrNoClusters()
{
final List<Document> documents = documentsWithLanguages(LanguageCode.POLISH,
LanguageCode.POLISH, LanguageCode.POLISH, LanguageCode.NORWEGIAN,
LanguageCode.NORWEGIAN);
final Cluster cn = new Cluster("Norwegian").addDocuments(
documents.subList(3, 5));
final Cluster cp = new Cluster("Polish").addDocuments(documents.subList(0, 3));
final List<Cluster> expectedClusters = Lists.newArrayList(cp, cn);
check(documents, expectedClusters, Lists.newArrayList(LanguageCode.POLISH,
LanguageCode.NORWEGIAN), LanguageAggregationStrategy.FLATTEN_MAJOR_LANGUAGE);
}
@Test
public void testMoreLanguagesTrivialOrNoClustersMajorityLanguage()
{
final List<Document> documents = documentsWithLanguages(LanguageCode.POLISH,
LanguageCode.POLISH, LanguageCode.POLISH, LanguageCode.NORWEGIAN,
LanguageCode.NORWEGIAN);
final Cluster co = new Cluster("Other Topics").addDocuments(
documents).setOtherTopics(true);
final List<Cluster> expectedClusters = Lists.newArrayList(co);
check(documents, expectedClusters, Lists.newArrayList(LanguageCode.POLISH),
LanguageAggregationStrategy.CLUSTER_IN_MAJORITY_LANGUAGE);
}
private void checkEmpty(final LanguageAggregationStrategy strategy)
{
final List<Document> documents = documentsWithLanguages();
final List<Cluster> expectedClusters = Lists.newArrayList();
check(documents, expectedClusters, Lists.<LanguageCode> newArrayList(), strategy);
}
private void checkNoLanguage(final LanguageAggregationStrategy strategy)
{
checkOneLanguageNontrivialClusters(strategy, null,
multilingualClustering.defaultLanguage);
}
private void checkOneLanguageNontrivialClusters(final LanguageAggregationStrategy strategy,
LanguageCode language)
{
checkOneLanguageNontrivialClusters(strategy, language, language);
}
private void checkOneLanguageNontrivialClusters(final LanguageAggregationStrategy strategy,
LanguageCode language, LanguageCode clusteringLanguage)
{
final List<Document> documents = documentsWithLanguages(language, language,
language);
final Cluster c1 = new Cluster("Cluster 1").addDocuments(documents.get(1));
final Cluster c2 = new Cluster("Cluster 2").addDocuments(documents.get(2));
final Cluster co = new Cluster("Other Topics").addDocuments(documents.get(0))
.setOtherTopics(true);
final List<Cluster> expectedClusters = Lists.newArrayList(c1, c2, co);
check(documents, expectedClusters, Lists.newArrayList(clusteringLanguage),
strategy);
}
private void checkOneLanguageOtherTopicsCluster(final LanguageAggregationStrategy strategy)
{
final List<Document> documents = documentsWithLanguages(LanguageCode.NORWEGIAN,
LanguageCode.NORWEGIAN, LanguageCode.NORWEGIAN);
final Cluster co = new Cluster("Other Topics").addDocuments(documents.get(0),
documents.get(1), documents.get(2)).setOtherTopics(true);
final List<Cluster> expectedClusters = Lists.newArrayList(co);
check(documents, expectedClusters, Lists.newArrayList(LanguageCode.NORWEGIAN),
strategy);
}
private void check(final List<Document> documents,
final List<Cluster> expectedClusters,
final List<LanguageCode> expectedClusteringLanguages,
LanguageAggregationStrategy languageClusteringStrategy)
{
multilingualClustering.languageAggregationStrategy = languageClusteringStrategy;
final List<Cluster> actualClusters = multilingualClustering.process(documents,
testMultilingualClusteringAlgorithm);
assertThatClusters(actualClusters).isEquivalentTo(expectedClusters);
assertThat(testMultilingualClusteringAlgorithm.clusteringLanguages).containsOnly(
expectedClusteringLanguages.toArray());
}
/**
* Returns a list of documents with the provided languages.
*/
private static List<Document> documentsWithLanguages(LanguageCode... languages)
{
return Lists.newArrayList(Lists.transform(Arrays.asList(languages),
new Function<LanguageCode, Document>()
{
public Document apply(LanguageCode language)
{
return new Document().setLanguage(language);
}
}));
}
/**
* A mock multilingual clustering algorithm.
*/
private static class TestMultilingualClusteringAlgorithm implements
IMonolingualClusteringAlgorithm
{
private Set<LanguageCode> clusteringLanguages = Sets.newHashSet();
public List<Cluster> process(List<Document> documents, LanguageCode language)
{
final List<Cluster> clusters = Lists.newArrayList();
clusteringLanguages.add(language);
if (LanguageCode.POLISH.equals(language))
{
// No clusters at all
}
else if (LanguageCode.NORWEGIAN.equals(language))
{
// Return one junk cluster
Cluster.appendOtherTopics(documents, clusters);
}
else
{
// Create some clusters
clusters.add(new Cluster("Cluster 1"));
clusters.add(new Cluster("Cluster 2"));
for (int i = 1; i < documents.size(); i++)
{
clusters.get(i % clusters.size()).addDocuments(documents.get(i));
}
Cluster.appendOtherTopics(documents, clusters);
return Lists.newArrayList(Collections2.filter(clusters,
new Predicate<Cluster>()
{
public boolean apply(Cluster cluster)
{
return !cluster.getDocuments().isEmpty();
}
}));
}
return clusters;
}
}
}