/*
* Carrot2 project.
*
* Copyright (C) 2002-2016, Dawid Weiss, Stanisław Osiński.
* All rights reserved.
*
* Refer to the full license file "carrot2.LICENSE"
* in the root folder of the repository checkout or at:
* http://www.carrot2.org/carrot2.LICENSE
*/
package org.carrot2.examples.source;
import java.util.*;
import org.carrot2.core.*;
import org.carrot2.core.attribute.*;
import org.carrot2.util.attribute.*;
import org.carrot2.shaded.guava.common.collect.Lists;
import org.carrot2.shaded.guava.common.collect.Maps;
/**
* An example clustering algorithm component that groups documents
* by the first letter of their title.
*/
@Bindable(prefix = "ByFirstLetter", inherit = CommonAttributes.class)
public class ByFirstTitleLetterClusteringAlgorithm
extends ProcessingComponentBase
implements IClusteringAlgorithm
{
/**
* Documents to cluster.
*/
@Processing
@Input
@Internal
@Attribute(key = AttributeNames.DOCUMENTS, inherit = true)
public List<Document> documents;
/**
* Clusters created by the algorithm.
*/
@Processing
@Output
@Internal
@Attribute(key = AttributeNames.CLUSTERS, inherit = true)
public List<Cluster> clusters = null;
/**
* Whether to group case-insensitive codepoints together.
*/
@Init
@Processing
@Input
@Attribute
public boolean caseSensitive = true;
/**
* Do the processing.
*/
@Override
public void process() throws ProcessingException
{
final Map<Integer, Cluster> codepointToCluster = Maps.newHashMap();
for (Document document : documents)
{
final String title = document.getTitle();
Integer codepoint;
if (title == null || title.trim().isEmpty())
{
// No letter in the title -- will become other topics.
continue;
}
else
{
codepoint = title.trim().codePointAt(0);
}
if (!caseSensitive)
{
// This is overly simplistic, but will do for the example.
codepoint = Character.toLowerCase(codepoint);
}
if (!codepointToCluster.containsKey(codepoint))
{
codepointToCluster.put(codepoint,
new Cluster("Starting with letter '"
+ new String(Character.toChars(codepoint)) + "'"));
}
codepointToCluster.get(codepoint).addDocuments(document);
}
clusters = Lists.newArrayList(codepointToCluster.values());
Collections.sort(clusters, Cluster.BY_REVERSED_SIZE_AND_LABEL_COMPARATOR);
Cluster.appendOtherTopics(documents, clusters);
}
}