/*
* Carrot2 project.
*
* Copyright (C) 2002-2016, Dawid Weiss, Stanisław Osiński.
* All rights reserved.
*
* Refer to the full license file "carrot2.LICENSE"
* in the root folder of the repository checkout or at:
* http://www.carrot2.org/carrot2.LICENSE
*/
package org.carrot2.clustering.synthetic;
import java.util.*;
import org.carrot2.core.*;
import org.carrot2.core.attribute.*;
import org.carrot2.util.attribute.*;
import org.carrot2.util.attribute.constraint.NotBlank;
import org.carrot2.shaded.guava.common.collect.Lists;
import org.carrot2.shaded.guava.common.collect.Maps;
/**
* Clusters documents into a flat structure based on the values of some field of the
* documents. By default the {@link Document#SOURCES} field is used.
*/
@Bindable(prefix = "ByAttributeClusteringAlgorithm", inherit = CommonAttributes.class)
@Label("By Attribute Clustering")
public class ByFieldClusteringAlgorithm extends ProcessingComponentBase implements
IClusteringAlgorithm
{
/**
* Documents to cluster.
*/
@Processing
@Input
@Internal
@Attribute(key = AttributeNames.DOCUMENTS, inherit = true)
public List<Document> documents;
/**
* Clusters created by the algorithm.
*/
@Processing
@Output
@Internal
@Attribute(key = AttributeNames.CLUSTERS, inherit = true)
public List<Cluster> clusters = null;
/**
* Name of the field to cluster by. Each non-null scalar field value with distinct
* hash code will give rise to a single cluster, named using the
* value returned by {@link #buildClusterLabel(Object)}. If the field value is a collection,
* the document will be assigned to all clusters corresponding to the values in the
* collection. Note that arrays will not be 'unfolded' in this way.
*/
@Processing
@Input
@Attribute
@Required
@NotBlank
@Level(AttributeLevel.BASIC)
@Group("Fields")
@Label("Field name")
public String fieldName = Document.SOURCES;
/**
* Performs by URL clustering.
*/
@Override
public void process() throws ProcessingException
{
final Map<Object, Cluster> clusterMap = Maps.newHashMap();
for (Document document : documents)
{
final Object field = document.getField(fieldName);
if (field instanceof Collection<?>)
{
for (Object value : (Collection<?>) field)
{
addToCluster(clusterMap, value, document);
}
}
else
{
addToCluster(clusterMap, field, document);
}
}
clusters = Lists.newArrayList(clusterMap.values());
Collections.sort(clusters, Cluster.BY_REVERSED_SIZE_AND_LABEL_COMPARATOR);
Cluster.appendOtherTopics(documents, clusters);
}
private void addToCluster(Map<Object, Cluster> clusters, Object fieldValue,
Document document)
{
if (fieldValue == null)
{
return;
}
Cluster cluster = clusters.get(fieldValue);
if (cluster == null)
{
cluster = new Cluster();
cluster.addPhrases(buildClusterLabel(fieldValue));
clusters.put(fieldValue, cluster);
}
cluster.addDocuments(document);
}
/**
* Builds cluster label based on the field value. This implementation returns
* <code>fieldValue.toString()</code>.
*/
protected String buildClusterLabel(Object fieldValue)
{
return fieldValue.toString();
}
}