/*
* Carrot2 project.
*
* Copyright (C) 2002-2016, Dawid Weiss, Stanisław Osiński.
* All rights reserved.
*
* Refer to the full license file "carrot2.LICENSE"
* in the root folder of the repository checkout or at:
* http://www.carrot2.org/carrot2.LICENSE
*/
package org.carrot2.output.metrics;
import java.util.*;
import org.carrot2.core.Cluster;
import org.carrot2.core.Document;
import org.carrot2.core.attribute.AttributeNames;
import org.carrot2.core.attribute.Processing;
import org.carrot2.util.MathUtils;
import org.carrot2.util.attribute.*;
import org.carrot2.shaded.guava.common.collect.*;
/**
* Computes precision, recall and F-metric for all partitions against the provided
* clusters.
* <p>
* Metrics will be calculated only if all input documents have non-blank
* {@link Document#PARTITIONS}.
* </p>
*/
@Bindable
public class PrecisionRecallMetric extends IdealPartitioningBasedMetric
{
/**
* Partition on which the cluster achieved best F-Score value. Value type:
* <code>Object</code>. See {@link Document#PARTITIONS} for more information.
*/
public final static String BEST_F_MEASURE_PARTITION = "best-f-measure-partition";
/**
* Average precision of the whole cluster set, weighted by cluster size.
*/
@Processing
@Output
@Attribute
public Double weightedAveragePrecision;
/**
* Average recall of the whole cluster set, weighted by cluster size.
*/
@Processing
@Output
@Attribute
public Double weightedAverageRecall;
/**
* Average F-measure of the whole cluster set, weighted by cluster size.
*/
@Processing
@Output
@Attribute
public Double weightedAverageFMeasure;
/**
* Precision by partition.
*/
@Processing
@Output
@Attribute
public Map<Object, Double> precisionByPartition;
/**
* Recall by partition.
*/
@Processing
@Output
@Attribute
public Map<Object, Double> recallByPartition;
/**
* F-measure by partition.
*/
@Processing
@Output
@Attribute
public Map<Object, Double> fMeasureByPartition;
/**
* Calculate F-measure.
*/
@Processing
@Input
@Attribute
public boolean enabled = true;
@Processing
@Input
@Attribute(key = AttributeNames.DOCUMENTS)
public List<Document> documents;
@Processing
@Input
@Attribute(key = AttributeNames.CLUSTERS)
public List<Cluster> clusters;
public void calculate()
{
final int partitionCount = getPartitionsCount(documents);
if (partitionCount == 0)
{
return;
}
if (clusters.size() == 0)
{
return;
}
final SetMultimap<Object, Document> documentsByPartition = getDocumentsByPartition(documents);
final Set<Object> partitions = getPartitions(documents);
precisionByPartition = Maps.newHashMap();
recallByPartition = Maps.newHashMap();
fMeasureByPartition = Maps.newHashMap();
double recallSum = 0;
double precisionSum = 0;
double fMeasureSum = 0;
int partitionDocumentsCountSum = 0;
for (Object partition : partitions)
{
final Set<Document> partitionDocuments = documentsByPartition.get(partition);
final int partitionDocumentsCount = partitionDocuments.size();
double partitionFMeasure = 0;
double partitionPrecision = 0;
double partitionRecall = 0;
Cluster bestFMeasureCluster = null;
for (Cluster cluster : clusters)
{
final List<Document> clusterDocuments = cluster.getAllDocuments();
if (cluster.isOtherTopics() || clusterDocuments.size() == 0)
{
continue;
}
final Set<Document> commonDocuments = Sets.newHashSet(partitionDocuments);
commonDocuments.retainAll(clusterDocuments);
final double precision = commonDocuments.size()
/ (double) clusterDocuments.size();
final double recall = commonDocuments.size()
/ (double) partitionDocumentsCount;
final double fMeasure = MathUtils.harmonicMean(precision, recall);
if (fMeasure > partitionFMeasure)
{
partitionFMeasure = fMeasure;
partitionPrecision = precision;
partitionRecall = recall;
bestFMeasureCluster = cluster;
}
}
recallSum += partitionRecall * partitionDocumentsCount;
precisionSum += partitionPrecision * partitionDocumentsCount;
fMeasureSum += partitionFMeasure * partitionDocumentsCount;
partitionDocumentsCountSum += partitionDocumentsCount;
recallByPartition.put(partition, partitionRecall);
precisionByPartition.put(partition, partitionPrecision);
fMeasureByPartition.put(partition, partitionFMeasure);
if (bestFMeasureCluster != null)
{
bestFMeasureCluster.setAttribute(BEST_F_MEASURE_PARTITION, partition);
}
}
// Dividing by partitionDocumentsCountSum rather than by the number of documents
// because partitionDocumentsCountSum can be larger than the number of documents
// if the partitions have overlapping documents.
weightedAveragePrecision = precisionSum / partitionDocumentsCountSum;
weightedAverageRecall = recallSum / partitionDocumentsCountSum;
weightedAverageFMeasure = fMeasureSum / partitionDocumentsCountSum;
}
public boolean isEnabled()
{
return enabled;
}
}