package org.apache.lucene.ClusterBasedPsuedoRelevanceFeedback;
import org.apache.lucene.index.TermFreqVector;
import org.apache.lucene.search.ScoreDoc;
import java.util.ArrayList;
import java.util.List;
/**
* Created by IntelliJ IDEA.
* User: Antonio
* Date: 5/5/11
* Time: 7:00 PM
* To change this template use File | Settings | File Templates.
*/
public class DocumentCluster {
public List<ScoreDoc> Docs = new ArrayList<ScoreDoc>();
private List<TermFreqVector> points = new ArrayList<TermFreqVector>();
private TermFreqVector center;
private final int MAX_POINTS = 3;
private final double SIM_THRESHOLD = 0.5;
/* Creates a new Cluster with sd as its first doc and the given vector as its center, presumably
* they are both the same document
*/
public DocumentCluster(ScoreDoc sd, TermFreqVector tfv) {
Docs.add(sd);
points.add(tfv);
center = tfv;
}
//add the doc <sd,tfv> to our cluster iff the cluster is not full
//and it is within the similarity threshold
public void addPoint(ScoreDoc sd, TermFreqVector tfv) {
if(Docs.size() == MAX_POINTS)
return;
if(true){//CosineSimilarity(center, tfv) > SIM_THRESHOLD) {
Docs.add(sd);
points.add(tfv);
}
}
public double getScore() {
double score = 1.0;
for(ScoreDoc sd : Docs) {
score *= sd.score;
}
return score;
}
}