/* * This file is part of Caliph & Emir. * * Caliph & Emir is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * Caliph & Emir is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Caliph & Emir; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * * Copyright statement: * -------------------- * (c) 2002-2005 by Mathias Lux (mathias@juggle.at) * http://www.juggle.at, http://caliph-emir.sourceforge.net */ package at.lux.retrieval.clustering.suffixtree; import at.lux.retrieval.clustering.TextSuffixTree; import java.util.HashSet; import java.util.Iterator; /** * @author Mathias Lux, mathias@juggle.at * Date: 03.06.2004 * Time: 15:33:45 */ public class BaseCluster implements Comparable { private StcNode node = null; private HashSet<StcDocument> documents = null; private String phrase = null; private double score = Double.MIN_VALUE; private double phraseScore = 0d; public BaseCluster(HashSet<StcDocument> documents, StcNode node, String phrase, WordIndex index, HashSet<String> stopwords) { this.documents = documents; this.node = node; this.phrase = phrase; score = calculateScore(index, stopwords); assert documents.size() > 1; } /** * Calculates the score as mentioned in paper. * * @param index * @param stopwords * @return */ private double calculateScore(WordIndex index, HashSet<String> stopwords) { double numberOfDocuments = (double) documents.size(); String[] tokens = phrase.trim().split("\\W"); int numberOfWords = tokens.length; for (int i = 0; i < tokens.length; i++) { String token = tokens[i]; if (stopwords.contains(token)) numberOfWords--; else if (index.getDocumentCount(token) < TextSuffixTree.CONFIGURATION_MINIMUM_TERM_FREQUENCY_BORDER) numberOfWords--; else if (index.getDocumentFrequency(token) > TextSuffixTree.CONFIGURATION_DOCUMENT_FREQUENCY_BORDER) numberOfWords--; } if (numberOfWords < 0) numberOfWords = 0; else if (numberOfWords > TextSuffixTree.CONFIGURATION_MAXIMUM_PHRASE_LENGTH) numberOfWords = TextSuffixTree.CONFIGURATION_MAXIMUM_PHRASE_LENGTH; double wordweighting = 0.0; if (numberOfWords == 1) { wordweighting = TextSuffixTree.CONFIGURATION_SINGLE_WORD_PENALTY; } else wordweighting = (double) numberOfWords; phraseScore = wordweighting; return (numberOfDocuments * wordweighting); } public String toString() { String result = score + ": \"" + phrase + "\""; for (Iterator<StcDocument> iterator = documents.iterator(); iterator.hasNext();) { StcDocument stcDocument = iterator.next(); result += " " + stcDocument.toString(); } return result; } public double getScore() { return score; } public HashSet<StcDocument> getDocuments() { return documents; } public StcNode getNode() { return node; } public String getPhrase() { return phrase; } /** * Makes Base Clusters comparable. * * @param o * @return */ public int compareTo(Object o) { int result = 0; if (o.equals(this)) return 0; else if (o instanceof BaseCluster) { BaseCluster b = (BaseCluster) o; if (score > b.score) return -1; else if (score < b.score) return 1; else return System.identityHashCode(this) - System.identityHashCode(o); } else { result = System.identityHashCode(this) - System.identityHashCode(o); } return result; } /** * Calculates the binary similarity between 2 base clusters. * * @param cluster * @return */ public int binarySimilarity(BaseCluster cluster) { HashSet<StcDocument> mergedDocuments = new HashSet<StcDocument>(); HashSet<StcDocument> toIterate, toCheck; int thisDocumentsSize = this.documents.size(); int clusterDocumentsSize = cluster.documents.size(); // iterate using the smaller set! if (thisDocumentsSize < clusterDocumentsSize) { toIterate = this.documents; toCheck = cluster.documents; } else { toCheck = this.documents; toIterate = cluster.documents; } for (Iterator<StcDocument> iterator = toIterate.iterator(); iterator.hasNext();) { StcDocument stcDocument = iterator.next(); if (toCheck.contains(stcDocument)) mergedDocuments.add(stcDocument); } double tmp1 = ((double) mergedDocuments.size()) / ((double) thisDocumentsSize); double tmp2 = ((double) mergedDocuments.size()) / ((double) clusterDocumentsSize); if (tmp1 > 0.5d && tmp2 > 0.5d) return 1; else return 0; } /** * Returns the score of the base cluster without multiplication with document count, * eventually needed for final cluster score :) * * @return */ public double getPhraseScore() { return phraseScore; } }