/* * This file is part of Caliph & Emir. * * Caliph & Emir is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * Caliph & Emir is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Caliph & Emir; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * * Copyright statement: * -------------------- * (c) 2002-2005 by Mathias Lux (mathias@juggle.at) * http://www.juggle.at, http://caliph-emir.sourceforge.net */ package at.lux.retrieval.clustering; import at.lux.retrieval.clustering.suffixtree.*; import java.util.*; /** * @author Mathias Lux, mathias@juggle.at * Date: 03.06.2004 * Time: 13:42:41 */ public abstract class AbstractSuffixTree { protected StcNode rootNode = null; protected HashSet<String> stopwords = new HashSet<String>(); protected HashSet<StcDocument> documents = null; /** * Sorted list of base clusters, as we only need the 500 with the highest score ... */ protected TreeSet<BaseCluster> baseClusters = null; /** * List of cluster that have been touched by the current insertion */ protected HashSet<BaseCluster> touchedClusters = null; /** * Map of Nodes2Clusters as Index! */ protected HashMap<StcNode, BaseCluster> nodeToClusterIndex = null; /** * Binary similarity Matrix for merging ... */ protected EdgeIndex similarityMatrix = null; protected WordIndex index = null; // defines in how many of the documents a term is allowed to occur (in percent: [0,1]). // if exceeded a term is not counted in score. Setting this to a lower value will result in // smaller clusters. 30% is a good value to start with. Set it higher if you want to cluster // a small collection. A smaller value increases speed. public static final double CONFIGURATION_DOCUMENT_FREQUENCY_BORDER = 0.9d; // defines what is the minimum term frequency, if below it is not counted in score. // set it to a higher value to increase the minimum cluster size. For small collections // it should be set to 3. For bigger collections eg. >300 5 or more is appropriate // a higher value increases speed. public static final int CONFIGURATION_MINIMUM_TERM_FREQUENCY_BORDER = 3; // defines where we stop to count for the score. // it is quite the same to have a score with this count of words or more public static final int CONFIGURATION_MAXIMUM_PHRASE_LENGTH = 6; // defines where we stop to count for the score. public static final double CONFIGURATION_SINGLE_WORD_PENALTY = .05; // defines the maximum of update cluster after inserting a new document. Usually it should // be set to 50-250 depending on the kind and size of the initial collection. Smaller value // gives more speed public static final int CONFIGURATION_MAX_SIMILARITY_UPDATES = 300; // defines the minimum size for a sentence. Note that for path clustering the min size // should be set to 0. protected int MIN_SENTENCE_SIZE = 0; public AbstractSuffixTree() { rootNode = new StcNode(); stopwords = new HashSet<String>(); documents = new HashSet<StcDocument>(); index = new WordIndex(); touchedClusters = new HashSet<BaseCluster>(); baseClusters = new TreeSet<BaseCluster>(); nodeToClusterIndex = new HashMap<StcNode, BaseCluster>(); similarityMatrix = new EdgeIndex(); } public void insert(String phrase, StcDocument stcDocument) { // add documents to the list of documents documents.add(stcDocument); String[] sentences = getSentences(phrase); for (int i = 0; i < sentences.length; i++) { String sentence = sentences[i].trim(); if (sentence.length() >= MIN_SENTENCE_SIZE) { String[] tokens = getTokens(sentence); tokens = filterTokens(tokens); // add the tokens to the index index.addToIndex(tokens, stcDocument); // insert all suffices into the tree: for (int j = 0; j < tokens.length; j++) { LinkedList<String> myList = new LinkedList<String>(); for (int ii = j; ii < tokens.length; ii++) { String token = tokens[ii]; assert(token!=null); myList.add(token); } rootNode.add(myList, stcDocument, j, "", this); } } } int maxSimilarityRecalc = baseClusters.size(); maxSimilarityRecalc = maxSimilarityRecalc < CONFIGURATION_MAX_SIMILARITY_UPDATES ? maxSimilarityRecalc : CONFIGURATION_MAX_SIMILARITY_UPDATES; // recalculating the similarity, new cluster have already been added: for (Iterator<BaseCluster> iterator = touchedClusters.iterator(); iterator.hasNext();) { BaseCluster bc = iterator.next(); Iterator<BaseCluster> recalcWith = baseClusters.iterator(); // only update the base clusters with the highest scores: int countRecalcs = 0; while (recalcWith.hasNext() && countRecalcs < maxSimilarityRecalc) { BaseCluster bc2 = recalcWith.next(); countRecalcs++; similarityMatrix.update(bc, bc2); } } touchedClusters.clear(); } /** * Override this one if you want to change the way of handling tokens * (or words in this implementation) * @param sentence * @return */ protected abstract String[] getTokens(String sentence); /** * Override this method if you want to use another method to create the sentences. * @param phrase * @return an array of sentences. */ protected abstract String[] getSentences(String phrase); /** * Override this method if you want to filter your tokens. * @param tokens * @return */ protected abstract String[] filterTokens(String[] tokens); public TreeSet<BaseCluster> getBaseClusters() { return baseClusters; } public WordIndex getIndex() { return index; } public HashSet<String> getStopwords() { return stopwords; } public HashSet<BaseCluster> getTouchedClusters() { return touchedClusters; } public HashMap<StcNode, BaseCluster> getNodeToClusterIndex() { return nodeToClusterIndex; } public EdgeIndex getSimilarityMatrix() { return similarityMatrix; } public Set<FinalCluster> getFinalClusters() { // new linear version of base cluster merging: HashSet<BaseCluster> bcVisited = new HashSet<BaseCluster>(); TreeSet<FinalCluster> set = new TreeSet<FinalCluster>(); for (Iterator<BaseCluster> iterator = baseClusters.iterator(); iterator.hasNext();) { BaseCluster bc = iterator.next(); if (bcVisited.add(bc)) { // not already checked ... FinalCluster fc = new FinalCluster(bc); Set<BaseCluster> allConnectedBaseClusters = similarityMatrix.getAllConnectedBaseClusters(bc); recursiveGraphTraversion(allConnectedBaseClusters, fc, bcVisited); set.add(fc); } } return set; } private void recursiveGraphTraversion(Set<BaseCluster> bcSet, FinalCluster fc, Set<BaseCluster> bcVisited) { if (bcSet != null) { for (Iterator<BaseCluster> iterator = bcSet.iterator(); iterator.hasNext();) { BaseCluster baseCluster = iterator.next(); if (bcVisited.add(baseCluster)) { fc.addBaseClusterWithoutCheck(baseCluster); recursiveGraphTraversion(similarityMatrix.getAllConnectedBaseClusters(baseCluster), fc, bcVisited); } } } else { // System.err.println("Edge set is NULL!"); } } }