AbstractSuffixTree.java example

Explorer

CaliphEmir-master
- caliphemir
  - src
    - at
  - test
    - at
      - lux
        fotoannotation
        ComponentsTest.java
        FileTreeTest.java
        fotoretrieval
        FastMapGraphTest.java
        FastMapTest.java
        GraphConstructionTest.java
        PointPanel.java
        SimilarityMatrixTest.java
        lucene
        GraphAnalyzerTest.java
        LabeledGraphTest.java
        panels
        TestConfigurationDialog.java
        retrievalengines
        GraphTest.java
        LucenePathIndexRetrievalEngineTest.java
        LuceneRetrievalEngineTest.java
        graphviz
        SpringEmbedderTest.java
        SpringEmbedderVis.java
        imageanalysis
        ColorLayoutTest.java
        ColorStructureTest.java
        ColorTest.java
        DominantColorTest.java
        EdgeHistogramTest.java
        ScalableColorTest.java
        db
        DerbyTest.java
        imaging
        BmpReaderTest.java
        PpmReaderTest.java
        retrieval
        StcTest.java
        evaluation
        SuffixTreeEvaluation.java
        graphisomorphism
        FastSubgraphIsomorphismTest.java
        SubgraphIsomorphismTest.java
        metrics
        BooleanNodeDistanceFunctionTest.java
        SimpleEdgeDistanceFunctionTest.java
        TermVectorNodeDistanceFunctionTest.java
        suffixtreemodel
        SuffixTreeTest.java
        vectorspace
        ElementTextVectorSimilarityTest.java
        GraphVectorSimilarityTest.java

/*
 * This file is part of Caliph & Emir.
 *
 * Caliph & Emir is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * Caliph & Emir is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with Caliph & Emir; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 *
 * Copyright statement:
 * --------------------
 * (c) 2002-2005 by Mathias Lux (mathias@juggle.at)
 * http://www.juggle.at, http://caliph-emir.sourceforge.net
 */
package at.lux.retrieval.clustering;

import at.lux.retrieval.clustering.suffixtree.*;

import java.util.*;

/**
 * @author Mathias Lux, mathias@juggle.at
 *         Date: 03.06.2004
 *         Time: 13:42:41
 */

public abstract class AbstractSuffixTree {
    protected StcNode rootNode = null;

    protected HashSet<String> stopwords = new HashSet<String>();

    protected HashSet<StcDocument> documents = null;

    /**
     * Sorted list of base clusters, as we only need the 500 with the highest score ...
     */
    protected TreeSet<BaseCluster> baseClusters = null;

    /**
     * List of cluster that have been touched by the current insertion
     */
    protected HashSet<BaseCluster> touchedClusters = null;

    /**
     * Map of Nodes2Clusters as Index!
     */
    protected HashMap<StcNode, BaseCluster> nodeToClusterIndex = null;

    /**
     * Binary similarity Matrix for merging ...
     */
    protected EdgeIndex similarityMatrix = null;

    protected WordIndex index = null;

    // defines in how many of the documents a term is allowed to occur (in percent: [0,1]).
    // if exceeded a term is not counted in score. Setting this to a lower value will result in
    // smaller clusters. 30% is a good value to start with. Set it higher if you want to cluster
    // a small collection. A smaller value increases speed.
    public static final double CONFIGURATION_DOCUMENT_FREQUENCY_BORDER = 0.9d;
    // defines what is the minimum term frequency, if below it is not counted in score.
    // set it to a higher value to increase the minimum cluster size. For small collections
    // it should be set to 3. For bigger collections eg. >300 5 or more is appropriate
    // a higher value increases speed.
    public static final int CONFIGURATION_MINIMUM_TERM_FREQUENCY_BORDER = 3;
    // defines where we stop to count for the score.
    // it is quite the same to have a score with this count of words or more
    public static final int CONFIGURATION_MAXIMUM_PHRASE_LENGTH = 6;
    // defines where we stop to count for the score.
    public static final double CONFIGURATION_SINGLE_WORD_PENALTY = .05;
    // defines the maximum of update cluster after inserting a new document. Usually it should
    // be set to 50-250 depending on the kind and size of the initial collection. Smaller value
    // gives more speed
    public static final int CONFIGURATION_MAX_SIMILARITY_UPDATES = 300;

    // defines the minimum size for a sentence. Note that for path clustering the min size
    // should be set to 0.
    protected int MIN_SENTENCE_SIZE = 0;

    public AbstractSuffixTree() {
        rootNode = new StcNode();
        stopwords = new HashSet<String>();
        documents = new HashSet<StcDocument>();
        index = new WordIndex();
        touchedClusters = new HashSet<BaseCluster>();
        baseClusters = new TreeSet<BaseCluster>();
        nodeToClusterIndex = new HashMap<StcNode, BaseCluster>();
        similarityMatrix = new EdgeIndex();
    }

    public void insert(String phrase, StcDocument stcDocument) {
        // add documents to the list of documents
        documents.add(stcDocument);
        String[] sentences = getSentences(phrase);
        for (int i = 0; i < sentences.length; i++) {
            String sentence = sentences[i].trim();
            if (sentence.length() >= MIN_SENTENCE_SIZE) {
                String[] tokens = getTokens(sentence);
                tokens = filterTokens(tokens);
                // add the tokens to the index
                index.addToIndex(tokens, stcDocument);
                // insert all suffices into the tree:
                for (int j = 0; j < tokens.length; j++) {
                    LinkedList<String> myList = new LinkedList<String>();
                    for (int ii = j; ii < tokens.length; ii++) {
                        String token = tokens[ii];
                        assert(token!=null);
                        myList.add(token);
                    }
                    rootNode.add(myList, stcDocument, j, "", this);
                }
            }
        }

        int maxSimilarityRecalc = baseClusters.size();
        maxSimilarityRecalc = maxSimilarityRecalc < CONFIGURATION_MAX_SIMILARITY_UPDATES ? maxSimilarityRecalc : CONFIGURATION_MAX_SIMILARITY_UPDATES;
        // recalculating the similarity, new cluster have already been added:
        for (Iterator<BaseCluster> iterator = touchedClusters.iterator(); iterator.hasNext();) {
            BaseCluster bc = iterator.next();
            Iterator<BaseCluster> recalcWith = baseClusters.iterator();
            // only update the base clusters with the highest scores:
            int countRecalcs = 0;
            while (recalcWith.hasNext() && countRecalcs < maxSimilarityRecalc) {
                BaseCluster bc2 = recalcWith.next();
                countRecalcs++;
                similarityMatrix.update(bc, bc2);
            }
        }
        touchedClusters.clear();
    }

    /**
     * Override this one if you want to change the way of handling tokens 
     * (or words in this implementation)
     * @param sentence
     * @return
     */
    protected abstract String[] getTokens(String sentence);

    /**
     * Override this method if you want to use another method to create the sentences.
     * @param phrase
     * @return an array of sentences.
     */
    protected abstract String[] getSentences(String phrase);

    /**
     * Override this method if you want to filter your tokens.
     * @param tokens
     * @return
     */
    protected abstract String[] filterTokens(String[] tokens);

    public TreeSet<BaseCluster> getBaseClusters() {
        return baseClusters;
    }

    public WordIndex getIndex() {
        return index;
    }

    public HashSet<String> getStopwords() {
        return stopwords;
    }

    public HashSet<BaseCluster> getTouchedClusters() {
        return touchedClusters;
    }

    public HashMap<StcNode, BaseCluster> getNodeToClusterIndex() {
        return nodeToClusterIndex;
    }

    public EdgeIndex getSimilarityMatrix() {
        return similarityMatrix;
    }

    public Set<FinalCluster> getFinalClusters() {
        // new linear version of base cluster merging:
        HashSet<BaseCluster> bcVisited = new HashSet<BaseCluster>();
        TreeSet<FinalCluster> set = new TreeSet<FinalCluster>();
        for (Iterator<BaseCluster> iterator = baseClusters.iterator(); iterator.hasNext();) {
            BaseCluster bc = iterator.next();
            if (bcVisited.add(bc)) { // not already checked ...
                FinalCluster fc = new FinalCluster(bc);
                Set<BaseCluster> allConnectedBaseClusters = similarityMatrix.getAllConnectedBaseClusters(bc);
                recursiveGraphTraversion(allConnectedBaseClusters, fc, bcVisited);
                set.add(fc);
            }
        }
        return set;
    }

    private void recursiveGraphTraversion(Set<BaseCluster> bcSet, FinalCluster fc, Set<BaseCluster> bcVisited) {
        if (bcSet != null) {
            for (Iterator<BaseCluster> iterator = bcSet.iterator(); iterator.hasNext();) {
                BaseCluster baseCluster = iterator.next();
                if (bcVisited.add(baseCluster)) {
                    fc.addBaseClusterWithoutCheck(baseCluster);
                    recursiveGraphTraversion(similarityMatrix.getAllConnectedBaseClusters(baseCluster), fc, bcVisited);
                }
            }
        } else {
//            System.err.println("Edge set is NULL!");
        }
    }
}