SuffixTreeNode.java example

Explorer

CaliphEmir-master
- caliphemir
  - src
    - at
  - test
    - at
      - lux
        fotoannotation
        ComponentsTest.java
        FileTreeTest.java
        fotoretrieval
        FastMapGraphTest.java
        FastMapTest.java
        GraphConstructionTest.java
        PointPanel.java
        SimilarityMatrixTest.java
        lucene
        GraphAnalyzerTest.java
        LabeledGraphTest.java
        panels
        TestConfigurationDialog.java
        retrievalengines
        GraphTest.java
        LucenePathIndexRetrievalEngineTest.java
        LuceneRetrievalEngineTest.java
        graphviz
        SpringEmbedderTest.java
        SpringEmbedderVis.java
        imageanalysis
        ColorLayoutTest.java
        ColorStructureTest.java
        ColorTest.java
        DominantColorTest.java
        EdgeHistogramTest.java
        ScalableColorTest.java
        db
        DerbyTest.java
        imaging
        BmpReaderTest.java
        PpmReaderTest.java
        retrieval
        StcTest.java
        evaluation
        SuffixTreeEvaluation.java
        graphisomorphism
        FastSubgraphIsomorphismTest.java
        SubgraphIsomorphismTest.java
        metrics
        BooleanNodeDistanceFunctionTest.java
        SimpleEdgeDistanceFunctionTest.java
        TermVectorNodeDistanceFunctionTest.java
        suffixtreemodel
        SuffixTreeTest.java
        vectorspace
        ElementTextVectorSimilarityTest.java
        GraphVectorSimilarityTest.java

package at.lux.retrieval.suffixtreemodel;

import java.util.*;
import java.util.logging.Logger;

/**
 * <p/>
 * Date: 15.02.2006 <br>
 * Time: 20:31:57 <br>
 * Know-Center Graz, Inffeldgasse 21a, 8010 Graz, AUSTRIA <br>
 *
 * @author Mathias Lux, mlux@know-center.at
 */
public class SuffixTreeNode {
    private Logger log = Logger.getLogger(SuffixTreeNode.class.getName());
    private String token;
    /**
     * Stores how often a single document has traversed given edge.
     * Used for similarity calculation
     */
    private HashMap<String, HashMap<Integer, Integer>> edge2document = new HashMap<String, HashMap<Integer, Integer>>();
    /**
     * Stores which documents have traversed the edge.
     * This is only created to allow calculation of the IDF value
     * based on a corpus.
     */
    private HashMap<String, HashSet<Integer>> idfCounter = new HashMap<String, HashSet<Integer>>();
    private HashMap<String, SuffixTreeNode> edges = new HashMap<String, SuffixTreeNode>();

    public SuffixTreeNode(String token) {
        this.token = token;
    }

    /**
     * @param tokens the suffix
     * @param id     the document id
     */
    public void addSuffix(List<String> tokens, int id) {
//        System.out.println("tokens = " + tokens);
        String t = tokens.get(0);
        // add edge if not here
        if (!edges.containsKey(t)) {
            edges.put(t, new SuffixTreeNode(t));
        }
        if (!edge2document.containsKey(t)) {
            HashMap<Integer, Integer> docs = new HashMap<Integer, Integer>(3);
            edge2document.put(t, docs);
        }
        if (!edge2document.get(t).containsKey(id)) {
            edge2document.get(t).put(id, 0);
        }
        // add id to edge:
        edge2document.get(t).put(id, edge2document.get(t).get(id) + 1);
        // add suffices if there is more than one token left:
        if (tokens.size() > 1) {
            ArrayList<String> tokensLeft = new ArrayList<String>(tokens.size());
            tokensLeft.addAll(tokens);
            tokensLeft.remove(0);
            edges.get(t).addSuffix(tokensLeft, id);
        }
    }

    public boolean isLeaf() {
        return (edges.isEmpty());
    }

    /**
     * Calculates how many edges have been created or traversed by document with id.
     * stores the edges which were traversed by both in doc id "-1";
     *
     * @param document2edgeCount
     */
    public void getEdgesTraversed(HashMap<Integer, Integer> document2edgeCount) {
        // go through all local edges ...
        for (String edge : edge2document.keySet()) {
            Set<Integer> documents = edge2document.get(edge).keySet();
            for (Integer docID : documents) {
                // increment for document
                document2edgeCount.put(docID, document2edgeCount.get(docID) + 1);
            }
            if (documents.size() > 1) {
                // increment count for doc with id -1 (the edge count).
                document2edgeCount.put(-1, document2edgeCount.get(-1) + 1);
            }
        }
        // ask all children, which are not leafs ...
        for (String token : edges.keySet()) {
            SuffixTreeNode node = edges.get(token);
            if (!node.isLeaf() && edge2document.containsKey(token)) {
                node.getEdgesTraversed(document2edgeCount);
            }
        }
    }

    public void traverseEdges(TermFrequencyWalker walker, SuffixTree.SimilarityType similarityType) {
        // go through all local edges ...
        for (String s : edge2document.keySet()) {
            Set<Integer> documents = edge2document.get(s).keySet();
            int min = Integer.MAX_VALUE;
            int max = 0;
            if (edge2document.get(s).size() >1) {
                for (Integer docID : documents) {
                    Integer travCount = edge2document.get(s).get(docID);
                    min = Math.min(min, travCount);
                    max = Math.max(max, travCount);
                }
            } else {
                min = 0;
                max = 1;
            }
            if (similarityType == SuffixTree.SimilarityType.TermFrequency) {
                // its term frequeny:
                walker.addToSum((double) min / (double) max);
            } else if (similarityType == SuffixTree.SimilarityType.IDF) {
                int documentFrequency;
                if (!idfCounter.containsKey(s)) {
                    // Note: the documents to check against were obviously not in the corpus!!
                    documentFrequency = edge2document.get(s).keySet().size();
                    log.warning("Note: the documents for similarity calculation were obviously not in the corpus!!");
                } else {
                    documentFrequency = idfCounter.get(s).size();
                }
                double idf = Math.log((double) walker.getCountCorpusDocuments() / (double) documentFrequency);
                // IDF factor is normalized to stay in the [0,1] area.
                if (min > 0)
                    walker.addToSum(idf);
            } else if (similarityType == SuffixTree.SimilarityType.TFIDF) {
                // its TF*IDF:
                double factor = (double) min / (double) max;
                int documentFrequency;
                if (!idfCounter.containsKey(s)) {
                    // Note: the documents to check against were obviously not in the corpus!!
                    documentFrequency = edge2document.get(s).keySet().size();
                    log.warning("Note: the documents for similarity calculation were obviously not in the corpus!!");
                } else {
                    documentFrequency = idfCounter.get(s).size();
                }
//                double maxIdf = Math.log((double) walker.getCountCorpusDocuments());
                double idf = Math.log((double) walker.getCountCorpusDocuments() / (double) documentFrequency);
//                assert(factor <= 1.0);
//                assert(idf / maxIdf <= 1.0);
                // IDF factor is normalized to stay in the [0,1] area.
                walker.addToSum(factor * idf);
//                walker.addToSum(factor * (idf / maxIdf));
            } else
                throw new UnsupportedOperationException("Not implemened yet!");
            walker.incrementCountEdges();
        }
        // ask all children, which are not leafs and have edges for given docs ...
        for (String token : edges.keySet()) {
            SuffixTreeNode node = edges.get(token);
            if (!node.isLeaf() && edge2document.containsKey(token)) {
                node.traverseEdges(walker, similarityType);
            }
        }
    }

    /**
     * Allows to train the tree for a specific document corpus on the document frequency for
     * later idf calculation
     *
     * @param tokens
     * @param id
     */
    public void prepareDocumentFrequency(List<String> tokens, int id) {
        String t = tokens.get(0);
        // add edge if not here
        if (!edges.containsKey(t)) {
            edges.put(t, new SuffixTreeNode(t));
        }
        if (!idfCounter.containsKey(t)) {
            idfCounter.put(t, new HashSet<Integer>());
        }
        idfCounter.get(t).add(id);
        if (tokens.size() > 1) {
            ArrayList<String> tokensLeft = new ArrayList<String>(tokens.size());
            tokensLeft.addAll(tokens);
            tokensLeft.remove(0);
            edges.get(t).prepareDocumentFrequency(tokensLeft, id);
        }
    }

    public void resetSimilarity() {
        // reset edge2document
        edge2document = new HashMap<String, HashMap<Integer, Integer>>();
        // reset all childs:
        for (String edge : edges.keySet()) {
            SuffixTreeNode node = edges.get(edge);
            if (!node.isLeaf()) node.resetSimilarity();
        }
    }
}