GraphVectorSimilarity.java example

Explorer

CaliphEmir-master
- caliphemir
  - src
    - at
  - test
    - at
      - lux
        fotoannotation
        ComponentsTest.java
        FileTreeTest.java
        fotoretrieval
        FastMapGraphTest.java
        FastMapTest.java
        GraphConstructionTest.java
        PointPanel.java
        SimilarityMatrixTest.java
        lucene
        GraphAnalyzerTest.java
        LabeledGraphTest.java
        panels
        TestConfigurationDialog.java
        retrievalengines
        GraphTest.java
        LucenePathIndexRetrievalEngineTest.java
        LuceneRetrievalEngineTest.java
        graphviz
        SpringEmbedderTest.java
        SpringEmbedderVis.java
        imageanalysis
        ColorLayoutTest.java
        ColorStructureTest.java
        ColorTest.java
        DominantColorTest.java
        EdgeHistogramTest.java
        ScalableColorTest.java
        db
        DerbyTest.java
        imaging
        BmpReaderTest.java
        PpmReaderTest.java
        retrieval
        StcTest.java
        evaluation
        SuffixTreeEvaluation.java
        graphisomorphism
        FastSubgraphIsomorphismTest.java
        SubgraphIsomorphismTest.java
        metrics
        BooleanNodeDistanceFunctionTest.java
        SimpleEdgeDistanceFunctionTest.java
        TermVectorNodeDistanceFunctionTest.java
        suffixtreemodel
        SuffixTreeTest.java
        vectorspace
        ElementTextVectorSimilarityTest.java
        GraphVectorSimilarityTest.java

package at.lux.retrieval.vectorspace;

import at.lux.fotoretrieval.lucene.Graph;
import at.lux.fotoretrieval.lucene.Path;
import at.lux.fotoretrieval.lucene.GraphPathExtractor;

import java.util.HashSet;
import java.util.HashMap;
import java.util.ArrayList;

/**
 * A class for calculating the similarity based on a feature
 * vector having the paths of a graph as dimensions.<p/>
 * Date: 09.03.2006 <br>
 * Time: 13:44:07 <br>
 * Know-Center Graz, Inffeldgasse 21a, 8010 Graz, AUSTRIA <br>
 *
 * @author Mathias Lux, mlux@know-center.at
 */
public class GraphVectorSimilarity {
    private int maxLength = -1;
    private HashMap<String, Integer> idf = new HashMap<String, Integer>(128);
    private int numDocsInCorpus = 0;
    private Type type = Type.Unweighted;
    private int docLengthSum = 0;
    private boolean normalize = false;

    public enum Type {
        IdfWeighted,
        PlIdfWeighted, // Path length normalization ...
        BM25,
        Unweighted
    }

    /**
     * Default constructor, paths of all available lengths are used.
     */
    public GraphVectorSimilarity() {
    }

    /**
     * Use paths up to specified length for calculations
     *
     * @param maxLength maximum path length.
     */
    public GraphVectorSimilarity(int maxLength) {
        this.maxLength = maxLength;
    }

    public GraphVectorSimilarity(Type type, int maxLength) {
        this.type = type;
        this.maxLength = maxLength;
    }

    public GraphVectorSimilarity(Type type, int maxLength, boolean normalize) {
        this.type = type;
        this.maxLength = maxLength;
        this.normalize = normalize;
    }


    /**
     * Calculates and returns the silimilarity with default type
     *
     * @return the similarity with default type.
     */
    public double getSimilarity(Graph g1, Graph g2) {
        Path[] paths1 = getPaths(g1);
        Path[] paths2 = getPaths(g2);
        HashMap<String, Double> termVector1, termVector2;
        termVector1 = new HashMap<String, Double>(paths1.length);
        termVector2 = new HashMap<String, Double>(paths2.length);
        for (Path path : paths1) {
            if (type != Type.PlIdfWeighted) termVector1.put(path.toString(), 1.0);
            else {
//                if (path.getLength() > 0)
                termVector1.put(path.toString(), ((double) path.getLength() + 1));
            }
        }
        for (Path path : paths2) {
            if (type != Type.PlIdfWeighted) termVector2.put(path.toString(), 1.0);
            else {
//                if (path.getLength() > 0)
                termVector2.put(path.toString(), ((double) path.getLength() + 1));
            }
        }
        HashSet<String> terms = new HashSet<String>(Math.max(paths1.length, paths2.length));
        if (normalize) {
            normalizeTermVector(termVector1);
            normalizeTermVector(termVector2);
        }
        if (type != Type.BM25) {
            terms.addAll(termVector1.keySet());
            terms.addAll(termVector2.keySet());
            double sum = 0f;
            double sum1 = 0;
            double sum2 = 0;
            for (String dim : terms) {
                double factor1 = 0f;
                double factor2 = 0f;
                if (termVector1.containsKey(dim)) {
                    double entry = termVector1.get(dim);
                    if (type == Type.IdfWeighted) {
                        if (!idf.containsKey(dim))
                            throw new UnsupportedOperationException("Document has to be added to corpus first!");
                        double idfValue = Math.log((double) numDocsInCorpus / (double) idf.get(dim));
                        entry = entry * idfValue;
                    }
                    factor1 = entry;
                    sum1 += entry * entry;
                }
                if (termVector2.containsKey(dim)) {
                    double entry = termVector2.get(dim);
                    if (type == Type.IdfWeighted) {
                        if (!idf.containsKey(dim))
                            throw new UnsupportedOperationException("Document has to be added to corpus first!");
                        double idfValue = Math.log((double) numDocsInCorpus / (double) idf.get(dim));
                        entry = entry * idfValue;
                    }
                    factor2 = entry;
                    sum2 += entry * entry;
                }
                sum += factor1 * factor2;
            }
            double upper = sum;
            double lower = (float) Math.sqrt((float) sum1 * (float) sum2);
            return upper / lower;
        } else {
            double avdl = (double) docLengthSum / (double) numDocsInCorpus;
            double dl1 = termVector1.size();
            double dl2 = termVector2.size();
            double k1 = 1.5;
            double b = 0.5;
            double sum = 0.0;
/*
            for (String term : termVector1.keySet()) {
                if (termVector2.containsKey(term)) {
                    terms.add(term);
                }
            }
            for (String term : terms) {
                if (numDocsInCorpus < idf.get(term)) {
                    System.out.println(term + " -> " + idf.get(term));
                }
                assert(numDocsInCorpus >= idf.get(term));
                sum += getBm25Weight(k1, b, termVector1.get(term), idf.get(term), avdl, dl1) *
                        getBm25Weight(k1, b, termVector2.get(term), idf.get(term), avdl, dl2);
            }
            return sum;
*/
            terms.addAll(termVector1.keySet());
            terms.addAll(termVector2.keySet());
            double sum1 = 0;
            double sum2 = 0;
            for (String dim : terms) {
                double factor1 = 0f;
                double factor2 = 0f;
                if (termVector1.containsKey(dim)) {
                    double entry = termVector1.get(dim);
                    if (!idf.containsKey(dim))
                        throw new UnsupportedOperationException("Document has to be added to corpus first!");
//                    double idfValue = Math.log((double) numDocsInCorpus / (double) idf.get(dim));
                    entry = getBm25Weight(k1, b, termVector1.get(dim), idf.get(dim), avdl, dl1);
                    factor1 = entry;
                    sum1 += entry * entry;
                }
                if (termVector2.containsKey(dim)) {
                    double entry = termVector2.get(dim);
                    if (!idf.containsKey(dim))
                        throw new UnsupportedOperationException("Document has to be added to corpus first!");
//                    double idfValue = Math.log((double) numDocsInCorpus / (double) idf.get(dim));
                    entry = getBm25Weight(k1, b, termVector2.get(dim), idf.get(dim), avdl, dl2);
                    factor2 = entry;
                    sum2 += entry * entry;
                }
                sum += factor1 * factor2;
            }
            double upper = sum;
            double lower = (float) Math.sqrt((float) sum1 * (float) sum2);
            return upper / lower;
        }
    }

    private void normalizeTermVector(HashMap<String, Double> termVector) {
        double vectorLength = 0;
        for (String path: termVector.keySet()) {
            vectorLength += termVector.get(path) * termVector.get(path);
        }
        vectorLength = Math.sqrt(vectorLength);
        for (String path: termVector.keySet()) {
            termVector.put(path, termVector.get(path) / vectorLength);
        }
    }

    private double getBm25Weight(double k1, double b, double termFreq, double docFreq, double avgDocLength, double docLength) {
        assert(numDocsInCorpus >= docFreq);
        return ((k1 + 1.0) * termFreq) / (k1 * ((1 - b) + b * docLength / avgDocLength) + termFreq) * Math.log((numDocsInCorpus - docFreq + 0.5) / (docFreq + 0.5));
    }

    /**
     * Adds a Graph to the corpus for Calculation of IDF.
     *
     * @param g
     */
    public void addToCorpus(Graph g) {
        numDocsInCorpus++;
        Path[] paths = getPaths(g);
        docLengthSum += paths.length;
        for (Path path : paths) {
            String key = path.toString();
            if (idf.containsKey(key)) {
                idf.put(key, idf.get(key) + 1);
            } else {
                idf.put(key, 1);
            }
        }

    }

    private Path[] getPaths(Graph g) {
        ArrayList<Path> paths = new ArrayList<Path>();
        HashSet<String> test = new HashSet<String>();
        if (maxLength < 0) {
            Path[] tmpPaths = GraphPathExtractor.extractPaths(g.toString(), -1);
            for (int i = 0; i < tmpPaths.length; i++) {
                if (!test.contains(tmpPaths[i].toString())) {
                    test.add(tmpPaths[i].toString());
                    paths.add(tmpPaths[i]);
                } else {
//                    System.out.println("tmpPaths["+i+"] = " + tmpPaths[i].toString());
                }
            }
        } else {
            for (int i = 0; i <= maxLength; i++) {
                Path[] tmpPaths = GraphPathExtractor.extractPaths(g.toString(), i);
                for (int j = 0; j < tmpPaths.length; j++) {
                    if (!test.contains(tmpPaths[j].toString())) {
                        test.add(tmpPaths[j].toString());
                        paths.add(tmpPaths[j]);
                    } else {
//                        System.out.println("tmpPaths["+j+"] = " + tmpPaths[j].toString());
                    }
                }
            }
        }
        return (Path[]) paths.toArray(new Path[1]);
    }

}