/* * File: TermVectorSimilarityNetworkCreator.java * Authors: Justin Basilico * Company: Sandia National Laboratories * Project: Cognitive Foundry * * Copyright March 18, 2009, Sandia Corporation. * Under the terms of Contract DE-AC04-94AL85000, there is a non-exclusive * license for use of this work by or on behalf of the U.S. Government. Export * of this program may require a license from the United States Government. * See CopyrightHistory.txt for complete details. * */ package gov.sandia.cognition.text.term.relation; import gov.sandia.cognition.math.matrix.Matrix; import gov.sandia.cognition.math.matrix.MatrixFactory; import gov.sandia.cognition.math.matrix.Vector; import gov.sandia.cognition.math.matrix.Vectorizable; import gov.sandia.cognition.text.relation.SimilarityFunction; import gov.sandia.cognition.text.term.TermIndex; import gov.sandia.cognition.text.term.vector.CosineSimilarityFunction; import gov.sandia.cognition.util.AbstractCloneableSerializable; import java.util.Collection; /** * Creates term similarity networks by comparing vectors representing the * terms. * * @author Justin Basilico * @since 3.0 */ public class TermVectorSimilarityNetworkCreator extends AbstractCloneableSerializable { // TODO: This should probably implement some kind of interface... /** The default effective zero value is {@value}. */ public static final double DEFAULT_EFFECTIVE_ZERO = 0.0; /** The similarity function between term vectors used to determine the * similarity between two terms. */ protected SimilarityFunction<? super Vector, ? super Vector> similarityFunction; /** The value to treat as zero. Used to increase the sparseness of a * similarity network. */ protected double effectiveZero; /** The matrix factory to create the matrix that backs the similarity * network. */ protected MatrixFactory<? extends Matrix> matrixFactory; /** * Creates a new {@code TermVectorSimilarityNetworkCreator}. */ public TermVectorSimilarityNetworkCreator() { this(CosineSimilarityFunction.getInstance()); } /** * Creates a new {@code TermVectorSimilarityNetworkCreator}. * * @param similarityFunction * The similarity function between term vectors used to determine the * term similarity. */ public TermVectorSimilarityNetworkCreator( final SimilarityFunction<? super Vector, ? super Vector> similarityFunction) { this(similarityFunction, DEFAULT_EFFECTIVE_ZERO); } /** * Creates a new {@code TermVectorSimilarityNetworkCreator}. * * @param similarityFunction * The similarity function between term vectors used to determine the * term similarity. * @param effectiveZero * The effective value to treat as zero. Used to increase the * sparseness of a similarity network. */ public TermVectorSimilarityNetworkCreator( final SimilarityFunction<? super Vector, ? super Vector> similarityFunction, final double effectiveZero) { this(similarityFunction, effectiveZero, MatrixFactory.getDefault()); } /** * Creates a new {@code TermVectorSimilarityNetworkCreator}. * * @param similarityFunction * The similarity function between term vectors used to determine the * term similarity. * @param effectiveZero * The effective value to treat as zero. Used to increase the * sparseness of a similarity network. * @param matrixFactory * The matrix factory used to create the similarity matrix. */ public TermVectorSimilarityNetworkCreator( final SimilarityFunction<? super Vector, ? super Vector> similarityFunction, final double effectiveZero, final MatrixFactory<? extends Matrix> matrixFactory) { this.similarityFunction = similarityFunction; this.effectiveZero = effectiveZero; this.matrixFactory = matrixFactory; } /** * Creates a new similarity network between the terms in the given * documents. First the document vectors are turned into a term-by-document * matrix. Then the similarity function in this object is used to calculate * the similarity between the column vectors representing each term to * populate a term-by-term matrix. The resulting matrix will be symmetric. * * @param documents * The term vectors for each document to calculate the similarity * network from. * @param termIndex * The index of terms that was used to create the term vectors for * each document. * @return * A new similarity network for the terms in the given index * calculated using the given vectors. */ public MatrixBasedTermSimilarityNetwork create( final Collection<? extends Vectorizable> documents, final TermIndex termIndex) { final int termCount = termIndex.getTermCount(); // Create a term-by-document matrix, since the given set of documents // are document-by-term. final Matrix termByDocumentMatrix = this.getMatrixFactory().copyColumnVectors(documents); // Create the matrix to hold the result. final Matrix similiarities = this.getMatrixFactory().createMatrix(termCount, termCount); // Go through the terms to compute term-to-term similarity. for (int i = 0; i < termCount; i++) { final Vector termIVector = termByDocumentMatrix.getRow(i); // We assume that sim(a, b) = sim(b, a) and only loop over the // upper diagonal of the matrix. for (int j = i; j < termCount; j++) { final Vector termJVector = termByDocumentMatrix.getRow(j); final double similarity = this.similarityFunction.evaluate( termIVector, termJVector); if (Math.abs(similarity) > this.effectiveZero) { similiarities.setElement(i, j, similarity); if (i != j) { // For non-diagonal elements we set the similarity for // the lower diagonal portion, since we are only looping // over the upper diagonal. similiarities.setElement(j, i, similarity); } } } } // Return the resulting network backed by the similarity matrix. return new MatrixBasedTermSimilarityNetwork(termIndex, similiarities); } /** * Gets the similarity function between term vectors used to determine the * similarity between two terms. * * @return * The similarity function. */ public SimilarityFunction<? super Vector, ? super Vector> getSimilarityFunction() { return this.similarityFunction; } /** * Sets the similarity function between term vectors used to determine the * similarity between two terms. * * @param similarityFunction * The similarity function. */ public void setSimilarityFunction( final SimilarityFunction<? super Vector, ? super Vector> similarityFunction) { this.similarityFunction = similarityFunction; } /** * Gets the value to treat as zero. Used to increase the sparseness of a * similarity network. * * @return * The threshold to treat absolute values below as zero. */ public double getEffectiveZero() { return this.effectiveZero; } /** * Sets the value to treat as zero. Used to increase the sparseness of a * similarity network. * * @param effectiveZero * The threshold to treat absolute values below as zero. */ public void setEffectiveZero( final double effectiveZero) { if (effectiveZero < 0.0) { throw new IllegalArgumentException( "effectiveZero must be non-negative"); } this.effectiveZero = effectiveZero; } /** * Gets the matrix factory to create the matrix that backs the similarity * network. * * @return * The matrix factory. */ public MatrixFactory<? extends Matrix> getMatrixFactory() { return this.matrixFactory; } /** * Sets the matrix factory to create the matrix that backs the similarity * network. * * @param matrixFactory * The matrix factory. */ public void setMatrixFactory( final MatrixFactory<? extends Matrix> matrixFactory) { this.matrixFactory = matrixFactory; } }