/*
* File: LatentSemanticAnalysisTest.java
* Authors: Justin Basilico
* Company: Sandia National Laboratories
* Project: Cognitive Foundry
*
* Copyright March 17, 2009, Sandia Corporation.
* Under the terms of Contract DE-AC04-94AL85000, there is a non-exclusive
* license for use of this work by or on behalf of the U.S. Government. Export
* of this program may require a license from the United States Government.
* See CopyrightHistory.txt for complete details.
*
*/
package gov.sandia.cognition.text.topic;
import gov.sandia.cognition.math.matrix.mtj.Vector1;
import gov.sandia.cognition.annotation.PublicationReference;
import gov.sandia.cognition.annotation.PublicationType;
import gov.sandia.cognition.math.matrix.Matrix;
import gov.sandia.cognition.math.matrix.MatrixFactory;
import gov.sandia.cognition.math.matrix.Vector;
import gov.sandia.cognition.math.matrix.VectorFactory;
import java.text.DecimalFormat;
import java.text.NumberFormat;
import java.util.ArrayList;
import org.junit.Test;
import static org.junit.Assert.*;
/**
* Unit tests for class LatentSemanticAnalysis.
*
* @author Justin Basilico
* @since 3.0
*/
public class LatentSemanticAnalysisTest
{
/**
* Creates a new test.
*/
public LatentSemanticAnalysisTest()
{
}
/**
* Test of constructors of class LatentSemanticAnalysis.
*/
@Test
public void testConstructors()
{
int requestedRank = 10;
LatentSemanticAnalysis instance = new LatentSemanticAnalysis();
assertEquals(requestedRank, instance.getRequestedRank());
requestedRank = 2 * requestedRank + 1;
instance = new LatentSemanticAnalysis(requestedRank);
assertEquals(requestedRank, instance.getRequestedRank());
}
/**
* Test of learn method, of class LatentSemanticAnalysis.
*/
@Test
public void testLearn()
{
@PublicationReference(
author={"Thomas K. Landauer", "Peter W. Foltz", "Darrell Laham"},
title="An Introduction to Latent Semantic Analysis",
year=1998,
type=PublicationType.Journal,
publication="Discourse Processes",
pages={259, 284},
url="http://lsa.colorado.edu/papers/dp1.LSAintro.pdf",
notes="This is the paper that had the following LSA example."
)
double[][] data = new double[][] {
{ 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
{ 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0 },
{ 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0 },
{ 1, 0, 0, 0, 2, 0, 0, 1, 0, 0, 0, 0 },
{ 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0 },
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 },
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0 },
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1 },
{ 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1 }
};
// Note: See the comment at the end of the file for some MATLAB code
// for this data.
double EPSILON = 0.01;
NumberFormat format = new DecimalFormat("+0.00;-0.00");
int dimensionality = data[0].length;
double[] expectedSingularValues = new double[] {
3.34, 2.54, 2.35, 1.64, 1.50, 1.31, 0.85, 0.56, 0.36
};
double[][] expectedTermBasisValues = new double[][] {
{+0.22, -0.11, +0.29, -0.41, -0.11, -0.34, +0.52, -0.06, -0.41},
{+0.20, -0.07, +0.14, -0.55, +0.28, +0.50, -0.07, -0.01, -0.11},
{+0.24, +0.04, -0.16, -0.59, -0.11, -0.25, -0.30, +0.06, +0.49},
{+0.40, +0.06, -0.34, +0.10, +0.33, +0.38, +0.00, +0.00, +0.01},
{+0.64, -0.17, +0.36, +0.33, -0.16, -0.21, -0.17, +0.03, +0.27},
{+0.27, +0.11, -0.43, +0.07, +0.08, -0.17, +0.28, -0.02, -0.05},
{+0.27, +0.11, -0.43, +0.07, +0.08, -0.17, +0.28, -0.02, -0.05},
{+0.30, -0.14, +0.33, +0.19, +0.11, +0.27, +0.03, -0.02, -0.17},
{+0.21, +0.27, -0.18, -0.03, -0.54, +0.08, -0.47, -0.04, -0.58},
{+0.01, +0.49, +0.23, +0.03, +0.59, -0.39, -0.29, +0.25, -0.23},
{+0.04, +0.62, +0.22, +0.00, -0.07, +0.11, +0.16, -0.68, +0.23},
{+0.03, +0.45, +0.14, -0.01, -0.30, +0.28, +0.34, +0.68, +0.18},
};
double[][] expectedDocumentBasisValues = new double[][] {
{+0.20, +0.61, +0.46, +0.54, +0.28, +0.00, +0.01, +0.02, +0.08},
{-0.06, +0.17, -0.13, -0.23, +0.11, +0.19, +0.44, +0.62, +0.53},
{+0.11, -0.50, +0.21, +0.57, -0.51, +0.10, +0.19, +0.25, +0.08},
{-0.95, -0.03, +0.04, +0.27, +0.15, +0.02, +0.02, +0.01, -0.03},
{+0.05, -0.21, +0.38, -0.21, +0.33, +0.39, +0.35, +0.15, -0.60},
{-0.08, -0.26, +0.72, -0.37, +0.03, -0.30, -0.21, +0.00, +0.36},
{+0.18, -0.43, -0.24, +0.26, +0.67, -0.34, -0.15, +0.25, +0.04},
{-0.01, +0.05, +0.01, -0.02, -0.06, +0.45, -0.76, +0.45, -0.07},
{-0.06, +0.24, +0.02, -0.08, -0.26, -0.62, +0.02, +0.52, -0.45}
};
double[][] expectedTransformValues = new double[][] {
{+0.7348, -0.2794, +0.6815, -0.6724, -0.1650, -0.4454, +0.4420, -0.0336, -0.14760},
{+0.6680, -0.1778, +0.3290, -0.9020, +0.4200, +0.6550, -0.0595, -0.0056, -0.03960},
{+0.8016, +0.1016, -0.3760, -0.9676, -0.1650, -0.3275, -0.2550, +0.0336, +0.17640},
{+1.3360, +0.1524, -0.7990, +0.1640, +0.4950, +0.4978, +0.0000, +0.0000, +0.00360},
{+2.1376, -0.4318, +0.8460, +0.5412, -0.2400, -0.2751, -0.1445, +0.0168, +0.09720},
{+0.9018, +0.2794, -1.0105, +0.1148, +0.1200, -0.2227, +0.2380, -0.0112, -0.01800},
{+0.9018, +0.2794, -1.0105, +0.1148, +0.1200, -0.2227, +0.2380, -0.0112, -0.01800},
{+1.0020, -0.3556, +0.7755, +0.3116, +0.1650, +0.3537, +0.0255, -0.0112, -0.06120},
{+0.7014, +0.6858, -0.4230, -0.0492, -0.8100, +0.1048, -0.3995, -0.0224, -0.20880},
{+0.0334, +1.2446, +0.5405, +0.0492, +0.8850, -0.5109, -0.2465, +0.1400, -0.08280},
{+0.1336, +1.5748, +0.5170, +0.0000, -0.1050, +0.1441, +0.1360, -0.3808, +0.08280},
{+0.1002, +1.1430, +0.3290, -0.0164, -0.4500, +0.3668, +0.2890, +0.3808, +0.06480}
};
Matrix expectedFullTermBasis = MatrixFactory.getDefault().copyArray(
expectedTermBasisValues);
Matrix expectedFullTransform = MatrixFactory.getDefault().copyArray(
expectedTransformValues);
ArrayList<Vector> documents = new ArrayList<Vector>(data.length);
for (double[] d : data)
{
documents.add(VectorFactory.getDefault().copyArray(d));
}
LatentSemanticAnalysis instance = new LatentSemanticAnalysis();
// Reduced rank LSA
int rank = 2;
instance.setRequestedRank(rank);
LatentSemanticAnalysis.Transform result =
instance.learn(documents);
assertEquals(rank, result.getRank());
assertEquals(dimensionality, result.getInputDimensionality());
assertEquals(rank, result.getOutputDimensionality());
for (int i = 0; i < result.getRank(); i++)
{
assertEquals(expectedSingularValues[i],
result.getSingularValues().getElement(i, i), EPSILON);
}
Matrix expectedTermBasis = expectedFullTermBasis.getSubMatrix(
0, dimensionality - 1, 0, result.getRank() - 1);
Matrix expectedTransform = expectedFullTransform.getSubMatrix(
0, dimensionality - 1, 0, result.getRank() - 1);
for (int i = 0; i < rank; i++)
{
Vector expected = expectedTermBasis.getColumn(i);
Vector actual = result.getTermBasis().getColumn(i);
assertTrue(actual.equals(expected, EPSILON)
|| actual.equals(expected.negative(), EPSILON));
expected = expectedTransform.getColumn(i);
actual = expectedTransform.getColumn(i);
assertTrue(actual.equals(expected, EPSILON)
|| actual.equals(expected.negative(), EPSILON));
}
// Full rank LSA:
rank = dimensionality;
instance.setRequestedRank(rank);
result = instance.learn(documents);
rank = Math.min(dimensionality, documents.size());
assertEquals(rank, result.getRank());
assertEquals(dimensionality, result.getInputDimensionality());
assertEquals(rank, result.getOutputDimensionality());
for (int i = 0; i < expectedSingularValues.length; i++)
{
assertEquals(expectedSingularValues[i],
result.getSingularValues().getElement(i, i), EPSILON);
}
expectedTermBasis = expectedFullTermBasis.getSubMatrix(
0, dimensionality - 1, 0, result.getRank() - 1);
expectedTransform = expectedFullTransform.getSubMatrix(
0, dimensionality - 1, 0, result.getRank() - 1);
for (int i = 0; i < result.getRank(); i++)
{
Vector expected = expectedTermBasis.getColumn(i);
Vector actual = result.getTermBasis().getColumn(i);
assertTrue(actual.equals(expected, EPSILON)
|| actual.equals(expected.negative(), EPSILON));
expected = expectedTransform.getColumn(i);
actual = expectedTransform.getColumn(i);
assertTrue(actual.equals(expected, EPSILON)
|| actual.equals(expected.negative(), EPSILON));
}
// Add a zero document vector and learn.
documents.add(VectorFactory.getDefault().createVector(12));
documents.add(VectorFactory.getDefault().createVector(12));
rank = dimensionality;
instance.setRequestedRank(rank);
result = instance.learn(documents);
rank = 9;
assertEquals(rank, result.getRank());
assertEquals(dimensionality, result.getInputDimensionality());
assertEquals(rank, result.getOutputDimensionality());
for (int i = 0; i < expectedSingularValues.length; i++)
{
assertEquals(expectedSingularValues[i],
result.getSingularValues().getElement(i, i), EPSILON);
}
expectedTermBasis = expectedFullTermBasis.getSubMatrix(
0, dimensionality - 1, 0, result.getRank() - 1);
expectedTransform = expectedFullTransform.getSubMatrix(
0, dimensionality - 1, 0, result.getRank() - 1);
for (int i = 0; i < result.getRank(); i++)
{
Vector expected = expectedTermBasis.getColumn(i);
Vector actual = result.getTermBasis().getColumn(i);
assertTrue(actual.equals(expected, EPSILON)
|| actual.equals(expected.negative(), EPSILON));
expected = expectedTransform.getColumn(i);
actual = expectedTransform.getColumn(i);
assertTrue(actual.equals(expected, EPSILON)
|| actual.equals(expected.negative(), EPSILON));
}
// Create a second dataset with a zero column.
ArrayList<Vector> documents2 = new ArrayList<Vector>();
for (Vector document : documents)
{
documents2.add(document.stack(new Vector1()));
}
// Learn from the documents with zeros.
dimensionality++;
rank = dimensionality;
instance.setRequestedRank(rank);
result = instance.learn(documents2);
rank = 9;
assertEquals(rank, result.getRank());
assertEquals(dimensionality, result.getInputDimensionality());
assertEquals(rank, result.getOutputDimensionality());
for (int i = 0; i < expectedSingularValues.length; i++)
{
assertEquals(expectedSingularValues[i],
result.getSingularValues().getElement(i, i), EPSILON);
}
expectedTermBasis = expectedFullTermBasis.getSubMatrix(
0, dimensionality - 2, 0, result.getRank() - 1);
expectedTransform = expectedFullTransform.getSubMatrix(
0, dimensionality - 2, 0, result.getRank() - 1);
for (int i = 0; i < result.getRank(); i++)
{
Vector expected = expectedTermBasis.getColumn(i).stack(new Vector1());
Vector actual = result.getTermBasis().getColumn(i);
assertTrue(actual.equals(expected, EPSILON)
|| actual.equals(expected.negative(), EPSILON));
expected = expectedTransform.getColumn(i);
actual = expectedTransform.getColumn(i);
assertTrue(actual.equals(expected, EPSILON)
|| actual.equals(expected.negative(), EPSILON));
}
// Try a small rank and learn.
rank = 2;
instance.setRequestedRank(rank);
result = instance.learn(documents2);
assertEquals(rank, result.getRank());
assertEquals(dimensionality, result.getInputDimensionality());
assertEquals(rank, result.getOutputDimensionality());
for (int i = 0; i < result.getRank(); i++)
{
assertEquals(expectedSingularValues[i],
result.getSingularValues().getElement(i, i), EPSILON);
}
expectedTermBasis = expectedFullTermBasis.getSubMatrix(
0, dimensionality - 2, 0, result.getRank() - 1);
expectedTransform = expectedFullTransform.getSubMatrix(
0, dimensionality - 2, 0, result.getRank() - 1);
for (int i = 0; i < rank; i++)
{
Vector expected = expectedTermBasis.getColumn(i).stack(new Vector1());
Vector actual = result.getTermBasis().getColumn(i);
assertTrue(actual.equals(expected, EPSILON)
|| actual.equals(expected.negative(), EPSILON));
expected = expectedTransform.getColumn(i);
actual = expectedTransform.getColumn(i);
assertTrue(actual.equals(expected, EPSILON)
|| actual.equals(expected.negative(), EPSILON));
}
// Try a large rank and learn.
rank = 200;
instance.setRequestedRank(rank);
result = instance.learn(documents2);
rank = 9;
assertEquals(rank, result.getRank());
assertEquals(dimensionality, result.getInputDimensionality());
assertEquals(rank, result.getOutputDimensionality());
for (int i = 0; i < expectedSingularValues.length; i++)
{
assertEquals(expectedSingularValues[i],
result.getSingularValues().getElement(i, i), EPSILON);
}
expectedTermBasis = expectedFullTermBasis.getSubMatrix(
0, dimensionality - 2, 0, result.getRank() - 1);
expectedTransform = expectedFullTransform.getSubMatrix(
0, dimensionality - 2, 0, result.getRank() - 1);
for (int i = 0; i < result.getRank(); i++)
{
Vector expected = expectedTermBasis.getColumn(i).stack(new Vector1());
Vector actual = result.getTermBasis().getColumn(i);
assertTrue(actual.equals(expected, EPSILON)
|| actual.equals(expected.negative(), EPSILON));
expected = expectedTransform.getColumn(i);
actual = expectedTransform.getColumn(i);
assertTrue(actual.equals(expected, EPSILON)
|| actual.equals(expected.negative(), EPSILON));
}
// Add a whole lot of zeros and learn.
documents2.clear();
for (Vector document : documents)
{
documents2.add(VectorFactory.getSparseDefault().copyVector(
document).stack(VectorFactory.getSparseDefault().createVector(
100)));
}
// Try a large rank and learn.
dimensionality += 99;
rank = 200;
instance.setRequestedRank(rank);
result = instance.learn(documents2);
rank = 9;
assertEquals(rank, result.getRank());
assertEquals(dimensionality, result.getInputDimensionality());
assertEquals(rank, result.getOutputDimensionality());
for (int i = 0; i < expectedSingularValues.length; i++)
{
assertEquals(expectedSingularValues[i],
result.getSingularValues().getElement(i, i), EPSILON);
}
expectedTermBasis = expectedFullTermBasis.getSubMatrix(
0, dimensionality - 101, 0, result.getRank() - 1);
expectedTransform = expectedFullTransform.getSubMatrix(
0, dimensionality - 101, 0, result.getRank() - 1);
for (int i = 0; i < result.getRank(); i++)
{
Vector expected = expectedTermBasis.getColumn(i).stack(VectorFactory.getSparseDefault().createVector(
100));
Vector actual = result.getTermBasis().getColumn(i);
assertTrue(actual.equals(expected, EPSILON)
|| actual.equals(expected.negative(), EPSILON));
expected = expectedTransform.getColumn(i);
actual = expectedTransform.getColumn(i);
assertTrue(actual.equals(expected, EPSILON)
|| actual.equals(expected.negative(), EPSILON));
}
}
/**
* Test of getRequestedRank method, of class LatentSemanticAnalysis.
*/
@Test
public void testGetRequestedRank()
{
this.testSetRequestedRank();
}
/**
* Test of setRequestedRank method, of class LatentSemanticAnalysis.
*/
@Test
public void testSetRequestedRank()
{
int requestedRank = 10;
LatentSemanticAnalysis instance = new LatentSemanticAnalysis();
assertEquals(requestedRank, instance.getRequestedRank());
requestedRank = 2 * requestedRank + 1;
instance.setRequestedRank(requestedRank);
assertEquals(requestedRank, instance.getRequestedRank());
requestedRank = 2 * requestedRank + 1;
instance.setRequestedRank(requestedRank);
assertEquals(requestedRank, instance.getRequestedRank());
boolean exceptionThrown = false;
try
{
instance.setRequestedRank(0);
}
catch (IllegalArgumentException e)
{
exceptionThrown = true;
}
finally
{
assertTrue(exceptionThrown);
}
assertEquals(requestedRank, instance.getRequestedRank());
exceptionThrown = false;
try
{
instance.setRequestedRank(-1);
}
catch (IllegalArgumentException e)
{
exceptionThrown = true;
}
finally
{
assertTrue(exceptionThrown);
}
assertEquals(requestedRank, instance.getRequestedRank());
}
}
/*
* Here is some helpful Octave/MATLAB code used for creating the unit test data:
D = [
1 0 0 1 0 0 0 0 0;
1 0 1 0 0 0 0 0 0;
1 1 0 0 0 0 0 0 0;
0 1 1 0 1 0 0 0 0;
0 1 1 2 0 0 0 0 0;
0 1 0 0 1 0 0 0 0;
0 1 0 0 1 0 0 0 0;
0 0 1 1 0 0 0 0 0;
0 1 0 0 0 0 0 0 1;
0 0 0 0 0 1 1 1 0;
0 0 0 0 0 0 1 1 1;
0 0 0 0 0 0 0 1 1];
[U, S, V] = svd(D);
Z = U * S * V';
r = 2;
Ur = U(:, 1:r); Sr = S(1:r, 1:r); Vr = V(:, 1:r); Zr = Ur * Sr * Vr';
*/