/******************************************************************************* * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl) * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. *******************************************************************************/ package tml.test; import java.text.DecimalFormat; import org.junit.BeforeClass; import org.junit.Test; import tml.Configuration; import tml.corpus.Corpus; import tml.corpus.SearchResultsCorpus; import tml.corpus.CorpusParameters.DimensionalityReduction; import tml.corpus.CorpusParameters.TermSelection; import tml.utils.LuceneUtils; import tml.vectorspace.TermWeighting.GlobalWeight; import tml.vectorspace.TermWeighting.LocalWeight; import tml.vectorspace.operations.PassagesSimilarity; import tml.vectorspace.operations.results.PassageSimilarityResult; import static org.junit.Assert.*; import Jama.Matrix; public class ValidateBerryDumaisTest extends AbstractTmlIndexingTest { private static double[][] termDoc = { {0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, {0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1}, {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0}, {0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0}, {1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0}, {0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1}, {0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, {0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0}, {0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0}, {0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0}, {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0}, {0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0}, {0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0}, {0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0}, {0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1} }; private static String[] documents = { "B01","B02","B03","B04","B05","B06","B07","B08","B09","B10","B11","B12","B13","B14","B15","B16","B17" }; private static String[] terms = { "algorithms", "application", "delay", "differential", "equations", "implementation", "integral", "introduction", "methods", "nonlinear", "ordinary", "oscillation", "partial", "problem", "systems", "theory" }; private static double[][] Uk = { {0.0159, -0.4317}, {0.0266, -0.3756}, {0.1785, -0.1692}, {0.6014, 0.1187}, {0.6691, 0.1209}, {0.0148, -0.3603}, {0.052, -0.2248}, {0.0066, -0.112}, {0.1503, 0.1127}, {0.0813, 0.0672}, {0.1503, 0.1127}, {0.1785, -0.1692}, {0.1415, 0.0974}, {0.0105, -0.2363}, {0.0952, 0.0399}, {0.2051, -0.5448} }; private static double[][] Sk = { {4.5314, 0}, {0, 2.7582} }; private double[][] queryVector = { {0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1} }; private double[][] queryProjected = { {0.0511,-0.3337} }; private double[][] distances2Factors = { {17, 0.99}, {3, 0.99}, {6, 0.99}, {16, 0.99}, {5, 0.98}, {7, 0.98}, {12, 0.55}, {11, 0.55}, {1, 0.38} }; private double[][] distances4Factors = { {17, 0.87}, {3, 0.82}, {12, 0.57}, {11, 0.57}, {16, 0.38}, {7, 0.38}, {1, 0.35}, {5, 0.22} }; private double[][] distances8Factors = { {17, 0.88}, {3, 0.78}, {12, 0.37}, {11, 0.37} }; private static Corpus corpus = null; // private static Corpus corpusLanczos = null; private static Corpus queryCorpus = null; @BeforeClass public static void setUpBeforeClass() throws Exception { AbstractTmlIndexingTest.setUpBeforeClass(); repository.addDocumentsInFolder(Configuration.getTmlFolder() + "/corpora/BerryDumais"); corpus = new SearchResultsCorpus("type:document AND -externalid:Q01"); corpus.getParameters().setTermSelectionCriterion(TermSelection.DF); corpus.getParameters().setTermSelectionThreshold(2); corpus.getParameters().setTermWeightLocal(LocalWeight.TF); corpus.getParameters().setTermWeightGlobal(GlobalWeight.None); corpus.getParameters().setDimensionalityReduction(DimensionalityReduction.NUM); corpus.getParameters().setDimensionalityReductionThreshold(2); corpus.load(repository); // corpusLanczos = new SearchResultsCorpus("type:document AND -externalid:Q01"); // corpusLanczos.getParameters().setTermWeightLocal(LocalWeight.TF); // corpusLanczos.getParameters().setTermWeightGlobal(GlobalWeight.None); // corpusLanczos.getParameters().setDimensionalityReduction(DimensionalityReduction.NUM); // corpusLanczos.getParameters().setDimensionalityReductionThreshold(2); // corpusLanczos.getParameters().setLanczosSVD(true); // corpusLanczos.load(repository); queryCorpus = new SearchResultsCorpus("type:document AND externalid:Q01"); queryCorpus.getParameters().setTermSelectionThreshold(0); queryCorpus.getParameters().setTermWeightLocal(LocalWeight.TF); queryCorpus.getParameters().setTermWeightGlobal(GlobalWeight.None); queryCorpus.setProjection(true); queryCorpus.load(repository); } @Test public void validateTermsAndDocuments() throws Exception { assertEquals(terms.length, corpus.getTerms().length); for(int i=0;i<corpus.getTerms().length; i++) { String term = corpus.getTerms()[i]; assertEquals(term, LuceneUtils.stemWords(terms[i])); } for(int i=0;i<corpus.getPassages().length; i++) { String passage = corpus.getPassages()[i]; assertEquals(passage, LuceneUtils.stemWords(documents[i])); } } @Test public void validateTermDocMatrix() { Matrix actual = corpus.getTermDocMatrix(); Matrix expected = new Matrix(termDoc); actual.print(5, 2); expected.print(5, 2); assertEquals(expected.getRowDimension(), actual.getRowDimension()); assertEquals(expected.getColumnDimension(), actual.getColumnDimension()); for(int i=0; i<actual.getRowDimension(); i++) { for(int j=0; j<actual.getColumnDimension(); j++) { assertEquals(expected.get(i, j), actual.get(i, j), 0.01); } } } @Test public void validateUk() { Matrix expected = corpus.getSemanticSpace().getUk(); Matrix actual = new Matrix(Uk); for(int i=0; i<actual.getRowDimension(); i++) { for(int j=0; j<actual.getColumnDimension(); j++) { assertEquals(expected.get(i, j), actual.get(i, j), 0.01); } } } @Test public void validateSk() { Matrix expected = corpus.getSemanticSpace().getSk(); Matrix actual = new Matrix(Sk); for(int i=0; i<actual.getRowDimension(); i++) { for(int j=0; j<actual.getColumnDimension(); j++) { assertEquals(expected.get(i, j), actual.get(i, j), 0.01); } } } /* @Test public void validateUkLanczos() { Matrix expected = corpusLanczos.getSemanticSpace().getUk(); Matrix actual = new Matrix(Uk); for(int i=0; i<actual.getRowDimension(); i++) { for(int j=0; j<actual.getColumnDimension(); j++) { assertEquals(expected.get(i, j), actual.get(i, j), 0.01); } } } @Test public void validateSkLanczos() { Matrix expected = corpusLanczos.getSemanticSpace().getSk(); Matrix actual = new Matrix(Sk); for(int i=0; i<actual.getRowDimension(); i++) { for(int j=0; j<actual.getColumnDimension(); j++) { assertEquals(expected.get(i, j), actual.get(i, j), 0.01); } } }*/ @Test public void validateQuery() { Matrix mUk = new Matrix(Uk); Matrix mSk = new Matrix(Sk); Matrix q = new Matrix(queryVector).transpose(); Matrix actual = q.transpose().times(mUk).times(mSk.inverse()); Matrix expected = new Matrix(queryProjected); for(int i=0; i<actual.getRowDimension(); i++) { for(int j=0; j<actual.getColumnDimension(); j++) { assertEquals(expected.get(i, j), actual.get(i, j), 0.01); } } } @Test public void validateProjection2Factors() throws Exception { PassagesSimilarity similarity = new PassagesSimilarity(); similarity.setCorpus(queryCorpus); similarity.setBackgroundKnowledgeCorpus(corpus); similarity.setIncludeBackgroundInSimilarity(true); similarity.setSortBySimilarity(true); similarity.start(); double[] similarities = new double[distances2Factors.length]; int current = 0; for(PassageSimilarityResult result : similarity.getResults()) { logger.debug( result.getDocumentA() +"-"+ result.getDocumentB()+":"+ (new DecimalFormat("0.000")).format(result.getSimilarity())); if(!result.getDocumentA().equals(result.getDocumentB())) { similarities[current] = result.getSimilarity(); current++; if(current >= similarities.length) break; } } for(int i=0;i<similarities.length;i++) { logger.debug(distances2Factors[i][1] + "," + similarities[i]); assertEquals(distances2Factors[i][1], similarities[i], 0.1); } } @Test public void validateProjection4Factors() throws Exception { corpus.getParameters().setDimensionalityReductionThreshold(4); corpus.load(repository); corpus.getSemanticSpace().calculate(); PassagesSimilarity similarity = new PassagesSimilarity(); similarity.setCorpus(queryCorpus); similarity.setBackgroundKnowledgeCorpus(corpus); similarity.setIncludeBackgroundInSimilarity(true); similarity.setSortBySimilarity(true); similarity.start(); double[] similarities = new double[distances4Factors.length]; int current = 0; for(PassageSimilarityResult result : similarity.getResults()) { logger.debug( result.getDocumentA() +"-"+ result.getDocumentB()+":"+ (new DecimalFormat("0.000")).format(result.getSimilarity())); if(!result.getDocumentA().equals(result.getDocumentB())) { similarities[current] = result.getSimilarity(); current++; if(current >= similarities.length) break; } } for(int i=0;i<similarities.length;i++) { logger.debug(distances4Factors[i][1] + "," + similarities[i]); assertEquals(distances4Factors[i][1], similarities[i], 0.1); } } @Test public void validateProjection8Factors() throws Exception { corpus.getParameters().setDimensionalityReductionThreshold(8); corpus.load(repository); corpus.getSemanticSpace().calculate(); PassagesSimilarity similarity = new PassagesSimilarity(); similarity.setCorpus(queryCorpus); similarity.setBackgroundKnowledgeCorpus(corpus); similarity.setIncludeBackgroundInSimilarity(true); similarity.setSortBySimilarity(true); similarity.start(); double[] similarities = new double[distances8Factors.length]; int current = 0; for(PassageSimilarityResult result : similarity.getResults()) { logger.debug( result.getDocumentA() +"-"+ result.getDocumentB()+":"+ (new DecimalFormat("0.000")).format(result.getSimilarity())); if(!result.getDocumentA().equals(result.getDocumentB())) { similarities[current] = result.getSimilarity(); current++; if(current >= similarities.length) break; } } for(int i=0;i<similarities.length;i++) { logger.debug(distances8Factors[i][1] + "," + similarities[i]); assertEquals(distances8Factors[i][1], similarities[i], 0.1); } } /* @Test public void validateProjection2FactorsLanczos() throws Exception { PassagesSimilarity similarity = new PassagesSimilarity(); similarity.setCorpus(queryCorpus); similarity.setBackgroundKnowledgeCorpus(corpusLanczos); similarity.setIncludeBackgroundInSimilarity(true); similarity.setSortBySimilarity(true); similarity.start(); double[] similarities = new double[distances2Factors.length]; int current = 0; for(PassageSimilarityResult result : similarity.getResults()) { logger.debug( result.getDocumentA() +"-"+ result.getDocumentB()+":"+ (new DecimalFormat("0.000")).format(result.getSimilarity())); if(!result.getDocumentA().equals(result.getDocumentB())) { similarities[current] = result.getSimilarity(); current++; if(current >= similarities.length) break; } } for(int i=0;i<similarities.length;i++) { logger.debug(distances2Factors[i][1] + "," + similarities[i]); assertEquals(distances2Factors[i][1], similarities[i], 0.1); } } @Test public void validateProjection4FactorsLanczos() throws Exception { corpusLanczos.getParameters().setDimensionalityReductionThreshold(4); corpusLanczos.load(repository); corpusLanczos.getSemanticSpace().calculate(); PassagesSimilarity similarity = new PassagesSimilarity(); similarity.setCorpus(queryCorpus); similarity.setBackgroundKnowledgeCorpus(corpusLanczos); similarity.setIncludeBackgroundInSimilarity(true); similarity.setSortBySimilarity(true); similarity.start(); double[] similarities = new double[distances4Factors.length]; int current = 0; for(PassageSimilarityResult result : similarity.getResults()) { logger.debug( result.getDocumentA() +"-"+ result.getDocumentB()+":"+ (new DecimalFormat("0.000")).format(result.getSimilarity())); if(!result.getDocumentA().equals(result.getDocumentB())) { similarities[current] = result.getSimilarity(); current++; if(current >= similarities.length) break; } } for(int i=0;i<similarities.length;i++) { logger.debug(distances4Factors[i][1] + "," + similarities[i]); assertEquals(distances4Factors[i][1], similarities[i], 0.1); } } @Test public void validateProjection8FactorsLanczos() throws Exception { corpusLanczos.getParameters().setDimensionalityReductionThreshold(8); corpusLanczos.load(repository); corpusLanczos.getSemanticSpace().calculate(); PassagesSimilarity similarity = new PassagesSimilarity(); similarity.setCorpus(queryCorpus); similarity.setBackgroundKnowledgeCorpus(corpusLanczos); similarity.setIncludeBackgroundInSimilarity(true); similarity.setSortBySimilarity(true); similarity.start(); double[] similarities = new double[distances8Factors.length]; int current = 0; for(PassageSimilarityResult result : similarity.getResults()) { logger.debug( result.getDocumentA() +"-"+ result.getDocumentB()+":"+ (new DecimalFormat("0.000")).format(result.getSimilarity())); if(!result.getDocumentA().equals(result.getDocumentB())) { similarities[current] = result.getSimilarity(); current++; if(current >= similarities.length) break; } } for(int i=0;i<similarities.length;i++) { logger.debug(distances8Factors[i][1] + "," + similarities[i]); assertEquals(distances8Factors[i][1], similarities[i], 0.1); } }*/ }