/******************************************************************************* * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl) * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. *******************************************************************************/ package tml.test; import java.text.DecimalFormat; import org.junit.BeforeClass; import org.junit.Test; import tml.Configuration; import tml.corpus.Corpus; import tml.corpus.CorpusParameters.DimensionalityReduction; import tml.corpus.SearchResultsCorpus; import tml.vectorspace.TermWeighting.GlobalWeight; import tml.vectorspace.TermWeighting.LocalWeight; import tml.vectorspace.operations.PassagesSimilarity; import tml.vectorspace.operations.results.PassageSimilarityResult; import static org.junit.Assert.*; import Jama.Matrix; public class ValidateHandbookOfLSATest extends AbstractTmlIndexingTest { private static double[][] docDistances = { {1.00, 0.81, 0.53, 0.4, 0.85, 0.91, 0.82, 0.8, 0.79}, {0.81, 1, 0.93, 0.87, 0.37, 0.49, 0.32, 0.29, 0.27}, {0.53, 0.93, 1, 0.99, 0, 0.13, -0.05, -0.09, -0.1}, {0.40, 0.87, 0.99, 1, -0.14, -0.01, -0.19, -0.23, -0.24}, {0.85, 0.37, 0, -0.14, 1, 0.99, 1, 1, 1}, {0.91, 0.49, 0.13, -0.01, 0.99, 1, 0.98, 0.98, 0.97}, {0.82, 0.33, -0.05, -0.19, 1, 0.98, 1, 1, 1}, {0.8, 0.29, -0.09, -0.23, 1, 0.98, 1, 1, 1}, {0.79, 0.27, -0.1, -0.24, 1, 0.97, 1, 1, 1} }; // Results obtained from WinGTP with queryfile containing "Recipe for White bread" and // the Handbook of LSA corpus. With no singular value scaling (no -S in qrun), and // 2 factors (-n in qrun). public static double[] queryResults2FactorsNoScaling= { 0.99225, 0.960501, 0.952626, 0.550482, 0.181349, 0.0642336, 0.0232894, -0.0109286, -0.0234396 }; private static String[] terms = { "bread", "composition", "demonstration", "dough", "drum", "ingredients", "music", "recipe", "rock", "roll" }; private static String[] documents = { "b1", "b2", "b3", "b4", "m1", "m2", "m3", "m4", "m5" }; private static Corpus hbkLSAcorpus; private static Corpus hbkLSAqueries; private Matrix m; @BeforeClass public static void setupBeforeClass() throws Exception { //AbstractTmlIndexingTest.setUpBeforeClass(); repository.addDocumentsInFolder(Configuration.getTmlFolder() + "/corpora/handbookOfLSA"); hbkLSAcorpus = new SearchResultsCorpus("type:document AND (externalid:m* OR externalid:b*)"); hbkLSAcorpus.getParameters().setDimensionalityReduction(DimensionalityReduction.NUM); hbkLSAcorpus.getParameters().setDimensionalityReductionThreshold(2); hbkLSAcorpus.getParameters().setTermWeightLocal(LocalWeight.LOGTF); hbkLSAcorpus.getParameters().setTermWeightGlobal(GlobalWeight.Entropy); hbkLSAcorpus.getParameters().setNormalizeDocuments(true); hbkLSAcorpus.load(repository); hbkLSAqueries = new SearchResultsCorpus( "type:document AND (externalid:q5)"); hbkLSAqueries.getParameters().setTermSelectionThreshold(0); hbkLSAqueries.setProjection(true); hbkLSAqueries.load(repository); hbkLSAqueries.setName("Handbook of LSA queries"); } @Test public void validateTermsAndDocuments() { assertEquals(terms.length, hbkLSAcorpus.getTerms().length); assertEquals(documents.length, hbkLSAcorpus.getPassages().length); for(int i=0;i<terms.length;i++) { assertEquals(tml.utils.LuceneUtils.stemWords(terms[i]), hbkLSAcorpus.getTerms()[i]); } for(int i=0;i<documents.length;i++) { assertEquals(documents[i], hbkLSAcorpus.getPassages()[i]); } } @Test public void simpleProjection() throws Exception { PassagesSimilarity operation = new PassagesSimilarity(); operation.setCorpus(hbkLSAcorpus); operation.setBackgroundKnowledgeCorpus(null); operation.setIncludeBackgroundInSimilarity(false); operation.setIncludeBackgroundInResults(false); operation.start(); m = new Matrix(docDistances); assertEquals(m.normF(), operation.getSimilarities().normF(), 0.01); } @Test public void validateSimilaritiesJama() throws Exception { PassagesSimilarity queryOperation = new PassagesSimilarity(); queryOperation.setCorpus(hbkLSAqueries); queryOperation.setBackgroundKnowledgeCorpus(hbkLSAcorpus); queryOperation.setIncludeBackgroundInSimilarity(true); queryOperation.setIncludeBackgroundInResults(false); queryOperation.setSortBySimilarity(true); queryOperation.start(); double[] similarities = new double[queryOperation.getResults().size()-1]; int current = 0; for(PassageSimilarityResult result : queryOperation.getResults()) { logger.debug( result.getDocumentA() +"-"+ result.getDocumentB()+":"+ (new DecimalFormat("0.000")).format(result.getSimilarity())); if(!result.getDocumentA().equals(result.getDocumentB())) { similarities[current] = result.getSimilarity(); current++; } } for(int i=0;i<similarities.length;i++) { logger.debug(queryResults2FactorsNoScaling[i] + "," + similarities[i]); assertEquals(queryResults2FactorsNoScaling[i], similarities[i], 0.1); } } /* @Test public void validateSimilaritiesLanczos() throws Exception { hbkLSAcorpus.getParameters().setLanczosSVD(true); hbkLSAcorpus.getSemanticSpace().calculate(); PassagesSimilarity operation = new PassagesSimilarity(); operation.setCorpus(hbkLSAcorpus); operation.setBackgroundKnowledgeCorpus(null); operation.setIncludeBackgroundInSimilarity(false); operation.setIncludeBackgroundInResults(false); operation.start(); assertEquals(m.normF(), operation.getSimilarities().normF(), 0.01); PassagesSimilarity queryOperation = new PassagesSimilarity(); queryOperation.setCorpus(hbkLSAqueries); queryOperation.setBackgroundKnowledgeCorpus(hbkLSAcorpus); queryOperation.setIncludeBackgroundInSimilarity(true); queryOperation.setIncludeBackgroundInResults(false); queryOperation.setSortBySimilarity(true); queryOperation.start(); double[] similarities = new double[queryOperation.getResults().size()-1]; int current = 0; for(PassageSimilarityResult result : queryOperation.getResults()) { logger.debug( result.getDocumentA() +"-"+ result.getDocumentB()+":"+ (new DecimalFormat("0.000")).format(result.getSimilarity())); if(!result.getDocumentA().equals(result.getDocumentB())) { similarities[current] = result.getSimilarity(); current++; } } for(int i=0;i<similarities.length;i++) { logger.debug(queryResults2FactorsNoScaling[i] + "," + similarities[i]); assertEquals(queryResults2FactorsNoScaling[i], similarities[i], 0.1); } }*/ }