/******************************************************************************* * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl) * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. *******************************************************************************/ /** * */ package tml.vectorspace.operations; import java.util.Collections; import java.util.Comparator; import tml.utils.MatrixUtils; import tml.vectorspace.operations.results.PassageSimilarityResult; import Jama.Matrix; /** * This operation calculates the similarity between all documents within * a {@link Corpus}, it can calculate the similarities based on its own * {@link SemanticSpace}, or use another {@link Corpus} to project all documents * and the calculate similarities between all documents in both corpora. * * @author Jorge Villalon * */ public class PassagesSimilarity extends AbstractOperation<PassageSimilarityResult> { private boolean includeBackgroundInSimilarity = false; private boolean includeBackgroundInResults = false; private boolean sortBySimilarity = false; public PassagesSimilarity() { this.name = "Passages similarity"; } /** * @return the sortBySimilarity */ public boolean isSortBySimilarity() { return sortBySimilarity; } /** * @param sortBySimilarity the sortBySimilarity to set */ public void setSortBySimilarity(boolean sortBySimilarity) { this.sortBySimilarity = sortBySimilarity; } private Matrix similarities; /** * @return the similarities */ public Matrix getSimilarities() { return similarities; } /** * @param includeBackgroundInResults the includeBackgroundInResults to set */ public void setIncludeBackgroundInResults(boolean includeBackgroundInResults) { this.includeBackgroundInResults = includeBackgroundInResults; } /** * @param includeBackgroundInSimilarity the includeBackgroundInSimilarity to set */ public void setIncludeBackgroundInSimilarity( boolean includeBackgroundInSimilarity) { this.includeBackgroundInSimilarity = includeBackgroundInSimilarity; } @Override public void start() throws Exception { super.start(); if(!this.includeBackgroundInSimilarity) { fillResultsFromSpace( this.corpus.getSemanticSpace().getSk(), this.corpus.getSemanticSpace().getVk(), this.corpus.getPassages()); } else { Matrix v = this.corpus.getSemanticSpace().getVk(); Matrix vv = this.backgroundKnowledge.getSemanticSpace().getVk(); int docs = v.getRowDimension() + vv.getRowDimension(); int terms = v.getColumnDimension(); Matrix newV = new Matrix(docs, terms); newV.setMatrix(0,v.getRowDimension()-1,0,terms-1,v); newV.setMatrix(v.getRowDimension(),docs-1,0,terms-1,vv); // The new array of passages String[] passages = new String[docs]; for(int i=0;i<this.corpus.getPassages().length;i++) passages[i] = this.corpus.getPassages()[i]; for(int i=this.corpus.getPassages().length;i<docs;i++) passages[i] = backgroundKnowledge.getPassages()[i-this.corpus.getPassages().length]; fillResultsFromSpace( this.corpus.getSemanticSpace().getSk(), newV, passages); } if(this.isSortBySimilarity()) { Collections.sort(this.results, new Comparator<PassageSimilarityResult>() { @Override public int compare(PassageSimilarityResult o1, PassageSimilarityResult o2) { return (int) ((-o1.getSimilarity() * 10E9 + o2.getSimilarity() * 10E9)); } }); } super.end(); } private void fillResultsFromSpace(Matrix S, Matrix V, String[] passages) { // The distances between documents is calculated using V // First, V is scaled by S cause LSA works like that (check Deerwester 1990 // and Beery and Dumais 1994). similarities = V.times(S); // Second, normalize the distances otherwise we won't get 1 for exactly // the same documents. similarities = MatrixUtils.normalizeRows(similarities); // Finally, the all with all comparison is made. similarities = similarities.times(similarities.transpose()); int totalDocsA = similarities.getColumnDimension(); int totalDocsB = similarities.getColumnDimension(); if(!this.includeBackgroundInResults) { totalDocsA = this.corpus.getPassages().length; } for(int docA=0;docA<totalDocsA;docA++) for(int docB=docA;docB<totalDocsB;docB++) { PassageSimilarityResult result = new PassageSimilarityResult(); result.setDocumentA(passages[docA]); result.setDocumentB(passages[docB]); result.setSimilarity(similarities.get(docA, docB)); this.results.add(result); } } }