/******************************************************************************* * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl) * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. *******************************************************************************/ package tml.vectorspace; import java.io.File; import Jama.SingularValueDecomposition; import Jama.Matrix; import org.apache.log4j.Logger; import tml.corpus.Corpus; import tml.corpus.CorpusParameters.DimensionalityReduction; import tml.utils.LanczosSVDLIBCUtils; /** * <p> * This class is a Vector Space Model representation of a group of documents or * {@link Corpus} constructed using Latent Semantic Indexing, it contains a term * by document matrix for the {@link Corpus}. * </p> * <p> * Some of the LSI steps are performed by this class: * </p> * <ul> * <li>6. Term weighting, calculates a new term weight as the multiplication of * a local and a global weight for each term-doc pair.</li> * <li>7. Dimensionality reduction, performs the SVD and reconstruct the matrix * considering less dimensions.</li> * <li>8. Normalisation, performs a crude normalisation of the vectors in the * space.</li> * </ul> * <p> * Several {@link Operation}s can be performed on a {@link SemanticSpace}. Each * one contains a list of results, that can be read from the operation in * Object[][], HTML and Graphic format for human consumption. * </p> * * @author Jorge Villalon * */ public class SemanticSpace implements Cloneable { private static final int MAX_MATRIX_SIZE = 100000; /** The logger */ private static Logger logger = Logger.getLogger(SemanticSpace.class); /** The {@link Corpus} that was the source for the {@link TextPassage}s */ private Corpus corpus = null; /** Terms matrix in the semantic space */ private Matrix Uk = null; /** Singular values in the semantic space */ private Matrix Sk = null; /** Documents matrix in the semantic space */ private Matrix Vk = null; /** The number of dimensions that were kept */ private int dimensionsKept = -1; /** The time in milliseconds the {@link SemanticSpace} took to calculate the space */ private long processingTime = 0; /** * Creates a new {@link SemanticSpace} from a {@link Corpus}. * * @param sourceCorpus * the {@link Corpus} for the {@link SemanticSpace} */ public SemanticSpace(Corpus sourceCorpus) { assert (sourceCorpus != null); this.corpus = sourceCorpus; } public boolean isCalculated() { return this.Uk != null && this.Vk != null && this.Sk != null && this.dimensionsKept > 0; } /** * Applies the dimensionality reduction to the matrix */ private Matrix applyDimensionalityReduction(Matrix termDoc) { logger.debug("Applying dimensionality reduction"); dimensionsKept = this.corpus.getDimensions(); String svdFilename = "tml_" + this.corpus.getFilename() + "_" + this.corpus.getParameters() + "_DIM_" + this.dimensionsKept + ".svd"; File svdFile = new File(this.corpus.getRepository().getSvdStoragePath() + "/" + svdFilename); boolean readSVDFromFile = false; if(this.corpus.getPassages().length * this.corpus.getTerms().length > MAX_MATRIX_SIZE) { if(svdFile.exists()) { try { SVD svd = SVD.readSVD(svdFile); readSVDFromFile = true; this.Uk = new Matrix(svd.getUkdata()); this.Sk = new Matrix(svd.getSkdata()); this.Vk = new Matrix(svd.getVkdata()); logger.debug("Big corpus, SVD file exists, reading it."); } catch (Exception e) { logger.debug("Big corpus, SVD file exists, but there were problems reading it."); logger.error(e); } } else { logger.debug("Big corpus, but SVD file wasn't found."); } } if(this.corpus.getParameters().isLanczosSVD() && !readSVDFromFile && this.corpus.getParameters().getDimensionalityReduction() != DimensionalityReduction.VARPCT) { logger.debug("Using Lanczos"); LanczosSVDLIBCUtils utils = null; try { utils = new LanczosSVDLIBCUtils(); utils.runLanczos(this.corpus, svdFilename); } catch (Exception e) { e.printStackTrace(); logger.error(e); this.Uk = null; this.Sk = null; this.Vk = null; return null; } this.Uk = utils.getU(); this.Sk = utils.getS(); this.Vk = utils.getV(); } else if(!readSVDFromFile){ logger.debug("Using Jama SVD"); SingularValueDecomposition svd = termDoc.svd(); this.Uk = new Matrix(svd.getU().getArray()); this.Sk = new Matrix(svd.getS().getArray()); this.Vk = new Matrix(svd.getV().getArray()); boolean invert = false; for(int i=0;i<this.Uk.getRowDimension();i++) { if(this.Uk.get(i, 0) < 0) { invert = true; break; } } if(invert) { logger.warn("Matrix inverted because first dimensions caused negative singular vectors in Jama"); this.Uk = this.Uk.times(-1); this.Vk = this.Vk.times(-1); } } if(this.corpus.getParameters().getDimensionalityReduction() != DimensionalityReduction.NO && !readSVDFromFile) { // Really reducing the dimensions of the matrices Matrix nUk = new Matrix(this.Uk.getRowDimension(), dimensionsKept); nUk.setMatrix(0, nUk.getRowDimension()-1, 0, dimensionsKept-1, this.Uk); this.Uk = nUk; Matrix nVk = new Matrix(this.Vk.getRowDimension(), dimensionsKept); nVk.setMatrix(0, nVk.getRowDimension()-1, 0, dimensionsKept-1, this.Vk); this.Vk = nVk; Matrix nSk = new Matrix(dimensionsKept, dimensionsKept); for(int i=0;i<dimensionsKept;i++) nSk.set(i, i, this.Sk.get(i, i)); this.Sk = nSk; } if(this.corpus.getPassages().length * this.corpus.getTerms().length > 10000 && !readSVDFromFile) { SVD svd = new SVD(); svd.setUkdata(this.Uk.getArray()); svd.setSkdata(this.Sk.getArray()); svd.setVkdata(this.Vk.getArray()); try { svd.saveSVD(svdFile); } catch (Exception e) { logger.error(e); } } try { termDoc = this.Uk.times(this.Sk).times( this.Vk.transpose()); } catch (ArrayIndexOutOfBoundsException ex) { logger .error("Problem reconstructing with reconstructing the matrix after DR"); throw ex; } return termDoc; } /** * Calculates the term by doc matrix for the {@link SemanticSpace} based on * the documents in the {@link Corpus}. * * @throws NotEnoughTermsInCorpusException * @throws TermWeightingException * @throws NormalizationException */ public void calculate() throws NotEnoughTermsInCorpusException { // First of all we check that the corpus contains enough terms to // calculate a semantic space if (this.getCorpus().getTerms().length <= 0 || this.getCorpus().getTerms().length < this .getCorpus().getPassages().length - 1) { throw new NotEnoughTermsInCorpusException(); } this.processingTime = System.currentTimeMillis(); Matrix m = this.corpus.getTermDocMatrix(); if(!this.corpus.isProjection()) { // Apply the dimensionality reduction m = this.applyDimensionalityReduction(m); } this.processingTime = System.currentTimeMillis() - this.processingTime; logger.info("Semantic space calculated in " + this.processingTime + " ms. " + "Parameters:" + this.corpus.getParameters()); } @Override public Object clone() throws CloneNotSupportedException { SemanticSpace clone = (SemanticSpace) super.clone(); if(this.Uk != null) clone.Uk = this.Uk.copy(); if(this.Sk != null) clone.Sk = this.Sk.copy(); if(this.Vk != null) clone.Vk = this.Vk.copy(); return clone; } /** * @return the {@link Corpus} that a {@link SemanticSpace} uses */ public Corpus getCorpus() { return this.corpus; } /** * @return the number of dimensions that the space kept */ public int getDimensionsKept() { return dimensionsKept; } /** * Gets the name of the {@link SemanticSpace} * * @return a String with the name */ public String getName() { return "Semantic space for " + this.getCorpus().getName(); } /** * The time that the {@link SemanticSpace} took to calculate its basic * operations * * @return time in milliseconds */ public long getProcessingTime() { return this.processingTime; } /** * @return the sk */ public Matrix getSk() { return Sk; } /** * @return The Ak reduced term-documents matrix. */ public Matrix getTermsDocuments() { return this.Uk.times(this.Sk).times(this.Vk.transpose()); } /** * @return the time taken to calculate the semantic space */ public long getTimeToCalculate() { return processingTime; } /** * @return the uk */ public Matrix getUk() { return Uk; } /** * @return the vk */ public Matrix getVk() { return Vk; } public void setCorpus(Corpus corpus) { this.corpus = corpus; } /** * @param vk the vk to set */ public void setVk(Matrix vk) { Vk = vk; } /** * Overrides the default toString method and replaces it with the * {@link SemanticSpace} name */ @Override public String toString() { if (this.getName() == null) return super.toString(); return this.getName(); } }