/******************************************************************************* * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl) * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. *******************************************************************************/ package tml.vectorspace.operations; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import tml.vectorspace.operations.results.TermsExtractionSummarizationResult; import Jama.Matrix; /** * This operation extracts the key terms from the corpus, ranked by their * importance on explaining the variance. * * @author Jorge Villalon * */ public class TermExtractionSummarization extends AbstractOperation<TermsExtractionSummarizationResult> { /** * Creates a new instance of {@link TermExtractionSummarization} */ public TermExtractionSummarization() { this.name = "Terms extraction"; } @Override public void start() throws Exception { super.start(); this.results = new ArrayList<TermsExtractionSummarizationResult>(); Matrix eigenVectors = this.corpus.getSemanticSpace() .getUk(); double[] eigenValues = this.corpus.getSemanticSpace() .getSk().getColumnPackedCopy(); logger.debug("rows:" + eigenVectors.getRowDimension() + " columns:" + eigenVectors.getColumnDimension()); for (int i = 0; i < eigenVectors.getRowDimension(); i++) { String term = this.corpus.getTerms()[i]; double maxTermWeight = 0; int termIndex = 0; for (int j = 0; j < eigenVectors.getColumnDimension(); j++) { double eigenvalue = 0; if (j < eigenValues.length) eigenvalue = eigenValues[j]; double termWeight = Math.abs(eigenVectors.get(i, j) * eigenvalue); if (maxTermWeight < termWeight) { maxTermWeight = termWeight; termIndex = j; } } logger.debug("Inserting term " + term + " with key " + maxTermWeight); TermsExtractionSummarizationResult result = new TermsExtractionSummarizationResult(); result.setEigenVectorIndex(termIndex); result.setLoad(maxTermWeight); result.setTerm(term); this.results.add(result); } Collections.sort(this.results, new Comparator<TermsExtractionSummarizationResult>() { @Override public int compare(TermsExtractionSummarizationResult o1, TermsExtractionSummarizationResult o2) { if (o1.getLoad() == o2.getLoad()) { return 1; } return (int) (o2.getLoad() * 100 - o1.getLoad() * 100); } }); // If we have a maximum number of results, we delete others if(this.maxResults > 0) { int toRemove = this.results.size() - this.maxResults; while (toRemove > 0) { this.results.remove(0); toRemove--; } } super.end(); } }