/*******************************************************************************
* Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
import org.apache.log4j.Logger;
import tml.corpus.SimpleCorpus;
import tml.corpus.CorpusParameters.DimensionalityReduction;
import tml.corpus.CorpusParameters.TermSelection;
import tml.utils.LuceneUtils;
import tml.vectorspace.TermWeighting.GlobalWeight;
import tml.vectorspace.TermWeighting.LocalWeight;
/**
* This class implements the integration with Matlab for TML
*
* @author Jorge Villalon
*
*/
public class SemanticSpace {
private static Logger logger = Logger.getLogger(SemanticSpace.class);
private SimpleCorpus corpus;
/**
* Creates a new instance of a SemanticSpace, the space will be created with
* all text documents found in a particular folder.
*
* @param pathToRepository
* folder containing the documents to be processed
* @param pathToMatlab
* folder where the user's matlab folder is
* @throws Exception
*/
public SemanticSpace(String pathToRepository, String pathToMatlab)
throws Exception {
this.corpus = new SimpleCorpus(pathToRepository, pathToMatlab, false);
}
/**
* Stemming words with Lucene and making it available in Matlab
* @param phrase
* @return
*/
public String stemWords(String phrase) {
return LuceneUtils.stemWords(phrase);
}
/**
* Loads the semantic space
*
* @throws Exception
*/
public void load() throws Exception {
try {
this.corpus.load();
} catch (Exception e) {
throw new Exception(
"Couldn't load the semantic space from the documents!", e);
}
}
/**
* Sets the criteria to select which terms will be included in the LSA space
*
* @param selection
* the criteria
* @param threshold
* the threshold above which the criteria will be validated
*/
public void setTermSelectionCriteria(int selection, double threshold) {
switch (selection) {
case 1:
this.corpus.getCorpus().getParameters().setTermSelectionCriterion(
TermSelection.DF);
break;
case 2:
this.corpus.getCorpus().getParameters().setTermSelectionCriterion(
TermSelection.TF);
break;
case 3:
this.corpus.getCorpus().getParameters().setTermSelectionCriterion(
TermSelection.AVG_TF);
break;
}
this.corpus.getCorpus().getParameters().setTermSelectionThreshold(threshold);
logger.info("Term selection criteria:"
+ this.corpus.getCorpus().getParameters().getTermSelectionCriterion()
+ " Threshold:"
+ this.corpus.getCorpus().getParameters().getTermSelectionThreshold());
}
/**
* Sets the criteria to select how many dimension will be kept after SVD
*
* @param reduction
* the criteria
* @param threshold
* the threshold above which the criteria will be validated
*/
public void setDimensionalityReductionCriteria(int reduction,
double threshold) {
switch (reduction) {
case 1:
this.corpus.getCorpus().getParameters()
.setDimensionalityReduction(
DimensionalityReduction.NUM);
break;
case 2:
this.corpus.getCorpus().getParameters()
.setDimensionalityReduction(
DimensionalityReduction.PCT);
break;
case 3:
this.corpus.getCorpus().getParameters()
.setDimensionalityReduction(
DimensionalityReduction.VARPCT);
break;
case 4:
this.corpus.getCorpus().getParameters()
.setDimensionalityReduction(
DimensionalityReduction.NO);
break;
}
this.corpus.getCorpus().getParameters()
.setDimensionalityReductionThreshold(threshold);
logger.info("Dimensionality reduction criteria:"
+ this.corpus.getCorpus().getParameters()
.getDimensionalityReduction()
+ " Threshold:"
+ this.corpus.getCorpus().getParameters()
.getDimensionalityReductionThreshold());
}
/**
* Sets the term weighting scheme that will be used to calculate the LSA
* space
*
* @param local
* local weight criterion
* @param global
* global weight criterion
*/
public void setTermWeightingCriteria(int local, int global) {
switch (local) {
case 1:
this.corpus.getCorpus().getParameters().setTermWeightLocal(
LocalWeight.Binary);
break;
case 2:
this.corpus.getCorpus().getParameters().setTermWeightLocal(
LocalWeight.LOGTF);
break;
case 3:
this.corpus.getCorpus().getParameters().setTermWeightLocal(
LocalWeight.TF);
break;
case 4:
this.corpus.getCorpus().getParameters().setTermWeightLocal(
LocalWeight.TFn);
break;
}
switch (global) {
case 1:
this.corpus.getCorpus().getParameters().setTermWeightGlobal(
GlobalWeight.Entropy);
break;
case 2:
this.corpus.getCorpus().getParameters().setTermWeightGlobal(
GlobalWeight.GfIdf);
break;
case 3:
this.corpus.getCorpus().getParameters().setTermWeightGlobal(
GlobalWeight.Idf);
break;
case 4:
this.corpus.getCorpus().getParameters().setTermWeightGlobal(
GlobalWeight.None);
break;
case 5:
this.corpus.getCorpus().getParameters().setTermWeightGlobal(
GlobalWeight.Normal);
break;
}
logger.info("Term weighting. Local:"
+ this.corpus.getCorpus().getParameters()
.getTermWeightLocal()
+ " Global:"
+ this.corpus.getCorpus().getParameters()
.getTermWeightGlobal());
}
/**
* Returns the matrix that represents the semantic space
*
* @return a matrix of doubles
* @throws Exception
*/
public double[][] getTermDocMatrix() throws Exception {
return this.corpus.getMatrix();
}
/**
* @return all terms in the semantic space
* @throws Exception
*/
public String[] getTerms() throws Exception {
return this.corpus.getTerms();
}
/**
* @return all the documents in the semantic space
* @throws Exception
*/
public String[] getDocuments() throws Exception {
return this.corpus.getDocuments();
}
}