/******************************************************************************* * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl) * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. *******************************************************************************/ package tml.corpus; import java.io.File; import java.io.FileReader; import java.util.Properties; import org.apache.log4j.Logger; import tml.vectorspace.TermWeighting.GlobalWeight; import tml.vectorspace.TermWeighting.LocalWeight; /** * Class that encapsulates all the parameters required to create * a {@link Corpus} and its corresponding {@link SemanticSpace}. * * @author Jorge Villalon * */ public class CorpusParameters implements Cloneable { /** * Criteria by which a {@link SemanticSpace} will reduce (or not) the * dimensions of the space. * * @author Jorge Villalon * */ public enum DimensionalityReduction { /** * This method selects the number of dimensions based on how much * variance they cover. The threshold is a percentage of the total * variance. */ VARPCT, /** * This method selects the number of dimensions based on a fixed number. */ NUM, /** * This method selects the number of dimensions based on a percentage of * the total number of dimensions in the space. */ PCT, /** * No dimensionality reduction will be performed. */ NO } /** * The criteria to select the terms that will be kept in the corpus */ public enum TermSelection { /** * TF: Number of times a term appears in the corpus. MIN_TF: Terms with * a TF lower than a threshold are discarded */ TF, /** * TF: Number of times a term appears in the corpus. MIN_AVG_TF: The * mean of the TF is calculated, and terms with an AVG_TF lower than a * threshold are discarded */ AVG_TF, /** * DF: Number of documents where the term appears. MIN_DF: Terms with a * DF lower than a threshold are discarded */ DF } private static Logger logger = Logger.getLogger(CorpusParameters.class); public static CorpusParameters getParametersFromString(String paramString) { String[] parts = paramString.split("_"); if(parts.length < 7) return null; CorpusParameters params = new CorpusParameters(); params.setTermSelectionCriterion(TermSelection.valueOf(parts[0])); params.setTermSelectionThreshold(Double.parseDouble(parts[1])); params.setTermWeightLocal(LocalWeight.valueOf(parts[2])); params.setTermWeightGlobal(GlobalWeight.valueOf(parts[3])); params.setDimensionalityReduction(DimensionalityReduction.valueOf(parts[4])); params.setDimensionalityReductionThreshold(Double.parseDouble(parts[5])); params.setLanczosSVD(parts[6].equals("L")); if(parts.length > 7) { params.setNormalizeDocuments(parts[7].equals("Y")); // params.setCalculateSemanticSpace(parts[8].equals("Y")); } return params; } /** Term selection criteria */ private TermSelection termSelectionCriterion = TermSelection.DF; /** Term selection threshold */ private double termSelectionThreshold = 2; /** Max number of documents the corpus can manage */ protected int maxDocuments = 3000; /** The term weighting scheme for this {@link SemanticSpace} */ private LocalWeight termWeightLocal = LocalWeight.TF; private GlobalWeight termWeightGlobal = GlobalWeight.None; private boolean normalizeDocuments = false; /** The dimensionality reduction criterion */ private DimensionalityReduction dimensionalityReduction = DimensionalityReduction.PCT; /** The dimensionality reduction threshold */ private double dimensionalityReductionThreshold = 20; /** If the semantic space should use the Lanczos SVD */ private boolean lanczosSVD = false; @Override protected Object clone() throws CloneNotSupportedException { CorpusParameters clone = (CorpusParameters) super.clone(); return clone; } public DimensionalityReduction getDimensionalityReduction() { return dimensionalityReduction; } public double getDimensionalityReductionThreshold() { return dimensionalityReductionThreshold; } /** * @return the maxDocuments */ public int getMaxDocuments() { return maxDocuments; } /** * @return the termSelectionCriterion */ public TermSelection getTermSelectionCriterion() { return termSelectionCriterion; } /** * @return the termSelectionThreshold */ public double getTermSelectionThreshold() { return termSelectionThreshold; } public GlobalWeight getTermWeightGlobal() { return termWeightGlobal; } public LocalWeight getTermWeightLocal() { return termWeightLocal; } /** * @return the lanczosSVD */ public boolean isLanczosSVD() { return lanczosSVD; } /** * @return the normalizeDocuments */ public boolean isNormalizeDocuments() { return normalizeDocuments; } public void loadFromFile(File file) { Properties props = new Properties(); try { props.load(new FileReader(file)); } catch (Exception e) { logger.error("Couldn't load file with parameters, sticking to the defaults"); e.printStackTrace(); logger.error(e); return; } String termSelectionCriterion = props.getProperty("termselcrit", "MIN_DF"); String termSelectionThreshold = props.getProperty("termselthre", "2"); String dimensionalityReductionCriterion = props.getProperty("reduxcrit", "DIMENSIONS_MAX_PERCENTAGE"); String dimensionalityReductionThreshold = props.getProperty("reduxthre", "25"); String localTermWeight = props.getProperty("localtw", "TF"); String globalTermWeight = props.getProperty("globaltw", "Idf"); String maxdocuments = props.getProperty("maxdocs", "9999"); String useLanczos = props.getProperty("lanczos"); if(termSelectionCriterion.equals("MIN_DF")) { this.setTermSelectionCriterion(TermSelection.DF); } else if (termSelectionCriterion.equals("MIN_AVG_TF")) { this.setTermSelectionCriterion(TermSelection.AVG_TF); } else if (termSelectionCriterion.equals("MIN_TF")) { this.setTermSelectionCriterion(TermSelection.TF); } if(useLanczos != null && useLanczos.equals("true")) { this.setLanczosSVD(true); } else this.setLanczosSVD(false); this.setTermSelectionThreshold(Double.parseDouble(termSelectionThreshold)); if(dimensionalityReductionCriterion.equals("DIMENSIONS_MAX_NUMBER")) { this.setDimensionalityReduction(DimensionalityReduction.NUM); } else if (dimensionalityReductionCriterion.equals("DIMENSIONS_MAX_PERCENTAGE")) { this.setDimensionalityReduction(DimensionalityReduction.PCT); } else if (dimensionalityReductionCriterion.equals("NO_REDUCTION")) { this.setDimensionalityReduction(DimensionalityReduction.NO); } else if (dimensionalityReductionCriterion.equals("VARIANCE_COVERAGE")) { this.setDimensionalityReduction(DimensionalityReduction.VARPCT); } this.setDimensionalityReductionThreshold(Double.parseDouble(dimensionalityReductionThreshold)); if(localTermWeight.equals("Binary")) { this.setTermWeightLocal(LocalWeight.Binary); } else if (localTermWeight.equals("LOGTF")) { this.setTermWeightLocal(LocalWeight.LOGTF); } else if (localTermWeight.equals("TF")) { this.setTermWeightLocal(LocalWeight.TF); } else if (localTermWeight.equals("TFn")) { this.setTermWeightLocal(LocalWeight.TFn); } if(localTermWeight.equals("Binary")) { this.setTermWeightLocal(LocalWeight.Binary); } else if (localTermWeight.equals("LOGTF")) { this.setTermWeightLocal(LocalWeight.LOGTF); } else if (localTermWeight.equals("TF")) { this.setTermWeightLocal(LocalWeight.TF); } else if (localTermWeight.equals("TFn")) { this.setTermWeightLocal(LocalWeight.TFn); } if(globalTermWeight.equals("Entropy")) { this.setTermWeightGlobal(GlobalWeight.Entropy); } else if (globalTermWeight.equals("GfIdf")) { this.setTermWeightGlobal(GlobalWeight.GfIdf); } else if (globalTermWeight.equals("Idf")) { this.setTermWeightGlobal(GlobalWeight.Idf); } else if (globalTermWeight.equals("None")) { this.setTermWeightGlobal(GlobalWeight.None); } else if (globalTermWeight.equals("Normal")) { this.setTermWeightGlobal(GlobalWeight.Normal); } this.setMaxDocuments(Integer.parseInt(maxdocuments)); } public void setDimensionalityReduction( DimensionalityReduction dimensionalityReduction) { this.dimensionalityReduction = dimensionalityReduction; } public void setDimensionalityReductionThreshold( double dimensionalityReductionThreshold) { this.dimensionalityReductionThreshold = dimensionalityReductionThreshold; } /** * @param lanczosSVD the lanczosSVD to set */ public void setLanczosSVD(boolean lanczosSVD) { this.lanczosSVD = lanczosSVD; } /** * @param maxDocuments the maxDocuments to set */ public void setMaxDocuments(int maxDocuments) { this.maxDocuments = maxDocuments; } /** * @param normalizeDocuments the normalizeDocuments to set */ public void setNormalizeDocuments(boolean normalizeDocuments) { this.normalizeDocuments = normalizeDocuments; } /** * @param termSelectionCriterion the termSelectionCriterion to set */ public void setTermSelectionCriterion(TermSelection termSelectionCriterion) { this.termSelectionCriterion = termSelectionCriterion; } /** * @param termSelectionThreshold the termSelectionThreshold to set */ public void setTermSelectionThreshold(double termSelectionThreshold) { this.termSelectionThreshold = termSelectionThreshold; } public void setTermWeightGlobal(GlobalWeight termWeightGlobal) { this.termWeightGlobal = termWeightGlobal; } public void setTermWeightLocal(LocalWeight termWeightLocal) { this.termWeightLocal = termWeightLocal; } public static CorpusParameters getNoReductionParameters() { CorpusParameters params = new CorpusParameters(); // params.setCalculateSemanticSpace(false); params.setDimensionalityReduction(DimensionalityReduction.NO); params.setDimensionalityReductionThreshold(0); params.setLanczosSVD(false); params.setMaxDocuments(Integer.MAX_VALUE); params.setNormalizeDocuments(false); params.setTermSelectionCriterion(TermSelection.DF); params.setTermSelectionThreshold(0); params.setTermWeightGlobal(GlobalWeight.None); params.setTermWeightLocal(LocalWeight.TF); return params; } @Override public String toString() { String lanczos = null; if(this.isLanczosSVD()) lanczos = "L"; else lanczos = "J"; return this.termSelectionCriterion + "_" + (int) this.termSelectionThreshold + "_" + this.termWeightLocal + "_" + this.termWeightGlobal + "_" + this.dimensionalityReduction + "_" + (int) this.dimensionalityReductionThreshold + "_" + lanczos ; } }