package uk.ac.shef.dcs.jate.feature; import org.apache.commons.lang.exception.ExceptionUtils; import uk.ac.shef.dcs.jate.JATEException; import java.io.BufferedReader; import java.io.FileReader; import java.io.IOException; import org.apache.log4j.Logger; /** * A specific type of feature builder that builds an instance of * FeatureRefCorpusTermFrequency. This is a dummy class which reads the data * from a text file which stores information as: <br> * [freq_in_corpus] [term] * * @author <a href="mailto:ziqi.zhang@sheffield.ac.uk">Ziqi Zhang</a> */ public class TTFReferenceFeatureFileBuilder extends AbstractFeatureBuilder { protected final Logger LOG = Logger.getLogger(TTFReferenceFeatureFileBuilder.class.getName()); private final String _refStatsPath; /** * Default constructor * * @param refStatsPath * file path to the reference corpus statistics file. The file * should store one term on a line, and in the format of: <br> * [freq_in_ref_corpus] [term] <br> * Any terms with frequency < 2 will be ignored. */ public TTFReferenceFeatureFileBuilder(String refStatsPath) { super(null, null); _refStatsPath = refStatsPath; } /** * Dummy method which does nothing with the GlobalIndexMem instance but load * statistics and creates and instance of FeatureRefCorpusTermFrequency from * the file specified in the constructor * * @return uk.ac.shef.dcs.jate.feature.FrequencyTermBased * @throws uk.ac.shef.dcs.jate.JATEException */ public FrequencyTermBased build() throws JATEException { FrequencyTermBased feature = new FrequencyTermBased(); try { final BufferedReader reader = new BufferedReader(new FileReader(_refStatsPath)); try { String line; while ((line = reader.readLine()) != null) { line = line.trim(); String[] elements = line.split("\\s+"); if (Integer.valueOf(elements[0]) < 2) continue; feature.increment(elements[1].trim(), Integer.valueOf(elements[0])); } } finally { reader.close(); } } catch (IOException e) { StringBuilder sb = new StringBuilder("Failed to build features!"); sb.append("\n").append(ExceptionUtils.getFullStackTrace(e)); throw new JATEException(sb.toString()); } return feature; } }