package com.cse10.classifier; import weka.core.Instances; import weka.core.converters.ArffSaver; import weka.core.stemmers.SnowballStemmer; import weka.core.tokenizers.NGramTokenizer; import weka.filters.unsupervised.attribute.StringToWordVector; import org.apache.log4j.Logger; import java.io.*; /** * used to transfer document features into vectors * Created by Chamath on 12/16/2014. */ public class FeatureVectorTransformer { protected StringToWordVector filter; private Logger log; public FeatureVectorTransformer() { filter = new StringToWordVector(); log = Logger.getLogger(this.getClass()); } /** * configure filter * * @param minNGramSize * @param maxNGramSize * @param useStemmer */ public void configure(int minNGramSize, int maxNGramSize, boolean useStemmer) { log.info("\n Feature Vector Transformer -> Configuration Started"); //set tokenizer - we can specify n-grams for classification NGramTokenizer tokenizer = new NGramTokenizer(); tokenizer.setNGramMinSize(minNGramSize); tokenizer.setNGramMaxSize(maxNGramSize); tokenizer.setDelimiters("\\W"); //set stemmer or lemmatizer if (useStemmer) { SnowballStemmer stemmer = new SnowballStemmer(); stemmer.setStemmer("english"); filter.setStemmer(stemmer); } else { StanfordCoreNLPLemmatizer lemmatizer; lemmatizer = new StanfordCoreNLPLemmatizer(); filter.setStemmer(lemmatizer); } //create new filter for vector transformation filter.setLowerCaseTokens(true); filter.setOutputWordCounts(true); filter.setTFTransform(true); filter.setIDFTransform(true); filter.setStopwords(new File("Classifier\\src\\main\\resources\\StopWordsR4.txt")); filter.setTokenizer(tokenizer); log.info("\n Feature Vector Transformer -> Configuration Completed"); } /** * set input format of the filter * * @param instances */ public void setInputFormat(Instances instances) { try { filter.setInputFormat(instances); } catch (Exception e) { e.printStackTrace(); } } /** * Transform articles to feature vectors and save in a file * * @param instances * @param fileName */ public Instances getTransformedArticles(Instances instances, String fileName) { log.info("\n Feature Vector Transformer -> Start Getting Transformed Articles"); Instances dataFiltered; // apply the StringToWordVector filter try { dataFiltered = weka.filters.Filter.useFilter(instances, filter); } catch (Exception e) { dataFiltered = instances; e.printStackTrace(); } //save to file ArffSaver saver = new ArffSaver(); saver.setInstances(dataFiltered); try { String path = "Classifier\\src\\main\\resources\\arffData\\".concat(fileName); saver.setFile(new File(path)); saver.writeBatch(); } catch (IOException e) { e.printStackTrace(); } log.info("\n Feature Vector Transformer -> Finish Getting Transformed Articles"); return dataFiltered; } /** * @param instances * @return */ public Instances getTransformedArticles(Instances instances) { log.info("\n Feature Vector Transformer -> Start Getting Transformed Articles"); Instances dataFiltered; try { dataFiltered = weka.filters.Filter.useFilter(instances, filter); } catch (Exception e) { dataFiltered = instances; e.printStackTrace(); } log.info("\n Feature Vector Transformer -> Finish Getting Transformed Articles"); return dataFiltered; } //for functional testing public StringToWordVector getFilter() { return filter; } }