package com.cse10.classifier;
import com.cse10.article.TrainingArticle;
import com.cse10.database.DatabaseHandler;
import com.cse10.gate.DocumentContentFilter;
import weka.core.Attribute;
import weka.core.FastVector;
import weka.core.Instance;
import weka.core.Instances;
import java.util.List;
import org.apache.log4j.Logger;
/**
* load training data after filtering the content
* Created by Chamath on 12/20/2014.
*/
public class DataHandlerWithGate extends DataHandler {
private DocumentContentFilter documentContentFilter;
private Logger log;
public DataHandlerWithGate() {
log = Logger.getLogger(this.getClass());
fileName = "dataWithGate";
documentContentFilter = new DocumentContentFilter();
}
@Override
protected String printDescription() {
String description = "This data handler will load training data and filter nouns,adjectives,verbs and adverbs from article content";
log.info(description);
return description;
}
/**
* fetch training data
*
* @param featureVectorTransformer
* @return Instances
* @throws Exception
*/
public Instances loadTrainingData(FeatureVectorTransformer featureVectorTransformer) {
printDescription();
FastVector attributeList = new FastVector(2);
Attribute content = new Attribute("text", (FastVector) null);
FastVector classVal = new FastVector();
classVal.addElement("crime");
classVal.addElement("other");
Attribute classValue = new Attribute("@@class@@", classVal);
//add class attribute and news text
attributeList.addElement(content);
attributeList.addElement(classValue);
Instances trainingData = new Instances("TrainingNews", attributeList, 0);
if (trainingData.classIndex() == -1) {
trainingData.setClassIndex(trainingData.numAttributes() - 1);
}
//load training data using database handler
List<TrainingArticle> trainingArticles = DatabaseHandler.fetchTrainingArticles();
for (TrainingArticle trainingArticle : trainingArticles) {
Instance inst = new Instance(trainingData.numAttributes());
inst.setValue(content, documentContentFilter.getFilterdContent(trainingArticle.getContent()));
inst.setValue(classValue, trainingArticle.getLabel());
inst.setDataset(trainingData);
trainingData.add(inst);
}
trainingData.setClassIndex(trainingData.numAttributes() - 1);
return trainingData;
}
}