package com.cse10.classifier;
import com.cse10.gate.DocumentKeyWordFinder;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.tokenizers.NGramTokenizer;
import java.util.ArrayList;
import java.util.List;
import org.apache.log4j.Logger;
/**
* remove obvious non-crime articles
* this is working with raw article data
* Created by Chamath on 12/21/2014.
*/
public class KeyWordClassifierHandler extends ClassifierHandler{
private DocumentKeyWordFinder documentKeyWordFinder;
private NGramTokenizer tokenizer;
private StanfordCoreNLPLemmatizer lemmatizer;
private Logger log;
public KeyWordClassifierHandler() {
this.documentKeyWordFinder = new DocumentKeyWordFinder();
this.tokenizer = new NGramTokenizer();
this.lemmatizer = new StanfordCoreNLPLemmatizer();
log = Logger.getLogger(this.getClass());
}
/**
* configure classifier
* @param minTokenSize minimum token size
* @param maxTokenSize maximum token size
* @param delimiter
*/
public void configure(int minTokenSize,int maxTokenSize, String delimiter ){
tokenizer.setNGramMinSize(minTokenSize);
tokenizer.setNGramMaxSize(maxTokenSize);
tokenizer.setDelimiters(delimiter);
}
/**
* cross validate the model, training data should be raw data
* @param trainingData
* @return
*/
public List<Double> crossValidateClassifier(Instances trainingData){
List<Double> accuracyValues=new ArrayList<>();
String words;
boolean exist=false;
double crimeCorrectCount=0;
double crimeIncorrectCount=0;
double otherCorrectCount=0;
double otherIncorrectCount=0;
for(int i=0;i<trainingData.numInstances();i++) {
words = "";
String content = trainingData.instance(i).stringValue(0);
String label = trainingData.instance(i).stringValue(1);
tokenizer.tokenize(content);
while (tokenizer.hasMoreElements()) {
String element = (String) tokenizer.nextElement();
words = words.concat(lemmatizer.stem(element));
words = words.concat(" ");
}
exist = documentKeyWordFinder.isKeyWordExist(words);
if(label.equals("crime") && exist){
log.info("crime correct");
crimeCorrectCount++;
}else if(label.equals("crime") && !exist){
log.info("crime incorrect");
crimeIncorrectCount++;
}else if(label.equals("other") && !exist){
log.info("non crime correct");
otherCorrectCount++;
}else{
log.info("non crime incorrect");
otherIncorrectCount++;
}
}
accuracyValues.add((crimeCorrectCount/(crimeCorrectCount+crimeIncorrectCount)) * 100);
accuracyValues.add((otherCorrectCount / (otherCorrectCount + otherIncorrectCount)) * 100);
log.info("Crime Accuracy= " + ((crimeCorrectCount / (crimeCorrectCount + crimeIncorrectCount)) * 100) + "%");
log.info("Other Accuracy= " + ((otherCorrectCount / (otherCorrectCount + otherIncorrectCount)) * 100) + "%");
return accuracyValues;
}
/**
* instance is raw article data
* @param testInstance
* @return
*/
public double classifyInstance(Instance testInstance){
String content = testInstance.stringValue(0);
tokenizer.tokenize(content);
String words="";
Boolean exist=false;
while (tokenizer.hasMoreElements()) {
String element = (String) tokenizer.nextElement();
words = words.concat(lemmatizer.stem(element));
words = words.concat(" ");
}
exist = documentKeyWordFinder.isKeyWordExist(words);
if(exist)
return 0.0;
else
return 1.0;
}
//for functional testing
public NGramTokenizer getTokenizer(){
return tokenizer;
}
}