/** * */ package com.maalaang.omtwitter.uima.consumer; import java.io.IOException; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; import java.util.Map; import java.util.Set; import java.util.Map.Entry; import org.apache.uima.cas.CAS; import org.apache.uima.cas.CASException; import org.apache.uima.cas.CASRuntimeException; import org.apache.uima.cas.FSIterator; import org.apache.uima.collection.CasConsumer_ImplBase; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.tcas.Annotation; import org.apache.uima.resource.ResourceInitializationException; import org.apache.uima.resource.ResourceProcessException; import org.apache.uima.util.Level; import org.apache.uima.util.Logger; import com.maalaang.omtwitter.io.OMTwitterCorpusFile; import com.maalaang.omtwitter.io.OMTwitterCorpusFileReader; import com.maalaang.omtwitter.model.OMTweet; import com.maalaang.omtwitter.uima.type.TokenAnnotation; import com.maalaang.omtwitter.uima.type.TweetAnnotation; /** * @author Sangwon Park * */ public class OMTwitterResultEvaluateConsumer extends CasConsumer_ImplBase { private final static String PARAM_EVALUATION_CORPUS_FILE = "evaluationCorpusFile"; private final static String PARAM_EVALUATION_CORPUS_DELIM = "evaluationCorpusDelim"; private final static String PARAM_EVALUATION_CORPUS_FIELDS = "evaluationCorpusFields"; private final static String PARAM_PRINT_RESULT = "printResult"; private final static String PARAM_NAMED_ENTITY_TAGS = "namedEntityTags"; private final static String PARAM_LABEL_NONE = "labelNone"; private OMTwitterCorpusFileReader evalCorpusReader = null; private Logger logger = null; private boolean printResult = false; private String labelNone = null; private int labelNoneIdx = 0; private int cnt = 0; private int senti[][] = null; private Map<String,Integer> map = null; private int[][] stat = null; private int nerCnt = 0; private int nerCorrrect = 0; private int nerClassified = 0; private int[] classifiedEntityCnt = null; private int[] answerEntityCnt = null; @Override public void initialize() throws ResourceInitializationException { super.initialize(); logger = getLogger(); try { evalCorpusReader = new OMTwitterCorpusFileReader((String)getConfigParameterValue(PARAM_EVALUATION_CORPUS_FILE), (String)getConfigParameterValue(PARAM_EVALUATION_CORPUS_DELIM), OMTwitterCorpusFile.fieldNameToId((String)getConfigParameterValue(PARAM_EVALUATION_CORPUS_FIELDS), " ")); } catch (Exception e) { logger.log(Level.SEVERE, e.getMessage()); throw new ResourceInitializationException(e); } printResult = (Boolean)getConfigParameterValue(PARAM_PRINT_RESULT); String neTagsStr = (String)getConfigParameterValue(PARAM_NAMED_ENTITY_TAGS); if (neTagsStr == null) { throw new ResourceInitializationException(); } labelNone = (String)getConfigParameterValue(PARAM_LABEL_NONE); String[] neTags = neTagsStr.split(" "); int idx = 0; map = new HashMap<String,Integer>(); for (String tag : neTags) { map.put(tag + "_B", idx++); map.put(tag + "_M", idx++); map.put(tag + "_E", idx++); } map.put(labelNone, idx++); labelNoneIdx = idx - 1; stat = new int[idx][3]; senti = new int[3][3]; classifiedEntityCnt = new int[idx/3]; answerEntityCnt = new int[idx/3]; } /* (non-Javadoc) * @see org.apache.uima.collection.base_cpm.CasObjectProcessor#processCas(org.apache.uima.cas.CAS) */ public void processCas(CAS aCAS) throws ResourceProcessException { JCas jcas; try { jcas = aCAS.getJCas(); } catch (CASException e) { logger.log(Level.SEVERE, e.getMessage()); throw new ResourceProcessException(e); } TweetAnnotation tweetAnn = (TweetAnnotation)jcas.getAnnotationIndex(TweetAnnotation.type).iterator().next(); OMTweet answerTweet = evalCorpusReader.next(); if(!answerTweet.getId().equals(tweetAnn.getId())) { logger.log(Level.SEVERE, "target corpus and evaluation corpus don't match to each other - " + answerTweet.getId() + ", " + tweetAnn.getId()); throw new ResourceProcessException(); } String[] entity = extractEntityTags(answerTweet.getText()); String classified = null; String prevClassified = null; StringBuffer sb = new StringBuffer(); try { sb.append("\n["); sb.append(answerTweet.getPolarityString()); sb.append("=>"); sb.append(tweetAnn.getPolarity()); sb.append("] "); sb.append(tweetAnn.getCoveredText()); sb.append('\n'); FSIterator<Annotation> tokenAnnIter = jcas.getAnnotationIndex(TokenAnnotation.type).iterator(); TokenAnnotation tokenAnn = null; int i = 0; int prevClassifiedIdx = labelNoneIdx; int prevAnswerIdx = labelNoneIdx; String classifiedEntityStr = ""; String answerEntityStr = ""; while (tokenAnnIter.hasNext()) { tokenAnn = (TokenAnnotation) tokenAnnIter.next(); classified = tokenAnn.getEntityLabel(); String answer = entity[i]; boolean correct = false; if (classified.equals(answer)) { correct = true; } int classifiedIdx = 0; int answerIdx = 0; try { answerIdx = map.get(answer); } catch (Exception e) { logger.log(Level.SEVERE, "wrong annotation on the evaluation corpus - tweet id: " + answerTweet.getId() + ", answerTag=" + answer); logger.log(Level.SEVERE, e.getMessage()); answerIdx = map.get(labelNone); } try { classifiedIdx = map.get(classified); } catch (Exception e) { logger.log(Level.SEVERE, "wrong annotation from the NER - tweet id: " + answerTweet.getId() + ", classifiedTag=" + classified); logger.log(Level.SEVERE, e.getMessage()); classifiedIdx = map.get(labelNone); } stat[classifiedIdx][0]++; stat[answerIdx][1]++; if (correct) { stat[classifiedIdx][2]++; } if (classifiedIdx != labelNoneIdx) { if (classifiedIdx/3 != prevClassifiedIdx/3) { classifiedEntityCnt[classifiedIdx/3]++; if (prevClassifiedIdx != labelNoneIdx) { sb.append('\t'); sb.append(classifiedEntityStr); sb.append(" -> "); sb.append(prevClassified.substring(0, prevClassified.lastIndexOf('_'))); sb.append('\n'); } classifiedEntityStr = tokenAnn.getCoveredText(); } else { classifiedEntityStr += " " + tokenAnn.getCoveredText(); } } else if (prevClassifiedIdx != labelNoneIdx) { sb.append('\t'); sb.append(classifiedEntityStr); sb.append(" -> "); sb.append(prevClassified.substring(0, prevClassified.lastIndexOf('_'))); sb.append('\n'); classifiedEntityStr = ""; } prevClassifiedIdx = classifiedIdx; if (answerIdx != labelNoneIdx) { if (answerIdx/3 != prevAnswerIdx/3) { answerEntityCnt[answerIdx/3]++; answerEntityStr = tokenAnn.getCoveredText(); } else { answerEntityStr += " " + tokenAnn.getCoveredText(); } } else if (prevAnswerIdx != labelNoneIdx) { answerEntityStr = ""; } prevAnswerIdx = answerIdx; prevClassified = classified; i++; } if (prevClassifiedIdx != labelNoneIdx) { sb.append('\t'); sb.append(classifiedEntityStr); sb.append(" -> "); sb.append(prevClassified.substring(0, prevClassified.lastIndexOf('_'))); sb.append('\n'); } // senti String answerSenti = answerTweet.getPolarityString(); boolean correct = false; String classifiedSenti = tweetAnn.getPolarity(); if (classifiedSenti.equals(senti)) { correct = true; } int classifiedIdx = sentiIdx(classifiedSenti); int answerIdx = sentiIdx(answerSenti); senti[classifiedIdx][0]++; senti[answerIdx][1]++; if (classifiedIdx == answerIdx) { correct = true; } if (correct) { senti[classifiedIdx][2]++; } cnt++; logger.log(Level.INFO, sb.toString()); } catch (CASRuntimeException e) { throw new ResourceProcessException(e); } } @Override public void destroy() { try { Set<Entry<String,Integer>> set = map.entrySet(); ArrayList<Entry<String,Integer>> list = new ArrayList<Entry<String,Integer>>(); list.addAll(set); Collections.sort(list, new Comparator<Entry<String,Integer>>(){ public int compare(Entry<String, Integer> o1, Entry<String, Integer> o2) { return o1.getKey().compareTo(o2.getKey()); } }); ///////////////////////////////////////////// System.out.println("# NER"); for (Entry<String,Integer> e : list) { int idx = e.getValue(); double prec = (stat[idx][0] != 0) ? (double)stat[idx][2] / (double)stat[idx][0] : -1; double recall = (stat[idx][1] != 0) ? (double)stat[idx][2] / (double)stat[idx][1] : -1; double f = (prec + recall > 0) ? (2 * prec * recall / (prec + recall)) : -1; // System.out.format("%02d %15s %3d/%3d=%7.4f %3d/%3d=%7.4f %7.4f\n", idx, e.getKey(), stat[idx][2], stat[idx][0], prec, stat[idx][2], stat[idx][1], recall, f); System.out.format("%02d\t%s\t%d\t%.4f\t%.4f\t%.4f\n", idx, e.getKey(), stat[idx][1], prec, recall, f); } System.out.println(); ///////////////////////////////////////////// System.out.format("%10s\t%12s\t%12s\t%12s\n", "Index", "Type", "Answer", "Classified"); for (int i = 0; i < list.size(); i+=3) { Entry<String,Integer> e = list.get(i); String s = e.getKey(); int idx = e.getValue(); if (idx != labelNoneIdx) { s = s.substring(0, s.lastIndexOf('_')); System.out.format("%010d\t%12s\t%12d\t%12d\n", idx/3, s, answerEntityCnt[idx/3], classifiedEntityCnt[idx/3]); } } System.out.println(); ///////////////////////////////////////////// System.out.println("# senti"); for (int i = 0; i < 3; i++) { int idx = i; double prec = (senti[idx][0] != 0) ? (double)senti[idx][2] / (double)senti[idx][0] : -1; double recall = (senti[idx][1] != 0) ? (double)senti[idx][2] / (double)senti[idx][1] : -1; double f = (prec + recall > 0) ? (2 * prec * recall / (prec + recall)) : -1; // System.out.format("%s\t%d\t%.4f\t%.4f\t%.4f\n", sentiStr(idx), senti[idx][1], prec, recall, f); System.out.format("%02d %15s %3d/%3d =%6.4f %3d/%3d =%6.4f %7.4f\n", idx, sentiStr(idx), senti[idx][2], senti[idx][0], prec, senti[idx][2], senti[idx][1], recall, f); } ///////////// System.out.println("# senti: sbj & neu"); double prec = (senti[0][0]+senti[1][0] != 0) ? (double)(senti[0][2]+senti[1][2]) / (double)(senti[0][0]+senti[1][0]) : -1; double recall = (senti[0][1]+senti[1][1] != 0) ? (double)(senti[0][2]+senti[1][2]) / (double)(senti[0][1]+senti[1][1]) : -1; double f = (prec + recall > 0) ? (2 * prec * recall / (prec + recall)) : -1; System.out.format("%02d %15s %3d/%3d =%6.4f %3d/%3d =%6.4f %7.4f\n", 0, sentiStr(0)+"/"+sentiStr(1), senti[0][2]+senti[1][2], senti[0][0]+senti[1][0], prec, senti[0][2]+senti[1][2], senti[0][1]+senti[1][1], recall, f); int idx = 2; prec = (senti[idx][0] != 0) ? (double)senti[idx][2] / (double)senti[idx][0] : -1; recall = (senti[idx][1] != 0) ? (double)senti[idx][2] / (double)senti[idx][1] : -1; f = (prec + recall > 0) ? (2 * prec * recall / (prec + recall)) : -1; System.out.format("%02d %15s %3d/%3d =%6.4f %3d/%3d =%6.4f %7.4f\n", idx, sentiStr(idx), senti[idx][2], senti[idx][0], prec, senti[idx][2], senti[idx][1], recall, f); ////////////// // System.out.println("# senti : pos & neg"); // prec = (senti[0][0]+senti[2][0] != 0) ? (double)(senti[0][2]+senti[2][2]) / (double)(senti[0][0]+senti[2][0]) : -1; // recall = (senti[0][1]+senti[2][1] != 0) ? (double)(senti[0][2]+senti[2][2]) / (double)(senti[0][1]+senti[2][1]) : -1; // f = (prec + recall > 0) ? (2 * prec * recall / (prec + recall)) : -1; // System.out.format("%02d %15s %3d/%3d =%6.4f %3d/%3d =%6.4f %7.4f\n", 0, sentiStr(0)+"/"+sentiStr(2), senti[0][2]+senti[2][2], senti[0][0]+senti[2][0], prec, senti[0][2]+senti[2][2], senti[0][1]+senti[2][1], recall, f); // // idx = 1; // prec = (senti[idx][0] != 0) ? (double)senti[idx][2] / (double)senti[idx][0] : -1; // recall = (senti[idx][1] != 0) ? (double)senti[idx][2] / (double)senti[idx][1] : -1; // f = (prec + recall > 0) ? (2 * prec * recall / (prec + recall)) : -1; // System.out.format("%02d %15s %3d/%3d =%6.4f %3d/%3d =%6.4f %7.4f\n", idx, sentiStr(idx), senti[idx][2], senti[idx][0], prec, senti[idx][2], senti[idx][1], recall, f); evalCorpusReader.close(); } catch (Exception e) { e.printStackTrace(); } super.destroy(); } private int sentiIdx(String s) { if (s.equals(OMTweet.POLARITY_STR_POSITIVE)) { return 0; } if (s.equals(OMTweet.POLARITY_STR_NEGATIVE)) { return 1; } if (s.equals(OMTweet.POLARITY_STR_NEUTRAL)) { return 2; } return -1; } private String sentiStr(int i) { switch(i) { case 0: return OMTweet.POLARITY_STR_POSITIVE; case 1: return OMTweet.POLARITY_STR_NEGATIVE; case 2: return OMTweet.POLARITY_STR_NEUTRAL; default: return null; } } private String[] extractEntityTags(String text) { String[] tokens = text.split(" "); for (int i = 0; i < tokens.length; i++) { tokens[i] = tokens[i].substring(tokens[i].lastIndexOf('/') + 1); } return tokens; } }