/**
*
*/
package com.maalaang.omtwitter.uima.annotator;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_component.CasAnnotator_ImplBase;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.FSIterator;
import org.apache.uima.cas.Feature;
import org.apache.uima.cas.Type;
import org.apache.uima.cas.TypeSystem;
import org.apache.uima.cas.text.AnnotationFS;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.util.Level;
import org.apache.uima.util.Logger;
import com.maalaang.omtwitter.resource.SentimentScore;
import com.maalaang.omtwitter.resource.SentimentScoreDictionary;
import com.maalaang.omtwitter.resource.SentimentScoreDictionaryFactory;
import com.maalaang.omtwitter.uima.type.SentenceAnnotation;
import com.maalaang.omtwitter.uima.type.TokenAnnotation;
/**
* @author Sangwon Park
*
*/
public class SentimentScoreAnnotator extends CasAnnotator_ImplBase {
private final static String PARAM_MAX_WINDOW_SIZE = "maxWindowSize";
private final static String PARAM_SENTI_SCORE_DIC_OBJ_FILE = "sentiScoreDicObjectFile";
private final static String PARAM_USE_STEM_TO_FIND_DIC = "useStemToFindDic";
private final static String PARAM_USE_POS_TO_FIND_DIC = "usePosToFindDic";
private final static String PARAM_POS_TAGSET = "posTagset";
private final static String PARAM_ANNOTATION_TYPE_NAME = "annotationTypeName";
private final static String PARAM_FEATURE_NAME_ID = "featureNameId";
private final static String PARAM_FEATURE_NAME_POSITIVE_SCORE = "featureNamePositiveScore";
private final static String PARAM_FEATURE_NAME_NEGATIVE_SCORE = "featureNameNegativeScore";
private final static String PARAM_FEATURE_NAME_SUBJECTIVE_SCORE = "featureNameSubjectiveScore";
private final static String PARAM_FEATURE_NAME_OBJECTIVE_SCORE = "featureNameObjectiveScore";
private final static int ANNOTATION_FEATURE_NUM = 5;
public final static String POS_TAGSET_PENN_TREE_BANK = "PENN_TREE_BANK";
public final static String POS_TAGSET_BROWN_CORPUS = "BROWN_CORPUS";
public final static String POS_TAGSET_WORDNET = "WORDNET";
private final static String TYPE_NAME_SENTENCE_ANNOTATION = "com.maalaang.omtwitter.uima.type.SentenceAnnotation";
private final static String TYPE_NAME_TOKEN_ANNOTATION = "com.maalaang.omtwitter.uima.type.TokenAnnotation";
private Logger logger = null;
private int maxWindowSize = 0;
private SentimentScoreDictionary sentiScoreDic = null;
private boolean useStemToFindDic = false;
private boolean usePosToFindDic = false;
private int posTagset = 0;
private String annotationTypeName = null;
private Feature[] features = null;
private String[] featureNames = null;
private Type sentenceType = null;
private Type tokenType = null;
private Type sentiScoreType = null;
private ArrayList<String> tokenList = null;
@Override
public void initialize(UimaContext aContext) throws ResourceInitializationException {
super.initialize(aContext);
logger = aContext.getLogger();
maxWindowSize = (Integer) aContext.getConfigParameterValue(PARAM_MAX_WINDOW_SIZE);
try {
String dicFile = (String)aContext.getConfigParameterValue(PARAM_SENTI_SCORE_DIC_OBJ_FILE);
InputStream is = getClass().getClassLoader().getResourceAsStream(dicFile);
if (is == null) {
is = new FileInputStream(dicFile);
}
sentiScoreDic = SentimentScoreDictionaryFactory.loadFromSerializedFile(is);
} catch (FileNotFoundException e) {
logger.log(Level.SEVERE, e.getMessage());
throw new ResourceInitializationException(e);
} catch (ClassNotFoundException e) {
logger.log(Level.SEVERE, e.getMessage());
throw new ResourceInitializationException(e);
} catch (IOException e) {
logger.log(Level.SEVERE, e.getMessage());
throw new ResourceInitializationException(e);
}
useStemToFindDic = (Boolean) aContext.getConfigParameterValue(PARAM_USE_STEM_TO_FIND_DIC);
usePosToFindDic = (Boolean) aContext.getConfigParameterValue(PARAM_USE_POS_TO_FIND_DIC);
try {
Object posTagsetValue = aContext.getConfigParameterValue(PARAM_POS_TAGSET);;
if (posTagsetValue != null) {
posTagset = posTagsetId((String) posTagsetValue);
}
} catch (IllegalArgumentException e) {
logger.log(Level.SEVERE, e.getMessage());
throw new ResourceInitializationException(e);
}
annotationTypeName = (String) aContext.getConfigParameterValue(PARAM_ANNOTATION_TYPE_NAME);
featureNames = new String[ANNOTATION_FEATURE_NUM];
featureNames[0] = (String) aContext.getConfigParameterValue(PARAM_FEATURE_NAME_ID);
featureNames[1] = (String) aContext.getConfigParameterValue(PARAM_FEATURE_NAME_POSITIVE_SCORE);
featureNames[2] = (String) aContext.getConfigParameterValue(PARAM_FEATURE_NAME_NEGATIVE_SCORE);
featureNames[3] = (String) aContext.getConfigParameterValue(PARAM_FEATURE_NAME_SUBJECTIVE_SCORE);
featureNames[4] = (String) aContext.getConfigParameterValue(PARAM_FEATURE_NAME_OBJECTIVE_SCORE);
features = new Feature[ANNOTATION_FEATURE_NUM];
tokenList = new ArrayList<String>();
logger.log(Level.INFO, "sentiment score annotator initialized");
}
public static int posTagsetId(String posTagset) {
if (POS_TAGSET_BROWN_CORPUS.equals(posTagset)) {
return SentimentScoreDictionary.POS_TAGSET_BROWN_CORPUS;
} else if (POS_TAGSET_PENN_TREE_BANK.equals(posTagset)) {
return SentimentScoreDictionary.POS_TAGSET_PENN_TREE_BANK;
} else if (POS_TAGSET_WORDNET.equals(posTagset)) {
return SentimentScoreDictionary.POS_TAGSET_WORD_NET;
} else {
throw new IllegalArgumentException();
}
}
public void createAnnotation(CAS aCas, SentimentScore score, int begin, int end) {
AnnotationFS ann = aCas.createAnnotation(sentiScoreType, begin, end);
ann.setIntValue(features[0], score.getId());
ann.setDoubleValue(features[1], score.getPositiveScore());
ann.setDoubleValue(features[2], score.getNegativeScore());
ann.setDoubleValue(features[3], score.getSubjectiveScore());
ann.setDoubleValue(features[4], score.getObjectiveScore());
aCas.addFsToIndexes(ann);
}
@Override
public void typeSystemInit(TypeSystem aTypeSystem) throws AnalysisEngineProcessException {
super.typeSystemInit(aTypeSystem);
sentenceType = aTypeSystem.getType(TYPE_NAME_SENTENCE_ANNOTATION);
tokenType = aTypeSystem.getType(TYPE_NAME_TOKEN_ANNOTATION);
sentiScoreType = aTypeSystem.getType(annotationTypeName);
for (int i = 0; i < ANNOTATION_FEATURE_NUM; i++) {
features[i] = sentiScoreType.getFeatureByBaseName(featureNames[i]);
}
}
@Override
public void process(CAS aCAS) throws AnalysisEngineProcessException {
TokenAnnotation tokenAnn = null;
SentimentScore score = null;
FSIterator<AnnotationFS> sentenceAnnIt = aCAS.getAnnotationIndex(sentenceType).iterator();
while (sentenceAnnIt.hasNext()) {
SentenceAnnotation sentAnn = (SentenceAnnotation) sentenceAnnIt.next();
tokenList.clear();
FSIterator<AnnotationFS> tokenAnnIt = aCAS.getAnnotationIndex(tokenType).subiterator(sentAnn);
if (!useStemToFindDic) {
while (tokenAnnIt.hasNext()) {
tokenAnn = (TokenAnnotation) tokenAnnIt.next();
tokenList.add(tokenAnn.getCoveredText());
}
} else {
while (tokenAnnIt.hasNext()) {
tokenAnn = (TokenAnnotation) tokenAnnIt.next();
tokenList.add(tokenAnn.getStem());
}
}
tokenAnnIt.moveToFirst();
for (int i = 0; i < tokenList.size(); i++) {
tokenAnn = (TokenAnnotation) tokenAnnIt.get();
String posTag = tokenAnn.getPosTag();
int lastIndex = i + maxWindowSize - 1;
if (lastIndex >= tokenList.size()) {
lastIndex = tokenList.size() - 1;
}
for ( ; i <= lastIndex; lastIndex--) {
StringBuilder expr = new StringBuilder();
expr.append(tokenList.get(i));
for (int j = i + 1; j <= lastIndex; j++) {
expr.append(' ');
expr.append(tokenList.get(j));
}
String key = expr.toString().toLowerCase().replaceAll("(-|_)", " ");
if (usePosToFindDic) {
score = sentiScoreDic.find(key, posTag, posTagset);
} else {
score = sentiScoreDic.find(key);
}
if (score != null) {
break;
}
}
if (score != null) {
int begin = tokenAnn.getBegin();
for ( ; i < lastIndex; i++) {
tokenAnnIt.moveToNext();
}
int end = ((TokenAnnotation) tokenAnnIt.get()).getEnd();
createAnnotation(aCAS, score, begin, end);
}
if (tokenAnnIt.hasNext()) {
tokenAnnIt.moveToNext();
}
}
}
}
}