/* * Copyright (c) 2012 Sebastian Schaffert * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.stanbol.enhancer.engines.sentiment.services; import static org.apache.stanbol.enhancer.nlp.NlpAnnotations.SENTIMENT_ANNOTATION; import static org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getAnalysedText; import static org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getLanguage; import java.util.Collections; import java.util.Dictionary; import java.util.HashMap; import java.util.Iterator; import java.util.Map; import java.util.Set; import org.apache.felix.scr.annotations.Activate; import org.apache.felix.scr.annotations.Component; import org.apache.felix.scr.annotations.ConfigurationPolicy; import org.apache.felix.scr.annotations.Deactivate; import org.apache.felix.scr.annotations.Properties; import org.apache.felix.scr.annotations.Property; import org.apache.felix.scr.annotations.Reference; import org.apache.felix.scr.annotations.ReferenceCardinality; import org.apache.felix.scr.annotations.ReferencePolicy; import org.apache.felix.scr.annotations.ReferenceStrategy; import org.apache.felix.scr.annotations.Service; import org.apache.stanbol.enhancer.engines.sentiment.api.SentimentClassifier; import org.apache.stanbol.enhancer.nlp.NlpAnnotations; import org.apache.stanbol.enhancer.nlp.NlpProcessingRole; import org.apache.stanbol.enhancer.nlp.NlpServiceProperties; import org.apache.stanbol.enhancer.nlp.model.AnalysedText; import org.apache.stanbol.enhancer.nlp.model.Token; import org.apache.stanbol.enhancer.nlp.model.annotation.Annotation; import org.apache.stanbol.enhancer.nlp.model.annotation.Value; import org.apache.stanbol.enhancer.nlp.pos.LexicalCategory; import org.apache.stanbol.enhancer.nlp.pos.PosTag; import org.apache.stanbol.enhancer.nlp.utils.LanguageConfiguration; import org.apache.stanbol.enhancer.servicesapi.ContentItem; import org.apache.stanbol.enhancer.servicesapi.EngineException; import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine; import org.apache.stanbol.enhancer.servicesapi.ServiceProperties; import org.apache.stanbol.enhancer.servicesapi.impl.AbstractEnhancementEngine; import org.osgi.framework.Constants; import org.osgi.service.cm.ConfigurationException; import org.osgi.service.component.ComponentContext; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * A Stanbol engine that associates sentiment values with the tokens created by the POS tagging engine. * Sentiment values are added to the POSContentPart of the content item and can by further analysed by other * engines, e.g. to compute sentiment values for the whole content item or in relation to certain nouns. * <p/> * The configuration allows specifying whether to analyse all words or only adjectives and nouns (a typical case). * <p/> * Currently, sentiment analysis is available for English and for German language. It uses the following word lists: * <ul> * <li>English: SentiWordNet (http://wordnet.princeton.edu/), license allows commercial use</li> * <li>German: SentiWS (http://wortschatz.informatik.uni-leipzig.de/download/), license does NOT allow commercial use</li> * </ul> * <p/> * Author: Sebastian Schaffert */ @Component(immediate = true, metatype = true, configurationFactory = true, //allow multiple instances policy = ConfigurationPolicy.OPTIONAL) //create a default instance with the default configuration @Service @Properties(value={ @Property(name= EnhancementEngine.PROPERTY_NAME,value="sentiment-wordclassifier"), @Property(name=SentimentEngine.CONFIG_LANGUAGES,value={SentimentEngine.DEFAULT_LANGUAGE_CONFIG}), @Property(name=SentimentEngine.CONFIG_ADJECTIVES, boolValue=SentimentEngine.DEFAULT_PROCESS_ADJECTIVES_ONLY), @Property(name=SentimentEngine.CONFIG_MIN_POS_CONFIDENCE, doubleValue = SentimentEngine.DEFAULT_MIN_POS_CONFIDNECE), @Property(name=Constants.SERVICE_RANKING,intValue=-100) //give the default instance a ranking < 0 }) public class SentimentEngine extends AbstractEnhancementEngine<RuntimeException,RuntimeException> implements ServiceProperties { /** * Language configuration. Takes a list of ISO language codes of supported languages. Currently supported * are the languages given as default value. */ public static final String CONFIG_LANGUAGES = "org.apache.stanbol.enhancer.sentiment.languages"; /** * When set to true, only adjectives and nouns will be considered in sentiment analysis. */ public static final String CONFIG_ADJECTIVES = "org.apache.stanbol.enhancer.sentiment.adjectives"; /** * POS tags that are not selected by {@link SentimentClassifier#isAdjective(PosTag)} * or {@link SentimentClassifier#isNoun(PosTag)} are ignored if their confidence * is >= the configured values. If there are multiple POS tag suggestions, * that Words that do have a suitable TAG are still considered if the * confidence of the fitting tag is >= {min-pos-confidence}/2 */ public static final String CONFIG_MIN_POS_CONFIDENCE = "org.apache.stanbol.enhancer.sentiment.min-pos-confidence"; boolean debugSentiments; public static final String DEFAULT_LANGUAGE_CONFIG = "*"; private LanguageConfiguration langaugeConfig = new LanguageConfiguration(CONFIG_LANGUAGES, new String[]{DEFAULT_LANGUAGE_CONFIG}); /** * The minimum confidence of POS tags so that a token is NOT processed if * the {@link LexicalCategory} is NOT {@link LexicalCategory#Adjective} (or * {@link LexicalCategory#Noun Noun} if {@link #CONFIG_ADJECTIVES} is * deactivated) - default: 0.8<p> */ public static final double DEFAULT_MIN_POS_CONFIDNECE = 0.8; public static final boolean DEFAULT_PROCESS_ADJECTIVES_ONLY = false; /** * Service Properties used by this Engine */ private static final Map<String,Object> SERVICE_PROPERTIES; static { Map<String,Object> props = new HashMap<String,Object>(); props.put(ServiceProperties.ENHANCEMENT_ENGINE_ORDERING, ServiceProperties.ORDERING_NLP_POS - 1); //after POS tagging props.put(NlpServiceProperties.ENHANCEMENT_ENGINE_NLP_ROLE, NlpProcessingRole.SentimentTagging); SERVICE_PROPERTIES = Collections.unmodifiableMap(props); } private static Logger log = LoggerFactory.getLogger(SentimentEngine.class); /** * {@link SentimentClassifier} are now OSGI services and injected via events * (calls to {@link #bindClassifier(SentimentClassifier)} and * {@link #unbindClassifier(SentimentClassifier)}) as soon as they become * (un)available. */ @Reference(referenceInterface=SentimentClassifier.class, cardinality=ReferenceCardinality.OPTIONAL_MULTIPLE, bind="bindClassifier", unbind="unbindClassifier", policy=ReferencePolicy.DYNAMIC, strategy=ReferenceStrategy.EVENT) private Map<String,SentimentClassifier> classifiers = Collections.synchronizedMap( new HashMap<String,SentimentClassifier>()); /** bind method for {@link #classifiers} */ protected void bindClassifier(SentimentClassifier classifier){ log.info(" ... bind Sentiment Classifier {} for language {}", classifier.getClass().getSimpleName(),classifier.getLanguage()); synchronized (classifiers) { SentimentClassifier old = classifiers.put(classifier.getLanguage(), classifier); if(old != null){ log.warn("Replaced Sentiment Classifier for language {} (old: {}, new: {}", new Object[]{old.getLanguage(),old,classifier}); } } } /** unbind method for {@link #classifiers} */ protected void unbindClassifier(SentimentClassifier classifier){ String lang = classifier.getLanguage(); synchronized (classifiers) { SentimentClassifier current = classifiers.remove(lang); if(!classifier.equals(current) //the current is not the parsed one && current != null){ classifiers.put(lang,current); //re-add the value } else { log.info(" ... unbind Sentiment Classifier {} for language {}", classifier.getClass().getSimpleName(),lang); } } } /** * The processed {@link LexicalCategory LexicalCategories}. */ boolean adjectivesOnly = DEFAULT_PROCESS_ADJECTIVES_ONLY; /** * The minimum {@link PosTag} value {@link Value#probability() confidence}.<p> * This means that if the {@link Value#probability() confidence} of a * {@link NlpAnnotations#POS_ANNOTATION}s (returned by * {@link Token#getAnnotations(Annotation)}) is greater than * {@link #minPOSConfidence} that the result of * {@link SentimentClassifier#isAdjective(PosTag)} (and * {@link SentimentClassifier#isNoun(PosTag)} - if #CONFIG_ADJECTIVES is * deactivated) is used to decide if a Token needs to be processed or not. * Otherwise further {@link NlpAnnotations#POS_ANNOTATION}s are analysed for * processable POS tags. Processable POS tags are accepted until * <code>{@link #minPOSConfidence}/2</code>. */ private double minPOSConfidence = DEFAULT_MIN_POS_CONFIDNECE; /** * Indicate if this engine can enhance supplied ContentItem, and if it * suggests enhancing it synchronously or asynchronously. The * {@link org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager} can force sync/async mode if desired, it is * just a suggestion from the engine. * <p/> * Returns {@link EnhancementEngine}#ENHANCE_ASYNC if <ul> * <li> the {@link AnalysedText} content part is present * <li> the language of the content is known * <li> the language is active based on the language configuration and * <li> a sentiment classifier is available for the language * </ul> * * @throws org.apache.stanbol.enhancer.servicesapi.EngineException * if the introspecting process of the content item * fails */ @Override public int canEnhance(ContentItem ci) throws EngineException { if(getAnalysedText(this,ci, false) == null){ return CANNOT_ENHANCE; } String language = getLanguage(this, ci,false); if(language == null) { return CANNOT_ENHANCE; } if(classifiers.containsKey(language)){ return ENHANCE_ASYNC; } else { return CANNOT_ENHANCE; } } /** * Compute enhancements for supplied ContentItem. The results of the process * are expected to be stored in the metadata of the content item. * <p/> * The client (usually an {@link org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager}) should take care of * persistent storage of the enhanced {@link org.apache.stanbol.enhancer.servicesapi.ContentItem}. * * @throws org.apache.stanbol.enhancer.servicesapi.EngineException * if the underlying process failed to work as * expected */ @Override public void computeEnhancements(ContentItem ci) throws EngineException { AnalysedText analysedText = getAnalysedText(this,ci, true); String language = getLanguage(this, ci, true); SentimentClassifier classifier = classifiers.get(language); if(classifier == null){ throw new IllegalStateException("Sentiment Classifier for language '" + language +"' not available. As this is also checked in " + " canEnhance this may indicate an Bug in the used " + "EnhancementJobManager!"); } //TODO: locking for AnalysedText not yet defined // ci.getLock().writeLock().lock(); // try { Iterator<Token> tokens = analysedText.getTokens(); while(tokens.hasNext()){ Token token = tokens.next(); Set<LexicalCategory> cats = null; boolean process = false; if(!adjectivesOnly){ process = true; Value<PosTag> posTag = token.getAnnotation(NlpAnnotations.POS_ANNOTATION); if(posTag != null && posTag.probability() == Value.UNKNOWN_PROBABILITY || posTag.probability() >= (minPOSConfidence/2.0)){ cats = classifier.getCategories(posTag.value()); } else { //no POS tags or probability to low cats = Collections.emptySet(); } } else { //check PosTags if we need to lookup this word Iterator<Value<PosTag>> posTags = token.getAnnotations(NlpAnnotations.POS_ANNOTATION).iterator(); boolean ignore = false; while(!ignore && !process && posTags.hasNext()) { Value<PosTag> value = posTags.next(); PosTag tag = value.value(); cats = classifier.getCategories(tag); boolean state = cats.contains(LexicalCategory.Adjective) || cats.contains(LexicalCategory.Noun); ignore = !state && (value.probability() == Value.UNKNOWN_PROBABILITY || value.probability() >= minPOSConfidence); process = state && (value.probability() == Value.UNKNOWN_PROBABILITY || value.probability() >= (minPOSConfidence/2.0)); } } //else process all tokens ... no POS tag checking needed if(process){ String word = token.getSpan(); double sentiment = 0.0; if(cats.isEmpty()){ sentiment = classifier.classifyWord(null, word); } else { //in case of multiple Lexical Cats //we build the average over NOT NULL sentiments for the word int catSentNum = 0; for(LexicalCategory cat : cats){ double catSent = classifier.classifyWord(cat, word); if(catSent != 0.0){ catSentNum++; sentiment = sentiment + catSent; } } if(catSentNum > 0){ sentiment = sentiment / (double) catSentNum; } } if(sentiment != 0.0){ token.addAnnotation(SENTIMENT_ANNOTATION, new Value<Double>(sentiment)); } //else do not set sentiments with 0.0 } // else do not process } // } finally { // ci.getLock().writeLock().unlock(); // } } /** * Activate and read the properties. Configures and initialises a ChunkerHelper for each language configured in * CONFIG_LANGUAGES. * * @param ce the {@link org.osgi.service.component.ComponentContext} */ @Activate protected void activate(ComponentContext ce) throws ConfigurationException { log.info("activating POS tagging engine"); super.activate(ce); @SuppressWarnings("unchecked") Dictionary<String, Object> properties = ce.getProperties(); //parse the configured languages langaugeConfig.setConfiguration(properties); //set the processed lexical categories Object value = properties.get(CONFIG_ADJECTIVES); adjectivesOnly = value instanceof Boolean ? (Boolean)value : value != null ? Boolean.parseBoolean(value.toString()) : DEFAULT_PROCESS_ADJECTIVES_ONLY; //set minimum POS confidence value = properties.get(CONFIG_MIN_POS_CONFIDENCE); if(value instanceof Number){ minPOSConfidence = ((Number)value).doubleValue(); } else if(value != null){ try { minPOSConfidence = Double.parseDouble(value.toString()); } catch (NumberFormatException e) { throw new ConfigurationException(CONFIG_MIN_POS_CONFIDENCE, "Unable to parsed minimum POS confidence value from '" + value +"'!",e); } } else { minPOSConfidence = DEFAULT_MIN_POS_CONFIDNECE; } if(minPOSConfidence <= 0 || minPOSConfidence >= 1){ throw new ConfigurationException(CONFIG_MIN_POS_CONFIDENCE, "The configured minimum POS confidence value '" +minPOSConfidence+"' MUST BE > 0 and < 1!"); } } @Deactivate protected void deactivate(ComponentContext ctx){ //remove remaining classifiers this.classifiers.clear(); langaugeConfig.setDefault(); super.deactivate(ctx); } @Override public Map<String,Object> getServiceProperties() { return SERVICE_PROPERTIES; } }