/*
* Copyright (c) 2012 Sebastian Schaffert
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.stanbol.enhancer.engines.poschunker.engine;
import static org.apache.stanbol.enhancer.nlp.NlpAnnotations.POS_ANNOTATION;
import static org.apache.stanbol.enhancer.nlp.NlpAnnotations.PHRASE_ANNOTATION;
import static org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getAnalysedText;
import static org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getLanguage;
import static org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.isLangaugeConfigured;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Dictionary;
import java.util.EnumSet;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import org.apache.felix.scr.annotations.Activate;
import org.apache.felix.scr.annotations.Component;
import org.apache.felix.scr.annotations.ConfigurationPolicy;
import org.apache.felix.scr.annotations.Deactivate;
import org.apache.felix.scr.annotations.Properties;
import org.apache.felix.scr.annotations.Property;
import org.apache.felix.scr.annotations.Reference;
import org.apache.felix.scr.annotations.Service;
import org.apache.stanbol.enhancer.engines.poschunker.PhraseBuilder;
import org.apache.stanbol.enhancer.engines.poschunker.PhraseBuilder.ChunkFactory;
import org.apache.stanbol.enhancer.engines.poschunker.PhraseTypeDefinition;
import org.apache.stanbol.enhancer.nlp.NlpAnnotations;
import org.apache.stanbol.enhancer.nlp.NlpProcessingRole;
import org.apache.stanbol.enhancer.nlp.NlpServiceProperties;
import org.apache.stanbol.enhancer.nlp.model.AnalysedText;
import org.apache.stanbol.enhancer.nlp.model.Chunk;
import org.apache.stanbol.enhancer.nlp.model.Section;
import org.apache.stanbol.enhancer.nlp.model.Span;
import org.apache.stanbol.enhancer.nlp.model.Token;
import org.apache.stanbol.enhancer.nlp.model.SpanTypeEnum;
import org.apache.stanbol.enhancer.nlp.model.annotation.Value;
import org.apache.stanbol.enhancer.nlp.model.tag.TagSet;
import org.apache.stanbol.enhancer.nlp.phrase.PhraseTag;
import org.apache.stanbol.enhancer.nlp.pos.LexicalCategory;
import org.apache.stanbol.enhancer.nlp.pos.Pos;
import org.apache.stanbol.enhancer.nlp.pos.PosTag;
import org.apache.stanbol.enhancer.nlp.utils.LanguageConfiguration;
import org.apache.stanbol.enhancer.servicesapi.ContentItem;
import org.apache.stanbol.enhancer.servicesapi.EngineException;
import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
import org.apache.stanbol.enhancer.servicesapi.ServiceProperties;
import org.apache.stanbol.enhancer.servicesapi.impl.AbstractEnhancementEngine;
import org.osgi.framework.Constants;
import org.osgi.service.cm.ConfigurationException;
import org.osgi.service.component.ComponentContext;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* A noun phrase detector (chunker) for English and German language base on OpenNLP. Uses the following chunker
* models for OpenNLP:
* <ul>
* <li>English: http://opennlp.sourceforge.net/models-1.5/en-chunker.bin</li>
* <li>German: http://gromgull.net/blog/2010/01/noun-phrase-chunking-for-the-awful-german-language/</li>
* </ul>
* The noun phrase detector requires a {@link org.apache.stanbol.enhancer.engines.opennlp.pos.model.POSContentPart} to
* be present in the content item and will extend each {@link org.apache.stanbol.enhancer.engines.opennlp.pos.model.POSSentence}
* with an array of chunks.
*
* @author Sebastian Schaffert
*/
@Component(immediate = true, metatype = true,
configurationFactory = true, //allow multiple instances to be configured
policy = ConfigurationPolicy.OPTIONAL) //create the default instance with the default config
@Service
@Properties(value={
@Property(name=EnhancementEngine.PROPERTY_NAME,value="pos-chunker"),
@Property(name=PosChunkerEngine.CONFIG_LANGUAGES,
cardinality=Integer.MAX_VALUE, value = {"*"}),
@Property(name=PosChunkerEngine.MIN_POS_SCORE,
doubleValue=PosChunkerEngine.DEFAULT_MIN_POS_SCORE),
@Property(name=PosChunkerEngine.NOUN_PHRASE_STATE,
boolValue=PosChunkerEngine.DEFAULT_NOUN_PHRASE_STATE),
@Property(name=PosChunkerEngine.VERB_PHRASE_STATE,
boolValue=PosChunkerEngine.DEFAULT_VERB_PHRASE_STATE),
@Property(name=Constants.SERVICE_RANKING,intValue=-100) //give the default instance a ranking < 0
})
public class PosChunkerEngine extends AbstractEnhancementEngine<RuntimeException,RuntimeException> implements ServiceProperties {
private static final Map<String,Object> SERVICE_PROPERTIES;
static {
Map<String,Object> props = new HashMap<String,Object>();
props.put(ServiceProperties.ENHANCEMENT_ENGINE_ORDERING,
ServiceProperties.ORDERING_NLP_CHUNK);
props.put(NlpServiceProperties.ENHANCEMENT_ENGINE_NLP_ROLE,
NlpProcessingRole.Chunking);
SERVICE_PROPERTIES = Collections.unmodifiableMap(props);
}
/**
* Language configuration. Takes a list of ISO language codes of supported languages. Currently supported
* are the languages given as default value.
*/
public static final String CONFIG_LANGUAGES = "enhancer.engine.poschunker.languages";
public static final String MIN_POS_SCORE = "enhancer.engine.poschunker.minPosScore";
public static final double DEFAULT_MIN_POS_SCORE = 0.5;
public static final String NOUN_PHRASE_STATE = "enhancer.engine.poschunker.nounPhrase";
public static final boolean DEFAULT_NOUN_PHRASE_STATE = true;
public static final String VERB_PHRASE_STATE = "enhancer.engine.poschunker.verbPhrase";
public static final boolean DEFAULT_VERB_PHRASE_STATE = false;
private static final PhraseTypeDefinition NOUN_PHRASE_TYPE;
private static final PhraseTypeDefinition VERB_PHRASE_TYPE;
//TODO: maybe move this to PhraseTypeDefinition
//TODO: this might be language specific
//TODO: make configurable
static {
PhraseTypeDefinition nounPD = new PhraseTypeDefinition(LexicalCategory.Noun);
//NOTE: Pos.Acronym, Pos.Abbreviation, Pos.Foreign are also considered as
// nouns by this definition.
nounPD.getRequiredType().addPosTags(Pos.Acronym, Pos.Abbreviation, Pos.Foreign);
//start types noun (automatically included) pronoun or determiners, adjectives
nounPD.getStartType().addCategories(LexicalCategory.PronounOrDeterminer, LexicalCategory.Adjective);
nounPD.getStartType().addPosTags(Pos.Acronym, Pos.Abbreviation, Pos.Foreign);
//prefix types are the same as start types (e.g. "the nice trip")
nounPD.getPrefixType().addCategories(LexicalCategory.PronounOrDeterminer, LexicalCategory.Adjective);
nounPD.getPrefixType().addPosTags(Pos.Acronym, Pos.Abbreviation, Pos.Foreign);
//continuation types are nouns and punctations.
//NOTE: Adverbs are excluded to avoid phrases like "the nice trip last week"
nounPD.getContinuationType().addCategories(LexicalCategory.Punctuation);
nounPD.getContinuationType().addPosTags(Pos.Acronym, Pos.Abbreviation, Pos.Foreign);
//end types are the same as start terms
nounPD.getEndType().addCategories(LexicalCategory.PronounOrDeterminer, LexicalCategory.Adjective);
nounPD.getEndType().addPosTags(Pos.Acronym, Pos.Abbreviation, Pos.Foreign);
//and required types do include a Noun (what is actually included by default)
NOUN_PHRASE_TYPE = nounPD;
PhraseTypeDefinition verbPD = new PhraseTypeDefinition(LexicalCategory.Verb);
verbPD.getStartType().addCategories(LexicalCategory.Adverb);
verbPD.getContinuationType().addCategories(LexicalCategory.Adverb,LexicalCategory.Punctuation);
verbPD.getEndType().addCategories(LexicalCategory.Adverb);
//and required types do include a Verbs (what is actually included by default)
VERB_PHRASE_TYPE = verbPD;
}
private static Logger log = LoggerFactory.getLogger(PosChunkerEngine.class);
private LanguageConfiguration languageConfiguration = new LanguageConfiguration(CONFIG_LANGUAGES,
new String []{"*"});
private double minPosScore = -1;
private List<PhraseTypeDefinition> phraseTypeDefinitions;
/**
* Indicate if this engine can enhance supplied ContentItem, and if it
* suggests enhancing it synchronously or asynchronously. The
* {@link org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager} can force sync/async mode if desired, it is
* just a suggestion from the engine.
* <p/>
* Returns CANNOT_ENHANCE if the content item does not have a POSContentPart, the language of the content is not
* available or no chunker for the language is available.
*
* @throws org.apache.stanbol.enhancer.servicesapi.EngineException
* if the introspecting process of the content item
* fails
*/
@Override
public int canEnhance(ContentItem ci) throws EngineException {
if(phraseTypeDefinitions.isEmpty()){
return CANNOT_ENHANCE; //Nothing to do
}
String language = getLanguage(this, ci,false);
if(language == null){
return CANNOT_ENHANCE;
}
if(!isLangaugeConfigured(this,languageConfiguration,language,false)){
return CANNOT_ENHANCE;
}
if(getAnalysedText(this,ci,false) == null) {
return CANNOT_ENHANCE;
}
// default enhancement is synchronous enhancement
return ENHANCE_ASYNC;
}
/**
* Compute enhancements for supplied ContentItem. The results of the process
* are expected to be stored in the metadata of the content item.
* <p/>
* The client (usually an {@link org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager}) should take care of
* persistent storage of the enhanced {@link org.apache.stanbol.enhancer.servicesapi.ContentItem}.
*
* @throws org.apache.stanbol.enhancer.servicesapi.EngineException
* if the underlying process failed to work as
* expected
*/
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
AnalysedText at = getAnalysedText(this, ci, true);
String language = getLanguage(this, ci, true);
isLangaugeConfigured(this, languageConfiguration, language, true);
//init the PhraseBuilder
ChunkFactory chunkFactory = new ChunkFactoryImpl(at, ci.getLock());
List<PhraseBuilder> phraseBuilders = new ArrayList<PhraseBuilder>(phraseTypeDefinitions.size());
for(PhraseTypeDefinition ptd : phraseTypeDefinitions){
phraseBuilders.add(new PhraseBuilder(ptd, chunkFactory, minPosScore));
}
Iterator<? extends Section> sentences = at.getSentences();
if(!sentences.hasNext()){ //no sentences ... iterate over the whole text
sentences = Collections.singleton(at).iterator();
}
while(sentences.hasNext()){
// (1) get Tokens and POS information for the sentence
Section sentence = sentences.next();
for(PhraseBuilder pb : phraseBuilders){
pb.nextSection(sentence);
}
Iterator<Token> tokens = sentence.getTokens();
while(tokens.hasNext()){
Token token = tokens.next();
for(PhraseBuilder pb : phraseBuilders){
pb.nextToken(token);
}
}
}
//signal the end of the document
for(PhraseBuilder pb : phraseBuilders){
pb.nextSection(null);
}
// if(log.isTraceEnabled()){
// logChunks(at);
// }
}
@Override
public Map<String,Object> getServiceProperties() {
return SERVICE_PROPERTIES;
}
//logging is now done by the PhraseBuilder
// private void logChunks(AnalysedText at){
// Iterator<Span> it = at.getEnclosed(EnumSet.of(SpanTypeEnum.Sentence, SpanTypeEnum.Chunk));
// while(it.hasNext()){
// Span span = it.next();
// if(span.getType() == SpanTypeEnum.Chunk){
// Value<PhraseTag> phraseAnno = span.getAnnotation(PHRASE_ANNOTATION);
// log.trace(" > {} Phrase: {} {}", new Object[]{
// phraseAnno != null ? phraseAnno.value().getTag() : "unknown",
// span, span.getSpan()});
// log.trace(" Tokens: ");
// int i = 1;
// for(Iterator<Token> tokens = ((Chunk)span).getTokens(); tokens.hasNext();i++){
// Token token = tokens.next();
// log.trace(" {}. {}{}", new Object[]{i,token.getSpan(),
// token.getAnnotations(NlpAnnotations.POS_ANNOTATION)});
// }
// } else {
// log.trace("--- {}",span);
// }
// }
// }
/**
* Activate and read the properties. Configures and initialises a ChunkerHelper for each language configured in
* CONFIG_LANGUAGES.
*
* @param ce the {@link org.osgi.service.component.ComponentContext}
*/
@Activate
protected void activate(ComponentContext ce) throws ConfigurationException {
log.info("activating POS tagging engine");
super.activate(ce);
@SuppressWarnings("unchecked")
Dictionary<String, Object> properties = ce.getProperties();
//read the min chunk score
Object value = properties.get(MIN_POS_SCORE);
Double minPosScore;
if(value instanceof Number){
minPosScore = ((Number)value).doubleValue();
} else if (value != null && !value.toString().isEmpty()){
try {
minPosScore = Double.parseDouble(value.toString());
} catch (NumberFormatException e) {
throw new ConfigurationException(MIN_POS_SCORE,
"The configured minumum chunk score MUST BE a floating point"
+ "number in the range > 0 < 1.",e);
}
} else {
minPosScore = null;
}
if(minPosScore != null && (minPosScore.doubleValue() >= 1d ||
minPosScore.doubleValue() < 0d)){
throw new ConfigurationException(MIN_POS_SCORE,
"The configured minumum chunk score '"+minPosScore+"' MUST BE a "
+ "floating point number in the range > 0 < 1.");
} else if(minPosScore == null){
this.minPosScore = DEFAULT_MIN_POS_SCORE; //set to default
} else {
this.minPosScore = minPosScore.doubleValue();
}
log.info(" > set minimum POS score to {} (Engine: {})",
this.minPosScore, getName());
//read the language configuration
languageConfiguration.setConfiguration(properties);
//configure the PhraseType definitions
phraseTypeDefinitions = new ArrayList<PhraseTypeDefinition>(2);
value = properties.get(NOUN_PHRASE_STATE);
if((value != null && Boolean.parseBoolean(value.toString())) ||
(value == null && DEFAULT_NOUN_PHRASE_STATE)){
phraseTypeDefinitions.add(NOUN_PHRASE_TYPE);
}
value = properties.get(VERB_PHRASE_STATE);
if((value != null && Boolean.parseBoolean(value.toString())) ||
(value == null && DEFAULT_VERB_PHRASE_STATE)){
phraseTypeDefinitions.add(VERB_PHRASE_TYPE);
}
}
@Deactivate
protected void deactivate(ComponentContext context){
this.languageConfiguration.setDefault();
this.minPosScore = -1;
super.deactivate(context);
}
}