/* * Copyright (c) 2012 Sebastian Schaffert * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.stanbol.enhancer.engines.kuromoji.impl; import static org.apache.stanbol.enhancer.engines.kuromoji.Constants.NER_TAG_SET; import static org.apache.stanbol.enhancer.engines.kuromoji.Constants.POS_TAG_SET; import static org.apache.stanbol.enhancer.nlp.NlpAnnotations.MORPHO_ANNOTATION; import static org.apache.stanbol.enhancer.nlp.NlpAnnotations.POS_ANNOTATION; import static org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getLanguage; import static org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.initAnalysedText; import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_TYPE; import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_END; import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_SELECTED_TEXT; import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_SELECTION_CONTEXT; import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_START; import java.io.IOException; import java.io.StringReader; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import org.apache.clerezza.commons.rdf.Language; import org.apache.clerezza.rdf.core.LiteralFactory; import org.apache.clerezza.commons.rdf.Graph; import org.apache.clerezza.commons.rdf.IRI; import org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl; import org.apache.clerezza.commons.rdf.impl.utils.TripleImpl; import org.apache.commons.io.input.CharSequenceReader; import org.apache.felix.scr.annotations.Activate; import org.apache.felix.scr.annotations.Component; import org.apache.felix.scr.annotations.ConfigurationPolicy; import org.apache.felix.scr.annotations.Deactivate; import org.apache.felix.scr.annotations.Properties; import org.apache.felix.scr.annotations.Property; import org.apache.felix.scr.annotations.Reference; import org.apache.felix.scr.annotations.ReferenceCardinality; import org.apache.felix.scr.annotations.Service; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.ja.JapaneseBaseFormFilterFactory; import org.apache.lucene.analysis.ja.JapaneseKatakanaStemFilterFactory; import org.apache.lucene.analysis.ja.JapanesePartOfSpeechStopFilterFactory; import org.apache.lucene.analysis.ja.JapaneseTokenizerFactory; import org.apache.lucene.analysis.ja.tokenattributes.BaseFormAttribute; import org.apache.lucene.analysis.ja.tokenattributes.InflectionAttribute; import org.apache.lucene.analysis.ja.tokenattributes.PartOfSpeechAttribute; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.TypeAttribute; import org.apache.lucene.analysis.util.ResourceLoader; import org.apache.lucene.analysis.util.ResourceLoaderAware; import org.apache.lucene.analysis.util.TokenFilterFactory; import org.apache.lucene.analysis.util.TokenizerFactory; import org.apache.lucene.util.Version; import org.apache.sling.installer.core.impl.OsgiInstallerImpl; import org.apache.stanbol.commons.solr.utils.StanbolResourceLoader; import org.apache.stanbol.enhancer.nlp.NlpAnnotations; import org.apache.stanbol.enhancer.nlp.NlpProcessingRole; import org.apache.stanbol.enhancer.nlp.NlpServiceProperties; import org.apache.stanbol.enhancer.nlp.model.AnalysedText; import org.apache.stanbol.enhancer.nlp.model.AnalysedTextFactory; import org.apache.stanbol.enhancer.nlp.model.Chunk; import org.apache.stanbol.enhancer.nlp.model.Sentence; import org.apache.stanbol.enhancer.nlp.model.Token; import org.apache.stanbol.enhancer.nlp.model.annotation.Value; import org.apache.stanbol.enhancer.nlp.morpho.MorphoFeatures; import org.apache.stanbol.enhancer.nlp.ner.NerTag; import org.apache.stanbol.enhancer.nlp.pos.Pos; import org.apache.stanbol.enhancer.nlp.pos.PosTag; import org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper; import org.apache.stanbol.enhancer.servicesapi.Blob; import org.apache.stanbol.enhancer.servicesapi.ContentItem; import org.apache.stanbol.enhancer.servicesapi.EngineException; import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine; import org.apache.stanbol.enhancer.servicesapi.ServiceProperties; import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper; import org.apache.stanbol.enhancer.servicesapi.impl.AbstractEnhancementEngine; import org.osgi.framework.Constants; import org.osgi.service.cm.ConfigurationException; import org.osgi.service.component.ComponentContext; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * Sentence detection and word tokenizer for Chinese based on the Solr/Lucene * smartcn analysers. * * @author Rupert Westenthaler */ @Component(immediate = true, metatype = true, policy = ConfigurationPolicy.OPTIONAL) //create a default instance with the default configuration @Service @Properties(value={ @Property(name= EnhancementEngine.PROPERTY_NAME,value="kuromoji-nlp"), @Property(name=Constants.SERVICE_RANKING,intValue=0) //give the default instance a ranking < 0 }) public class KuromojiNlpEngine extends AbstractEnhancementEngine<IOException,RuntimeException> implements ServiceProperties { private static final Version LUCENE_VERSION = Version.LUCENE_44; private static final String TOKENIZER_MODE = "search"; //normal, extended private static final Map<String,Object> SERVICE_PROPERTIES; private static final Map<String,String> TOKENIZER_FACTORY_CONFIG = new HashMap<String,String>(); private static final Map<String, String> BASE_FORM_FILTER_CONFIG = new HashMap<String,String>(); private static final Map<String, String> POS_FILTER_CONFIG = new HashMap<String,String>(); private static final Map<String, String> STEMM_FILTER_CONFIG = new HashMap<String,String>(); static { Map<String,Object> props = new HashMap<String,Object>(); props.put(ServiceProperties.ENHANCEMENT_ENGINE_ORDERING, ServiceProperties.ORDERING_NLP_TOKENIZING); props.put(NlpServiceProperties.ENHANCEMENT_ENGINE_NLP_ROLE, NlpProcessingRole.Tokenizing); SERVICE_PROPERTIES = Collections.unmodifiableMap(props); TOKENIZER_FACTORY_CONFIG.put("luceneMatchVersion", LUCENE_VERSION.toString()); TOKENIZER_FACTORY_CONFIG.put("mode",TOKENIZER_MODE); //we want to have tokens for punctations TOKENIZER_FACTORY_CONFIG.put("discardPunctuation", "false"); BASE_FORM_FILTER_CONFIG.put("luceneMatchVersion", LUCENE_VERSION.toString()); POS_FILTER_CONFIG.put("luceneMatchVersion", LUCENE_VERSION.toString()); POS_FILTER_CONFIG.put("tags", "nostoptags.txt"); POS_FILTER_CONFIG.put("enablePositionIncrements","true"); STEMM_FILTER_CONFIG.put("luceneMatchVersion", LUCENE_VERSION.toString()); STEMM_FILTER_CONFIG.put("minimumLength","4"); } private static Logger log = LoggerFactory.getLogger(KuromojiNlpEngine.class); @Reference(cardinality=ReferenceCardinality.OPTIONAL_UNARY) protected ResourceLoader parentResourceLoader; protected ResourceLoader resourceLoader; //private MappingCharFilterFactory charFilterFactory; private TokenizerFactory tokenizerFactory; private List<TokenFilterFactory> filterFactories = new ArrayList<TokenFilterFactory>(); @Reference protected AnalysedTextFactory analysedTextFactory; protected LiteralFactory lf = LiteralFactory.getInstance(); /** * holds {@link PosTag}s that are not contained in the * {@link org.apache.stanbol.enhancer.engines.kuromoji.Constants#POS_TAG_SET} */ private Map<String,PosTag> adhocTags = new HashMap<String,PosTag>(); /** * Indicate if this engine can enhance supplied ContentItem, and if it * suggests enhancing it synchronously or asynchronously. The * {@link org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager} can force sync/async mode if desired, it is * just a suggestion from the engine. * <p/> * Returns ENHANCE_ASYNC in case there is a text/plain content part and a tagger for the language identified for * the content item, CANNOT_ENHANCE otherwise. * * @throws org.apache.stanbol.enhancer.servicesapi.EngineException * if the introspecting process of the content item * fails */ @Override public int canEnhance(ContentItem ci) throws EngineException { // check if content is present Map.Entry<IRI,Blob> entry = NlpEngineHelper.getPlainText(this, ci, false); if(entry == null || entry.getValue() == null) { return CANNOT_ENHANCE; } String language = getLanguage(this,ci,false); if("ja".equals(language) || (language != null && language.startsWith("ja-"))) { log.trace(" > can enhance ContentItem {} with language {}",ci,language); return ENHANCE_ASYNC; } else { return CANNOT_ENHANCE; } } /** * Compute enhancements for supplied ContentItem. The results of the process * are expected to be stored in the metadata of the content item. * <p/> * The client (usually an {@link org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager}) should take care of * persistent storage of the enhanced {@link org.apache.stanbol.enhancer.servicesapi.ContentItem}. * <p/> * This method creates a new POSContentPart using {@link org.apache.stanbol.enhancer.engines.pos.api.POSTaggerHelper#createContentPart} from a text/plain part and * stores it as a new part in the content item. The metadata is not changed. * * @throws org.apache.stanbol.enhancer.servicesapi.EngineException * if the underlying process failed to work as * expected */ @Override public void computeEnhancements(ContentItem ci) throws EngineException { final AnalysedText at = initAnalysedText(this,analysedTextFactory,ci); String language = getLanguage(this,ci,false); if(!("ja".equals(language) || (language != null && language.startsWith("ja-")))) { throw new IllegalStateException("The detected language is NOT 'ja'! " + "As this is also checked within the #canEnhance(..) method this " + "indicates an Bug in the used EnhancementJobManager implementation. " + "Please report this on the dev@apache.stanbol.org or create an " + "JIRA issue about this."); } //start with the Tokenizer TokenStream tokenStream = tokenizerFactory.create(new CharSequenceReader(at.getText())); //build the analyzing chain by adding all TokenFilters for(TokenFilterFactory filterFactory : filterFactories){ tokenStream = filterFactory.create(tokenStream); } //Try to extract sentences based on POS tags ... int sentStartOffset = -1; //NER data List<NerData> nerList = new ArrayList<NerData>(); int nerSentIndex = 0; //the next index where the NerData.context need to be set NerData ner = null; OffsetAttribute offset = null; try { tokenStream.reset(); //required with Solr 4 while (tokenStream.incrementToken()){ offset = tokenStream.addAttribute(OffsetAttribute.class); Token token = at.addToken(offset.startOffset(), offset.endOffset()); //Get the POS attribute and init the PosTag PartOfSpeechAttribute posAttr = tokenStream.addAttribute(PartOfSpeechAttribute.class); PosTag posTag = POS_TAG_SET.getTag(posAttr.getPartOfSpeech()); if(posTag == null){ posTag = adhocTags.get(posAttr.getPartOfSpeech()); if(posTag == null){ posTag = new PosTag(posAttr.getPartOfSpeech()); adhocTags.put(posAttr.getPartOfSpeech(), posTag); log.warn(" ... missing PosTag mapping for {}",posAttr.getPartOfSpeech()); } } //Sentence detection by POS tag if(sentStartOffset < 0){ //the last token was a sentence ending sentStartOffset = offset.startOffset(); } if(posTag.hasPos(Pos.Point)) { Sentence sent = at.addSentence(sentStartOffset, offset.startOffset()); //add the sentence as context to the NerData instances while(nerSentIndex < nerList.size()){ nerList.get(nerSentIndex).context = sent.getSpan(); nerSentIndex++; } sentStartOffset = -1; } //POS token.addAnnotation(POS_ANNOTATION, Value.value(posTag)); //NER NerTag nerTag = NER_TAG_SET.getTag(posAttr.getPartOfSpeech()); if(ner != null && (nerTag == null || !ner.tag.getType().equals(nerTag.getType()))){ //write NER annotation Chunk chunk = at.addChunk(ner.start, ner.end); chunk.addAnnotation(NlpAnnotations.NER_ANNOTATION, Value.value(ner.tag)); //NOTE that the fise:TextAnnotation are written later based on the nerList //clean up ner = null; } if(nerTag != null){ if(ner == null){ ner = new NerData(nerTag, offset.startOffset()); nerList.add(ner); } ner.end = offset.endOffset(); } BaseFormAttribute baseFormAttr = tokenStream.addAttribute(BaseFormAttribute.class); MorphoFeatures morpho = null; if(baseFormAttr != null && baseFormAttr.getBaseForm() != null){ morpho = new MorphoFeatures(baseFormAttr.getBaseForm()); morpho.addPos(posTag); //and add the posTag } InflectionAttribute inflectionAttr = tokenStream.addAttribute(InflectionAttribute.class); inflectionAttr.getInflectionForm(); inflectionAttr.getInflectionType(); if(morpho != null){ //if present add the morpho token.addAnnotation(MORPHO_ANNOTATION, Value.value(morpho)); } } //we still need to write the last sentence Sentence lastSent = null; if(offset != null && sentStartOffset >= 0 && offset.endOffset() > sentStartOffset){ lastSent = at.addSentence(sentStartOffset, offset.endOffset()); } //and set the context off remaining named entities while(nerSentIndex < nerList.size()){ if(lastSent != null){ nerList.get(nerSentIndex).context = lastSent.getSpan(); } else { //no sentence detected nerList.get(nerSentIndex).context = at.getSpan(); } nerSentIndex++; } } catch (IOException e) { throw new EngineException(this, ci, "Exception while reading from " + "AnalyzedText contentpart",e); } finally { try { tokenStream.close(); } catch (IOException e) {/* ignore */} } //finally write the NER annotations to the metadata of the ContentItem final Graph metadata = ci.getMetadata(); ci.getLock().writeLock().lock(); try { Language lang = new Language("ja"); for(NerData nerData : nerList){ IRI ta = EnhancementEngineHelper.createTextEnhancement(ci, this); metadata.add(new TripleImpl(ta, ENHANCER_SELECTED_TEXT, new PlainLiteralImpl( at.getSpan().substring(nerData.start, nerData.end),lang))); metadata.add(new TripleImpl(ta, DC_TYPE, nerData.tag.getType())); metadata.add(new TripleImpl(ta, ENHANCER_START, lf.createTypedLiteral(nerData.start))); metadata.add(new TripleImpl(ta, ENHANCER_END, lf.createTypedLiteral(nerData.end))); metadata.add(new TripleImpl(ta, ENHANCER_SELECTION_CONTEXT, new PlainLiteralImpl(nerData.context, lang))); } } finally{ ci.getLock().writeLock().unlock(); } } @Override public Map<String,Object> getServiceProperties() { return SERVICE_PROPERTIES; } /** * Activate and read the properties. Configures and initialises a POSTagger for each language configured in * CONFIG_LANGUAGES. * * @param ce the {@link org.osgi.service.component.ComponentContext} */ @Activate protected void activate(ComponentContext ce) throws ConfigurationException, IOException { log.info("activating smartcn tokenizing engine"); super.activate(ce); //init the Solr ResourceLoader used for initialising the components //first a ResourceLoader for this classloader, 2nd one using the commons.solr.core classloader //and third the parentResourceLoader (if present). resourceLoader = new StanbolResourceLoader(KuromojiNlpEngine.class.getClassLoader(), new StanbolResourceLoader(parentResourceLoader)); tokenizerFactory = new JapaneseTokenizerFactory(TOKENIZER_FACTORY_CONFIG); ((ResourceLoaderAware) tokenizerFactory).inform(resourceLoader); //base form filter TokenFilterFactory baseFormFilterFactory = new JapaneseBaseFormFilterFactory(BASE_FORM_FILTER_CONFIG); filterFactories.add(baseFormFilterFactory); //POS filter TokenFilterFactory posFilterFactory = new JapanesePartOfSpeechStopFilterFactory(POS_FILTER_CONFIG); ((ResourceLoaderAware) posFilterFactory).inform(resourceLoader); filterFactories.add(posFilterFactory); //Stemming TokenFilterFactory stemmFilterFactory = new JapaneseKatakanaStemFilterFactory(STEMM_FILTER_CONFIG); filterFactories.add(stemmFilterFactory); } @Deactivate protected void deactivate(ComponentContext context) { tokenizerFactory = null; filterFactories.clear(); filterFactories = null; super.deactivate(context); } /** * This is an internal helper class that avoids to execute sentences * using the {@link SentenceTokenizer} twice. * @author Rupert Westenthaler * */ protected final class AnalyzedTextSentenceTokenizer extends Tokenizer { private final AnalysedText at; private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class); private Iterator<Sentence> sentences; private Sentence sentence = null; protected AnalyzedTextSentenceTokenizer(AnalysedText at) { super(new StringReader(at.getText().toString())); this.at = at; sentences = at.getSentences(); } @Override public boolean incrementToken() throws IOException { if(sentences.hasNext()){ sentence = sentences.next(); termAtt.setEmpty().append(sentence.getSpan()); offsetAtt.setOffset(sentence.getStart(),sentence.getEnd()); typeAtt.setType("sentence"); return true; } else { return false; } } @Override public void end() throws IOException { // set final offset offsetAtt.setOffset(at.getEnd(), at.getEnd()); } @Override public void reset() throws IOException { super.reset(); sentences = at.getSentences(); termAtt.setEmpty(); offsetAtt.setOffset(0, 0); typeAtt.setType(null); } } }