/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.stanbol.enhancer.engines.entitycoreference; import static org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getLanguage; import java.util.ArrayList; import java.util.Collections; import java.util.Dictionary; import java.util.EnumSet; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import org.apache.felix.scr.annotations.Activate; import org.apache.felix.scr.annotations.Component; import org.apache.felix.scr.annotations.Deactivate; import org.apache.felix.scr.annotations.Properties; import org.apache.felix.scr.annotations.Property; import org.apache.felix.scr.annotations.Reference; import org.apache.felix.scr.annotations.Service; import org.apache.stanbol.enhancer.engines.entitycoreference.datamodel.NounPhrase; import org.apache.stanbol.enhancer.engines.entitycoreference.impl.CoreferenceFinder; import org.apache.stanbol.enhancer.engines.entitycoreference.impl.NounPhraseFilterer; import org.apache.stanbol.enhancer.nlp.NlpAnnotations; import org.apache.stanbol.enhancer.nlp.model.AnalysedText; import org.apache.stanbol.enhancer.nlp.model.Section; import org.apache.stanbol.enhancer.nlp.model.Span; import org.apache.stanbol.enhancer.nlp.model.SpanTypeEnum; import org.apache.stanbol.enhancer.nlp.model.annotation.Value; import org.apache.stanbol.enhancer.nlp.ner.NerTag; import org.apache.stanbol.enhancer.nlp.phrase.PhraseTag; import org.apache.stanbol.enhancer.nlp.pos.LexicalCategory; import org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper; import org.apache.stanbol.enhancer.servicesapi.ContentItem; import org.apache.stanbol.enhancer.servicesapi.EngineException; import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine; import org.apache.stanbol.enhancer.servicesapi.ServiceProperties; import org.apache.stanbol.enhancer.servicesapi.impl.AbstractEnhancementEngine; import org.apache.stanbol.entityhub.servicesapi.Entityhub; import org.apache.stanbol.entityhub.servicesapi.site.SiteManager; import org.osgi.service.cm.ConfigurationException; import org.osgi.service.component.ComponentContext; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * This engine extracts references in the given text of noun phrases which point to NERs. The coreference is * performed based on matching several of the named entity's dbpedia/yago properties to the noun phrase * tokens. * * TODO - Be able to detect possessive coreferences such as Germany's prime minister * TODO - be able to detect products and their developer such as Iphone 7 and Apple's new device. * TODO - provide the ability via config for the user to also allow coreferencing of 1 word noun phrases based * solely on comparison with entity class type? * * @author Cristian Petroaca * */ @Component(immediate = true, metatype = true) @Service(value = EnhancementEngine.class) @Properties(value = { @Property(name = EnhancementEngine.PROPERTY_NAME, value = "entity-coreference"), @Property(name = EntityCoReferenceEngine.CONFIG_LANGUAGES, value = "en"), @Property(name = EntityCoReferenceEngine.REFERENCED_SITE_ID, value = "entity-coref-dbpedia"), @Property(name = EntityCoReferenceEngine.ENTITY_URI_BASE, value = "http://dbpedia.org/resource/"), @Property(name = EntityCoReferenceEngine.MAX_DISTANCE, intValue = Constants.MAX_DISTANCE_DEFAULT_VALUE), @Property(name = EntityCoReferenceEngine.SPATIAL_ATTR_FOR_PERSON, value = Constants.DEFAULT_SPATIAL_ATTR_FOR_PERSON), @Property(name = EntityCoReferenceEngine.SPATIAL_ATTR_FOR_ORGANIZATION, value = Constants.DEFAULT_SPATIAL_ATTR_FOR_ORGANIZATION), @Property(name = EntityCoReferenceEngine.SPATIAL_ATTR_FOR_PLACE, value = Constants.DEFAULT_SPATIAL_ATTR_FOR_PLACE), @Property(name = EntityCoReferenceEngine.ORG_ATTR_FOR_PERSON, value = Constants.DEFAULT_ORG_ATTR_FOR_PERSON), @Property(name = EntityCoReferenceEngine.ENTITY_CLASSES_TO_EXCLUDE, value = Constants.DEFAULT_ENTITY_CLASSES_TO_EXCLUDE)}) public class EntityCoReferenceEngine extends AbstractEnhancementEngine<RuntimeException,RuntimeException> implements EnhancementEngine, ServiceProperties { private static final Integer ENGINE_ORDERING = ServiceProperties.ORDERING_POST_PROCESSING + 91; /** * Language configuration. Takes a list of ISO language codes of supported languages. Currently supported * are the languages given as default value. */ protected static final String CONFIG_LANGUAGES = "enhancer.engine.entitycoreference.languages"; /** * Referenced site configuration. Defaults to dbpedia. */ protected static final String REFERENCED_SITE_ID = "enhancer.engine.entitycoreference.referencedSiteId"; /** * */ protected static final String ENTITY_URI_BASE = "enhancer.engine.entitycoreference.entity.uri.base"; /** * Maximum sentence distance between the ner and the noun phrase which mentions it. -1 means no distance * constraint. */ protected static final String MAX_DISTANCE = "enhancer.engine.entitycoreference.maxDistance"; /** * Attributes used for spatial coreference when dealing with a person entity. */ protected static final String SPATIAL_ATTR_FOR_PERSON = "enhancer.engine.entitycoreference.spatial.attr.person"; /** * Attributes used for spatial coreference when dealing with an organization entity. */ protected static final String SPATIAL_ATTR_FOR_ORGANIZATION = "enhancer.engine.entitycoreference.spatial.attr.org"; /** * Attributes used for spatial coreference when dealing with a place entity. */ protected static final String SPATIAL_ATTR_FOR_PLACE = "enhancer.engine.entitycoreference.spatial.attr.place"; /** * Attributes used for organisational membership coreference when dealing with a person entity. */ protected static final String ORG_ATTR_FOR_PERSON = "enhancer.engine.entitycoreference.org.attr.person"; /** * Entity classes which will be excluded when doing the entity class type matching * because they are too general in nature. */ protected static final String ENTITY_CLASSES_TO_EXCLUDE = "enhancer.engine.entitycoreference.entity.classes.excluded"; /** * Logger */ private final Logger log = LoggerFactory.getLogger(EntityCoReferenceEngine.class); /** * Service of the Entityhub that manages all the active referenced Site. This Service is used to lookup * the configured Referenced Site when we need to enhance a content item. */ @Reference protected SiteManager siteManager; /** * Used to lookup Entities if the {@link #REFERENCED_SITE_ID} property is set to "entityhub" or "local" */ @Reference protected Entityhub entityhub; /** * Specialized class which filters out bad noun phrases based on the language. */ private NounPhraseFilterer nounPhraseFilterer; /** * Performs the logic needed to find corefs based on the NERs and noun phrases in the text. */ private CoreferenceFinder corefFinder; @SuppressWarnings("unchecked") @Activate protected void activate(ComponentContext ctx) throws ConfigurationException { super.activate(ctx); Dictionary<String,Object> config = ctx.getProperties(); /* Step 1 - initialize the {@link NounPhraseFilterer} with the language config */ String languages = (String) config.get(CONFIG_LANGUAGES); if (languages == null || languages.isEmpty()) { throw new ConfigurationException(CONFIG_LANGUAGES, "The Languages Config is a required Parameter and MUST NOT be NULL or an empty String!"); } nounPhraseFilterer = new NounPhraseFilterer(languages.split(",")); /* Step 2 - initialize the {@link CoreferenceFinder} */ String referencedSiteID = null; Object referencedSiteIDfromConfig = config.get(REFERENCED_SITE_ID); if (referencedSiteIDfromConfig == null) { throw new ConfigurationException(REFERENCED_SITE_ID, "The ID of the Referenced Site is a required Parameter and MUST NOT be NULL!"); } referencedSiteID = referencedSiteIDfromConfig.toString(); if (referencedSiteID.isEmpty()) { throw new ConfigurationException(REFERENCED_SITE_ID, "The ID of the Referenced Site is a required Parameter and MUST NOT be an empty String!"); } if (Entityhub.ENTITYHUB_IDS.contains(referencedSiteID.toLowerCase())) { log.debug("Init NamedEntityTaggingEngine instance for the Entityhub"); referencedSiteID = null; } int maxDistance; Object maxDistanceFromConfig = config.get(MAX_DISTANCE); if (maxDistanceFromConfig == null) { maxDistance = Constants.MAX_DISTANCE_DEFAULT_VALUE; } else if (maxDistanceFromConfig instanceof Number) { maxDistance = ((Number) maxDistanceFromConfig).intValue(); } else { try { maxDistance = Integer.parseInt(maxDistanceFromConfig.toString()); } catch (NumberFormatException nfe) { throw new ConfigurationException(MAX_DISTANCE, "The Max Distance parameter must be a number"); } } if (maxDistance < -1) { throw new ConfigurationException(MAX_DISTANCE, "The Max Distance parameter must not be smaller than -1"); } String entityUriBase = (String) config.get(ENTITY_URI_BASE); if (entityUriBase == null || entityUriBase.isEmpty()) { throw new ConfigurationException(ENTITY_URI_BASE, "The Entity Uri Base parameter cannot be empty"); } String spatialAttrForPerson = (String) config.get(SPATIAL_ATTR_FOR_PERSON); String spatialAttrForOrg = (String) config.get(SPATIAL_ATTR_FOR_ORGANIZATION); String spatialAttrForPlace = (String) config.get(SPATIAL_ATTR_FOR_PLACE); String orgAttrForPerson = (String) config.get(ORG_ATTR_FOR_PERSON); String entityClassesToExclude = (String) config.get(ENTITY_CLASSES_TO_EXCLUDE); corefFinder = new CoreferenceFinder(languages.split(","), siteManager, entityhub, referencedSiteID, maxDistance, entityUriBase, spatialAttrForPerson, spatialAttrForOrg, spatialAttrForPlace, orgAttrForPerson, entityClassesToExclude); log.info("activate {}[name:{}]", getClass().getSimpleName(), getName()); } @Override public Map<String,Object> getServiceProperties() { return Collections.unmodifiableMap(Collections.singletonMap(ENHANCEMENT_ENGINE_ORDERING, (Object) ENGINE_ORDERING)); } @Override public int canEnhance(ContentItem ci) throws EngineException { String language = getLanguage(this, ci, false); if (language == null) { log.debug("Engine {} ignores ContentItem {} becuase language {} is not detected.", new Object[] {getName(), ci.getUri(), language}); return CANNOT_ENHANCE; } if (!nounPhraseFilterer.supportsLanguage(language)) { log.debug("Engine {} does not support language {}.", new Object[] {getName(), language}); return CANNOT_ENHANCE; } return ENHANCE_SYNCHRONOUS; } @Override public void computeEnhancements(ContentItem ci) throws EngineException { /* * Step 1 - Build the NER list and the noun phrase list. * * TODO - the noun phrases need to be lemmatized. */ Map<Integer,List<Span>> ners = new HashMap<Integer,List<Span>>(); List<NounPhrase> nounPhrases = new ArrayList<NounPhrase>(); extractNersAndNounPhrases(ci, ners, nounPhrases); /* * If there are no NERs to reference there's nothing to do but exit. */ if (ners.size() == 0) { log.info("Did not find any NERs for which to do the coreferencing"); return; } /* * Step 2 - Filter out bad noun phrases. */ String language = getLanguage(this, ci, false); if (language == null) { log.info("Could not detect the language of the text"); return; } nounPhraseFilterer.filter(nounPhrases, language); /* * If there are no good noun phrases there's nothing to do but exit. */ if (nounPhrases.size() == 0) { log.info("Did not find any noun phrases with which to do the coreferencing"); return; } /* * Step 3 - Extract corefs and write them as {@link NlpAnnotations.COREF_ANNOTATION}s in the {@link * Span}s */ corefFinder.extractCorefs(ners, nounPhrases, language); } @Deactivate protected void deactivate(ComponentContext ctx) { log.info("deactivate {}[name:{}]", getClass().getSimpleName(), getName()); nounPhraseFilterer = null; corefFinder = null; super.deactivate(ctx); } /** * Extracts the NERs and the noun phrases from the given text and puts them in the given lists. * * @param ci * @param ners * @param nounPhrases */ private void extractNersAndNounPhrases(ContentItem ci, Map<Integer,List<Span>> ners, List<NounPhrase> nounPhrases) { AnalysedText at = NlpEngineHelper.getAnalysedText(this, ci, true); Iterator<? extends Section> sections = at.getSentences(); if (!sections.hasNext()) { // process as single sentence sections = Collections.singleton(at).iterator(); } int sentenceCnt = 0; while (sections.hasNext()) { sentenceCnt++; Section section = sections.next(); List<NounPhrase> sectionNounPhrases = new ArrayList<NounPhrase>(); List<Span> sectionNers = new ArrayList<Span>(); Iterator<Span> chunks = section.getEnclosed(EnumSet.of(SpanTypeEnum.Chunk)); while (chunks.hasNext()) { Span chunk = chunks.next(); Value<NerTag> ner = chunk.getAnnotation(NlpAnnotations.NER_ANNOTATION); if (ner != null) { sectionNers.add(chunk); } Value<PhraseTag> phrase = chunk.getAnnotation(NlpAnnotations.PHRASE_ANNOTATION); if (phrase != null && phrase.value().getCategory() == LexicalCategory.Noun) { sectionNounPhrases.add(new NounPhrase(chunk, sentenceCnt)); } } for (NounPhrase nounPhrase : sectionNounPhrases) { Iterator<Span> tokens = section.getEnclosed(EnumSet.of(SpanTypeEnum.Token)); while (tokens.hasNext()) { Span token = tokens.next(); if (nounPhrase.containsSpan(token)) { nounPhrase.addToken(token); } } for (Span sectionNer : sectionNers) { if (nounPhrase.containsSpan(sectionNer)) { nounPhrase.addNerChunk(sectionNer); } } } nounPhrases.addAll(sectionNounPhrases); if (!sectionNers.isEmpty()) { ners.put(sentenceCnt, sectionNers); } } } }