/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.stanbol.enhancer.engines.lucenefstlinking; import static org.apache.stanbol.enhancer.nlp.NlpAnnotations.NER_ANNOTATION; import java.io.IOException; import java.security.AccessController; import java.security.PrivilegedAction; import java.util.HashSet; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.NavigableMap; import java.util.Set; import org.apache.commons.collections.Predicate; import org.apache.commons.collections.iterators.FilterIterator; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.stanbol.enhancer.nlp.model.AnalysedText; import org.apache.stanbol.enhancer.nlp.model.Chunk; import org.apache.stanbol.enhancer.nlp.model.Span; import org.apache.stanbol.enhancer.nlp.model.annotation.Value; import org.apache.stanbol.enhancer.nlp.ner.NerTag; import org.opensextant.solrtexttagger.TagClusterReducer; import org.opensextant.solrtexttagger.TagLL; import org.opensextant.solrtexttagger.TaggingAttribute; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * Class that ensures that only Tokens within a {@link Chunk} with a * {@link NerTag} are processed.<p> * This is ensured on two places:<ol> * <li> Classifies Tokens in the Solr {@link TokenStream} with the {@link TaggingAttribute} * based on {@link NerTag}s present in the {@link AnalysedText}.<p> * <li> Implements {@link TagClusterReducer} to ensure that all {@link TagLL tags} * that do not cover the whole Named Entity are removed from the Cluster. * </ol> * <b> Implementation Details</b><p> * The {@link TokenStream} implementation of this class does set * <code>{@link TaggingAttribute#isTaggable()} == ture</code> if the do overlap * with a {@link Chunk} having an {@link NerTag} * <p> * The {@link TagClusterReducer} implementation keeps track of Chunks with * {@link NerTag} while iterating over the {@link TokenStream} and adds them to * the end of a List. When {@link TagClusterReducer#reduce(TagLL[])} is called * tags of the cluster are checked if they do cover Chunks with a {@link NerTag}. * If they do not they are removed from the cluster. * <p> * This implementation was derived from the {@link LinkableTokenFilter} * * @author Rupert Westenthaler * */ public final class NamedEntityTokenFilter extends TokenFilter implements TagClusterReducer{ private final Logger log = LoggerFactory.getLogger(NamedEntityTokenFilter.class); /** * The NLP processing results */ private AnalysedText at; /** * The language of the text */ /** * Iterator over all {@link Chunk}s in the {@link AnalysedText} that do * have an {@link NerTag} */ private Iterator<Chunk> neChunks; protected final CharTermAttribute termAtt; protected final OffsetAttribute offset; protected final TaggingAttribute taggable; private int lookupCount = 0; private int incrementCount = 0; /** * List with {@link Chunk}s having {@link NerTag}s. This is used by * the {@link #reduce(TagLL[])} method to check if {@link TagLL tags} * do cover Named Entities detected in the text. */ private List<Chunk> nePhrases; private final NavigableMap<int[],Set<String>> nePhrasesTypes; private Chunk neChunk; protected final boolean wildcardType; protected final Set<String> neTypes; /** * A Token Filter for Named Entities of the configured types. Also collects * '<code>span -> type</code>' mappings for Named Entities. * @param input the input token stream for the parsed text * @param at the {@link AnalysedText} containing {@link NerTag} values * @param lang the language of the text * @param neTypes the string {@link NerTag#getType()} and {@link NerTag#getTag()} * values of enabled Named Entities. If <code>null</code> or containing the * <code>null</code> element all types will be accepted. * @param nePhrasesTypes The {@link NavigableMap} used to store the spans of * named entities as key and the set o their {@link NerTag#getTag()} and * {@link NerTag#getType()} as values. Those information are collected while * iterating over the text (by the {@link NamedEntityPredicate}) and are * used later for filtering {@link Match}es based on the type of the Entities. * Typically the {@link TaggingSession#entityMentionTypes} is parsed as this * parameter. */ protected NamedEntityTokenFilter(TokenStream input, AnalysedText at, String lang, Set<String> neTypes, NavigableMap<int[],Set<String>> nePhrasesTypes) { super(input); //STANBOL-1177: add attributes in doPrivileged to avoid //AccessControlException: access denied ("java.lang.RuntimePermission" "getClassLoader") termAtt = AccessController.doPrivileged(new PrivilegedAction<CharTermAttribute>() { @Override public CharTermAttribute run() { return addAttribute(CharTermAttribute.class); }}); offset = AccessController.doPrivileged(new PrivilegedAction<OffsetAttribute>() { @Override public OffsetAttribute run() { return addAttribute(OffsetAttribute.class); }}); taggable = AccessController.doPrivileged(new PrivilegedAction<TaggingAttribute>() { @Override public TaggingAttribute run() { return addAttribute(TaggingAttribute.class); }}); this.at = at; this.wildcardType = neTypes == null || neTypes.contains(null); this.neTypes = neTypes; this.nePhrasesTypes = nePhrasesTypes; } @SuppressWarnings("unchecked") @Override public void reset() throws IOException { super.reset(); nePhrases = new LinkedList<Chunk>(); neChunks = new FilterIterator(at.getChunks(), new NamedEntityPredicate()); } @Override public boolean incrementToken() throws IOException { if(input.incrementToken()){ incrementCount++; if(log.isTraceEnabled()){ log.trace("> solr:[{},{}] {}",new Object[]{ offset.startOffset(), offset.endOffset(), termAtt}); } while((neChunk == null || neChunk.getEnd() < offset.startOffset()) && neChunks.hasNext()){ neChunk = neChunks.next(); nePhrases.add(neChunk); } if(neChunk == null){ taggable.setTaggable(false); incrementCount++; log.debug("lookup percentage: {}",lookupCount*100/(float)incrementCount); return false; } else if(offset.endOffset() > neChunk.getStart() || offset.startOffset() < neChunk.getEnd()){ //set tagable to true if the tokens overlapps with the current chunk taggable.setTaggable(true); if(log.isTraceEnabled()){ log.trace("lookup: token [{},{}]: {} | named Entity [{},{}]:{}", new Object[]{ offset.startOffset(), offset.endOffset(), termAtt, neChunk.getStart(), neChunk.getEnd(), neChunk.getSpan()}); } lookupCount++; } else { taggable.setTaggable(false); } incrementCount++; return true; } else { //no more tokens in the parent token stream return false; } } @Override public void reduce(TagLL[] head) { //(1) reduce Tags based on named entity phrases. for(TagLL tag = head[0]; tag != null; tag = tag.getNextTag()) { int start = tag.getStartOffset(); int end = tag.getEndOffset(); Chunk nePhrase = nePhrases.isEmpty() ? null : nePhrases.get(0); while(nePhrase != null && nePhrase.getEnd() <= start){ nePhrases.remove(0); nePhrase = nePhrases.isEmpty() ? null : nePhrases.get(0); } if(nePhrase == null || !(start <= nePhrase.getStart() && end >= nePhrase.getEnd())){ //does not cover any named entity phrase tag.removeLL(); //remove the tag from the cluster if(log.isTraceEnabled()){ log.trace(" > reduce tag {} - does not cover {}", tag, nePhrase); } } else if(log.isTraceEnabled()) {//the current Tag coveres a named entity phrase log.trace(" > keep tag {} for {}", tag, nePhrase); } } } /** * {@link Predicate} used to select Named Entities based on matching * {@link NerTag#getTag()} and {@link NerTag#getType()} values against the * {@link NamedEntityTokenFilter#neTypes} configuration. As a side effect * this also collects the {@link NamedEntityTokenFilter#nePhrasesTypes} * information. This avoids a 2nd pass over the {@link AnalysedText} to * collect those information * @author Rupert Westenthaler * */ final class NamedEntityPredicate implements Predicate { @Override public boolean evaluate(Object o) { if(o instanceof Chunk){ Chunk chunk = (Chunk)o; Value<NerTag> nerValue = chunk.getAnnotation(NER_ANNOTATION); if(nerValue != null){ NerTag nerTag = nerValue.value(); String nerType = nerTag.getType() != null ? nerTag.getType().getUnicodeString() : null; if( wildcardType || neTypes.contains(nerTag.getTag()) || (nerType != null && neTypes.contains(nerType))){ int[] span = new int[]{chunk.getStart(), chunk.getEnd()}; Set<String> types = nePhrasesTypes.get(span); if(types == null){ types = new HashSet<String>(4); nePhrasesTypes.put(span, types); } types.add(nerType); types.add(nerTag.getTag()); return true; } } } return false; } } }