/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.stanbol.enhancer.engines.entitylinking.impl; import static org.apache.stanbol.enhancer.nlp.NlpAnnotations.PHRASE_ANNOTATION; import org.apache.stanbol.enhancer.engines.entitylinking.config.LanguageProcessingConfig; import org.apache.stanbol.enhancer.nlp.NlpAnnotations; import org.apache.stanbol.enhancer.nlp.model.AnalysedText; import org.apache.stanbol.enhancer.nlp.model.Chunk; import org.apache.stanbol.enhancer.nlp.model.Sentence; import org.apache.stanbol.enhancer.nlp.model.Token; import org.apache.stanbol.enhancer.nlp.model.annotation.Value; import org.apache.stanbol.enhancer.nlp.phrase.PhraseTag; import org.apache.stanbol.enhancer.nlp.pos.LexicalCategory; /** * Represents a Chunk (group of tokens) used as context for EntityLinking. * Typically a single {@link ChunkData#chunk} is used, but in case of * overlapping and {@link ChunkData#isProcessable processable} chunks * multiple {@link Chunk}s might be merged to a single {@link ChunkData} * instance. In such cases {@link ChunkData#chunk} represents the * first and {@link ChunkData#merged} the last of the merged chunks.<p> * {@link ChunkData#startToken} and {@link ChunkData#endToken} represent * the covered [start,end) {@link Token} indices relative to the current * sections (typically a {@link Sentence}). {@link ChunkData#getStartChar()} * and {@link ChunkData#getEndChar()} are the absolute [start,end) character * indices within the {@link AnalysedText#getSpan()} */ public class ChunkData { protected final static boolean DEFAULT_PROCESSABLE_STATE = true; /** if this Chunk represents a Named Entity **/ protected final boolean isNamedEntity; /** if the Chunk is processable */ public final boolean isProcessable; /** the Chunk */ public final Chunk chunk; /** the start token index relative to the current section (sentence) */ int startToken; /** the end token index relative to the current section (sentence) */ int endToken; /** * If this chunk has a linkable token */ boolean hasLinkable = false; /** * The number of matchable Tokens enclosed by this Chunk */ int matchableCount; /** * The start position of the first matchable {@link Token} within this * chunk */ int matchableStart = -1; /** * The start char offset of the first matchable {@link Token} within this chunk */ int matchableStartCharIndex = -1; /** * The end position of the last matchable {@link Token} within this chunk */ int matchableEnd = -1; /** * The end char offset of the last matchable {@link Token} within this chunk */ int matchableEndCharIndex = -1; /** * constructs and initializes the meta data for the parsed {@link Chunk} * @param chunk */ public ChunkData(LanguageProcessingConfig tpc, Chunk chunk){ this.chunk = chunk; Boolean process = null; for (Value<PhraseTag> phraseAnnotation : chunk.getAnnotations(PHRASE_ANNOTATION)) { if (tpc.getProcessedPhraseCategories().contains(phraseAnnotation.value().getCategory()) || tpc.getProcessedPhraseTags().contains(phraseAnnotation.value().getTag())) { if (phraseAnnotation.probability() == Value.UNKNOWN_PROBABILITY || phraseAnnotation.probability() >= tpc.getMinPhraseAnnotationProbability()) { process = true; break; } // else probability to low for inclusion } else if (phraseAnnotation.probability() == Value.UNKNOWN_PROBABILITY || phraseAnnotation.probability() >= tpc.getMinExcludePhraseAnnotationProbability()) { process = false; break; } // else probability to low for exclusion } //fallback for NER chunks in case Noun Phrases are processible and a NER //annotation is present for the parsed chunk. isNamedEntity = chunk.getAnnotation(NlpAnnotations.NER_ANNOTATION) != null; if(process == null && isNamedEntity && tpc.getProcessedPhraseCategories().contains(LexicalCategory.Noun)){ process = true; } isProcessable = process == null ? DEFAULT_PROCESSABLE_STATE : process; } /** * Getter for the start character position * @return the start character position of the selected text span. */ public int getStartChar(){ return chunk.getStart(); } /** * Getter for the end character position of the text * @return the end character position */ public int getEndChar(){ return chunk.getEnd(); } /** * If this chunk is processable * @return the state */ public boolean isProcessable() { return isProcessable; } public boolean isNamedEntity() { return isNamedEntity; } /** * If this chunk covers a linkable token * @return */ public boolean hasLinkable(){ return hasLinkable; } /** * Getter for the number of matchable tokens contained in this chunk * @return The number of matchable tokens contained in this chunk */ public int getMatchableCount() { return matchableCount; } public int getStartTokenIndex() { return startToken; } public int getEndTokenIndex() { return endToken; } /** * The index of the first matchable Token within the {@link Chunk} or * <code>-1</code> if none * @return */ public int getMatchableStart() { return matchableStart; } /** * The index of the last matchable Token within the {@link Chunk} or * <code>-1</code> if none * @return */ public int getMatchableEnd() { return matchableEnd; } /** * The char index of the start character of the first matchable {@link Token} * within the {@link Chunk} or <code>-1</code> if none. * @return */ public int getMatchableStartChar() { return matchableStartCharIndex; } /** * the char indes of the end character of the last matchable {@link Token} * within the {@link Chunk} or <code>-1</code> if none * @return */ public int getMatchableEndChar() { return matchableEndCharIndex; } }