/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** * */ package org.apache.stanbol.enhancer.engines.keywordextraction.linking; import java.util.Iterator; import org.apache.stanbol.commons.opennlp.TextAnalyzer; import org.apache.stanbol.commons.opennlp.TextAnalyzer.AnalysedText; import org.apache.stanbol.commons.opennlp.TextAnalyzer.AnalysedText.Token; /** * Represents the already with NLP tools analysed content to be linked with * Entities of an {@link EntitySearcher}.<p> * Note that for the linking process it is only required that the text is * tokenized. All other features (sentence detection, POS tags and Chunks) are * optional but do improve the performance and to an smaller amount also the * results of the linking process. <p> * TODO: <ul> * <li> Find a better Name * <li> The API is not optimal. In general the {@link TextAnalyzer} and the * {@link AnalysedContent} interface do not play well together :( * </ul> * @author Rupert Westenthaler * */ public interface AnalysedContent { /** * Getter for the Iterator over the analysed sentences. This Method * is expected to return always the same Iterator instance. * @return the iterator over the analysed sentences */ public Iterator<AnalysedText> getAnalysedText(); /** * Called to check if a {@link Token} should be used to search for * Concepts within the Taxonomy based on the POS tag of the Token. * @param posTag the POS tag to check * @param posProb the probability of the POS tag or <code>1.0</code> if not * available * @return <code>true</code> if Tokens with this POS tag should be * included in searches. Otherwise <code>false</code>. If this information * is not available (e.g. no set of Tags that need to be processed is defined) * this Method MUST return <code>null</code> */ public Boolean processPOS(String posTag, double posProb); /** * Called to check if a chunk should be used to search for Concepts. * @param chunkTag the tag (type) of the chunk * @param chunkProb the probability of the chunk tag or <code>1.0</code> if * not available * @return <code>true</code> if chunks with this tag (type) should be * processed (used to search for matches of concepts) and <code>false</code> * if not. If this information is not available (e.g. no set of Tags that * need to be processed is defined) this Method MUST return <code>null</code> */ public Boolean processChunk(String chunkTag,double chunkProb); /** * Tokenizes the parsed label * @param label the label to tokenize * @return the spans of the tokens */ public String[] tokenize(String label); }