/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.stanbol.commons.opennlp;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import opennlp.tools.chunker.Chunker;
import opennlp.tools.chunker.ChunkerME;
import opennlp.tools.chunker.ChunkerModel;
import opennlp.tools.postag.POSModel;
import opennlp.tools.postag.POSTaggerME;
import opennlp.tools.sentdetect.SentenceDetector;
import opennlp.tools.sentdetect.SentenceDetectorME;
import opennlp.tools.sentdetect.SentenceModel;
import opennlp.tools.tokenize.SimpleTokenizer;
import opennlp.tools.tokenize.Tokenizer;
import opennlp.tools.util.Sequence;
import opennlp.tools.util.Span;
import org.apache.felix.scr.annotations.Reference;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* @deprecated replaced by STANBOL-733 (stanbol nlp processing module
*/
public class TextAnalyzer {
private static final Logger log = LoggerFactory.getLogger(TextAnalyzer.class);
@Reference
private final OpenNLP openNLP;
private final TextAnalyzerConfig config;
/**
* @deprecated replaced by STANBOL-733 (stanbol nlp processing module
*/
public static final class TextAnalyzerConfig {
protected boolean forceSimpleTokenizer = false; //default to false
protected boolean forceKeywordTokenizer = false; //default to false
protected boolean enablePosTagger = true;
protected boolean enableChunker = true;
protected boolean enableSentenceDetector = true;
protected boolean enablePosTypeChunker = true;
protected boolean forcePosTypeChunker = true;
/**
* The minimum POS type probability used by the PosTypeChunker
*/
private double minPosTagProbability = 0.75;
public final boolean isSimpleTokenizerForced() {
return forceSimpleTokenizer;
}
public final void forceSimpleTokenizer(boolean useSimpleTokenizer) {
this.forceSimpleTokenizer = useSimpleTokenizer;
if(useSimpleTokenizer){
this.forceKeywordTokenizer = false;
}
}
public final boolean isKeywordTokenizerForced() {
return forceKeywordTokenizer;
}
public final void forceKeywordTokenizer(boolean useKeywordTokenizer) {
this.forceKeywordTokenizer = useKeywordTokenizer;
if(useKeywordTokenizer){
this.forceSimpleTokenizer = false;
}
}
public final boolean isPosTaggerEnable() {
return enablePosTagger;
}
public final void enablePosTagger(boolean enablePosTagger) {
this.enablePosTagger = enablePosTagger;
}
public final boolean isChunkerEnabled() {
return enableChunker;
}
public final void enableChunker(boolean enableChunker) {
this.enableChunker = enableChunker;
}
public final boolean isSentenceDetectorEnabled() {
return enableSentenceDetector;
}
public final void enableSentenceDetector(boolean enableSentenceDetector) {
this.enableSentenceDetector = enableSentenceDetector;
}
public final boolean isPosTypeChunkerEnabled() {
return enablePosTypeChunker;
}
/**
* Enables the used of the {@link PosTypeChunker} if no {@link Chunker} for
* the current {@link #getLanguage() language} is available.
* @param enablePosTypeChunker
*/
public final void enablePosTypeChunker(boolean enablePosTypeChunker) {
this.enablePosTypeChunker = enablePosTypeChunker;
if(!enablePosTypeChunker){
forcePosTypeChunker(enablePosTypeChunker);
}
}
public final boolean isPosTypeChunkerForced() {
return forcePosTypeChunker;
}
/**
* Forces the use of the {@link PosTypeChunker} even if a {@link Chunker}
* for the current language would be available
* @param forcePosTypeChunker
*/
public final void forcePosTypeChunker(boolean forcePosTypeChunker) {
this.forcePosTypeChunker = forcePosTypeChunker;
if(forcePosTypeChunker) {
enablePosTypeChunker(true);
}
}
/**
* Getter for the minimum POS tag probability so that the
* {@link PosTypeChunker} processes a POS tag.
* @return the minPosTypeProbability
*/
public final double getMinPosTypeProbability() {
return minPosTagProbability;
}
/**
* Setter for the minimum POS tag probability so that the
* {@link PosTypeChunker} processes a POS tag.
* @param minPosTagProbability The probability [0..1] or value < 0 to
* deactivate this feature
* @throws IllegalArgumentException if values > 1 are parsed as probability
*/
public final void setMinPosTagProbability(double probability) {
if(probability > 1){
throw new IllegalArgumentException("The minimum POS tag probability MUST be set to a value <= 1 (parsed:"+minPosTagProbability+"");
}
this.minPosTagProbability = probability;
}
}
private POSTaggerME posTagger;
/**
* used to ensure that {@link #openNLP} is only ask once for the {@link POSTaggerME}
* of the parsed {@link #language}
*/
private boolean posTaggerNotAvailable;
private SentenceDetector sentenceDetector;
/**
* used to ensure that {@link #openNLP} is only ask once for the {@link SentenceDetector}
* of the parsed {@link #language}
*/
private boolean sentenceDetectorNotAvailable;
private ChunkerME chunker;
/**
* used to ensure that {@link #openNLP} is only ask once for the {@link ChunkerME}
* of the parsed {@link #language}
*/
private boolean chunkerNotAvailable;
private PosTypeChunker posTypeChunker;
/**
* used to ensure only a single try to init a {@link PosTypeChunker} for
* the parsed {@link #language}
*/
private boolean posTypeChunkerNotAvailable;
/**
* The Tokenizer
*/
private Tokenizer tokenizer;
/**
* The language
*/
private final String language;
/**
* Creates a TextAnalyzer based on the OpenNLP and the given language and the
* default {@link TextAnalyzerConfig configuration}.<p>
* If <code>null</code> is parsed as language, than a minimal configuration
* that tokenizes the text using the {@link SimpleTokenizer} is used.
* @param openNLP The openNLP configuration to be used to analyze the text
* @param language the language or <code>null</code> if not known.
*/
public TextAnalyzer(OpenNLP openNLP,String language){
this(openNLP,language,null);
}
/**
* Creates a TextAnalyzer based on the OpenNLP and the given language.<p>
* If <code>null</code> is parsed as language, than a minimal configuration
* that tokenizes the text using the {@link SimpleTokenizer} is used.
* @param openNLP The openNLP configuration to be used to analyze the text
* @param language the language or <code>null</code> if not known.
*/
public TextAnalyzer(OpenNLP openNLP,String language, TextAnalyzerConfig config){
if(openNLP == null){
throw new IllegalArgumentException("The OpenNLP component MUST NOT be NULL");
}
this.config = config == null ? new TextAnalyzerConfig() : config;
this.openNLP = openNLP;
this.language = language;
}
protected final POSTaggerME getPosTagger() {
if(!config.enablePosTagger){
return null;
}
if(posTagger == null && !posTaggerNotAvailable){
try {
POSModel posModel = openNLP.getPartOfSpeechModel(language);
if(posModel != null){
posTagger = new POSTaggerME(posModel);
} else {
log.debug("No POS Model for language '{}'",language);
posTaggerNotAvailable = true;
}
} catch (IOException e) {
log.info("Unable to load POS Model for language '"+language+"'",e);
posTaggerNotAvailable = true;
}
}
return posTagger;
}
/**
* Getter for the Tokenizer of a given language
* @param language the language
* @return the Tolenizer
*/
public final Tokenizer getTokenizer(){
if(tokenizer == null){
if(config.forceSimpleTokenizer){
tokenizer = SimpleTokenizer.INSTANCE;
} else if(config.forceKeywordTokenizer){
tokenizer = KeywordTokenizer.INSTANCE;
} else {
tokenizer = openNLP.getTokenizer(language);
if(tokenizer == null){
log.debug("No Tokenizer for Language '{}': fall back to SimpleTokenizer!",language);
tokenizer = SimpleTokenizer.INSTANCE;
}
}
}
return tokenizer;
}
protected final ChunkerME getChunker(){
if(!config.enableChunker || config.forcePosTypeChunker){
return null;
}
if(chunker == null && !chunkerNotAvailable) {
try {
ChunkerModel chunkerModel = openNLP.getChunkerModel(language);
if(chunkerModel != null){
chunker = new ChunkerME(chunkerModel);
} else {
log.debug("No Chunker Model for language {}",language);
chunkerNotAvailable = true;
}
} catch (IOException e) {
log.info("Unable to load Chunker Model for language "+language,e);
chunkerNotAvailable = true;
}
}
return chunker;
}
protected final PosTypeChunker getPosTypeChunker(){
if(!config.enableChunker || !config.enablePosTagger){
return null;
}
if(posTypeChunker == null && !posTypeChunkerNotAvailable){
posTypeChunker = PosTypeChunker.getInstance(language,config.minPosTagProbability);
posTypeChunkerNotAvailable = posTypeChunker == null;
}
return posTypeChunker;
}
protected final SentenceDetector getSentenceDetector() {
if(!config.enableSentenceDetector){
return null;
}
if(sentenceDetector == null && !sentenceDetectorNotAvailable){
try {
SentenceModel sentModel = openNLP.getSentenceModel(language);
if(sentModel != null){
sentenceDetector = new SentenceDetectorME(sentModel);
} else {
log.debug("No Sentence Detection Model for language '{}'",language);
sentenceDetectorNotAvailable = true;
}
} catch (IOException e) {
log.info("Unable to load Sentence Detection Model for language '"+language+"'",e);
sentenceDetectorNotAvailable = true;
}
}
return sentenceDetector;
}
public final OpenNLP getOpenNLP() {
return openNLP;
}
/**
* @return the config
*/
public final TextAnalyzerConfig getConfig() {
return config;
}
/**
* @return the language
*/
public final String getLanguage() {
return language;
}
/**
* Analyses the parsed text in a single chunk. No sentence detector is used
* @param sentence the sentence (text) to analyse
* @return the Analysed text
*/
public AnalysedText analyseSentence(String sentence){
return new AnalysedText(sentence,language);
}
/**
* Analyses sentence by sentence when {@link Iterator#next()} is called on
* the returned Iterator. Changes to the configuration of this class will
* have an effect on the analysis results of this iterator.<p>
* if no sentence detector is available the whole text is parsed at once.
* @param text The text to analyse
* @return Iterator the analyses the parsed text sentence by sentence on
* calls to {@link Iterator#next()}.
*/
public Iterator<AnalysedText> analyse(String text){
return new TextAnalysisIterator(text, language);
}
/**
* @deprecated replaced by STANBOL-733 (stanbol nlp processing module
*/
private final class TextAnalysisIterator implements Iterator<AnalysedText> {
private final String text;
private final Span[] sentenceSpans;
private int current = 0;
private final String language;
private TextAnalysisIterator(String text,String language){
this.text = text;
this.language = language;
if(text == null || text.isEmpty()){
sentenceSpans = new Span[]{};
} else {
SentenceDetector sd = getSentenceDetector();
if(sd != null){
sentenceSpans = sd.sentPosDetect(text);
} else {
sentenceSpans = new Span[]{new Span(0, text.length())};
}
}
}
@Override
public boolean hasNext() {
return sentenceSpans.length > current;
}
@Override
public AnalysedText next() {
Span sentenceSpan = sentenceSpans[current];
String sentence = sentenceSpan.getCoveredText(text).toString();
current++; //mark this as consumed and navigate to the next
return new AnalysedText(sentence,language,sentenceSpan.getStart());
}
@Override
public void remove() {
throw new UnsupportedOperationException(
"Removal of Sentences of the prsed Text is not supported!");
}
}
/**
* @deprecated replaced by STANBOL-733 (stanbol nlp processing module
*/
public final class AnalysedText {
//NOTE: Members are protected to allow the JVM direct access
/**
* The analysed sentence
*/
protected final String sentence;
/**
* Final and {@link Collections#unmodifiableList(List) unmodifiable list}
* with the tokens of the analysed {@link #sentence}.
*/
protected final List<Token> tokens;
/**
* Final and {@link Collections#unmodifiableList(List) unmodifiable list}
* with the chunks of the analysed {@link #sentence} or <code>null</code>
* of no chunks are available
*/
protected final List<Chunk> chunks;
/**
* The offset of the sentence with respect to the whole text. Note that
* {@link AnalysedText this class} only holds the offset and no reference
* to the whole text. <code>0</code> indicates that this represents the
* start of the text (this may also indicate that the {@link #sentence}
* represents the whole analysed text).
*/
private final int offset;
/**
* The language of the analyzed text
*/
protected String language;
private AnalysedText(String sentence, String language){
this(sentence,language,0);
}
private AnalysedText(String sentence,String language, int offset){
if(sentence == null || sentence.isEmpty()){
throw new IllegalArgumentException(
"The parsed Sentence MUST NOT be NULL nor empty!");
}
this.sentence = sentence;
if(language == null || language.isEmpty()){
throw new IllegalArgumentException("The parsed language MUST NOT be NULL nor empty");
}
this.language = language;
if(offset < 0){
throw new IllegalArgumentException(
"The parsed offset MUST NOT be a negative number (offset="+offset+")");
}
this.offset = offset;
Span[] tokenSpans = getTokenizer().tokenizePos(sentence);
POSTaggerME tagger = getPosTagger();
ChunkerME chunker = getChunker();
PosTypeChunker posTypeChunker = getPosTypeChunker();
String[] tokens = new String[tokenSpans.length];
for(int ti = 0; ti<tokenSpans.length;ti++) {
tokens[ti] = tokenSpans[ti].getCoveredText(sentence).toString();
}
String[][] posTags;
double[][] posProbs;
Span[] chunkSpans;
double[] chunkProps;
if(tagger != null){
posTags = new String[tokens.length][];
posProbs = new double[tokens.length][];
//get the topK POS tags and props and copy it over to the 2dim Arrays
Sequence[] posSequences = tagger.topKSequences(tokens);
//extract the POS tags and props for the current token from the
//posSequences.
//NOTE: Sequence includes always POS tags for all Tokens. If
// less then posSequences.length are available it adds the
// best match for all followings.
// We do not want such copies.
String[] actPos = new String[posSequences.length];
double[] actProp = new double[posSequences.length];
for(int i=0;i<tokenSpans.length;i++){
boolean done = false;
int j = 0;
while( j < posSequences.length && !done){
String p = posSequences[j].getOutcomes().get(i);
done = j > 0 && p.equals(actPos[0]);
if(!done){
actPos[j] = p;
actProp[j] = posSequences[j].getProbs()[i];
j++;
}
}
posTags[i] = new String[j];
System.arraycopy(actPos, 0, posTags[i], 0, j);
posProbs[i] = new double[j];
System.arraycopy(actProp, 0, posProbs[i], 0, j);
}
//posProbs = tagger.probs();
if(chunker != null){
//we still need the Array of the best ranked POS tags for the chunker
String[] pos = posSequences[0].getOutcomes().toArray(new String[tokens.length]);
chunkSpans = chunker.chunkAsSpans(tokens, pos);
chunkProps = chunker.probs();
} else if(posTypeChunker != null){
chunkSpans = posTypeChunker.chunkAsSpans(tokens, posTags, posProbs);
chunkProps = new double[chunkSpans.length];
Arrays.fill(chunkProps, 1.0);
} else {
chunkSpans = null;
chunkProps = null;
}
} else {
posTags = null;
posProbs = null;
chunkSpans = null;
chunkProps = null;
}
List<Token> tokenList = new ArrayList<Token>(tokenSpans.length);
for(int i=0;i<tokenSpans.length;i++){
tokenList.add(new Token(tokenSpans[i], tokens[i],
posTags == null ? null: posTags[i],
posProbs == null ? null : posProbs[i]));
}
//assign the list to the member var but make unmodifiable!
this.tokens = Collections.unmodifiableList(tokenList);
if(chunkSpans != null){
List<Chunk> chunkList = new ArrayList<Chunk>(chunkSpans.length);
for(int i=0;i<chunkSpans.length;i++){
chunkList.add(new Chunk(chunkSpans[i], chunkProps[i]));
}
this.chunks = Collections.unmodifiableList(chunkList);
} else {
chunks = null;
}
}
public List<Token> getTokens(){
return tokens;
}
public List<Chunk> getChunks(){
return chunks;
}
public String getText(){
return sentence;
}
public String getLanguage(){
return language;
}
/**
* Getter for the Offset of this Sentence relative to the whole analysed
* Text. <code>0</code> if there is no offset this analysed text represents
* the whole content
* @return the offset
*/
public int getOffset() {
return offset;
}
/**
* @deprecated replaced by STANBOL-733 (stanbol nlp processing module
*/
public final class Token {
//NOTE: Members are protected to allow the JVM direct access
protected final Span span;
protected String token;
protected final String[] posTags;
protected final double[] posProbabilities;
protected final boolean hasAlphaNumeric;
private Token(Span span,String token,String pos,double posProbability){
this(span,token,new String[]{pos},new double[] {posProbability});
}
private Token(Span span,String token,String[] posTags, double[] posProbabilities){
this.span = span;
if(posTags == null || posTags.length < 1){
this.posTags = null;
} else {
this.posTags = posTags;
}
this.token = token;
if(this.posTags == null){
this.posProbabilities = null;
} else if(posTags.length != posProbabilities.length){
throw new IllegalStateException("POS Tag array and POS probability array MUST BE of the same size!");
} else {
this.posProbabilities = posProbabilities;
}
boolean foundAlphaNumericCahr = false;
for(int i = 0;!foundAlphaNumericCahr &&i<token.length();i++){
foundAlphaNumericCahr = Character.isLetterOrDigit(token.charAt(i));
}
hasAlphaNumeric = foundAlphaNumericCahr;
}
public int getStart(){
return span.getStart();
}
public int getEnd(){
return span.getEnd();
}
/**
* Getter for the best ranked POS tag for this token
* @return
*/
public String getPosTag(){
return posTags == null ? null : posTags[0];
}
/**
* Getter for all the POS tags of this Token. The one with the
* highest probability is at index 0.
* @return All POS tags assigned to this Token
*/
public String[] getPosTags(){
return posTags;
}
/**
* Getter for the probability of the top ranked POS tag
* @return the POS probability
*/
public double getPosProbability() {
return posProbabilities == null ? -1 : posProbabilities[0];
}
/**
* Getter for the probabilities of all {@link #getPosTags() POS tags}
* @return the probabilities of the POS tags returned by
* {@link #getPosTags()}
*/
public double[] getPosProbabilities(){
return posProbabilities;
}
/**
* Getter for the value of this token
* @return
*/
public String getText(){
if(token == null){
token = span.getCoveredText(sentence).toString();
}
return token;
}
public boolean hasAplhaNumericChar(){
return hasAlphaNumeric;
}
@Override
public String toString() {
return getText()+(posTags != null?
'_'+(posTags.length == 1 ?
posTags[0] :
Arrays.toString(posTags)):"");
}
}
/**
* @deprecated replaced by STANBOL-733 (stanbol nlp processing module
*/
public final class Chunk {
//NOTE: Members are protected to allow the JVM direct access
/**
* The span over the char offset of this chunk within the
* {@link AnalysedText#sentence}
*/
protected final Span span;
/**
* Span over the {@link AnalysedText#tokens} as used by the
* {@link #getStart()} and {@link #getEnd()} methods
*/
protected final Span chunkSpan;
protected final double probability;
/**
* DO NOT DIRECTLY ACCESS - lazy initialisation in {@link #getText()}
*/
private String __text;
/**
* DO NOT DIRECTYL ACCESS - lazy initialisation in {@link #getTokens()}
*/
private List<Token> __chunkTokens;
private Chunk(Span chunkSpan,double probability){
this.chunkSpan = chunkSpan;
this.span = new Span(tokens.get(chunkSpan.getStart()).getStart(),
tokens.get(chunkSpan.getEnd()).getEnd());
this.probability = probability;
}
public List<Token> getTokens(){
if(__chunkTokens == null){
__chunkTokens = tokens.subList(chunkSpan.getStart(), chunkSpan.getEnd());
}
return __chunkTokens;
}
/**
* @return the span
*/
public int getStart() {
return chunkSpan.getStart();
}
public int getEnd(){
return chunkSpan.getEnd();
}
public int getSize(){
return chunkSpan.length();
}
/**
* @return the probability
*/
public double getProbability() {
return probability;
}
/**
* The text of this chunk
* @return
*/
public String getText(){
if(__text == null){
__text = span.getCoveredText(sentence).toString();
}
return __text;
}
@Override
public String toString() {
return getText();
}
}
}
}