/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** * */ package org.apache.stanbol.enhancer.engines.keywordextraction.impl; import java.util.HashMap; import java.util.Iterator; import java.util.LinkedHashMap; import java.util.Map; import org.apache.stanbol.commons.opennlp.TextAnalyzer.AnalysedText; import org.apache.stanbol.commons.opennlp.TextAnalyzer.AnalysedText.Chunk; import org.apache.stanbol.commons.opennlp.TextAnalyzer.AnalysedText.Token; public class ProcessingState { private final Iterator<AnalysedText> sentences; /** * The sentence currently processed */ private AnalysedText sentence; /** * The index of the current token needed to be linked */ private int tokenIndex = -1; /** * The current token */ private Token token; /** * The iterator over the chunks of the current {@link #sentence} * or <code>null</code> if no {@link Chunk}s are available. */ private Iterator<Chunk> chunks; /** * The current {@link Chunk} */ private Chunk chunk; private static final int MAX_TEXT_CACHE_SIZE = 32; /** * This is a cache over the last {@link #MAX_TEXT_CACHE_SIZE} token texts * requested by {@link #getTokenText(int, int)} */ private Map<String,String> textCache = new LinkedHashMap<String,String>( MAX_TEXT_CACHE_SIZE, 0.75f, true){ private static final long serialVersionUID = 1L; protected boolean removeEldestEntry(Map.Entry<String,String> eldest) { return size() > MAX_TEXT_CACHE_SIZE; }; }; /** * The position for the next token */ private int nextToken = -1; /** * The position of the last consumed position */ private int consumedIndex = -1; public ProcessingState(Iterator<AnalysedText> sentences){ this.sentences = sentences; if(!sentences.hasNext()){ throw new IllegalArgumentException("The parsed AnalysedContent MUST NOT have an empty AnalysedText iterator!"); } } /** * Getter for the current Sentence * @return the sentence */ public final AnalysedText getSentence() { return sentence; } /** * Getter for the index of the current active token within the current * active {@link #getSentence() sentence} * @return the tokenPos the index of the token */ public final int getTokenIndex() { return tokenIndex; } /** * Getter for the last consumed index * @return the index of the last consumed token */ public final int getConsumedIndex() { return consumedIndex; } /** * The currently active token * @return the token */ public final Token getToken() { return token; } /** * Getter for the language of the current Token (based on the current * sentence) * @return the language */ public final String getLanguage() { return sentence.getLanguage(); } /** * The currently active chunk or <code>null</code> if no chunks are * available. If chunks are present this can not be <code>null</code> * because {@link Token}s outside of chunks are skiped. * @return the chunk the current {@link Chunk} or <code>null</code> if * no chunks are present. */ public final Chunk getChunk() { return chunk; } /** * Getter for the next {@link Token} to be processed. Calling {@link #next()} * is guaranteed to skip all tokens in between {@link #getTokenIndex()} * and {@link #getNextToken()}, but it might even skip more tokens (e.g. * in case that the token referenced by {@link #getNextToken()} is not * within a {@link Chunk} * @return the nextToken */ public final int getNextToken() { return nextToken; } // /** // * Allows to manually set to position of the next token to process. // * This can be used to skip some tokens within (e.g. if a Concept // * matching multiple Tokens where found.<p> // * The set token may be greater than the number of tokens in // * {@link #sentence}. This will simple cause the next sentence to be // * activated on the next call to {@link #next()} // * @param pos the position of the next token to process. // */ // public void setNextToken(int pos){ // if(pos > tokenIndex){ // this.nextToken = pos; // } else { // throw new IllegalArgumentException("The nextTokenPos "+pos+ // " MUST BE greater than the current "+tokenIndex); // } // } /** * The index of an consumed Token. The consumed index MUST BE equals or * greater as {@link #getTokenIndex()}. If the consumed index is set to a * value greater that {@link #getTokenIndex()} than consumed tokens are * skipped on the next call to {@link #next()} * @param pos the position of the last consumed token. */ public void setConsumed(int pos){ if(pos >= tokenIndex){ this.consumedIndex = pos; this.nextToken = pos+1; } else { throw new IllegalArgumentException("The lastConsumedPos "+pos+ " MUST BE equals or gerater than the current Pos "+tokenIndex); } } /** * Moves the state to #nextToken this may switch to the next Chunk or * sentence. * @return <code>true</code> if there are further elements to process or * <code>false</code> if there are no further elements to process. */ public boolean next() { //switch to the next token if(nextToken > tokenIndex){ tokenIndex = nextToken; } else { tokenIndex++; nextToken = tokenIndex; } //now init the next element final boolean hasNext; if(chunk != null){ //if chunks are present //get next chunk (may be the current if chunk.getEnd() > tokenPos for(;tokenIndex > chunk.getEnd() && chunks.hasNext();chunk = chunks.next()); if(tokenIndex <= chunk.getEnd()){ //found valid chunk if(chunk.getStart() > tokenIndex) { //skip tokens outside chunks tokenIndex = chunk.getStart(); } if(chunk.getStart() > consumedIndex){ consumedIndex = chunk.getStart()-1; } hasNext = true; } else { //no more valid chunks in this sentence hasNext = initNextSentence(); } } else { //no chunks ... use tokens only if(sentence == null){ //first sentence hasNext = initNextSentence(); } else if(tokenIndex >= sentence.getTokens().size()){ hasNext = initNextSentence(); } else { //more tokens in the sentence //set the token hasNext = true; } } if(hasNext){ //set the Token token = sentence.getTokens().get(tokenIndex); } return hasNext; } /** * Correctly initialise {@link #sentence}, {@link #chunks}, {@link #chunk} * and {@link #tokenIndex} for the next element of {@link #sentences}. If * no further sentences are to process it simple sets {@link #sentence}, * {@link #chunks}, {@link #chunk} and {@link #tokenIndex} to <code>null</code> */ private boolean initNextSentence() { textCache.clear(); sentence = null; while(sentence == null && sentences.hasNext()){ sentence = sentences.next(); if(sentence.getChunks() != null){ chunks = sentence.getChunks().iterator(); if(chunks.hasNext()){ chunk = chunks.next(); tokenIndex = chunk.getStart(); consumedIndex = tokenIndex-1; nextToken = tokenIndex; } else { //no chunks in this sentence sentence = null; //skip this sentence } } else { if(sentence.getTokens().isEmpty()){ //no tokens in this sentence sentence = null; //skip this one } else { chunks = null; chunk = null; tokenIndex = 0; consumedIndex = -1; nextToken = 0; } } } return sentence != null; } /** * Getter for the text covered by the next tokenCount tokens relative to * {@link #token}. It uses the {@link #textCache} to lookup/store such texts. * Given the Tokens * <pre> * [This, is, an, Example] * </pre> * and the parameter <code>3</code> this method will return * <pre> * This is an * </pre> * @param tokenCount the number of tokens to be included relative to * {@link #tokenIndex} * @return the text covered by the span start of {@link #token} to end of * token at <code>{@link #tokenIndex}+tokenCount</code>. */ public String getTokenText(int start, int tokenCount){ String pos = start+","+tokenCount; String text = textCache.get(pos); if(text == null){ text = sentence.getText().substring( sentence.getTokens().get(start).getStart(), sentence.getTokens().get(start+tokenCount-1).getEnd()); textCache.put(pos, text); } return text; } @Override public String toString() { return "["+tokenIndex+","+token+"] chunk: " + (chunk == null?null:chunk.getText())+"| sentence: "+ (sentence == null?null:sentence.getText()); } }