// ================================================================================================= // Copyright 2011 Twitter, Inc. // ------------------------------------------------------------------------------------------------- // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this work except in compliance with the License. // You may obtain a copy of the License in the LICENSE file, or at: // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // ================================================================================================= package com.twitter.common.text.combiner; import java.util.Map; import java.util.Queue; import com.google.common.base.Preconditions; import com.google.common.collect.Lists; import com.google.common.collect.Maps; import com.twitter.common.text.token.TokenProcessor; import com.twitter.common.text.token.TwitterTokenStream; import com.twitter.common.text.token.attribute.TokenType; /** * Combines multiple tokens into a single one if they define an entity identified * by an extractor TwitterTokenStream. */ public class ExtractorBasedTokenCombiner extends TokenProcessor { private TwitterTokenStream extractor = null; private TokenType type = null; private Queue<State> nextStates = Lists.newLinkedList(); // this map stores the start offsets and end offsets // of the tokens detected by extractor. private Map<Integer, Integer> offsetMap = Maps.newHashMap(); public ExtractorBasedTokenCombiner(TwitterTokenStream inputStream) { super(inputStream); } protected void setExtractor(TwitterTokenStream extractor) { this.extractor = extractor; } protected void setType(TokenType type) { this.type = type; } @Override public void reset(CharSequence input) { super.reset(input); Preconditions.checkNotNull(extractor); offsetMap.clear(); extractor.reset(input); while (extractor.incrementToken()) { offsetMap.put(extractor.offset(), extractor.offset() + extractor.length()); } } @Override public final boolean incrementToken() { if (!nextStates.isEmpty()) { restoreState(nextStates.poll()); return true; } if (!incrementInputStream()) { return false; } if (offsetMap.containsKey(offset())) { int startOffset = offset(); int endOffset = offsetMap.get(startOffset); // if the current token matches the given pattern, // simply update its TypeAttribute. if (endOffset == startOffset + length()) { if (type != null) { updateType(type); } return true; } // store the attributes of the current token nextStates.add(captureState()); while (incrementInputStream()) { // store the next token's status nextStates.add(captureState()); int currentEndOffset = offset() + length(); if (currentEndOffset == endOffset) { //found it! // restore attributes of the first token. restoreState(nextStates.poll()); updateOffsetAndLength(startOffset, endOffset - startOffset); if (type != null) { updateType(type); } nextStates.clear(); break; } else if (currentEndOffset > endOffset) { // cannot find it and currentEndOffset. // grows beyond expected. (tokenization mismatch??) break; } } if (!nextStates.isEmpty()) { restoreState(nextStates.poll()); } } return true; } }