// =================================================================================================
// Copyright 2011 Twitter, Inc.
// -------------------------------------------------------------------------------------------------
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this work except in compliance with the License.
// You may obtain a copy of the License in the LICENSE file, or at:
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// =================================================================================================
package com.twitter.common.text.combiner;
import java.util.Map;
import com.google.common.base.Preconditions;
import com.google.common.collect.Maps;
import org.apache.lucene.util.AttributeSource;
import com.twitter.common.text.token.TokenProcessor;
import com.twitter.common.text.token.TokenStream;
import com.twitter.common.text.token.attribute.CharSequenceTermAttribute;
import com.twitter.common.text.token.attribute.TokenType;
import com.twitter.common.text.token.attribute.TokenTypeAttribute;
/**
* Combines multiple tokens into a single one if they define an entity identified
* by an extractor TokenStream.
*/
public class ExtractorBasedTokenCombiner extends TokenProcessor {
private final CharSequenceTermAttribute termAttr;
private final CharSequenceTermAttribute inputTermAttr;
private final TokenTypeAttribute typeAttr;
private TokenStream extractor = null;
private CharSequenceTermAttribute extractorTermAttr;
private TokenType type = null;
private AttributeSource.State state;
// this map stores the start offsets and end offsets
// of the tokens detected by extractor.
private Map<Integer, Integer> offsetMap = Maps.newHashMap();
public ExtractorBasedTokenCombiner(TokenStream inputStream) {
super(inputStream);
Preconditions.checkArgument(hasAttribute(CharSequenceTermAttribute.class));
termAttr = getAttribute(CharSequenceTermAttribute.class);
typeAttr = addAttribute(TokenTypeAttribute.class);
inputTermAttr = inputStream.getAttribute(CharSequenceTermAttribute.class);
}
protected void setExtractor(TokenStream extractor) {
this.extractor = extractor;
extractorTermAttr = extractor.getAttribute(CharSequenceTermAttribute.class);
}
protected void setType(TokenType type) {
this.type = type;
}
@Override
public void reset(CharSequence input) {
super.reset(input);
Preconditions.checkNotNull(extractor);
offsetMap.clear();
extractor.reset(input);
while (extractor.incrementToken()) {
offsetMap.put(extractorTermAttr.getOffset(),
extractorTermAttr.getOffset() + extractorTermAttr.getLength());
}
}
@Override
public boolean incrementToken() {
clearAttributes();
if (state != null) {
restoreState(state);
state = null;
} else {
if (!getInputStream().incrementToken()) {
return false;
}
restoreState(getInputStream().captureState());
}
if (offsetMap.containsKey(termAttr.getOffset())) {
int startOffset = termAttr.getOffset();
int endOffset = offsetMap.get(startOffset);
// if the current token matches the given pattern,
// simply update its TypeAttribute.
if (endOffset == inputTermAttr.getOffset() + inputTermAttr.getLength()) {
if (type != null) {
typeAttr.setType(type);
}
return true;
}
while (getInputStream().incrementToken()) {
state = getInputStream().captureState();
int currentEndOffset = inputTermAttr.getOffset() + inputTermAttr.getLength();
if (currentEndOffset == endOffset) {
//found it!
termAttr.setLength(endOffset - startOffset);
if (type != null) {
typeAttr.setType(type);
}
state = null;
break;
} else if (currentEndOffset > endOffset) {
// cannot find it and currentEndOffset
// grows beyond expected (tokenization mismatch??)
break;
}
}
}
return true;
}
}