/*
* Copyright (c) 2009 Andrejs Jermakovics.
*
* All rights reserved. This program and the accompanying materials
* are made available under the terms of the Eclipse Public License v1.0
* which accompanies this distribution, and is available at
* http://www.eclipse.org/legal/epl-v10.html
*
* Contributors:
* Andrejs Jermakovics - initial implementation
*/
package it.unibz.instasearch.indexing.tokenizers;
import java.io.IOException;
import java.util.LinkedList;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
/**
* Splits terms. Returns the original term and its split parts
*/
public abstract class TermSplitTokenizer extends TokenFilter {
private LinkedList<Token> tokens = new LinkedList<Token>();
private TermAttribute termAtt;
private OffsetAttribute offsetAtt;
private PositionIncrementAttribute posAtt;
public TermSplitTokenizer(TokenStream in) {
super(in);
assert( in.hasAttribute(TermAttribute.class) );
assert( in.hasAttribute(OffsetAttribute.class) );
assert( in.hasAttribute(PositionIncrementAttribute.class) );
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
posAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
}
@Override
public boolean incrementToken() throws IOException
{
if( !tokens.isEmpty() )
{
applyToken(tokens.removeFirst());
}
else if( input.incrementToken() )
{
splitIntoTokens();
if( !tokens.isEmpty() )
{
if( ! returnOriginalTerm() )
applyToken( tokens.removeFirst() );
}
}
else
{
return false; // does not have any more tokens
}
return true;
}
private void splitIntoTokens()
{
String term = termAtt.term();
String[] termParts = splitTerm(term);
if(termParts.length > 1)
{
int termPos = offsetAtt.startOffset();
for (int i = 0; i < termParts.length; i++)
{
String termPart = termParts[i];
int termPartPos = termPos + term.indexOf(termPart);
int termPartEndPos = termPartPos + termPart.length();
Token newToken = new Token(termPart, termPartPos, termPartEndPos);
newToken.setPositionIncrement(0); // in the same position
tokens.add( newToken );
}
}
}
private void applyToken(Token token)
{
termAtt.setTermBuffer(token.termBuffer(), 0, token.termLength());
posAtt.setPositionIncrement(token.getPositionIncrement());
offsetAtt.setOffset(token.startOffset(), token.endOffset());
}
/**
* Return original term together with the parts
* @return returnOriginalTerm
*/
protected boolean returnOriginalTerm()
{
return false;
}
/**
* Split term into an array of terms
*
* @param term
* @return split term
*/
public abstract String[] splitTerm(String term);
}