package org.apache.lucene.analysis.compound;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.Arrays;
import java.util.Collection;
import java.util.LinkedList;
import java.util.Set;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.util.Version;
/**
* Base class for decomposition token filters. <a name="version"/>
* <p>
* You must specify the required {@link Version} compatibility when creating
* CompoundWordTokenFilterBase:
* <ul>
* <li>As of 3.1, CompoundWordTokenFilterBase correctly handles Unicode 4.0
* supplementary characters in strings and char arrays provided as compound word
* dictionaries.
* </ul>
*/
public abstract class CompoundWordTokenFilterBase extends TokenFilter {
/**
* The default for minimal word length that gets decomposed
*/
public static final int DEFAULT_MIN_WORD_SIZE = 5;
/**
* The default for minimal length of subwords that get propagated to the output of this filter
*/
public static final int DEFAULT_MIN_SUBWORD_SIZE = 2;
/**
* The default for maximal length of subwords that get propagated to the output of this filter
*/
public static final int DEFAULT_MAX_SUBWORD_SIZE = 15;
protected final CharArraySet dictionary;
protected final LinkedList<Token> tokens;
protected final int minWordSize;
protected final int minSubwordSize;
protected final int maxSubwordSize;
protected final boolean onlyLongestMatch;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private final FlagsAttribute flagsAtt = addAttribute(FlagsAttribute.class);
private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
private final PayloadAttribute payloadAtt = addAttribute(PayloadAttribute.class);
private final Token wrapper = new Token();
/**
* @deprecated use {@link #CompoundWordTokenFilterBase(Version, TokenStream, String[], int, int, int, boolean)} instead
*/
@Deprecated
protected CompoundWordTokenFilterBase(TokenStream input, String[] dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
this(Version.LUCENE_30, input, makeDictionary(dictionary),minWordSize,minSubwordSize,maxSubwordSize, onlyLongestMatch);
}
/**
* @deprecated use {@link #CompoundWordTokenFilterBase(Version, TokenStream, String[], boolean)} instead
*/
@Deprecated
protected CompoundWordTokenFilterBase(TokenStream input, String[] dictionary, boolean onlyLongestMatch) {
this(Version.LUCENE_30, input, makeDictionary(dictionary),DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, onlyLongestMatch);
}
/**
* @deprecated use {@link #CompoundWordTokenFilterBase(Version, TokenStream, Set, boolean)} instead
*/
@Deprecated
protected CompoundWordTokenFilterBase(TokenStream input, Set<?> dictionary, boolean onlyLongestMatch) {
this(Version.LUCENE_30, input, dictionary,DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, onlyLongestMatch);
}
/**
* @deprecated use {@link #CompoundWordTokenFilterBase(Version, TokenStream, String[])} instead
*/
@Deprecated
protected CompoundWordTokenFilterBase(TokenStream input, String[] dictionary) {
this(Version.LUCENE_30, input, makeDictionary(dictionary),DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, false);
}
/**
* @deprecated use {@link #CompoundWordTokenFilterBase(Version, TokenStream, Set)} instead
*/
@Deprecated
protected CompoundWordTokenFilterBase(TokenStream input, Set<?> dictionary) {
this(Version.LUCENE_30, input, dictionary,DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, false);
}
/**
* @deprecated use {@link #CompoundWordTokenFilterBase(Version, TokenStream, Set, int, int, int, boolean)} instead
*/
@Deprecated
protected CompoundWordTokenFilterBase(TokenStream input, Set<?> dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
this(Version.LUCENE_30, input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch);
}
protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, String[] dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
this(matchVersion, input,makeDictionary(dictionary),minWordSize,minSubwordSize,maxSubwordSize, onlyLongestMatch);
}
protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, String[] dictionary, boolean onlyLongestMatch) {
this(matchVersion, input,makeDictionary(dictionary),DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, onlyLongestMatch);
}
protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, Set dictionary, boolean onlyLongestMatch) {
this(matchVersion, input,dictionary,DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, onlyLongestMatch);
}
protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, String[] dictionary) {
this(matchVersion, input,makeDictionary(dictionary),DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, false);
}
protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, Set dictionary) {
this(matchVersion, input,dictionary,DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, false);
}
protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, Set dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
super(input);
this.tokens=new LinkedList<Token>();
this.minWordSize=minWordSize;
this.minSubwordSize=minSubwordSize;
this.maxSubwordSize=maxSubwordSize;
this.onlyLongestMatch=onlyLongestMatch;
if (dictionary==null || dictionary instanceof CharArraySet) {
this.dictionary = (CharArraySet) dictionary;
} else {
this.dictionary = new CharArraySet(matchVersion, dictionary.size(), false);
addAllLowerCase(this.dictionary, dictionary);
}
}
/**
* Create a set of words from an array
* The resulting Set does case insensitive matching
* TODO We should look for a faster dictionary lookup approach.
* @param dictionary
* @return {@link Set} of lowercased terms
*/
public static final Set<?> makeDictionary(final String[] dictionary) {
return makeDictionary(Version.LUCENE_30, dictionary);
}
public static final Set<?> makeDictionary(final Version matchVersion, final String[] dictionary) {
if (dictionary == null) {
return null;
}
// is the below really case insensitive?
CharArraySet dict = new CharArraySet(matchVersion, dictionary.length, false);
addAllLowerCase(dict, Arrays.asList(dictionary));
return dict;
}
private final void setToken(final Token token) throws IOException {
clearAttributes();
termAtt.copyBuffer(token.buffer(), 0, token.length());
flagsAtt.setFlags(token.getFlags());
typeAtt.setType(token.type());
offsetAtt.setOffset(token.startOffset(), token.endOffset());
posIncAtt.setPositionIncrement(token.getPositionIncrement());
payloadAtt.setPayload(token.getPayload());
}
@Override
public final boolean incrementToken() throws IOException {
if (tokens.size() > 0) {
setToken(tokens.removeFirst());
return true;
}
if (!input.incrementToken())
return false;
wrapper.copyBuffer(termAtt.buffer(), 0, termAtt.length());
wrapper.setStartOffset(offsetAtt.startOffset());
wrapper.setEndOffset(offsetAtt.endOffset());
wrapper.setFlags(flagsAtt.getFlags());
wrapper.setType(typeAtt.type());
wrapper.setPositionIncrement(posIncAtt.getPositionIncrement());
wrapper.setPayload(payloadAtt.getPayload());
decompose(wrapper);
if (tokens.size() > 0) {
setToken(tokens.removeFirst());
return true;
} else {
return false;
}
}
protected static final void addAllLowerCase(Set<Object> target, Collection<String> col) {
for (String string : col) {
target.add(string.toLowerCase());
}
}
protected static char[] makeLowerCaseCopy(final char[] buffer) {
char[] result=new char[buffer.length];
System.arraycopy(buffer, 0, result, 0, buffer.length);
for (int i=0;i<buffer.length;++i) {
result[i]=Character.toLowerCase(buffer[i]);
}
return result;
}
protected final Token createToken(final int offset, final int length,
final Token prototype) {
int newStart = prototype.startOffset() + offset;
Token t = prototype.clone(prototype.buffer(), offset, length, newStart, newStart+length);
t.setPositionIncrement(0);
return t;
}
protected void decompose(final Token token) {
// In any case we give the original token back
tokens.add((Token) token.clone());
// Only words longer than minWordSize get processed
if (token.length() < this.minWordSize) {
return;
}
decomposeInternal(token);
}
protected abstract void decomposeInternal(final Token token);
@Override
public void reset() throws IOException {
super.reset();
tokens.clear();
}
}