/* LanguageTool, a natural language style checker
* Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de)
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
* USA
*/
package org.languagetool.dev.index;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.util.CharacterUtils;
import org.apache.lucene.util.AttributeFactory;
import java.io.IOException;
/**
* A tokenizer that renders the whole input as one token.
*
* @author Tao Lin
*/
public final class AnyCharTokenizer extends Tokenizer {
private static final int MAX_WORD_LEN = Integer.MAX_VALUE; // extend the word length!
private final CharacterUtils.CharacterBuffer ioBuffer = CharacterUtils.newCharacterBuffer(4096);
private final CharacterUtils charUtils = CharacterUtils.getInstance();
private final CharTermAttribute termAtt = (CharTermAttribute)this.addAttribute(CharTermAttribute.class);
private final OffsetAttribute offsetAtt = (OffsetAttribute)this.addAttribute(OffsetAttribute.class);
private int bufferIndex = 0;
private int dataLen = 0;
private int offset = 0;
private int finalOffset = 0;
/**
* Construct a new AnyCharTokenizer.
*/
public AnyCharTokenizer() {
super();
}
/**
* Construct a new AnyCharTokenizer using a given
* {@link org.apache.lucene.util.AttributeFactory}.
* @param factory the attribute factory to use for this {@link org.apache.lucene.analysis.Tokenizer}
*/
public AnyCharTokenizer(AttributeFactory factory) {
super(factory);
}
/**
* Collects any characters.
*/
protected boolean isTokenChar(int c) {
return true;
}
protected int normalize(int c) {
return c;
}
@Override
public boolean incrementToken() throws IOException {
this.clearAttributes();
int length = 0;
int start = -1;
int end = -1;
char[] buffer = this.termAtt.buffer();
while(true) {
if(this.bufferIndex >= this.dataLen) {
this.offset += this.dataLen;
this.charUtils.fill(this.ioBuffer, this.input);
if(this.ioBuffer.getLength() == 0) {
this.dataLen = 0;
if(length <= 0) {
this.finalOffset = this.correctOffset(this.offset);
return false;
}
break;
}
this.dataLen = this.ioBuffer.getLength();
this.bufferIndex = 0;
}
int c = this.charUtils.codePointAt(this.ioBuffer.getBuffer(), this.bufferIndex, this.ioBuffer.getLength());
int charCount = Character.charCount(c);
this.bufferIndex += charCount;
if(this.isTokenChar(c)) {
if(length == 0) {
assert start == -1;
start = this.offset + this.bufferIndex - charCount;
end = start;
} else if(length >= buffer.length - 1) {
buffer = this.termAtt.resizeBuffer(2 + length);
}
end += charCount;
length += Character.toChars(this.normalize(c), buffer, length);
if(length >= MAX_WORD_LEN) {
break;
}
} else if(length > 0) {
break;
}
}
this.termAtt.setLength(length);
assert start != -1;
this.offsetAtt.setOffset(this.correctOffset(start), this.finalOffset = this.correctOffset(end));
return true;
}
@Override
public void end() throws IOException {
super.end();
this.offsetAtt.setOffset(this.finalOffset, this.finalOffset);
}
@Override
public void reset() throws IOException {
super.reset();
this.bufferIndex = 0;
this.offset = 0;
this.dataLen = 0;
this.finalOffset = 0;
this.ioBuffer.reset();
}
}