package com.github.btpka3.lucene.analysis;
import net.sourceforge.pinyin4j.PinyinHelper;
import net.sourceforge.pinyin4j.format.HanyuPinyinCaseType;
import net.sourceforge.pinyin4j.format.HanyuPinyinOutputFormat;
import net.sourceforge.pinyin4j.format.HanyuPinyinToneType;
import net.sourceforge.pinyin4j.format.HanyuPinyinVCharType;
import net.sourceforge.pinyin4j.format.exception.BadHanyuPinyinOutputFormatCombination;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import java.io.IOException;
import java.io.Reader;
public class PinyinTokenizer extends Tokenizer {
private static final int DEFAULT_BUFFER_SIZE = 256;
private boolean done = false;
private int finalOffset;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private HanyuPinyinOutputFormat format = new HanyuPinyinOutputFormat();
private String padding_char;
private String first_letter;
public PinyinTokenizer(Reader reader, String padding_char, String first_letter) {
this(reader, DEFAULT_BUFFER_SIZE);
this.padding_char = padding_char;
this.first_letter = first_letter;
}
public PinyinTokenizer(Reader input, int bufferSize) {
super(input);
termAtt.resizeBuffer(bufferSize);
format.setCaseType(HanyuPinyinCaseType.LOWERCASE);
format.setToneType(HanyuPinyinToneType.WITHOUT_TONE);
format.setVCharType(HanyuPinyinVCharType.WITH_V);
}
@Override
public final boolean incrementToken() throws IOException {
clearAttributes();
if (!done) {
done = true;
int upto = 0;
char[] buffer = termAtt.buffer();
while (true) {
final int length = input.read(buffer, upto, buffer.length - upto);
if (length == -1) break;
upto += length;
if (upto == buffer.length)
buffer = termAtt.resizeBuffer(1 + buffer.length);
}
termAtt.setLength(upto);
String str = termAtt.toString();
termAtt.setEmpty();
StringBuilder stringBuilder = new StringBuilder();
StringBuilder firstLetters = new StringBuilder();
for (int i = 0; i < str.length(); i++) {
char c = str.charAt(i);
if (c < 128) {
stringBuilder.append(c);
} else {
try {
String[] strs = PinyinHelper.toHanyuPinyinStringArray(c, format);
if (strs != null) {
//get first result by default
String first_value = strs[0];
//TODO more than one pinyin
stringBuilder.append(first_value);
if (this.padding_char.length() > 0) {
stringBuilder.append(this.padding_char);
}
firstLetters.append(first_value.charAt(0));
}
} catch (BadHanyuPinyinOutputFormatCombination badHanyuPinyinOutputFormatCombination) {
badHanyuPinyinOutputFormatCombination.printStackTrace();
}
}
}
//let's join them
if (first_letter.equals("prefix")) {
termAtt.append(firstLetters.toString());
if (this.padding_char.length() > 0) {
termAtt.append(this.padding_char); //TODO splitter
}
termAtt.append(stringBuilder.toString());
} else if (first_letter.equals("append")) {
termAtt.append(stringBuilder.toString());
if (this.padding_char.length() > 0) {
if (!stringBuilder.toString().endsWith(this.padding_char)) {
termAtt.append(this.padding_char);
}
}
termAtt.append(firstLetters.toString());
} else if (first_letter.equals("none")) {
termAtt.append(stringBuilder.toString());
} else if (first_letter.equals("only")) {
termAtt.append(firstLetters.toString());
}
finalOffset = correctOffset(upto);
offsetAtt.setOffset(correctOffset(0), finalOffset);
return true;
}
return false;
}
@Override
public final void end() {
// set final offset
offsetAtt.setOffset(finalOffset, finalOffset);
}
@Override
public void reset() throws IOException {
super.reset();
this.done = false;
}
}