package com.github.btpka3.lucene.analysis;
import net.sourceforge.pinyin4j.PinyinHelper;
import net.sourceforge.pinyin4j.format.HanyuPinyinCaseType;
import net.sourceforge.pinyin4j.format.HanyuPinyinOutputFormat;
import net.sourceforge.pinyin4j.format.HanyuPinyinToneType;
import net.sourceforge.pinyin4j.format.HanyuPinyinVCharType;
import net.sourceforge.pinyin4j.format.exception.BadHanyuPinyinOutputFormatCombination;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import java.io.IOException;
public class PinyinTokenFilter extends TokenFilter {
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private HanyuPinyinOutputFormat format = new HanyuPinyinOutputFormat();
private String padding_char;
private String first_letter;
@Override
public final boolean incrementToken() throws IOException {
if (!input.incrementToken()) {
return false;
}
final char[] buffer = termAtt.buffer();
final int bufferLength = termAtt.length();
StringBuilder stringBuilder = new StringBuilder();
StringBuilder firstLetters = new StringBuilder();
for (int i = 0; i < bufferLength; i++) {
char c = buffer[i];
if (c < 128) {
stringBuilder.append(c);
} else {
try {
String[] strs = PinyinHelper.toHanyuPinyinStringArray(c, format);
if (strs != null) {
//get first result by default
String first_value = strs[0];
//TODO more than one pinyin
stringBuilder.append(first_value);
if (this.padding_char.length() > 0) {
stringBuilder.append(this.padding_char);
}
firstLetters.append(first_value.charAt(0));
}
} catch (BadHanyuPinyinOutputFormatCombination badHanyuPinyinOutputFormatCombination) {
badHanyuPinyinOutputFormatCombination.printStackTrace();
}
}
}
StringBuilder pinyinStringBuilder = new StringBuilder();
if (first_letter.equals("prefix")) {
pinyinStringBuilder.append(firstLetters.toString());
if (this.padding_char.length() > 0) {
pinyinStringBuilder.append(this.padding_char); //TODO splitter
}
pinyinStringBuilder.append(stringBuilder.toString());
} else if (first_letter.equals("append")) {
pinyinStringBuilder.append(stringBuilder.toString());
if (this.padding_char.length() > 0) {
if (!stringBuilder.toString().endsWith(this.padding_char)) {
pinyinStringBuilder.append(this.padding_char);
}
}
pinyinStringBuilder.append(firstLetters.toString());
} else if (first_letter.equals("none")) {
pinyinStringBuilder.append(stringBuilder.toString());
} else if (first_letter.equals("only")) {
pinyinStringBuilder.append(firstLetters.toString());
}
termAtt.setEmpty();
termAtt.resizeBuffer(pinyinStringBuilder.length());
termAtt.append(pinyinStringBuilder);
termAtt.setLength(pinyinStringBuilder.length());
return true;
}
public PinyinTokenFilter(TokenStream in, String padding_char, String first_letter) {
super(in);
this.padding_char = padding_char;
this.first_letter = first_letter;
format.setCaseType(HanyuPinyinCaseType.LOWERCASE);
format.setToneType(HanyuPinyinToneType.WITHOUT_TONE);
format.setVCharType(HanyuPinyinVCharType.WITH_V);
}
@Override
public final void end() throws IOException {
// set final offset
super.end();
}
@Override
public void reset() throws IOException {
super.reset();
}
}