package com.github.btpka3.lucene.analysis;
import net.sourceforge.pinyin4j.PinyinHelper;
import net.sourceforge.pinyin4j.format.HanyuPinyinCaseType;
import net.sourceforge.pinyin4j.format.HanyuPinyinOutputFormat;
import net.sourceforge.pinyin4j.format.HanyuPinyinToneType;
import net.sourceforge.pinyin4j.format.HanyuPinyinVCharType;
import net.sourceforge.pinyin4j.format.exception.BadHanyuPinyinOutputFormatCombination;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.synonym.SynonymFilter;
import org.apache.lucene.analysis.synonym.SynonymMap;
import org.apache.lucene.analysis.tokenattributes.*;
import org.apache.lucene.util.CharsRef;
import java.io.IOException;
import java.util.Set;
import java.util.TreeSet;
public class PinyinAbbrTokenFilter extends TokenFilter {
private SynonymMap pinyinSynonymMap;
private HanyuPinyinOutputFormat defaultFormat = new HanyuPinyinOutputFormat();
private SynonymFilter synonymFilter;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class);
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private void setupSynonymMap()
throws IOException, BadHanyuPinyinOutputFormatCombination {
char[][] chineseChars = {
// {from, to}
{'\u4e00', '\u9fa5'}
};
SynonymMap.Builder builder = new SynonymMap.Builder(true);
for (int i = 0; i < chineseChars.length; i++) {
char[] charRange = chineseChars[i];
for (char c = charRange[0]; c <= charRange[1]; c++) {
String[] pinyinArr = pinyinSynonymArr(c);
if (pinyinArr == null) {
continue;
}
addTo(builder,
new String[]{Character.toString(c)},
flattenPinyinArr(pinyinArr));
}
}
pinyinSynonymMap = builder.build();
}
private String[] pinyinSynonymArr(char c)
throws BadHanyuPinyinOutputFormatCombination {
String[] pinyinArr = PinyinHelper.toHanyuPinyinStringArray(c, defaultFormat);
if (pinyinArr == null) {
return null;
}
String[] flattenPinyinArr = flattenPinyinArr(pinyinArr);
String[] withOrgCharPinyinArr = new String[flattenPinyinArr.length + 1];
withOrgCharPinyinArr[0] = Character.toString(c);
System.arraycopy(flattenPinyinArr, 0, withOrgCharPinyinArr, 1, flattenPinyinArr.length);
return withOrgCharPinyinArr;
}
// 称 {chèn, chēng } -> {c,ch,che,cheng}
// 重 {zhòng,chóng} -> {z,zh,zho,zhon,zhong, c,ch,cho,chon,chong}
private String[] flattenPinyinArr(String[] pinyinArr) {
if (pinyinArr == null) {
return null;
}
Set<String> pinyinAbbrSet = new TreeSet<String>();
for (int i = 0; i < pinyinArr.length; i++) {
String pinyin = pinyinArr[i];
for (int j = 1; j <= pinyin.length(); j++) {
pinyinAbbrSet.add(pinyin.substring(0, 0 + j));
}
}
return pinyinAbbrSet.toArray(new String[0]);
}
private void addTo(SynonymMap.Builder builder, String[] from, String[] to) {
for (String input : from) {
for (String output : to) {
builder.add(new CharsRef(input), new CharsRef(output), false);
}
}
}
public static boolean isChineseChar(char c) {
return '\u4e00' <= c && c <= '\u9fa5';
}
public static boolean containsChineseChar(String str) {
if (str == null || str.length() == 0) {
return false;
}
for (int i = 0; i < str.length(); i++) {
if (isChineseChar(str.charAt(i))) {
return true;
}
}
return false;
}
public PinyinAbbrTokenFilter(TokenStream in) {
super(in);
defaultFormat.setCaseType(HanyuPinyinCaseType.LOWERCASE);
defaultFormat.setToneType(HanyuPinyinToneType.WITHOUT_TONE);
defaultFormat.setVCharType(HanyuPinyinVCharType.WITH_V);
try {
setupSynonymMap();
} catch (Exception e) {
throw new RuntimeException(e);
}
synonymFilter = new SynonymFilter(input, pinyinSynonymMap, true);
}
private HanyuPinyinOutputFormat format = new HanyuPinyinOutputFormat();
private String padding_char;
private String first_letter;
@Override
public final boolean incrementToken() throws IOException {
if (!synonymFilter.incrementToken()) {
return false;
}
CharTermAttribute termAtt0 = synonymFilter.getAttribute(CharTermAttribute.class);
PositionIncrementAttribute posIncrAtt0 = synonymFilter.getAttribute(PositionIncrementAttribute.class);
PositionLengthAttribute posLenAtt0 = synonymFilter.getAttribute(PositionLengthAttribute.class);
TypeAttribute typeAtt0 = synonymFilter.getAttribute(TypeAttribute.class);
OffsetAttribute offsetAtt0 = synonymFilter.getAttribute(OffsetAttribute.class);
return true;
}
@Override
public final void end() throws IOException {
// set final offset
super.end();
}
@Override
public void reset() throws IOException {
super.reset();
}
}