package com.github.btpka3.lucene.analysis; import net.sourceforge.pinyin4j.PinyinHelper; import net.sourceforge.pinyin4j.format.HanyuPinyinOutputFormat; import net.sourceforge.pinyin4j.format.exception.BadHanyuPinyinOutputFormatCombination; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.synonym.SynonymFilter; import org.apache.lucene.analysis.synonym.SynonymMap; import org.apache.lucene.util.CharsRef; import java.io.IOException; import java.io.Reader; import java.util.Set; import java.util.TreeSet; public class PinyinAbbrAnalyzer extends Analyzer { private SynonymMap pinyinSynonymMap; private HanyuPinyinOutputFormat defaultFormat = new HanyuPinyinOutputFormat(); private SynonymFilter synonymFilter; private void setupSynonymMap() throws IOException, BadHanyuPinyinOutputFormatCombination { char[][] chineseChars = { // {from, to} {'\u4e00', '\u9fa5'} }; SynonymMap.Builder builder = new SynonymMap.Builder(true); for (int i = 0; i < chineseChars.length; i++) { char[] charRange = chineseChars[i]; for (char c = charRange[0]; c <= charRange[1]; c++) { String[] pinyinArr = pinyinSynonymArr(c); if (pinyinArr == null) { continue; } addTo(builder, new String[]{Character.toString(c)}, flattenPinyinArr(pinyinArr)); } } pinyinSynonymMap = builder.build(); } private String[] pinyinSynonymArr(char c) throws BadHanyuPinyinOutputFormatCombination { String[] pinyinArr = PinyinHelper.toHanyuPinyinStringArray(c, defaultFormat); if (pinyinArr == null) { return null; } String[] flattenPinyinArr = flattenPinyinArr(pinyinArr); String[] withOrgCharPinyinArr = new String[flattenPinyinArr.length + 1]; withOrgCharPinyinArr[0] = Character.toString(c); System.arraycopy(flattenPinyinArr, 0, withOrgCharPinyinArr, 1, flattenPinyinArr.length); return withOrgCharPinyinArr; } // 称 {chèn, chēng } -> {c,ch,che,cheng} // 重 {zhòng,chóng} -> {z,zh,zho,zhon,zhong, c,ch,cho,chon,chong} private String[] flattenPinyinArr(String[] pinyinArr) { if (pinyinArr == null) { return null; } Set<String> pinyinAbbrSet = new TreeSet<String>(); for (int i = 0; i < pinyinArr.length; i++) { String pinyin = pinyinArr[i]; for (int j = 1; j <= pinyin.length(); j++) { pinyinAbbrSet.add(pinyin.substring(0, 0 + j)); } } return pinyinAbbrSet.toArray(new String[0]); } private void addTo(SynonymMap.Builder builder, String[] from, String[] to) { for (String input : from) { for (String output : to) { builder.add(new CharsRef(input), new CharsRef(output), false); } } } public static boolean isChineseChar(char c) { return '\u4e00' <= c && c <= '\u9fa5'; } public static boolean containsChineseChar(String str) { if (str == null || str.length() == 0) { return false; } for (int i = 0; i < str.length(); i++) { if (isChineseChar(str.charAt(i))) { return true; } } return false; } @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { return new TokenStreamComponents(null); } }