package com.github.btpka3.lucene.analysis; import net.sourceforge.pinyin4j.PinyinHelper; import net.sourceforge.pinyin4j.format.HanyuPinyinCaseType; import net.sourceforge.pinyin4j.format.HanyuPinyinOutputFormat; import net.sourceforge.pinyin4j.format.HanyuPinyinToneType; import net.sourceforge.pinyin4j.format.HanyuPinyinVCharType; import net.sourceforge.pinyin4j.format.exception.BadHanyuPinyinOutputFormatCombination; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.analysis.synonym.SynonymFilter; import org.apache.lucene.analysis.synonym.SynonymMap; import org.apache.lucene.analysis.tokenattributes.*; import org.apache.lucene.util.CharsRef; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; import java.io.Reader; import java.util.Set; import java.util.TreeSet; import java.util.regex.Pattern; public class PinyinAbbrTokenizer extends Tokenizer { private Logger logger = LoggerFactory.getLogger(PinyinAbbrTokenizer.class); private SynonymMap pinyinSynonymMap; private HanyuPinyinOutputFormat defaultFormat = new HanyuPinyinOutputFormat(); private SynonymFilter synonymFilter; private void setupSynonymMap() throws IOException, BadHanyuPinyinOutputFormatCombination { char[][] chineseChars = { // {from, to} {'\u4e00', '\u9fa5'} }; SynonymMap.Builder builder = new SynonymMap.Builder(true); for (int i = 0; i < chineseChars.length; i++) { char[] charRange = chineseChars[i]; for (char c = charRange[0]; c <= charRange[1]; c++) { String[] pinyinArr = pinyinSynonymArr(c); if (pinyinArr == null) { continue; } addTo(builder, new String[]{Character.toString(c)}, flattenPinyinArr(pinyinArr)); } } pinyinSynonymMap = builder.build(); } private String[] pinyinSynonymArr(char c) throws BadHanyuPinyinOutputFormatCombination { String[] pinyinArr = PinyinHelper.toHanyuPinyinStringArray(c, defaultFormat); if (pinyinArr == null) { return null; } String[] flattenPinyinArr = flattenPinyinArr(pinyinArr); String[] withOrgCharPinyinArr = new String[flattenPinyinArr.length + 1]; withOrgCharPinyinArr[0] = Character.toString(c); System.arraycopy(flattenPinyinArr, 0, withOrgCharPinyinArr, 1, flattenPinyinArr.length); return withOrgCharPinyinArr; } // 称 {chèn, chēng } -> {c,ch,che,cheng} // 重 {zhòng,chóng} -> {z,zh,zho,zhon,zhong, c,ch,cho,chon,chong} private String[] flattenPinyinArr(String[] pinyinArr) { if (pinyinArr == null) { return null; } Set<String> pinyinAbbrSet = new TreeSet<String>(); for (int i = 0; i < pinyinArr.length; i++) { String pinyin = pinyinArr[i]; for (int j = 1; j <= pinyin.length(); j++) { pinyinAbbrSet.add(pinyin.substring(0, 0 + j)); } } return pinyinAbbrSet.toArray(new String[0]); } private void addTo(SynonymMap.Builder builder, String[] from, String[] to) { for (String input : from) { for (String output : to) { builder.add(new CharsRef(input), new CharsRef(output), false); } } } public static boolean isChineseChar(char c) { return '\u4e00' <= c && c <= '\u9fa5'; } public static boolean containsChineseChar(String str) { if (str == null || str.length() == 0) { return false; } for (int i = 0; i < str.length(); i++) { if (isChineseChar(str.charAt(i))) { return true; } } return false; } private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class); private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class); private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); private static final int DEFAULT_BUFFER_SIZE = 256; private HanyuPinyinOutputFormat format = new HanyuPinyinOutputFormat(); private static Pattern pattern = Pattern.compile("^[\\u4e00-\\u9fa5]$"); // public PinyinAbbrTokenizer(AttributeFactory factory, Reader input) { // this(factory, input, DEFAULT_BUFFER_SIZE); // } public PinyinAbbrTokenizer(Reader input) { super(input); defaultFormat.setCaseType(HanyuPinyinCaseType.LOWERCASE); defaultFormat.setToneType(HanyuPinyinToneType.WITHOUT_TONE); defaultFormat.setVCharType(HanyuPinyinVCharType.WITH_V); try { setupSynonymMap(); Analyzer analyzer = new StandardAnalyzer(); TokenStream ts0 = analyzer.tokenStream("name", input); synonymFilter = new SynonymFilter(ts0, pinyinSynonymMap, true); } catch (Exception e) { throw new RuntimeException(e); } } // public PinyinAbbrTokenizer(AttributeFactory factory, Reader input, int bufferSize) { // super(factory, input); // termAtt.resizeBuffer(bufferSize); // format.setCaseType(HanyuPinyinCaseType.LOWERCASE); // format.setToneType(HanyuPinyinToneType.WITHOUT_TONE); // format.setVCharType(HanyuPinyinVCharType.WITH_V); // } // public PinyinAbbrTokenizer(Reader input, int bufferSize) { // super(input); // } // @Override // public boolean isTokenChar(int c) { // if (Character.isLetterOrDigit(c)) { // return true; // } // // Matcher matcher = pattern.matcher(String.valueOf(c)); // return matcher.matches(); // } // // @Override // protected int normalize(int c) { // try { // String[] strs = PinyinHelper.toHanyuPinyinStringArray((char) c, format); // if (strs != null) { // termAtt.append(strs[0]); // return strs[0].codePointAt(0); // } // } catch (BadHanyuPinyinOutputFormatCombination e) { // logger.debug(e.getMessage(), e); // } // return c; // } @Override public final boolean incrementToken() throws IOException { boolean hasMore = synonymFilter.incrementToken(); CharTermAttribute termAtt0 = synonymFilter.getAttribute(CharTermAttribute.class); if (termAtt.length() < termAtt0.length()) { termAtt.resizeBuffer(termAtt0.length() * 2); } termAtt.setEmpty(); termAtt.copyBuffer(termAtt0.buffer(), 0, termAtt0.length()); PositionIncrementAttribute posIncrAtt0 = synonymFilter.getAttribute(PositionIncrementAttribute.class); posIncrAtt.setPositionIncrement(posIncrAtt0.getPositionIncrement()); PositionLengthAttribute posLenAtt0 = synonymFilter.getAttribute(PositionLengthAttribute.class); posLenAtt.setPositionLength(posLenAtt0.getPositionLength()); TypeAttribute typeAtt0 = synonymFilter.getAttribute(TypeAttribute.class); typeAtt.setType(typeAtt0.type()); OffsetAttribute offsetAtt0 = synonymFilter.getAttribute(OffsetAttribute.class); offsetAtt.setOffset(offsetAtt0.startOffset(), offsetAtt0.endOffset()); return hasMore; } @Override public void close() throws IOException { // TODO Auto-generated method stub synonymFilter.close(); } @Override public void reset() throws IOException { // TODO Auto-generated method stub synonymFilter.reset(); } }