package com.github.btpka3.lucene.analysis; import net.sourceforge.pinyin4j.PinyinHelper; import net.sourceforge.pinyin4j.format.HanyuPinyinCaseType; import net.sourceforge.pinyin4j.format.HanyuPinyinOutputFormat; import net.sourceforge.pinyin4j.format.HanyuPinyinToneType; import net.sourceforge.pinyin4j.format.HanyuPinyinVCharType; import net.sourceforge.pinyin4j.format.exception.BadHanyuPinyinOutputFormatCombination; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.analysis.synonym.SynonymFilter; import org.apache.lucene.analysis.synonym.SynonymFilterFactory; import org.apache.lucene.analysis.synonym.SynonymMap; import org.apache.lucene.analysis.tokenattributes.*; import org.apache.lucene.analysis.util.FilesystemResourceLoader; import org.apache.lucene.util.Attribute; import org.apache.lucene.util.CharsRef; import org.apache.lucene.util.Version; import org.junit.Before; import org.junit.Test; import java.io.IOException; import java.io.StringReader; import java.util.*; // http://www.hankcs.com/program/java/lucene-synonymfilterfactory.html public class SynonymFilterTest { HanyuPinyinOutputFormat defaultFormat = new HanyuPinyinOutputFormat(); @Before public void befor() { defaultFormat.setCaseType(HanyuPinyinCaseType.LOWERCASE); defaultFormat.setToneType(HanyuPinyinToneType.WITHOUT_TONE); defaultFormat.setVCharType(HanyuPinyinVCharType.WITH_V); } @Test public void test1() throws Exception { StringReader strReader = new StringReader(" I love 中国 "); Analyzer analyzer = new StandardAnalyzer(); TokenStream ts0 = analyzer.tokenStream("name", strReader); SynonymMap.Builder builder = new SynonymMap.Builder(true); addEq(builder, new String[]{"love", "喜欢", "爱"}); addEq(builder, new String[]{"i", "我", "偶"}); addTo(builder, new String[]{"中"}, new String[]{"z", "zh", "zho", "zhon", "zhong"}); SynonymMap synonymMap = builder.build(); SynonymFilter ts = new SynonymFilter(ts0, synonymMap, true); print(ts); analyzer.close(); } private void addEq(SynonymMap.Builder builder, String[] eqSynoyms) { for (String input : eqSynoyms) { for (String output : eqSynoyms) { builder.add(new CharsRef(input), new CharsRef(output), false); } } } private void addTo(SynonymMap.Builder builder, String[] from, String[] to) { for (String input : from) { for (String output : to) { builder.add(new CharsRef(input), new CharsRef(output), false); } } } private void print(TokenStream ts) throws IOException { Iterator<Class<? extends Attribute>> it = ts.getAttributeClassesIterator(); while (it.hasNext()) { System.out.println(it.next()); } CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class); PositionIncrementAttribute posIncrAtt = ts.getAttribute(PositionIncrementAttribute.class); PositionLengthAttribute posLenAtt = ts.getAttribute(PositionLengthAttribute.class); TypeAttribute typeAtt = ts.getAttribute(TypeAttribute.class); OffsetAttribute offsetAtt = ts.getAttribute(OffsetAttribute.class); TermToBytesRefAttribute byteRefAtt = ts.getAttribute(TermToBytesRefAttribute.class); ts.reset(); while (ts.incrementToken()) { System.out.printf("%3d ~ %3d : %15s : %3d : %3d : '%s' - '%s' : %n", offsetAtt.startOffset(), offsetAtt.endOffset(), typeAtt.type(), posIncrAtt.getPositionIncrement(), posLenAtt.getPositionLength(), new String(byteRefAtt.getBytesRef().bytes), termAtt.toString() ); } } @Test public void test2() throws Exception { StringReader strReader = new StringReader(" I love 中国 "); Analyzer analyzer = new StandardAnalyzer(); TokenStream ts0 = analyzer.tokenStream("name", strReader); Map<String, String> filterArgs = new HashMap<String, String>(); filterArgs.put("luceneMatchVersion", Version.LATEST.toString()); filterArgs.put("synonyms", "me/test/SynonymFilterTest.txt"); filterArgs.put("expand", "true"); SynonymFilterFactory factory = new SynonymFilterFactory(filterArgs); factory.inform(new FilesystemResourceLoader()); TokenStream ts = factory.create(ts0); print(ts); analyzer.close(); } @Test public void test3() throws Exception { StringReader strReader = new StringReader(" zhu zhi 中"); Analyzer analyzer = new StandardAnalyzer(); TokenStream ts0 = analyzer.tokenStream("name", strReader); SynonymMap.Builder builder = new SynonymMap.Builder(true); addTo(builder, new String[]{"zhu"}, new String[]{"z", "zh", "zhu"}); addTo(builder, new String[]{"zhi"}, new String[]{"z", "zh", "zhi"}); addTo(builder, new String[]{"中"}, new String[]{"中", "z", "zh", "zho", "zhon", "zhong"}); SynonymMap synonymMap = builder.build(); SynonymFilter ts = new SynonymFilter(ts0, synonymMap, true); print(ts); analyzer.close(); } // 称 {chèn, chēng } -> {c,ch,che,cheng} // 重 {zhòng,chóng} -> {z,zh,zho,zhon,zhong, c,ch,cho,chon,chong} private String[] flattenPinyinArr(String[] pinyinArr) { if (pinyinArr == null) { return null; } Set<String> pinyinAbbrSet = new TreeSet<String>(); for (int i = 0; i < pinyinArr.length; i++) { String pinyin = pinyinArr[i]; for (int j = 1; j <= pinyin.length(); j++) { pinyinAbbrSet.add(pinyin.substring(0, 0 + j)); } } return pinyinAbbrSet.toArray(new String[0]); } private String[] pinyinSynonymArr(char c) throws BadHanyuPinyinOutputFormatCombination { String[] pinyinArr = PinyinHelper.toHanyuPinyinStringArray(c, defaultFormat); if (pinyinArr == null) { return null; } String[] flattenPinyinArr = flattenPinyinArr(pinyinArr); String[] withOrgCharPinyinArr = new String[flattenPinyinArr.length + 1]; withOrgCharPinyinArr[0] = Character.toString(c); System.arraycopy(flattenPinyinArr, 0, withOrgCharPinyinArr, 1, flattenPinyinArr.length); return withOrgCharPinyinArr; } // 测试一次性构造所有汉字的拼音缩写词 @Test public void test4() throws Exception { char[][] chineseChars = { // {from, to} {'\u4e00', '\u9fa5'} }; SynonymMap.Builder builder = new SynonymMap.Builder(true); for (int i = 0; i < chineseChars.length; i++) { char[] charRange = chineseChars[i]; for (char c = charRange[0]; c <= charRange[1]; c++) { String[] pinyinArr = pinyinSynonymArr(c); if (pinyinArr == null) { continue; } addTo(builder, new String[]{Character.toString(c)}, flattenPinyinArr(pinyinArr)); } } StringReader strReader = new StringReader("abc重阳节123"); Analyzer analyzer = new StandardAnalyzer(); TokenStream ts0 = analyzer.tokenStream("name", strReader); SynonymMap synonymMap = builder.build(); SynonymFilter ts = new SynonymFilter(ts0, synonymMap, true); print(ts0); analyzer.close(); } }