/* ================================================================== * Created [2009-4-27 锟斤拷锟斤拷11:32:55] by Jon.King * ================================================================== * TSS * ================================================================== * mailTo:jinpujun@hotmail.com * Copyright (c) Jon.King, 2009-2012 * ================================================================== */ package com.jinhe.tss.cms.lucene.analyzer; import java.io.BufferedReader; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.Reader; import java.util.TreeMap; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.Tokenizer; public class CJKTokenizer extends Tokenizer { // 这个TreeMap用来缓存词库 private static TreeMap<String, String> simWords = null; private int bufferIndex = 0; private int dataLen = 0; private final char[] ioBuffer = new char[256]; private String tokenType = "word"; public CJKTokenizer(Reader input) { this.input = input; } // 这里是lucene分词器实现的最关键的地方 public Token next() throws IOException { loadWords(); StringBuffer currentWord = new StringBuffer(); while (true) { char c; Character.UnicodeBlock ub; if (bufferIndex >= dataLen) { dataLen = input.read(ioBuffer); bufferIndex = 0; } if (dataLen == -1) { if (currentWord.length() == 0) { return null; } else { break; } } else { c = ioBuffer[bufferIndex++]; ub = Character.UnicodeBlock.of(c); } // 通过这个条件不难看出这里只处理了CJK_UNIFIED_IDEOGRAPHS, // 因此会丢掉其它的字符,如它会丢掉LATIN字符和数字 // 这也是该lucene分词器的一个限制,您可以在此基础之上完善它,也很欢迎把您完善的结果反馈给我 if (Character.isLetter(c) && ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS) { tokenType = "double"; if (currentWord.length() == 0) { currentWord.append(c); } else { // 这里实现了正向最大匹配法 String temp = (currentWord.toString() + c).intern(); if (simWords.containsKey(temp)) { currentWord.append(c); } else { bufferIndex--; break; } } } } Token token = new Token(currentWord.toString(), bufferIndex - currentWord.length(), bufferIndex, tokenType); currentWord.setLength(0); return token; } // 装载词库,您必须明白它的逻辑和之所以这样做的目的,这样您才能理解正向最大匹配法是如何实现的 public void loadWords() { if (simWords != null) return; simWords = new TreeMap<String, String>(); try { InputStream words = new FileInputStream("d:/Temp/simchinese.txt"); BufferedReader in = new BufferedReader(new InputStreamReader(words, "UTF-8")); String word = null; while ((word = in.readLine()) != null) { // #使得我们可以在词库中进行必要的注释 if ((word.indexOf("#") == -1) && (word.length() < 5) && (word.length() > 0)) { simWords.put(word.intern(), "1"); if (word.length() == 3) { if (!simWords.containsKey(word.substring(0, 2).intern())) { simWords.put(word.substring(0, 2).intern(), "2"); } } if (word.length() == 4) { if (!simWords.containsKey(word.substring(0, 2).intern())) { simWords.put(word.substring(0, 2).intern(), "2"); } if (!simWords.containsKey(word.substring(0, 3).intern())) { simWords.put(word.substring(0, 3).intern(), "2"); } } } } in.close(); } catch (IOException e) { e.printStackTrace(); } } }