/* ==================================================================
* Created [2009-4-27 锟斤拷锟斤拷11:32:55] by Jon.King
* ==================================================================
* TSS
* ==================================================================
* mailTo:jinpujun@hotmail.com
* Copyright (c) Jon.King, 2009-2012
* ==================================================================
*/
package com.jinhe.tss.cms.lucene.analyzer;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.util.TreeMap;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.Tokenizer;
public class CJKTokenizer extends Tokenizer {
// 这个TreeMap用来缓存词库
private static TreeMap<String, String> simWords = null;
private int bufferIndex = 0;
private int dataLen = 0;
private final char[] ioBuffer = new char[256];
private String tokenType = "word";
public CJKTokenizer(Reader input) {
this.input = input;
}
// 这里是lucene分词器实现的最关键的地方
public Token next() throws IOException {
loadWords();
StringBuffer currentWord = new StringBuffer();
while (true) {
char c;
Character.UnicodeBlock ub;
if (bufferIndex >= dataLen) {
dataLen = input.read(ioBuffer);
bufferIndex = 0;
}
if (dataLen == -1) {
if (currentWord.length() == 0) { return null; }
else { break; }
} else {
c = ioBuffer[bufferIndex++];
ub = Character.UnicodeBlock.of(c);
}
// 通过这个条件不难看出这里只处理了CJK_UNIFIED_IDEOGRAPHS,
// 因此会丢掉其它的字符,如它会丢掉LATIN字符和数字
// 这也是该lucene分词器的一个限制,您可以在此基础之上完善它,也很欢迎把您完善的结果反馈给我
if (Character.isLetter(c) && ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS) {
tokenType = "double";
if (currentWord.length() == 0) {
currentWord.append(c);
} else {
// 这里实现了正向最大匹配法
String temp = (currentWord.toString() + c).intern();
if (simWords.containsKey(temp)) {
currentWord.append(c);
} else {
bufferIndex--;
break;
}
}
}
}
Token token = new Token(currentWord.toString(), bufferIndex - currentWord.length(), bufferIndex, tokenType);
currentWord.setLength(0);
return token;
}
// 装载词库,您必须明白它的逻辑和之所以这样做的目的,这样您才能理解正向最大匹配法是如何实现的
public void loadWords() {
if (simWords != null)
return;
simWords = new TreeMap<String, String>();
try {
InputStream words = new FileInputStream("d:/Temp/simchinese.txt");
BufferedReader in = new BufferedReader(new InputStreamReader(words, "UTF-8"));
String word = null;
while ((word = in.readLine()) != null) {
// #使得我们可以在词库中进行必要的注释
if ((word.indexOf("#") == -1) && (word.length() < 5) && (word.length() > 0)) {
simWords.put(word.intern(), "1");
if (word.length() == 3) {
if (!simWords.containsKey(word.substring(0, 2).intern())) {
simWords.put(word.substring(0, 2).intern(), "2");
}
}
if (word.length() == 4) {
if (!simWords.containsKey(word.substring(0, 2).intern())) {
simWords.put(word.substring(0, 2).intern(), "2");
}
if (!simWords.containsKey(word.substring(0, 3).intern())) {
simWords.put(word.substring(0, 3).intern(), "2");
}
}
}
}
in.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}