/**
* Copyright (c)2010-2011 Enterprise Website Content Management System(EWCMS), All rights reserved.
* EWCMS PROPRIETARY/CONFIDENTIAL. Use is subject to license terms.
* http://www.ewcms.com
*/
package com.ewcms.content.document.util.analyzer.dic;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.Collection;
import java.util.List;
import com.ewcms.content.document.util.analyzer.cfg.Configuration;
/**
* <ul>
* 词典管理类,单例模式
* </ul>
*
* @author 吴智俊
*/
public class Dictionary {
/*
* 分词器默认字典路径
*/
public static final String PATH_DIC_MAIN = "/com/ewcms/content/document/util/analyzer/dic/main.dic";
public static final String PATH_DIC_SURNAME = "/com/ewcms/content/document/util/analyzer/dic/surname.dic";
public static final String PATH_DIC_QUANTIFIER = "/com/ewcms/content/document/util/analyzer/dic/quantifier.dic";
public static final String PATH_DIC_SUFFIX = "/com/ewcms/content/document/util/analyzer/dic/suffix.dic";
public static final String PATH_DIC_PREP = "/com/ewcms/content/document/util/analyzer/dic/preposition.dic";
public static final String PATH_DIC_STOP = "/com/ewcms/content/document/util/analyzer/dic/stopword.dic";
/*
* 词典单子实例
*/
private static final Dictionary singleton;
/*
* 词典初始化
*/
static{
singleton = new Dictionary();
}
/*
* 主词典对象
*/
private DictSegment _MainDict;
/*
* 姓氏词典
*/
private DictSegment _SurnameDict;
/*
* 量词词典
*/
private DictSegment _QuantifierDict;
/*
* 后缀词典
*/
private DictSegment _SuffixDict;
/*
* 副词,介词词典
*/
private DictSegment _PrepDict;
/*
* 停止词集合
*/
private DictSegment _StopWords;
private Dictionary(){
//初始化系统词典
loadMainDict();
loadSurnameDict();
loadQuantifierDict();
loadSuffixDict();
loadPrepDict();
loadStopWordDict();
}
/**
* 加载主词典及扩展词典
*/
private void loadMainDict(){
//建立一个主词典实例
_MainDict = new DictSegment((char)0);
//读取主词典文件
InputStream is = Dictionary.class.getResourceAsStream(Dictionary.PATH_DIC_MAIN);
if(is == null){
throw new RuntimeException("Main Dictionary not found!!!");
}
try {
BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512);
String theWord = null;
do {
theWord = br.readLine();
if (theWord != null && !"".equals(theWord.trim())) {
_MainDict.fillSegment(theWord.trim().toCharArray());
}
} while (theWord != null);
} catch (IOException ioe) {
System.err.println("Main Dictionary loading exception.");
ioe.printStackTrace();
}finally{
try {
if(is != null){
is.close();
is = null;
}
} catch (IOException e) {
e.printStackTrace();
}
}
//加载扩展词典配置
List<String> extDictFiles = Configuration.getExtDictionarys();
if(extDictFiles != null){
for(String extDictName : extDictFiles){
//读取扩展词典文件
is = Dictionary.class.getResourceAsStream(extDictName);
//如果找不到扩展的字典,则忽略
if(is == null){
continue;
}
try {
BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512);
String theWord = null;
do {
theWord = br.readLine();
if (theWord != null && !"".equals(theWord.trim())) {
//加载扩展词典数据到主内存词典中
_MainDict.fillSegment(theWord.trim().toCharArray());
}
} while (theWord != null);
} catch (IOException ioe) {
System.err.println("Extension Dictionary loading exception.");
ioe.printStackTrace();
}finally{
try {
if(is != null){
is.close();
is = null;
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
}
/**
* 加载姓氏词典
*/
private void loadSurnameDict(){
//建立一个姓氏词典实例
_SurnameDict = new DictSegment((char)0);
//读取姓氏词典文件
InputStream is = Dictionary.class.getResourceAsStream(Dictionary.PATH_DIC_SURNAME);
if(is == null){
throw new RuntimeException("Surname Dictionary not found!!!");
}
try {
BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512);
String theWord = null;
do {
theWord = br.readLine();
if (theWord != null && !"".equals(theWord.trim())) {
_SurnameDict.fillSegment(theWord.trim().toCharArray());
}
} while (theWord != null);
} catch (IOException ioe) {
System.err.println("Surname Dictionary loading exception.");
ioe.printStackTrace();
}finally{
try {
if(is != null){
is.close();
is = null;
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
/**
* 加载量词词典
*/
private void loadQuantifierDict(){
//建立一个量词典实例
_QuantifierDict = new DictSegment((char)0);
//读取量词词典文件
InputStream is = Dictionary.class.getResourceAsStream(Dictionary.PATH_DIC_QUANTIFIER);
if(is == null){
throw new RuntimeException("Quantifier Dictionary not found!!!");
}
try {
BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512);
String theWord = null;
do {
theWord = br.readLine();
if (theWord != null && !"".equals(theWord.trim())) {
_QuantifierDict.fillSegment(theWord.trim().toCharArray());
}
} while (theWord != null);
} catch (IOException ioe) {
System.err.println("Quantifier Dictionary loading exception.");
ioe.printStackTrace();
}finally{
try {
if(is != null){
is.close();
is = null;
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
/**
* 加载后缀词典
*/
private void loadSuffixDict(){
//建立一个后缀词典实例
_SuffixDict = new DictSegment((char)0);
//读取量词词典文件
InputStream is = Dictionary.class.getResourceAsStream(Dictionary.PATH_DIC_SUFFIX);
if(is == null){
throw new RuntimeException("Suffix Dictionary not found!!!");
}
try {
BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512);
String theWord = null;
do {
theWord = br.readLine();
if (theWord != null && !"".equals(theWord.trim())) {
_SuffixDict.fillSegment(theWord.trim().toCharArray());
}
} while (theWord != null);
} catch (IOException ioe) {
System.err.println("Suffix Dictionary loading exception.");
ioe.printStackTrace();
}finally{
try {
if(is != null){
is.close();
is = null;
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
/**
* 加载介词\副词词典
*/
private void loadPrepDict(){
//建立一个介词\副词词典实例
_PrepDict = new DictSegment((char)0);
//读取量词词典文件
InputStream is = Dictionary.class.getResourceAsStream(Dictionary.PATH_DIC_PREP);
if(is == null){
throw new RuntimeException("Preposition Dictionary not found!!!");
}
try {
BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512);
String theWord = null;
do {
theWord = br.readLine();
if (theWord != null && !"".equals(theWord.trim())) {
_PrepDict.fillSegment(theWord.trim().toCharArray());
}
} while (theWord != null);
} catch (IOException ioe) {
System.err.println("Preposition Dictionary loading exception.");
ioe.printStackTrace();
}finally{
try {
if(is != null){
is.close();
is = null;
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
/**
* 加载停止词词典
*/
private void loadStopWordDict(){
//建立一个停止词典实例
_StopWords = new DictSegment((char)0);
//读取量词词典文件
InputStream is = Dictionary.class.getResourceAsStream(Dictionary.PATH_DIC_STOP);
if(is == null){
throw new RuntimeException("Stopword Dictionary not found!!!");
}
try {
BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512);
String theWord = null;
do {
theWord = br.readLine();
if (theWord != null && !"".equals(theWord.trim())) {
_StopWords.fillSegment(theWord.trim().toCharArray());
}
} while (theWord != null);
} catch (IOException ioe) {
System.err.println("Stopword Dictionary loading exception.");
ioe.printStackTrace();
}finally{
try {
if(is != null){
is.close();
is = null;
}
} catch (IOException e) {
e.printStackTrace();
}
}
//加载扩展停止词典
List<String> extStopWordDictFiles = Configuration.getExtStopWordDictionarys();
if(extStopWordDictFiles != null){
for(String extStopWordDictName : extStopWordDictFiles){
//读取扩展词典文件
is = Dictionary.class.getResourceAsStream(extStopWordDictName);
//如果找不到扩展的字典,则忽略
if(is == null){
continue;
}
try {
BufferedReader br = new BufferedReader(new InputStreamReader(is , "UTF-8"), 512);
String theWord = null;
do {
theWord = br.readLine();
if (theWord != null && !"".equals(theWord.trim())) {
//加载扩展停止词典数据到内存中
_StopWords.fillSegment(theWord.trim().toCharArray());
}
} while (theWord != null);
} catch (IOException ioe) {
System.err.println("Extension Stop word Dictionary loading exception.");
ioe.printStackTrace();
}finally{
try {
if(is != null){
is.close();
is = null;
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
}
/**
* 词典初始化
* 由于IK Analyzer的词典采用Dictionary类的静态方法进行词典初始化
* 只有当Dictionary类被实际调用时,才会开始载入词典,
* 这将延长首次分词操作的时间
* 该方法提供了一个在应用加载阶段就初始化字典的手段
* 用来缩短首次分词时的时延
* @return Dictionary
*/
public static Dictionary getInstance(){
return Dictionary.singleton;
}
/**
* 加载扩展的词条
* @param extWords Collection<String>词条列表
*/
public static void loadExtendWords(Collection<String> extWords){
if(extWords != null){
for(String extWord : extWords){
if (extWord != null) {
//加载扩展词条到主内存词典中
singleton._MainDict.fillSegment(extWord.trim().toCharArray());
}
}
}
}
/**
* 加载扩展的停止词条
* @param extStopWords Collection<String>词条列表
*/
public static void loadExtendStopWords(Collection<String> extStopWords){
if(extStopWords != null){
for(String extStopWord : extStopWords){
if (extStopWord != null) {
//加载扩展的停止词条
singleton._StopWords.fillSegment(extStopWord.trim().toCharArray());
}
}
}
}
/**
* 检索匹配主词典
* @param charArray
* @return Hit 匹配结果描述
*/
public static Hit matchInMainDict(char[] charArray){
return singleton._MainDict.match(charArray);
}
/**
* 检索匹配主词典
* @param charArray
* @param begin
* @param length
* @return Hit 匹配结果描述
*/
public static Hit matchInMainDict(char[] charArray , int begin, int length){
return singleton._MainDict.match(charArray, begin, length);
}
/**
* 检索匹配主词典,
* 从已匹配的Hit中直接取出DictSegment,继续向下匹配
* @param charArray
* @param currentIndex
* @param matchedHit
* @return Hit
*/
public static Hit matchWithHit(char[] charArray , int currentIndex , Hit matchedHit){
DictSegment ds = matchedHit.getMatchedDictSegment();
return ds.match(charArray, currentIndex, 1 , matchedHit);
}
/**
* 检索匹配姓氏词典
* @param charArray
* @param begin
* @param length
* @return Hit 匹配结果描述
*/
public static Hit matchInSurnameDict(char[] charArray , int begin, int length){
return singleton._SurnameDict.match(charArray, begin, length);
}
// /**
// *
// * 在姓氏词典中匹配指定位置的char数组
// * (对传入的字串进行后缀匹配)
// * @param charArray
// * @param begin
// * @param end
// * @return
// */
// public static boolean endsWithSurnameDict(char[] charArray , int begin, int length){
// Hit hit = null;
// for(int i = 1 ; i <= length ; i++){
// hit = singleton._SurnameDict.match(charArray, begin + (length - i) , i);
// if(hit.isMatch()){
// return true;
// }
// }
// return false;
// }
/**
* 检索匹配量词词典
* @param charArray
* @param begin
* @param length
* @return Hit 匹配结果描述
*/
public static Hit matchInQuantifierDict(char[] charArray , int begin, int length){
return singleton._QuantifierDict.match(charArray, begin, length);
}
/**
* 检索匹配在后缀词典
* @param charArray
* @param begin
* @param length
* @return Hit 匹配结果描述
*/
public static Hit matchInSuffixDict(char[] charArray , int begin, int length){
return singleton._SuffixDict.match(charArray, begin, length);
}
// /**
// * 在后缀词典中匹配指定位置的char数组
// * (对传入的字串进行前缀匹配)
// * @param charArray
// * @param begin
// * @param end
// * @return
// */
// public static boolean startsWithSuffixDict(char[] charArray , int begin, int length){
// Hit hit = null;
// for(int i = 1 ; i <= length ; i++){
// hit = singleton._SuffixDict.match(charArray, begin , i);
// if(hit.isMatch()){
// return true;
// }else if(hit.isUnmatch()){
// return false;
// }
// }
// return false;
// }
/**
* 检索匹配介词、副词词典
* @param charArray
* @param begin
* @param length
* @return Hit 匹配结果描述
*/
public static Hit matchInPrepDict(char[] charArray , int begin, int length){
return singleton._PrepDict.match(charArray, begin, length);
}
/**
* 判断是否是停止词
* @param charArray
* @param begin
* @param length
* @return boolean
*/
public static boolean isStopWord(char[] charArray , int begin, int length){
return singleton._StopWords.match(charArray, begin, length).isMatch();
}
}