/*
* 文件名:WordMap.java
* 版权:Copyright 2008-20012 复旦大学 All Rights Reserved.
* 描述:程序总入口
* 修改人:xpqiu
* 修改时间:2008-12-26
* 修改内容:新增
*
* 修改人:〈修改人〉
* 修改时间:YYYY-MM-DD
* 跟踪单号:〈跟踪单号〉
* 修改单号:〈修改单号〉
* 修改内容:〈修改内容〉
*/
package edu.fudan.nlp.corpus;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import edu.fudan.nlp.similarity.EditDistance;
import edu.fudan.nlp.similarity.EditDistanceWithSemantic;
import edu.fudan.nlp.similarity.ISimilarity;
/**
* 进行单词、短语的映射,将相同意思的词都标准化成同一的词
* @author Administrator
* @version 1.0
* @since 1.0
*/
public class WordMap {
private Map<String, String> nameMap;
private String fileName;
private String serFileName;
ISimilarity is;
/**
* 初始化实例
* @param filename 保存单词匹配的文件名
*/
public WordMap(String filename){
fileName = filename;
buildNameMap();
}
/**
* @param filename
* @return
*/
private void buildNameMap() {
nameMap = Collections.synchronizedMap(new HashMap<String, String>());
try {
InputStreamReader read = new InputStreamReader (new FileInputStream(fileName),"utf-8");
BufferedReader bin = new BufferedReader(read);
String info = bin.readLine();
while(info!=null&&info.length()>0){
String[] toks = info.split("\\s+");
for(int i=0;i<toks.length;i++){
nameMap.put(toks[i], toks[0]);
}
info = bin.readLine();
}
}catch(Exception e){
}
}
/**
* 查找输入词有没有对应的标准词
* @param word
* @return 词
*/
public String getMap(String word){
if(nameMap==null||!nameMap.containsKey(word))
return word;
else
return nameMap.get(word);
}
/**
* 查找输入词有没有对应的标准词,进行宽松的匹配方法
* @param str
* @return 词
* @throws Exception
*/
public String getLooseMap (String str) throws Exception {
if(is==null)
is = new EditDistanceWithSemantic();
String resName = str;
if(str==null||str.trim().length()==0)
return resName;
for(Iterator it = nameMap.keySet().iterator();it.hasNext();){
str = (String) it.next();
if(is.calc(str,resName)==0){
resName = nameMap.get(str);
System.out.println("匹配:"+str+"<"+resName);
break;
}
}
return resName;
}
}