package ruc.irm.similarity.word.cilin;
import java.io.IOException;
import java.io.InputStream;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.zip.GZIPInputStream;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import ruc.irm.similarity.util.FileUtils;
import ruc.irm.similarity.util.TraverseEvent;
/**
* 词林数据库
*
* @author <a href="mailto:iamxiatian@gmail.com">夏天</a>
* @organization 中国人民大学信息资源管理学院 知识工程实验室
*/
public class CilinDb {
/** the logger */
protected static Logger LOG = LoggerFactory.getLogger(CilinDb.class);
/** 以词语为主键的索引表 */
private Map<String, Set<String>> wordIndex = new HashMap<String, Set<String>>();
/** 以编码为主键的索引表 */
private Map<String, Set<String>> codeIndex = new HashMap<String, Set<String>>();
private static CilinDb instance = null;
public static CilinDb getInstance(){
if(instance == null){
try {
instance = new CilinDb();
} catch (IOException e) {
LOG.error(e.toString());
}
}
return instance;
}
private CilinDb() throws IOException{
String cilinFile = getClass().getPackage().getName().replaceAll("\\.", "/") + "/cilin.db.gz";
InputStream input = new GZIPInputStream(this.getClass().getClassLoader().getResourceAsStream(cilinFile));
TraverseEvent<String> event = new TraverseEvent<String>(){
@Override
public boolean visit(String line) {
String[] items = line.split(" ");
Set<String> set = new HashSet<String>();
for(int i=2; i<items.length; i++){
String code = items[i].trim();
if(!code.equals("")){
set.add(code);
//加入codeIndex编码
Set<String> codeWords = codeIndex.get(code);
if(codeWords==null){
codeWords = new HashSet<String>();
}
codeWords.add(items[0]);
codeIndex.put(code, codeWords);
}
}
wordIndex.put(items[0], set);
items = null;
return false;
}};
LOG.info("loading cilin dictionary...");
long time = System.currentTimeMillis();
FileUtils.traverseLines(input, "UTF8", event);
time = System.currentTimeMillis() - time;
LOG.info("loading cilin dictionary completely. time elapsed: " + time);
}
/**
* 获取某个词语的词林编码,一个词语可以有多个编码,通过Set给出
* @param word
* @return
*/
public Set<String> getCilinCoding(String word){
return wordIndex.get(word);
}
public Set<String> getCilinWords(String code){
return codeIndex.get(code);
}
public static void main(String[] args) {
CilinDb db = CilinDb.getInstance();
String code = db.getCilinCoding("中国").iterator().next();
System.out.println(CilinCoding.printCoding(code));
System.out.println(db.getCilinWords(code));
}
}