package org.ansj.library; import java.io.IOException; import java.util.HashMap; import java.util.Map.Entry; import java.util.Set; import org.ansj.dic.DicReader; import org.ansj.domain.AnsjItem; import org.ansj.domain.PersonNatureAttr; import org.ansj.domain.TermNature; import org.ansj.domain.TermNatures; import org.ansj.library.name.PersonAttrLibrary; import org.nlpcn.commons.lang.dat.DoubleArrayTire; import org.nlpcn.commons.lang.dat.Item; import org.nlpcn.commons.lang.util.logging.Log; import org.nlpcn.commons.lang.util.logging.LogFactory; public class DATDictionary { private static final Log LOG = LogFactory.getLog(DATDictionary.class); /** * 核心词典 */ private static final DoubleArrayTire DAT = loadDAT(); /** * 数组长度 */ public static int arrayLength = DAT.arrayLength; /** * 加载词典 * * @return */ private static DoubleArrayTire loadDAT() { long start = System.currentTimeMillis(); try { DoubleArrayTire dat = DoubleArrayTire.loadText(DicReader.getInputStream("core.dic"), AnsjItem.class); // 人名识别必备的 personNameFull(dat); // 记录词典中的词语,并且清除部分数据 for (Item item : dat.getDAT()) { if (item == null || item.getName() == null) { continue; } if (item.getStatus() < 2) { item.setName(null); continue; } } LOG.info("init core library ok use time : " + (System.currentTimeMillis() - start)); return dat; } catch (InstantiationException e) { LOG.warn("无法实例化", e); } catch (IllegalAccessException e) { LOG.warn("非法访问", e); } catch (NumberFormatException e) { LOG.warn("数字格式异常", e); } catch (IOException e) { LOG.warn("IO异常", e); } return null; } private static void personNameFull(DoubleArrayTire dat) throws NumberFormatException, IOException { HashMap<String, PersonNatureAttr> personMap = new PersonAttrLibrary().getPersonMap(); AnsjItem ansjItem = null; // 人名词性补录 Set<Entry<String, PersonNatureAttr>> entrySet = personMap.entrySet(); char c = 0; String temp = null; for (Entry<String, PersonNatureAttr> entry : entrySet) { temp = entry.getKey(); if (temp.length() == 1 && (ansjItem = (AnsjItem) dat.getDAT()[temp.charAt(0)]) == null) { ansjItem = new AnsjItem(); ansjItem.setBase(c); ansjItem.setCheck(-1); ansjItem.setStatus((byte) 3); ansjItem.setName(temp); dat.getDAT()[temp.charAt(0)] = ansjItem; } else { ansjItem = dat.getItem(temp); } if (ansjItem == null) { continue; } if ((ansjItem.termNatures) == null) { if (temp.length() == 1 && temp.charAt(0) < 256) { ansjItem.termNatures = TermNatures.NULL; } else { ansjItem.termNatures = new TermNatures(TermNature.NR); } } ansjItem.termNatures.setPersonNatureAttr(entry.getValue()); } } public static int status(char c) { Item item = DAT.getDAT()[c]; if (item == null) { return 0; } return item.getStatus(); } /** * 判断一个词语是否在词典中 * * @param word * @return */ public static boolean isInSystemDic(String word) { Item item = DAT.getItem(word); return item != null && item.getStatus() > 1; } public static AnsjItem getItem(int index) { AnsjItem item = DAT.getItem(index); if (item == null) { return AnsjItem.NULL; } return item; } public static AnsjItem getItem(String str) { AnsjItem item = DAT.getItem(str); if (item == null || item.getStatus() < 2) { return AnsjItem.NULL; } return item; } public static int getId(String str) { return DAT.getId(str); } }