package edu.fudan.ml.types; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.Map.Entry; import java.util.Scanner; import java.util.Set; import java.util.TreeMap; import java.util.TreeSet; import edu.fudan.util.MultiValueMap; import edu.fudan.util.MyCollection; import edu.fudan.util.exception.LoadModelException; public class Dictionary { private int MAX_LEN = 7; private int MIN_LEN = 2; /** * 词典,词和相应的词性 */ private MultiValueMap<String,String> dp; private TreeMap<String, int[]> index = new TreeMap<String, int[]>(); private int indexLen = 2; private boolean isAmbiguity = false; public Dictionary(){ MAX_LEN = Integer.MIN_VALUE; MIN_LEN = Integer.MAX_VALUE; dp = new MultiValueMap<String, String>(); } /** * * @param b 是否模糊处理 */ public Dictionary(boolean b) { this(); this.setAmbiguity(b); } /** * * @param path * @throws IOException */ public Dictionary(String path) throws IOException { this(path,false); } /** * * @param path * @param b 使用模糊处理 * @throws IOException */ public Dictionary(String path, boolean b) throws IOException { this(); this.setAmbiguity(b); ArrayList<String[]> al = loadDict(path); add(al); createIndex(); } /** * 加入不带词性的词典 * @param al 词的数组 */ public void addSegDict(Collection<String> al) { for(String s: al){ addDict(s); } createIndex(); } /** * * @param word 词 * @param poses 词性数组 */ public void add(String word, String... poses) { addDict(word,poses); indexLen = MIN_LEN; createIndex(); } /** * * @param al 词典 ArrayList<String[]> * 每一个元素为一个单元String[]. * String[] 第一个元素为单词,后面为对应的词性 * @return */ public void add(ArrayList<String[]> al) { for(String[] pos: al) { addDict(pos[0], Arrays.copyOfRange(pos, 1, pos.length)); } indexLen = MIN_LEN; createIndex(); } /** * 在目前词典中增加新的词典信息 * @param path * @throws FileNotFoundException */ public void addFile(String path) throws LoadModelException{ try { ArrayList<String[]> al = loadDict(path); add(al); indexLen = MIN_LEN; createIndex(); } catch (IOException e) { throw new LoadModelException("加载词典错误"+e.toString()); } } /** * 通过词典文件建立词典 * @param path * @return * @throws FileNotFoundException */ private ArrayList<String[]> loadDict(String path) throws IOException { Scanner scanner = new Scanner(new FileInputStream(path), "utf-8"); ArrayList<String[]> al = new ArrayList<String[]>(); while(scanner.hasNext()) { String line = scanner.nextLine().trim(); if(line.length() > 0) { String[] s = line.split("\\s"); al.add(s); } } scanner.close(); return al; } /** * 增加词典信息 * @param word * @param poses */ private void addDict(String word, String... poses){ if(word.length() > MAX_LEN) MAX_LEN = word.length(); if(word.length() < MIN_LEN) MIN_LEN = word.length(); if(poses==null||poses.length==0){ if(!dp.containsKey(word)) dp.put(word, null); return; } for(int j = 0; j < poses.length; j++) { dp.put(word, poses[j]); } } /** * 建立词的索引 */ private void createIndex() { indexLen = MIN_LEN; TreeMap<String, TreeSet<Integer>> indexT = new TreeMap<String, TreeSet<Integer>>(); for(String s: dp.keySet()) { if(s.length() < indexLen) continue; String temp = s.substring(0, indexLen); //System.out.println(temp); if(indexT.containsKey(temp) == false) { TreeSet<Integer> set = new TreeSet<Integer>(); set.add(s.length()); indexT.put(temp, set); } else { indexT.get(temp).add(s.length()); } } for(Entry<String, TreeSet<Integer>> entry: indexT.entrySet()) { String key = entry.getKey(); TreeSet<Integer> set = entry.getValue(); int[] ia = new int[set.size()]; int i = set.size(); // System.out.println(key); for(Integer integer: set) { ia[--i] = integer; } // for(int j = 0; j < ia.length; j++) // System.out.println(ia[j]); index.put(key, ia); } // System.out.println(indexT); } public int getMaxLen() { return MAX_LEN; } public int getMinLen() { return MIN_LEN; } public boolean contains(String s) { return dp.containsKey(s); } public int[] getIndex(String s) { return index.get(s); } public TreeSet<String> getPOS(String s) { return dp.getSet(s); } public int getDictSize() { return dp.size(); } public int getIndexLen() { return indexLen; } public boolean isAmbiguity() { return isAmbiguity; } private void setAmbiguity(boolean isAmbiguity) { this.isAmbiguity = isAmbiguity; } public Set<String> getDict() { return dp.keySet(); } public MultiValueMap<String, String> getPOSDict() { return dp; } public TreeMap<String, int[]> getIndex() { return index; } public int size(){ return dp.size(); } public void save(String path) { MyCollection.writeMultiValueMap(dp, path); } }