/* * 文件名:WordList.java * 版权:Copyright 2008-20012 复旦大学 All Rights Reserved. * 描述:程序总入口 * 修改人:xpqiu * 修改时间:Nov 30, 2008 * 修改内容:新增 * * 修改人:〈修改人〉 * 修改时间:YYYY-MM-DD * 跟踪单号:〈跟踪单号〉 * 修改单号:〈修改单号〉 * 修改内容:〈修改内容〉 */ package edu.fudan.nlp.corpus; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FilenameFilter; import java.io.InputStreamReader; import java.io.Serializable; import java.util.ArrayList; import java.util.HashSet; import java.util.Iterator; /** * @author xpqiu * @version 1.0 * WordList */ public class WordList { class WordTree implements Serializable{ private static final long serialVersionUID = 523580450351093949L; HashSet<String> wordSet; int depth; String tag; ArrayList<WordTree> childs; public WordTree(){ wordSet = new HashSet<String>(); childs = new ArrayList<WordTree>(); } } public static WordTree dicts=null; //字典,用来生成字典特征 public String dicDir = "../Data/wordlist"; public WordList(int depth){ if(dicts ==null){ loaddict(depth); } } public String getFeatures(String word,String tag){ return getFeaturesFromNodes(dicts,word, tag, 1); } public String getFeaturesFromNodes(WordTree node, String word,String tag,int depth){ if(node.depth>depth) return " "; String res = ""; String newfeature =tag; newfeature = newfeature+node.tag+"."; if(node.wordSet.size()>0) { if(node.wordSet.contains(word)) newfeature = newfeature +"1 "; else newfeature = newfeature +"0 "; res = res + newfeature; } Iterator<WordTree> it = node.childs.iterator(); while(it.hasNext()){ WordTree subnode = it.next(); if(subnode==null) continue; res = res + getFeaturesFromNodes(subnode,word,newfeature,depth); } return res; } /** * */ private void loaddict(int depth) { dicts = new WordTree(); dicts.depth=-1; dicts.tag = ""; File f = new File(dicDir); if(!f.exists()) return; else loadDir(f,dicts,depth); } /** * @param dicts * @param file */ private void loadDir(File f, WordTree parent,int depth) { WordTree current; if(parent.depth>=depth) current = parent; else{ current= new WordTree(); parent.childs.add(current); current.depth = parent.depth+1; current.tag=f.getName().replace(".dic", ""); } if(f.isDirectory()){ File[] flist = f.listFiles(new FilenameFilter(){ public boolean accept(File dir, String name) { if(name.endsWith(".dic")) return true; else return false; } }); for(int i=0;i<flist.length;i++) loadDir(flist[i],current,depth); }else{ if(!f.toString().endsWith(".dic")) return; try { InputStreamReader read = new InputStreamReader (new FileInputStream(f.toString()),"utf-8"); BufferedReader bin = new BufferedReader(read); String w; while((w=bin.readLine())!=null){ current.wordSet.add(w.trim()); } }catch(Exception e){ } } } /** * @param args */ public static void main(String[] args) { WordList wl = new WordList(2); } }