package ruc.irm.similarity.word.hownet.sememe; import java.io.FileOutputStream; import java.io.PrintWriter; import java.util.ArrayList; import java.util.List; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.transform.OutputKeys; import javax.xml.transform.Transformer; import javax.xml.transform.TransformerFactory; import javax.xml.transform.dom.DOMSource; import javax.xml.transform.stream.StreamResult; import org.w3c.dom.Document; import org.w3c.dom.Element; import ruc.irm.similarity.util.TraverseEvent; /** * 实现遍历加载义原信息到义原表中, 义原词典的组织以知网导出的格式为标准,如:<br/> * - entity|实体 <br/> * ├ thing|万物 [#time|时间,#space|空间] <br/> * │ ├ physical|物质 [!appearance|外观] <br/> * │ │ ├ animate|生物 [*alive|活着,!age|年龄,*die|死,*metabolize|代谢] <br/> * │ │ │ ├ AnimalHuman|动物 [!sex|性别,*AlterLocation|变空间位置,*StateMental|精神状态] <br/> * │ │ │ │<br/> * 等等 <br> * * @author <a href="mailto:iamxiatian@gmail.com">夏天</a> * @organization 中国人民大学信息资源管理学院 知识工程实验室 * @deprecated */ public class SememeDictTraverseEvent implements TraverseEvent<String>{ /** 义原存放的列表, 按照顺序设置ID,存放到线性表中 */ private List<Sememe> sememeList = null; public SememeDictTraverseEvent(){ this.sememeList = new ArrayList<Sememe>(); } /** * 获取加载后的义原信息,按照下标顺序存放,树的层次关系通过数组下标表示 * @return */ public Sememe[] getSememes(){ return sememeList.toArray(new Sememe[sememeList.size()]); } private void processXML(Document document, Element root, int parentId, String fullParentId){ int position = 1; for(int i=0; i<sememeList.size(); i++){ Sememe sememe = sememeList.get(i); if(sememe.getParentId()==parentId && sememe.getId()!=parentId){ Element sememeNode = document.createElement("sememe"); String fullId = fullParentId + "-" + (position++); sememeNode.setAttribute("id", fullId); sememeNode.setAttribute("cn", sememe.getCnWord()); sememeNode.setAttribute("en", sememe.getEnWord()); if(sememe.getDefine()!=null && !sememe.getDefine().equals("")){ sememeNode.setAttribute("define", sememe.getDefine()); } root.appendChild(sememeNode); processXML(document, root, sememe.getId(), fullId); } } } /** * 保存到XML文件中, 新版本的xsimilarity采用xml格式存储义原,其格式为 * <sememes> * <sememe cn="事件" en="event" id="1"/> * <sememe cn="静态" en="static" id="1-1"/> * ... * </sememes> * @param xmlFile * @throws Exception */ public void saveToXML(String xmlFile) throws Exception{ DocumentBuilderFactory factory=DocumentBuilderFactory.newInstance(); DocumentBuilder builder=factory.newDocumentBuilder(); Document document=builder.newDocument(); Element root=document.createElement("sememes"); document.appendChild(root); int position = 1; for(Sememe sememe:sememeList){ if(sememe.getId()!=sememe.getParentId()){ continue; } Element sememeNode = document.createElement("sememe"); String fullId = Integer.toString(position++); sememeNode.setAttribute("id", fullId); sememeNode.setAttribute("cn", sememe.getCnWord()); sememeNode.setAttribute("en", sememe.getEnWord()); if(sememe.getDefine()!=null && !sememe.getDefine().equals("")){ sememeNode.setAttribute("define", sememe.getDefine()); } root.appendChild(sememeNode); processXML(document, root, sememe.getId(), fullId); } TransformerFactory tf=TransformerFactory.newInstance(); Transformer transformer=tf.newTransformer(); DOMSource source=new DOMSource(document); transformer.setOutputProperty(OutputKeys.ENCODING,"utf8"); transformer.setOutputProperty(OutputKeys.INDENT,"yes"); PrintWriter pw=new PrintWriter(new FileOutputStream(xmlFile)); StreamResult result=new StreamResult(pw); transformer.transform(source,result); } /** * 解析当前义原信息文本行<br/> * 判断读入的一行文本是义元树中的第几层,读入的格式形如:<br> * - entity|实体 <br> * ├ thing|万物 [#time|时间,#space|空间] <br> * │ ├ physical|物质 [!appearance|外观] <br> * │ │ ├ animate|生物 [*alive|活着,!age|年龄,*die|死,*metabolize|代谢] <br> * * @param item * @return 如果是义原,则info[0]返回层次深度(info[0]>=0); info[1]返回具体的义元内容起始位置;否则info[0]返回-1 */ private int[] parseSememeLine(String item) { int[] info = new int[2]; info[0] = -1; int prefixLen = 0; // 前缀的数目,包括空格和"-,│,├"等符号,其中空格和"-"符号算一个长度,其他算2个 for (int i = 0; i < item.length(); i++) { char ch = item.charAt(i); if ((ch == ' ') || (ch == '-')) { prefixLen++; } else if ((ch == '├') || (ch == '│') || (ch == '└')) { prefixLen += 2; } else { // 遇到非前缀字符,求解,根据前缀深度,如果为2,返回0,即第一级,否则,每增加3,深度加1 if (prefixLen >= 2) { info[0] = (prefixLen - 2) / 3; info[1] = i; } break; } } return info; } /** * 根据字符串判断义元的类型 * * @param item * @return */ private int parseSememeType(String item) { String myItem = item.toLowerCase().trim(); if (myItem.indexOf("event|") == 0) return SememeType.Event; else if (myItem.indexOf("entity|") == 0) return SememeType.Entity; else if (myItem.indexOf("attribute|") == 0) return SememeType.Attribute; else if (myItem.indexOf("quantity|") == 0) return SememeType.Quantity; else if (myItem.indexOf("avalue|") == 0) return SememeType.AValue; else if (myItem.indexOf("qvalue|") == 0) return SememeType.QValue; else if (myItem.indexOf("secondary feature") == 0) return SememeType.SecondaryFeature; else if (myItem.indexOf("syntax") == 0) return SememeType.Syntax; else if (myItem.indexOf("eventrole and features") == 0) return SememeType.EventRoleAndFeature; else return 0; } /** * 实现TraverseEvent<String>的实际访问接口, 返回值没有使用 * @see ke.commons.util.TraverseEvent */ public boolean visit(String line) { //判断是否为注释行 if(line.trim().equals("")||line.trim().charAt(0)=='#') return true; //当前义原在整个义原列表中的位置 int position = sememeList.size(); //解析当前义原信息文本行, info[0]表示当前义原的层次, info[1]表示当前义原的实际信息在文本行中的开始位置 int[] info = parseSememeLine(line); int curDepth = info[0]; //如果深度<0,继续 if(info[0]<0) return false; //取出真正的义原字符串 String sememeString = line.substring(info[1]); //深度为0,表示为根节点 if(info[0]==0){ Sememe sememe = new Sememe(position, position, 0, sememeString); int sememeType = parseSememeType(sememeString); sememe.setType(sememeType); sememeList.add(sememe); }else{ Sememe parentSememe = sememeList.get(position-1); //最近一个深度比当前深度大1的义原即为该义原的父节点 while((parentSememe.getDepth()-curDepth)!=-1){ parentSememe = sememeList.get(parentSememe.getParentId()); } Sememe sememe = new Sememe(position, parentSememe.getId(), curDepth, sememeString); sememe.setType(parentSememe.getType()); sememeList.add(sememe); } return true; } }