package ruc.irm.similarity.word.hownet2.concept;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import ruc.irm.similarity.util.TraverseEvent;
import ruc.irm.similarity.word.hownet2.concept.Concept;
/**
* 实现遍历加载概念信息到概念表中, 概念词典的组织以知网导出的格式为标准,格式如下:<br/>
* 阿斗 N human|人,ProperName|专,past|昔<br/>
* 阿爸 N human|人,family|家,male|男<br/>
* 即: <概念> <空格或者跳格> <词性> <空格或者跳格> <定义>"
* <br/>
* 概念保存到数组中,没有保存到Map中,可以降低对内存空间的使用
*
* @author <a href="mailto:iamxiatian@gmail.com">夏天</a>
* @organization 中国人民大学信息资源管理学院 知识工程实验室
*/
public class ConceptDictTraverseEvent implements TraverseEvent<String> {
private List<Concept> conceptList = null;
public ConceptDictTraverseEvent(){
conceptList = new ArrayList<Concept>();
}
public Concept[] getConcepts(){
Concept[] concepts = conceptList.toArray(new Concept[conceptList.size()]);
Arrays.sort(concepts);
return concepts;
}
/**
* 读取概念词典中的一行,并进行解析处理
*/
public boolean visit(String line) {
String word = null;
String pos = null;
String define = "";
char ch;
//以符号//开始的是注释行
if(line.startsWith("//")){
return true;
}
int lastPosition = 0; //最近一次处理内容的有意义的开始位置
int processFlag = 0; //当前处理部分的标志 0:处理word; 1:词性;2:定义
//解析出一行中的概念各项数据
loop: for (int position = 0; position < line.length(); position++) {
ch = line.charAt(position);
if ((ch == ' ') || (ch == '\t') || (position==(line.length()-1))) {
String item = line.substring(lastPosition, (position==(line.length()-1))?(position+1):position);
switch(processFlag){
case 0:
word = item;
processFlag++;
break;
case 1:
pos = item;
processFlag++;
break;
case 2:
//define = item;
//processFlag++;
define = line.substring(lastPosition).trim();
break loop;
case 3:
System.out.println(line);
break;
}
for( ;(position < line.length()); position++){
ch = line.charAt(position);
if ((ch != ' ') && (ch != '\t')) {
lastPosition = position;
break;
}
}
}
}
conceptList.add(new Concept(word, pos, define));
return true;
}
public void saveToXML(File xmlFile) throws Exception{
String conceptFile = getClass().getPackage().getName().replaceAll("\\.", "/") + "/concept.dat";
InputStream input = this.getClass().getClassLoader().getResourceAsStream(conceptFile);
BufferedReader in = new BufferedReader(new InputStreamReader(input, "utf8"));
DocumentBuilderFactory factory=DocumentBuilderFactory.newInstance();
DocumentBuilder builder=factory.newDocumentBuilder();
Document document=builder.newDocument();
Element root=document.createElement("concepts");
document.appendChild(root);
String line = null;
while ((line = in.readLine()) != null) {
saveLineToXML(document, root, line);
}
input.close();
in.close();
TransformerFactory tf=TransformerFactory.newInstance();
Transformer transformer=tf.newTransformer();
DOMSource source=new DOMSource(document);
transformer.setOutputProperty(OutputKeys.ENCODING,"utf8");
transformer.setOutputProperty(OutputKeys.INDENT,"yes");
PrintWriter pw=new PrintWriter(new FileOutputStream(xmlFile));
StreamResult result=new StreamResult(pw);
transformer.transform(source,result);
}
/**
* 读取概念词典中的一行,并进行解析处理
*/
private boolean saveLineToXML(Document document, Element root, String line) {
String word = null;
String pos = null;
String define = "";
char ch;
//以符号//开始的是注释行
if(line.startsWith("//")){
return true;
}
int lastPosition = 0; //最近一次处理内容的有意义的开始位置
int processFlag = 0; //当前处理部分的标志 0:处理word; 1:词性;2:定义
//解析出一行中的概念各项数据
loop: for (int position = 0; position < line.length(); position++) {
ch = line.charAt(position);
if ((ch == ' ') || (ch == '\t') || (position==(line.length()-1))) {
String item = line.substring(lastPosition, (position==(line.length()-1))?(position+1):position);
switch(processFlag){
case 0:
word = item;
processFlag++;
break;
case 1:
pos = item;
processFlag++;
break;
case 2:
//define = item;
//processFlag++;
define = line.substring(lastPosition).trim();
break loop;
case 3:
System.out.println(line);
break;
}
for( ;(position < line.length()); position++){
ch = line.charAt(position);
if ((ch != ' ') && (ch != '\t')) {
lastPosition = position;
break;
}
}
}
}
Element e = document.createElement("c");
e.setAttribute("w", word);
e.setAttribute("p", pos);
e.setAttribute("d", define);
root.appendChild(e);
return true;
}
public static void main(String[] args) throws Exception {
new ConceptDictTraverseEvent().saveToXML(new File("/home/xiatian/Desktop/concept.xml"));
}
}