/* $Id: Tree2XML.java 971 2011-09-13 18:32:55Z hong1.cui $ */
package fna.charactermarkup;
import java.io.*;
import java.util.ArrayList;
import java.util.Arrays;
import org.jdom.Document;
import org.jdom.input.SAXBuilder;
/**
* @author hongcui
*
*/
public class Tree2XML {
private String test=null;
//private String str = "";
public static ArrayList<String> adverbs = new ArrayList<String>();
//private static PrintWriter out;
/**
*
*/
public Tree2XML(String test) {
// TODO Auto-generated constructor stub
this.test = test.replaceAll("\\(\\s*\\)", "");
}
public Document xml() throws Exception{
if(test==null || test.trim().length()==0){
return null;
}
/*
//out.println();
//out.println(test);
//step 1: turn all ( to <
test = test.replaceAll("\\(", "<");
//System.out.println(test);
//step 2: turn those ) that are after a <, but without another < in between, to />. Regexp: <[^<]*?\)
Pattern p = Pattern.compile("(.*?<[^<]*?)(\\))(.*)");
Matcher m = p.matcher(test);
while(m.matches()){
xml += m.group(1)+"/>";
test = m.group(3);
m = p.matcher(test);
}
xml+=test;
//System.out.println(xml);
//step 3: process remaining ) one by one
p = Pattern.compile("(.*?\\))(.*)");
m = p.matcher(xml);
while(m.matches()){
String part = m.group(1);
part = process(part);
xml = part+m.group(2);
// System.out.println(xml);
m = p.matcher(xml);
}
*/
String xml = "";
try {
xml = format(test);
//System.out.println(xml);
Document doc =null;
SAXBuilder builder = new SAXBuilder();
ByteArrayInputStream bais = new ByteArrayInputStream(xml.getBytes());
doc = builder.build(bais);
return doc;
} catch (Exception e) {
e.printStackTrace();
System.out.print("Problem parsing the xml: \n" + xml+"\n"+e.toString());
throw e;
}
//return doc;
}
/*
* @param parsed: (NP (JJ subulate) (NNS enations))
*/
private String format(String parsed) throws Exception{
try{
int count = 0;
String t;
StringBuffer xml = new StringBuffer();
parsed = parsed.replaceAll("\\)", ") ").replaceAll("\\s+", " ").trim();//(NP (JJ subulate) (NNS enations) )
parsed = parsed.replaceAll("``", "JJ").replaceAll("\\(-LRB-", "(PUNCT").replaceAll("\\([^A-Z/ ]+", "(PUNCT");
parsed = parsed.replaceAll("(?<=\\([A-Z]{1,8}) (?!\\()", "_");//(NP (JJ_subulate) (NNS_enations))
ArrayList<String> tokens = new ArrayList<String>(Arrays.asList(parsed.split("\\s+")));
for(int i = 0; i < tokens.size(); i++){
String token = tokens.get(i);
if(token.matches("\\)")){//now i-1 should have the start tag that matches this ")"
xml.append(tokens.get(i-1).replaceFirst("\\(", "</")+">");
tokens.remove(i-1);
tokens.remove(i-1);
i=i-2;
}else if(token.endsWith(")")){//(JJ_subulate)
token = token.replaceAll("[()]", "");
String tag = token.substring(0, token.indexOf("_"));
String text = token.substring(token.indexOf("_")+1);
xml.append("<"+tag+" id='"+count+"' text='"+text+"'/>");//<JJ text="subulate"/>
count++;
tokens.remove(i);
i--;
}else{//(NP
String tag = token.replaceFirst("\\(", "<").trim()+">";//<NP>
xml.append(tag); //keep this token
}
}
if(tokens.size()>0){
System.err.println("error reading xml");
System.exit(2);
}
return xml.toString();
}catch (Exception e){
e.printStackTrace();
throw e;
}
}
/**
* <NN Heads/> will become
* <NN text="Heads"/>
* @param xml
*/
/*private String format(String xml) {
// TODO Auto-generated method stub
String r = "";
int count = 0;
xml = xml.replaceAll("``", "JJ").replaceAll("<[^A-Z/ ]+", "<PUNCT");
Pattern p = Pattern.compile("(.*?)<([^<]*?) ([^<]*?)/>(.*)");
Matcher m = p.matcher(xml);
while(m.matches()){
r += m.group(1);
r +="<"+m.group(2)+" id=\""+count+"\" text=\""+m.group(3)+"\"/>";
xml = m.group(4);
if(m.group(2).compareToIgnoreCase("RB")==0 && !this.adverbs.contains(m.group(3)) && !m.group(3).matches("\\b("+ChunkedSentence.prepositions+")\\b")){
this.adverbs.add(m.group(3));
}
m = p.matcher(xml);
count++;
}
r +=xml;
return r;
}*/
/**
*
* @param part looks like:
* a) <S <NP <NP <NN Heads/> <JJ many/>) or
* b) <S <NP <NP> <NN Heads/> <JJ many/> </NP> )
* @return:
* a) <S <NP <NP> <NN Heads/> <JJ many/> </NP> or
* b) <S <NP> <NP> <NN Heads/> <JJ many/> </NP> </NP>
*/
/*
private String process(String part) {
String result = "";
part = part.trim().replaceFirst("\\)$", "").replaceAll("\\s+", " ").trim();
String cp = part;
part = part.replaceAll("<[^<]*?/>", "");
Pattern p = Pattern.compile("(.*?)<([A-Z]+)>\\s*</\\2>(.*)");
Matcher m = p.matcher(part);
while(m.matches()){
part = m.group(1)+m.group(3);
m = p.matcher(part);
}
part = part.trim();
if(part.lastIndexOf("<") < 0){
return cp;
}
String tag = part.substring(part.lastIndexOf("<"));
cp = cp.replaceAll(tag+"( |$)", tag+"* ");
int index = cp.lastIndexOf('*');
result = cp.substring(0, index)+">"+cp.substring(index+1)+"</"+tag.replaceFirst("<","")+">";
result = result.replaceAll("\\*", "");
return result;
}
*/
/* private void processxml(Document root) {
str = "";
NodeList noun = root.getElementsByTagName("NP");
if( noun.getLength() != 0){
Node node = noun.item(0);
if (node.hasChildNodes()){
processchildnn(node);
}
else{
if (node.getAttributes()!= null){
str = str + node.getAttributes().getNamedItem("text").getNodeValue()+" ";
}
}
}
NodeList verbid = root.getElementsByTagName("VP");
if( verbid.getLength() != 0){
System.out.println(verbid.getLength());
out.print(str+" / ");
//System.out.println(verbid.item(0).getFirstChild().getTextContent());
for(int i = 0; i < verbid.getLength(); i++){
Node node = verbid.item(i);
if (node.hasChildNodes()){
processchilds(node);
out.print(" / ");
}
else{
if (node.getAttributes()!= null){
out.println(node.getAttributes().getNamedItem("text").getNodeValue());
}
}
}
}
}
private void processchildnn(Node node) {
NodeList childid = node.getChildNodes();
for(int j = 0; j < childid.getLength(); j++){
Node nodes = childid.item(j);
if (nodes.hasChildNodes())
processchildnn(nodes);
else{
if (nodes.getAttributes()!= null && nodes.getNodeName() == "NN"){
str = str + nodes.getAttributes().getNamedItem("text").getNodeValue()+" ";
}
}
}
}
private void processchilds(Node node) {
NodeList childid = node.getChildNodes();
for(int j = 0; j < childid.getLength(); j++){
Node nodes = childid.item(j);
if (nodes.hasChildNodes())
processchilds(nodes);
else{
if (nodes.getAttributes()!= null){
out.print(nodes.getAttributes().getNamedItem("text").getNodeValue()+" ");
}
}
}
}*/
/**
* @param args
*/
public static void main(String[] args) {
}
}