package org.docx4j.utils.sax; import java.io.OutputStreamWriter; import java.io.UnsupportedEncodingException; import java.io.Writer; import java.net.URLEncoder; import java.util.HashMap; import java.util.HashSet; import java.util.Set; import javax.xml.bind.Marshaller; import org.apache.log4j.Logger; import org.docx4j.jaxb.Context; import org.docx4j.jaxb.NamespacePrefixMapperUtils; import org.docx4j.openpackaging.packages.WordprocessingMLPackage; import org.docx4j.openpackaging.parts.WordprocessingML.MainDocumentPart; import org.jvnet.jaxb2_commons.xpath_tracker.Histgram; import org.xml.sax.Attributes; import org.xml.sax.SAXException; /** * Convert OpenXML to hyperlinked HTML * * The hyperlinks are XPaths to each element * * Could have used XSLT to produce these XPaths. * See for example http://stackoverflow.com/questions/953197/how-do-you-output-the-current-element-path-in-xslt * * But that would've been inefficient. * * I could have written the XPath as an extra attribute, * then used XSLT to transform the XML to HTML, but it * seems simpler to create the HTML here as well. * */ public class SAXHandlerToCodeString extends SAXIdentityHandler { protected static Logger log = Logger.getLogger(SAXHandlerToCodeString.class); final static boolean IS_PRETTY_PRINTED = true; boolean firstElement=true; boolean initialised=false; Set<String> requiredNamespaceDecs = new HashSet<String>(); private void registerNamespaceDecs(String uri, String qName) { int index = qName.indexOf(":"); if (index>0) { String prefix = qName.substring(0, index); String dec = "xmlns:"+prefix+"=\""+ uri+ "\""; requiredNamespaceDecs.add(dec); } else if (uri.equals("http://schemas.openxmlformats.org/spreadsheetml/2006/main")) { String dec = "xmlns:s=\""+ uri+ "\""; } else { log.warn("TODO: handle default namespace " + uri); } } public String getNamespaceDecs() { StringBuilder sb = new StringBuilder(); for (String s : requiredNamespaceDecs) { sb.append( " " + s.replace("\"", "\\\"") ); } return sb.toString(); } private void init() { histgrams.clear(); histgrams.push(new Histgram()); initialised=true; } public SAXHandlerToCodeString(Writer out) { super(out); TAG_OPEN="<"; TAG_CLOSE=">"; QUOTE="\\\""; } // @Override public void startDocument() throws SAXException { init(); } // @Override public void endDocument() throws SAXException { write("\"", false); } // @Override public void startPrefixMapping(String prefix, String uri) throws SAXException { } // @Override public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException { if (!initialised) init(); if (needToClose) { write(TAG_CLOSE+"\"\n", false); needToClose = false; } histgrams.peek().update(uri, localName, qName); histgrams.push(new Histgram()); registerNamespaceDecs(uri, qName); if (firstElement) { write("String openXML = \"" + TAG_OPEN + qName, false); firstElement = false; } else { write("+ \"" + TAG_OPEN + qName, true); } int length = atts.getLength(); for (int i = 0; i < length; i++) { if (atts.getLocalName(i).equals("rsidRDefault") || atts.getLocalName(i).equals("rsidP") || atts.getLocalName(i).equals("rsidR") || atts.getLocalName(i).equals("rsidRPr") || atts.getLocalName(i).equals("rsidSect") || atts.getLocalName(i).equals("rsidTr") || atts.getLocalName(i).equals("paraId") || atts.getLocalName(i).equals("textId") ) { // Ignore these } else { write(" " + atts.getQName(i)+"=" + QUOTE +atts.getValue(i)+ QUOTE, false); registerNamespaceDecs(atts.getURI(i), atts.getQName(i) ); } } needToClose = true; } // @Override public void endElement(String uri, String localName, String qName) throws SAXException { if (IS_PRETTY_PRINTED) { if (needToClose) { write("/"+TAG_CLOSE+"\"\n", false); needToClose = false; } else if (charactersIncludeNewline) { // Workaround for case where source is pretty printed // and so has whitespace write("+ \"" + TAG_OPEN+"/" + qName + TAG_CLOSE+"\"\n", true); charactersIncludeNewline= false; } else if (encounteredText) { write(TAG_OPEN+"/" + qName + TAG_CLOSE+"\"\n", false); } else { write("+ \"" + TAG_OPEN+"/" + qName + TAG_CLOSE+"\"\n", true); } } else { if (needToClose) { write("/"+TAG_CLOSE+"\"", false); needToClose = false; } else if (charactersIncludeNewline) { // Workaround for case where source is pretty printed // and so has whitespace write("+ \"" + TAG_OPEN+"/" + qName + TAG_CLOSE+"\"", true); charactersIncludeNewline= false; } else if (encounteredText) { write(TAG_OPEN+"/" + qName + TAG_CLOSE+"\"", false); } else { write("+ \"" + TAG_OPEN+"/" + qName + TAG_CLOSE+"\"", true); } } histgrams.pop(); encounteredText = false; } public void characters(char[] ch, int start, int length) throws SAXException { // Close opening tag? if (needToClose) { write(TAG_CLOSE, false); // System.out.println(String.valueOf(ch) + ": " + start + ", " + length); // Write a quote, if there is no text content if (String.valueOf(ch).trim().length()==0) { if (IS_PRETTY_PRINTED) { write("\"", false); } else { write("\"\n", false); } } needToClose = false; } write((new String(ch, start, length)).replace("\\", "\\\\"), false); encounteredText = true; } /** * @param args * @throws Exception */ public static void main(String[] args) throws Exception { String inputfilepath = System.getProperty("user.dir") + "/test.docx"; WordprocessingMLPackage wordMLPackage = WordprocessingMLPackage .load(new java.io.File(inputfilepath)); MainDocumentPart documentPart = wordMLPackage.getMainDocumentPart(); Marshaller marshaller=Context.jc.createMarshaller(); NamespacePrefixMapperUtils.setProperty(marshaller, NamespacePrefixMapperUtils.getPrefixMapper()); Writer out = new OutputStreamWriter(System.out); marshaller.marshal(documentPart.getJaxbElement(), new SAXHandlerToCodeString(out)); out.close(); } }