/** * */ package org.docx4j.convert.in.word2003xml; import java.io.File; import java.io.IOException; import javax.xml.bind.JAXBContext; import javax.xml.bind.JAXBException; import javax.xml.bind.util.JAXBResult; import javax.xml.transform.Source; import javax.xml.transform.Templates; import javax.xml.transform.TransformerConfigurationException; import javax.xml.transform.stream.StreamSource; import org.apache.commons.io.FileUtils; import org.docx4j.XmlUtils; import org.docx4j.openpackaging.exceptions.Docx4JException; import org.docx4j.openpackaging.exceptions.InvalidFormatException; import org.docx4j.openpackaging.packages.WordprocessingMLPackage; import org.docx4j.openpackaging.parts.WordprocessingML.FontTablePart; import org.docx4j.openpackaging.parts.WordprocessingML.MainDocumentPart; import org.docx4j.openpackaging.parts.WordprocessingML.NumberingDefinitionsPart; import org.docx4j.wml.Numbering.AbstractNum; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * This is a simple proof of concept of * converting Word 2003 XML to ECMA 376 docx. * * @author jharrop * @since 3.0.0 */ public class Word2003XmlConverter { private static Logger log = LoggerFactory.getLogger(Word2003XmlConverter.class); static Templates xslt; private Transition03To06 transitionContainer; static { try { // XmlUtils.getTransformerFactory().setURIResolver(new OutHtmlURIResolver()); // TODO FIXME - not thread safe, which would be an issue Source xsltSource = new StreamSource(org.docx4j.utils.ResourceUtils.getResource( "org/docx4j/convert/in/word2003xml/2003-import.xslt")); xslt = XmlUtils.getTransformerTemplate(xsltSource); } catch (IOException e) { e.printStackTrace(); log.error("Couldn't setup 2003-import.xslt", e); } catch (TransformerConfigurationException e) { e.printStackTrace(); log.error("Couldn't setup 2003-import.xslt", e); } } public Word2003XmlConverter(Source source) throws JAXBException, Docx4JException { // Use 2003-import.xsl to convert to a Transition03To06 object java.lang.ClassLoader classLoader = Word2003XmlConverter.class.getClassLoader(); JAXBResult result = new JAXBResult( JAXBContext.newInstance("org.docx4j.convert.in.word2003xml", classLoader) ); XmlUtils.transform(source, xslt, null, result); // set the unmarshalled content tree transitionContainer = (Transition03To06)result.getResult(); } /** * Get the new docx. Will be made public if/when this code is mature enough. * @return */ private WordprocessingMLPackage getWordprocessingMLPackage() { return getWordprocessingMLPackage(false); } private WordprocessingMLPackage getWordprocessingMLPackage(boolean mainDocOnly) { WordprocessingMLPackage wordMLPackage=null; try { wordMLPackage = WordprocessingMLPackage.createPackage(); } catch (InvalidFormatException e) {} MainDocumentPart mdp = wordMLPackage.getMainDocumentPart(); // Main Document Part mdp.getJaxbElement().setBody(transitionContainer.getBody()); // DEBUGGING: if Word can't open the resulting docx, // a process for working out why is to // make sure it works with just the main document part, // then each of the following 3 parts, one by one. // What you need to do is to compare the XSLT output for the part // (XmlUtils.marshaltoString for the relevant part is usually // enough) to what ECMA 376 requires. if (!mainDocOnly) { // Styles mdp.getStyleDefinitionsPart(true).setJaxbElement(transitionContainer.getStyles()); // Numbering try { NumberingDefinitionsPart ndp = new NumberingDefinitionsPart(); ndp.setJaxbElement(transitionContainer.getNumbering()); mdp.addTargetPart(ndp); // fix attributes // <w:multiLevelType w:val="Multilevel"/> should start with lower case for (AbstractNum anum : ndp.getJaxbElement().getAbstractNum()) { if (anum.getMultiLevelType()==null) continue; String multiLevelType = anum.getMultiLevelType().getVal(); multiLevelType = multiLevelType.substring(0, 1).toLowerCase() + multiLevelType.substring(1); anum.getMultiLevelType().setVal(multiLevelType); } } catch (InvalidFormatException e) {} // Fonts try { FontTablePart fontsPart = new FontTablePart(); fontsPart.setJaxbElement(transitionContainer.getFonts()); mdp.addTargetPart(fontsPart); } catch (InvalidFormatException e) {} } return wordMLPackage; } /** * Example of usage * * @param args * @throws IOException * @throws Docx4JException * @throws JAXBException */ public static void main(String[] args) throws IOException, JAXBException, Docx4JException { boolean save = true; File file = new File(System.getProperty("user.dir") + "/sample-docs/word/2003/word2003xml.xml"); // It works for this document, but that's the only one tested so far. // This is currently just a proof of concept, but contributed improvements are welcome. Source source = new StreamSource(FileUtils.openInputStream(file)); Word2003XmlConverter conv = new Word2003XmlConverter(source); WordprocessingMLPackage wordMLPackage = conv.getWordprocessingMLPackage(); // Pretty print the main document part // System.out.println( // XmlUtils.marshaltoString(wordMLPackage.getMainDocumentPart().getJaxbElement(), true, true) ); // Optionally save it if (save) { String filename = System.getProperty("user.dir") + "/OUT_FromWord2003XML.docx"; wordMLPackage.save(new java.io.File(filename) ); System.out.println("Saved " + filename); } } }