package org.krakenapps.docxcod; import static org.krakenapps.docxcod.util.XMLDocHelper.evaluateXPath; import static org.krakenapps.docxcod.util.XMLDocHelper.newDocumentBuilder; import static org.krakenapps.docxcod.util.XMLDocHelper.newXPath; import java.io.File; import java.io.FileInputStream; import java.io.InputStream; import java.util.ArrayList; import java.util.List; import java.util.Map; import javax.xml.transform.TransformerFactoryConfigurationError; import javax.xml.xpath.XPath; import org.krakenapps.docxcod.util.XMLDocHelper.NodeListWrapper; import org.w3c.dom.Document; import org.w3c.dom.Node; import org.w3c.dom.NodeList; public class DirectiveExtractor implements OOXMLProcessor { @Override public void process(OOXMLPackage pkg, Map<String, Object> rootMap) { extractField(pkg); } private List<Directive> directives = new ArrayList<Directive>(); public List<Directive> getDirectives() { return directives; } private void extractField(OOXMLPackage pkg) throws TransformerFactoryConfigurationError { InputStream f = null; try { f = new FileInputStream(new File(pkg.getDataDir(), "word/document.xml")); Document doc = newDocumentBuilder().parse(f); XPath xpath = newXPath(doc); NodeList nodeList = evaluateXPath(xpath, "//*[name()='w:fldChar' or name()='w:instrText' or name()='w:fldSimple']", doc); directives = parseNodeList(nodeList); } catch (Exception e) { e.printStackTrace(); } finally { safeClose(f); } } public static List<Directive> parseNodeList(NodeList nodeList) { /* example: <w:fldSimple w:instr="MERGEFIELD "@before-row#list .vars[\"disk-usage-summary\"] as u" \* MERGEFORMAT"> <w:r w:rsidR="00C47145"> <w:t>«@before-row#list .vars["disk-usage-summa»</w:t> </w:r> </w:fldSimple> <w:r> <w:fldChar w:fldCharType="begin" /> </w:r> <w:r> <w:instrText xml:space="preserve">MERGEFIELD "@before-row#list .vars[\"disk-usage-summary\"] as u" \* MERGEFORMAT</w:instrText> </w:r> <w:r> <w:fldChar w:fldCharType="separate" /> </w:r> ... w:r screen representation of directive <w:r> <w:rPr> <w:noProof /> </w:rPr> <w:fldChar w:fldCharType="end" /> </w:r> contents of 'w:instrText' can be splitted into many runs, so contents of the elements between w:fldCharType="begin" and w:fldCharType="end" should be merged */ ArrayList<Directive> directives = new ArrayList<Directive>(); String mergedDirective = null; Node directivePosition = null; for (Node n : new NodeListWrapper(nodeList)) { if (n.getNodeName().equals("w:fldChar")) { String fldCharType = n.getAttributes().getNamedItem("w:fldCharType").getNodeValue(); if (fldCharType.equals("begin")) { mergedDirective = ""; directivePosition = n; } else if (fldCharType.equals("end") || fldCharType.equals("separate")) { if (directivePosition != null) { directives.add(new Directive(directivePosition, Directive.extractDirective(mergedDirective))); } mergedDirective = null; directivePosition = null; } else { mergedDirective = null; directivePosition = null; } } else if (n.getNodeName().equals("w:instrText")) { if (directivePosition != null) mergedDirective += n.getTextContent(); } else if (n.getNodeName().equals("w:fldSimple")) { String nodeValue = n.getAttributes().getNamedItem("w:instr").getNodeValue(); directives.add(new Directive(n, Directive.extractDirective(nodeValue))); } else { } } return directives; } private void safeClose(InputStream f) { if (f == null) return; try { f.close(); } catch (Exception e) { // ignore } } }