package org.docx4j.model.fields;
import org.docx4j.XmlUtils;
import org.docx4j.jaxb.Context;
import org.docx4j.openpackaging.exceptions.Docx4JException;
import org.docx4j.openpackaging.parts.JaxbXmlPart;
import org.docx4j.wml.ContentAccessor;
import org.docx4j.wml.FldChar;
import org.docx4j.wml.P;
import org.docx4j.wml.ProofErr;
import org.docx4j.wml.R;
import org.docx4j.wml.STFldCharType;
import org.docx4j.wml.Text;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.xml.bind.JAXBContext;
import javax.xml.bind.Unmarshaller;
import javax.xml.namespace.QName;
import javax.xml.transform.Source;
import javax.xml.transform.Templates;
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.stream.StreamSource;
import java.io.IOException;
import java.util.LinkedList;
import java.util.List;
import java.util.NoSuchElementException;
/**
* This class puts fields into a "canonical" representation
* (see FieldRef for description).
*
* It does this in 2 steps:
* - step 1: use XSLT to convert simple fields into complex ones
* - step 2: put all the instructions into a single run
*
* Currently the canonicalisation is done at the paragraph level,
* so it is not suitable for fields (such as TOC) which extend across paragraphs.
* TOC will need to be regenerated (using Word) if touched by canonicalisation.
*
* @author jharrop
*
*/
public class FieldsPreprocessor {
private static Logger log = LoggerFactory.getLogger(FieldsPreprocessor.class);
private final static QName _RInstrText_QNAME = new QName("http://schemas.openxmlformats.org/wordprocessingml/2006/main",
"instrText");
private final static QName _PHyperlink_QNAME = new QName("http://schemas.openxmlformats.org/wordprocessingml/2006/main",
"hyperlink");
static Templates xslt;
static {
try {
Source xsltSource = new StreamSource(
org.docx4j.utils.ResourceUtils.getResource(
"org/docx4j/model/fields/FieldsSimpleToComplex.xslt"));
xslt = XmlUtils.getTransformerTemplate(xsltSource);
} catch (IOException e) {
e.printStackTrace();
} catch (TransformerConfigurationException e) {
e.printStackTrace();
}
}
private FieldsPreprocessor(List<FieldRef> fieldRefs) {
this.fieldRefs = fieldRefs;
}
/**
* Convert any w:fldSimple in this part to complex field.
* @param part
* @throws Docx4JException
*/
public static void complexifyFields(JaxbXmlPart part) throws Docx4JException {
org.w3c.dom.Document doc = XmlUtils.marshaltoW3CDomDocument(
part.getJaxbElement() );
// XPathsPart xPathsPart = null;
JAXBContext jc = Context.jc;
try {
// Use constructor which takes Unmarshaller, rather than JAXBContext,
// so we can set JaxbValidationEventHandler
Unmarshaller u = jc.createUnmarshaller();
u.setEventHandler(new org.docx4j.jaxb.JaxbValidationEventHandler());
javax.xml.bind.util.JAXBResult result = new javax.xml.bind.util.JAXBResult(u );
org.docx4j.XmlUtils.transform(doc, xslt, null, result);
part.setJaxbElement(result);
} catch (Exception e) {
throw new Docx4JException("Problems transforming fields", e);
}
}
/**
* Convert the field(s) in the input P into a predictable
* format, and add a FieldRef object to the list for each
* top level field encountered.
*
* WARNING: this method should not be used where a field
* in the P extends into a subsequent P.
*
* @param p
* @param fieldRefs
* @return the modified P
*/
public static P canonicalise(P p, List<FieldRef> fieldRefs) {
/*
* Result is something like:
*
<w:p>
<w:r>
<w:fldChar w:fldCharType="begin"/>
<w:instrText xml:space="preserve"> DATE </w:instrText>
<w:fldChar w:fldCharType="separate"/>
</w:r>
<w:r>
<w:t>4/12/2011</w:t>
</w:r>
<w:r>
<w:fldChar w:fldCharType="end"/>
</w:r>
</w:p>
* Note that the content between begin and separate could be more complex
* including nested fields.
**/
FieldsPreprocessor fp = new FieldsPreprocessor(fieldRefs);
return fp.canonicaliseInstance(p);
}
private P canonicaliseInstance(P p) {
P newP = Context.getWmlObjectFactory().createP();
newP.setPPr(p.getPPr());
newR = Context.getWmlObjectFactory().createR();
// fieldRPr = null;
stack = new LinkedList<FieldRef>();
if(log.isDebugEnabled()) {
log.debug(XmlUtils.marshaltoString(p));
}
handleContent(p.getContent(), newP);
// log.debug(XmlUtils.marshaltoString(newP, true));
return newP;
}
/**
* A list of FieldRef objects representing outermost fields
* only.
*/
private List<FieldRef> fieldRefs;
private LinkedList<FieldRef> stack;
private FieldRef currentField=null;
private R newR;
private void handleContent(List<Object> objects, ContentAccessor attachmentPoint) {
// handles case where the run(s) containing the field are inside a P, or inside a P.Hyperlink
// (eg a PAGEREF in a table of contents).
for (Object o : objects ) {
// Handling for hyperlink (can occur in field result, and might contain another
// nested field). Since at present the field processing here is for
// MERGEFIELD and DOCPROPERTY fields, this is currently just handled by else below.
// if ( o instanceof P.Hyperlink
// || ((o instanceof JAXBElement
// && ((JAXBElement)o).getName().equals(_PHyperlink_QNAME)) ) ) {
//
if ( o instanceof R ) {
R existingRun = (R)o;
handleRun(existingRun, attachmentPoint);
} else if (o instanceof ProofErr) {
// Ignore
// What happens if we ignore eg grammarStart, but its matching
// grammarEnd is outside and retained?
// Well, a stray spellStart doesn't matter to Word 2010, so
// assume others would be ok as well.
} else {
// its not something we're interested in
log.debug("Retaining" + XmlUtils.unwrap(o).getClass().getName());
attachmentPoint.getContent().add(o);
// prepare new run
newR = Context.getWmlObjectFactory().createR();
}
// if (newR.getContent().size() > 0 && !attachmentPoint.getContent().contains(newR)) {
// attachmentPoint.getContent().add(newR);
// }
}
}
private boolean fieldIsTopLevel() {
return stack.size()==1;
}
private boolean inParentResult() {
FieldRef thisField = stack.pop();
try {
FieldRef parentField = stack.pop();
boolean inResult = parentField.haveSeenSeparate();
// restore stack
stack.push(parentField);
stack.push(thisField);
return inResult;
} catch (NoSuchElementException e) {
// No parent
// restore stack
stack.push(thisField);
return false;
}
}
/**
* Its preserved, if it is locked.
*
* If it isn't locked, it is preserved unless its a MERGEFIELD or a DOCPROPERTY field.
*
* @param fieldRef
* @return
*/
private boolean preserveResult(FieldRef fieldRef) {
if (fieldRef.isLock()) return true;
String fldName = fieldRef.getFldName();
if (fldName==null) return true;
if (fldName.equals("MERGEFIELD")
|| fldName.equals("DOCPROPERTY")) {
return false;
}
return true;
}
private boolean preserveParentResult() {
FieldRef thisField = stack.pop();
FieldRef parentField = stack.pop();
boolean preserveParentResult = preserveResult(parentField);
// restore stack
stack.push(parentField);
stack.push(thisField);
return preserveParentResult;
}
private void handleRun(R existingRun, ContentAccessor newAttachPoint) {
// note that the newR object persists between invocations of this method,
// so you have to be careful to actually add it to the docx
// before re-creating it
if(log.isDebugEnabled()) {
log.debug("\nInput run: \n " + XmlUtils.marshaltoString(existingRun, true, true));
}
for (Object o2 : existingRun.getContent() ) {
newR.setRPr(existingRun.getRPr());
if (isCharType(o2, STFldCharType.BEGIN)) {
log.debug("\n\n begin.. ");
// Setup a FieldRef object
currentField = new FieldRef((FldChar)XmlUtils.unwrap(o2));
currentField.setParent(newAttachPoint);
currentField.setBeginRun(newR); // may as well do this
stack.push(currentField);
if (inParentResult()) {
if (preserveParentResult()) {
newR.getContent().add(o2);
} else {
log.debug(".. but in result, so don't add to run");
}
} else {
if ( fieldIsTopLevel() ) {
newR = Context.getWmlObjectFactory().createR();
newR.getContent().add(o2);
currentField.setBeginRun(newR); // IMPORTANT, so we can delete it when we perform mail merge
fieldRefs.add(currentField);
} else {
newR.getContent().add(o2);
stack.peek().getInstructions().add(currentField);
}
}
} else if (isCharType(o2, STFldCharType.SEPARATE)) {
currentField.setSeenSeparate(true);
if (inParentResult()) {
if (preserveParentResult()) {
newR.getContent().add(o2);
} else {
log.debug(".. but in result, so don't add to run");
}
} else {
newR.getContent().add(o2);
if (!newAttachPoint.getContent().contains(newR)) {
newAttachPoint.getContent().add(newR);
if(log.isDebugEnabled()) {
log.debug("-- attaching -->" + XmlUtils.marshaltoString(newR, true, true));
}
}
if ( fieldIsTopLevel() ) {
// Top level field separator
// Create result slot
newR = Context.getWmlObjectFactory().createR();
currentField.setResultsSlot(newR);
}
}
} else if (isCharType(o2, STFldCharType.END)) {
log.debug("\n\n .. end ");
if (inParentResult()) {
if (preserveParentResult()) {
newR.getContent().add(o2);
if (currentField.getFldName().equals("FORMTEXT")) {
/*
* Workaround for a bug in Word 2010.
*
* If you have multiple FORMTEXT in a single run,
* for example:
*
* <w:fldChar w:fldCharType="begin">
<w:ffData>
<w:name w:val="Text12"/>
<w:enabled/>
<w:calcOnExit w:val="false"/>
<w:textInput/>
</w:ffData>
</w:fldChar>
<w:instrText xml:space="preserve"> FORMTEXT </w:instrText>
<w:fldChar w:fldCharType="separate"/>
<w:t> </w:t>
<w:fldChar w:fldCharType="end"/>
<w:fldChar w:fldCharType="begin">
<w:ffData>
<w:name w:val="Text12"/>
<w:enabled/>
<w:calcOnExit w:val="false"/>
<w:textInput/>
</w:ffData>
</w:fldChar>
<w:instrText xml:space="preserve"> FORMTEXT </w:instrText>
<w:fldChar w:fldCharType="separate"/>
<w:t> </w:t>
<w:fldChar w:fldCharType="end"/>
*
* Word 2010 does not display all the w:t elements (ie spaces appear to
* be missing).
*
* Adding w:t/@xml:space="preserve" doesn't help.
*
* So the workaround here is to start a new run after each END tag.
*/
if (!newAttachPoint.getContent().contains(newR)) {
newAttachPoint.getContent().add(newR);
if(log.isDebugEnabled()) {
log.debug("-- attaching -->" + XmlUtils.marshaltoString(newR, true, true));
}
}
newR = Context.getWmlObjectFactory().createR();
}
} else {
log.debug(".. but in result, so don't add to run");
}
} else { // still in END processing
if ( fieldIsTopLevel() ) {
if (!currentField.haveSeenSeparate()) {
// Word 2010 can produce a docx where:
// <w:r>
// <w:fldChar w:fldCharType="separate"/>
// </w:r>
// is missing (valid per spec).
// For top level fields only, we add this
log.debug(".. ADDING SEP .. ");
// R separateR = Context.getWmlObjectFactory().createR();
FldChar fldChar = Context.getWmlObjectFactory().createFldChar();
fldChar.setFldCharType(STFldCharType.SEPARATE);
newR.getContent().add(fldChar);
if (!newAttachPoint.getContent().contains(newR)) {
newAttachPoint.getContent().add(newR);
if(log.isDebugEnabled()) {
log.debug("-- attaching -->" + XmlUtils.marshaltoString(newR, true, true));
}
}
}
// set up results slot - only for top-level fields
newR = currentField.getResultsSlot(); // MERGEFORMAT processing below may have set this already
if (newR==null) {
newR = Context.getWmlObjectFactory().createR();
currentField.setResultsSlot(newR);
}
if (!newAttachPoint.getContent().contains(newR)) { // test, since this is also done immediately before each loop ends
newAttachPoint.getContent().add(newR);
if(log.isDebugEnabled()) {
log.debug("-- attaching -->" + XmlUtils.marshaltoString(newR, true, true));
}
}
// create a run specifically for end char
newR = Context.getWmlObjectFactory().createR();
newAttachPoint.getContent().add(newR);
newR.getContent().add(o2);
currentField.setEndRun(newR);
//for whatever follows the field
newR = Context.getWmlObjectFactory().createR();
} else {
newR.getContent().add(o2);
}
}
stack.pop();
currentField = stack.peek();
} else if (currentField==null) {
// run content before or after the field
// - preserve this content
newR.getContent().add(o2);
if (!newAttachPoint.getContent().contains(newR)) {
newAttachPoint.getContent().add(newR);
if(log.isDebugEnabled()) {
log.debug("-- attaching -->" + XmlUtils.marshaltoString(newR, true, true));
}
}
newR = Context.getWmlObjectFactory().createR();
} else if ( !currentField.haveSeenSeparate() ) {
// Handles problems with empty w:instrText elements within complex field "begin" section
Object o = XmlUtils.unwrap(o2);
if (o instanceof Text && ((Text) o).getValue().trim().isEmpty()) {
log.debug("Empty w:instrText found. Ignore it!");
continue;
}
// log.debug("Processing " +((JAXBElement<Text>)o2).getValue().getValue() );
currentField.getInstructions().add(o2);
if (inParentResult()) {
if (preserveParentResult()) {
newR.getContent().add(o2);
} else {
log.debug(".. but in result, so don't add to run");
}
} else {
newR.getContent().add(o2);
}
} else if (preserveResult(currentField)) {
// ie locked, or not MERGEFIELD, or DOCPROPERTY
log.debug("preserveResult-> adding");
newR.getContent().add(o2);
if (currentField.getResultsSlot()==null) {
currentField.setResultsSlot(newR); // no harm in doing this - same as in SEPARATE processing?
} else if (currentField.getResultsSlot()!=newR) {
log.warn("Multiple runs in results slot?");
}
} else {
// result content .. can ignore unless it has \* MERGEFORMAT
// if \* MERGEFORMAT, attach the rPr of first run in the result
if (o2 instanceof R
&& currentField.isMergeFormat()
&& currentField.getResultsSlot()==null) {
R resultR = Context.getWmlObjectFactory().createR();
currentField.setResultsSlot(resultR);
resultR.setRPr(((R)o2).getRPr()); // could be null, but that's ok
log.debug("MERGEFORMAT Set rPr");
}
// TODO: a TOC field usually has a PAGEREF wrapped in a hyperlink in its
// result part. We should either keep the entire result, or empty it.
// only do this if the field has no nested field; we need a way to look ahead
// to see whether a nested field is coming up)
// we only want a single run between SEPARATOR and END,
// and we added that in the SEPARATE stuff above
if(log.isDebugEnabled()) {
log.debug("IGNORING " + XmlUtils.marshaltoString(o2, true, true));
}
}
// Doesn't solve the problem of Word failing to display some spaces.
// if ( o2 instanceof Text
// || ((o2 instanceof JAXBElement
// && ((JAXBElement)o2).getName().equals(_RT_QNAME)) ) ) {
// Text t = (Text)XmlUtils.unwrap(o2);
// t.setSpace("preserve");
// }
if (newR.getContent().size() > 0 && !newAttachPoint.getContent().contains(newR)) {
newAttachPoint.getContent().add(newR);
}
} // end for (Object o2 : existingRun.getContent() )
}
private final static QName _RT_QNAME = new QName("http://schemas.openxmlformats.org/wordprocessingml/2006/main", "t");
// public static boolean containsCharType(Object o, STFldCharType charType) {
//
// if (o instanceof R) {
// for (Object o2 : ((R)o).getContent() ) {
//
// if (isCharType(o2, charType)) {
// return true;
// }
// }
// }
// return false;
// }
public static boolean isCharType(Object o2, STFldCharType charType) {
o2 = XmlUtils.unwrap(o2);
if (o2 instanceof org.docx4j.wml.FldChar) {
FldChar fldChar = (FldChar)o2;
if (fldChar.getFldCharType().equals(charType) ) {
return true;
} else {
log.debug(fldChar.getFldCharType().toString());
}
}
return false;
}
}