FieldsPreprocessor.java example

Explorer
docx4j-master
- src
package org.docx4j.model.fields;

import org.docx4j.XmlUtils;
import org.docx4j.jaxb.Context;
import org.docx4j.openpackaging.exceptions.Docx4JException;
import org.docx4j.openpackaging.parts.JaxbXmlPart;
import org.docx4j.wml.ContentAccessor;
import org.docx4j.wml.FldChar;
import org.docx4j.wml.P;
import org.docx4j.wml.ProofErr;
import org.docx4j.wml.R;
import org.docx4j.wml.STFldCharType;
import org.docx4j.wml.Text;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import javax.xml.bind.JAXBContext;
import javax.xml.bind.Unmarshaller;
import javax.xml.namespace.QName;
import javax.xml.transform.Source;
import javax.xml.transform.Templates;
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.stream.StreamSource;
import java.io.IOException;
import java.util.LinkedList;
import java.util.List;
import java.util.NoSuchElementException;

/**
 * This class puts fields into a "canonical" representation
 * (see FieldRef for description).
 * 
 * It does this in 2 steps:
 * - step 1: use XSLT to convert simple fields into complex ones
 * - step 2: put all the instructions into a single run
 * 
 * Currently the canonicalisation is done at the paragraph level,
 * so it is not suitable for fields (such as TOC) which extend across paragraphs.
 * TOC will need to be regenerated (using Word) if touched by canonicalisation.
 * 
 * @author jharrop
 *
 */
public class FieldsPreprocessor {
	
	private static Logger log = LoggerFactory.getLogger(FieldsPreprocessor.class);		

    private final static QName _RInstrText_QNAME = new QName("http://schemas.openxmlformats.org/wordprocessingml/2006/main", 
    		"instrText");
    private final static QName _PHyperlink_QNAME = new QName("http://schemas.openxmlformats.org/wordprocessingml/2006/main", 
    		"hyperlink");
    
	
	static Templates xslt;			
	static {
		try {
			Source xsltSource = new StreamSource(
						org.docx4j.utils.ResourceUtils.getResource(
								"org/docx4j/model/fields/FieldsSimpleToComplex.xslt"));
			xslt = XmlUtils.getTransformerTemplate(xsltSource);
		} catch (IOException e) {
			e.printStackTrace();
		} catch (TransformerConfigurationException e) {
			e.printStackTrace();
		}
		
	}
	
	private FieldsPreprocessor(List<FieldRef> fieldRefs) {
		this.fieldRefs = fieldRefs;
	}

	/**
	 * Convert any w:fldSimple in this part to complex field. 
	 * @param part
	 * @throws Docx4JException
	 */
	public static void complexifyFields(JaxbXmlPart part) throws Docx4JException {
		
		org.w3c.dom.Document doc = XmlUtils.marshaltoW3CDomDocument(
				part.getJaxbElement() ); 	
		
//		XPathsPart xPathsPart = null;
				
		JAXBContext jc = Context.jc;
		try {
			// Use constructor which takes Unmarshaller, rather than JAXBContext,
			// so we can set JaxbValidationEventHandler
			Unmarshaller u = jc.createUnmarshaller();
			u.setEventHandler(new org.docx4j.jaxb.JaxbValidationEventHandler());
			javax.xml.bind.util.JAXBResult result = new javax.xml.bind.util.JAXBResult(u );
								
			org.docx4j.XmlUtils.transform(doc, xslt, null, result);
			
			part.setJaxbElement(result);
		} catch (Exception e) {
			throw new Docx4JException("Problems transforming fields", e);			
		}
				
	}
	
	
	/**
	 * Convert the field(s) in the input P into a predictable
	 * format, and add a FieldRef object to the list for each
	 * top level field encountered.  
	 * 
	 * WARNING: this method should not be used where a field 
	 * in the P extends into a subsequent P.
	 * 
	 * @param p
	 * @param fieldRefs
	 * @return the modified P
	 */
	public static P canonicalise(P p, List<FieldRef> fieldRefs) {
		/*
		 * Result is something like:
		 * 
		        <w:p>
		            <w:r>
		                <w:fldChar w:fldCharType="begin"/>
		                <w:instrText xml:space="preserve"> DATE  </w:instrText>
		                <w:fldChar w:fldCharType="separate"/>
		            </w:r>
		            <w:r>
		                <w:t>4/12/2011</w:t>
		            </w:r>
		            <w:r>
		                <w:fldChar w:fldCharType="end"/>
		            </w:r>
		        </w:p>		  
		 
		 * Note that the content between begin and separate could be more complex
		 * including nested fields.
		 **/
		
		
		FieldsPreprocessor fp = new FieldsPreprocessor(fieldRefs);
		return fp.canonicaliseInstance(p);
	}
	
	private P canonicaliseInstance(P p) {

		P newP = Context.getWmlObjectFactory().createP();
		newP.setPPr(p.getPPr());
		
		newR = Context.getWmlObjectFactory().createR();
//		fieldRPr = null;
		
		stack = new LinkedList<FieldRef>();
		if(log.isDebugEnabled()) {
            log.debug(XmlUtils.marshaltoString(p));
        }
		handleContent(p.getContent(), newP);

		// log.debug(XmlUtils.marshaltoString(newP, true));

		return newP;
	}
	
	/**
	 * A list of FieldRef objects representing outermost fields
	 * only.
	 */
	private List<FieldRef> fieldRefs;
	
	
	private LinkedList<FieldRef> stack;
	private FieldRef currentField=null;
	
	private R newR;
	
	private void handleContent(List<Object> objects, ContentAccessor attachmentPoint) {
		// handles case where the run(s) containing the field are inside a P, or inside a P.Hyperlink 
		// (eg a PAGEREF in a table of contents).
		
		for (Object o : objects ) {

			// Handling for hyperlink (can occur in field result, and might contain another
			// nested field).  Since at present the field processing here is for
			// MERGEFIELD and DOCPROPERTY fields, this is currently just handled by else below.
			
			//	if ( o instanceof P.Hyperlink
			//			|| ((o instanceof JAXBElement
			//					&& ((JAXBElement)o).getName().equals(_PHyperlink_QNAME)) )	) {
			//	
			
			
			if ( o instanceof R ) {
				
				R existingRun = (R)o;
				handleRun(existingRun, attachmentPoint);

			} else if (o instanceof ProofErr) {
				// Ignore
				// What happens if we ignore eg grammarStart, but its matching
				// grammarEnd is outside and retained?
				// Well, a stray spellStart doesn't matter to Word 2010, so
				// assume others would be ok as well.
			} else {
				// its not something we're interested in
				
				log.debug("Retaining" + XmlUtils.unwrap(o).getClass().getName());

				attachmentPoint.getContent().add(o);

				// prepare new run
				newR = Context.getWmlObjectFactory().createR();
				
			}

//			if (newR.getContent().size() > 0 && !attachmentPoint.getContent().contains(newR)) {
//				attachmentPoint.getContent().add(newR);
//			}
			
		}
		
	}
	
	private boolean fieldIsTopLevel() {
		return stack.size()==1;
	}
	
	private boolean inParentResult() {
		
		FieldRef thisField = stack.pop();
		try {
			FieldRef parentField = stack.pop();
			boolean inResult = parentField.haveSeenSeparate();
			// restore stack
			stack.push(parentField);
			stack.push(thisField);
			return inResult;
		} catch (NoSuchElementException e) {
			// No parent
			// restore stack
			stack.push(thisField);
			return false;
		}
		
	}
	
	/**
	 * Its preserved, if it is locked.
	 * 
	 * If it isn't locked, it is preserved unless its a MERGEFIELD or a DOCPROPERTY field.
	 * 
	 * @param fieldRef
	 * @return
	 */
	private boolean preserveResult(FieldRef fieldRef) {
		
		if (fieldRef.isLock()) return true;
		
		
		String fldName = fieldRef.getFldName();
		if (fldName==null) return true;
		
		if (fldName.equals("MERGEFIELD")
				|| fldName.equals("DOCPROPERTY")) {
			return false;
		}
		return true;
	}

	private boolean preserveParentResult() {
		
		FieldRef thisField = stack.pop();
		FieldRef parentField = stack.pop();
		boolean preserveParentResult = preserveResult(parentField);
		// restore stack
		stack.push(parentField);
		stack.push(thisField);
		return preserveParentResult;
	}
	
	private void handleRun(R existingRun, ContentAccessor newAttachPoint) {
		
		// note that the newR object persists between invocations of this method,
		// so you have to be careful to actually add it to the docx 
		// before re-creating it

        if(log.isDebugEnabled()) {
            log.debug("\nInput run: \n " + XmlUtils.marshaltoString(existingRun, true, true));
        }
		
		for (Object o2 : existingRun.getContent() ) {
			
			newR.setRPr(existingRun.getRPr());

			if (isCharType(o2, STFldCharType.BEGIN)) {
				
				log.debug("\n\n begin.. ");

				// Setup a FieldRef object 
				currentField = new FieldRef((FldChar)XmlUtils.unwrap(o2));							
				currentField.setParent(newAttachPoint);							
				currentField.setBeginRun(newR); // may as well do this
				
				stack.push(currentField);
				
				if (inParentResult()) {
					
					if (preserveParentResult()) {
						newR.getContent().add(o2);
					} else {
						log.debug(".. but in result, so don't add to run");
					}
					
				} else {
					
					if ( fieldIsTopLevel() ) { 
					
						newR = Context.getWmlObjectFactory().createR();
						newR.getContent().add(o2);					
						
						currentField.setBeginRun(newR); // IMPORTANT, so we can delete it when we perform mail merge
						
						fieldRefs.add(currentField);					
					} else {
						newR.getContent().add(o2);
						
						stack.peek().getInstructions().add(currentField);
					}
				}
				
			} else if (isCharType(o2, STFldCharType.SEPARATE)) {
				
				currentField.setSeenSeparate(true);
				
				if (inParentResult()) {

					if (preserveParentResult()) {
						newR.getContent().add(o2);
					} else {
						log.debug(".. but in result, so don't add to run");
					}

				} else {
				
					newR.getContent().add(o2);
					if (!newAttachPoint.getContent().contains(newR)) {
						newAttachPoint.getContent().add(newR);
                        if(log.isDebugEnabled()) {
                            log.debug("-- attaching -->" + XmlUtils.marshaltoString(newR, true, true));
                        }
					}
					
					if ( fieldIsTopLevel() ) {
						// Top level field separator
						
						// Create result slot
						newR = Context.getWmlObjectFactory().createR();
						currentField.setResultsSlot(newR); 
					}
				}
					
			} else if (isCharType(o2, STFldCharType.END)) {
				
				log.debug("\n\n .. end ");
				
				if (inParentResult()) {

					if (preserveParentResult()) {
						
						newR.getContent().add(o2);

						if (currentField.getFldName().equals("FORMTEXT")) {
							/*
							 * Workaround for a bug in Word 2010.
							 * 
							 * If you have multiple FORMTEXT in a single run,
							 * for example:
							 * 
							 *      <w:fldChar w:fldCharType="begin">
							          <w:ffData>
							            <w:name w:val="Text12"/>
							            <w:enabled/>
							            <w:calcOnExit w:val="false"/>
							            <w:textInput/>
							          </w:ffData>
							        </w:fldChar>
							        <w:instrText xml:space="preserve"> FORMTEXT </w:instrText>
							        <w:fldChar w:fldCharType="separate"/>
							        <w:t> </w:t>
							        <w:fldChar w:fldCharType="end"/>
							        <w:fldChar w:fldCharType="begin">
							          <w:ffData>
							            <w:name w:val="Text12"/>
							            <w:enabled/>
							            <w:calcOnExit w:val="false"/>
							            <w:textInput/>
							          </w:ffData>
							        </w:fldChar>
							        <w:instrText xml:space="preserve"> FORMTEXT </w:instrText>
							        <w:fldChar w:fldCharType="separate"/>
							        <w:t> </w:t>
							        <w:fldChar w:fldCharType="end"/>						
							 *
							 * Word 2010 does not display all the w:t elements (ie spaces appear to
							 * be missing).
							 * 
							 * Adding w:t/@xml:space="preserve" doesn't help.
							 * 
							 * So the workaround here is to start a new run after each END tag.
							 */
							if (!newAttachPoint.getContent().contains(newR)) {
								newAttachPoint.getContent().add(newR);
                                if(log.isDebugEnabled()) {
                                    log.debug("-- attaching -->" + XmlUtils.marshaltoString(newR, true, true));
                                }
							}
							newR = Context.getWmlObjectFactory().createR();						
						}						
					} else {
						log.debug(".. but in result, so don't add to run");
					}

				} else {  // still in END processing

					if ( fieldIsTopLevel() ) {
					
						if (!currentField.haveSeenSeparate()) {
							// Word 2010 can produce a docx where:
							//  <w:r>
							//    <w:fldChar w:fldCharType="separate"/>
							//  </w:r>
							// is missing (valid per spec).
							
							// For top level fields only, we add this
							log.debug(".. ADDING SEP ..  ");
	
	//						R separateR = Context.getWmlObjectFactory().createR();							
							FldChar fldChar = Context.getWmlObjectFactory().createFldChar();
							fldChar.setFldCharType(STFldCharType.SEPARATE);
							newR.getContent().add(fldChar);
													
							if (!newAttachPoint.getContent().contains(newR)) {
								newAttachPoint.getContent().add(newR);
                                if(log.isDebugEnabled()) {
                                    log.debug("-- attaching -->" + XmlUtils.marshaltoString(newR, true, true));
                                }
							}
							
						}					
						
						// set up results slot - only for top-level fields
						newR = currentField.getResultsSlot(); // MERGEFORMAT processing below may have set this already
						if (newR==null) {
							newR = Context.getWmlObjectFactory().createR();
							currentField.setResultsSlot(newR);
						}
						if (!newAttachPoint.getContent().contains(newR)) { // test, since this is also done immediately before each loop ends
							newAttachPoint.getContent().add(newR);
                            if(log.isDebugEnabled()) {
                                log.debug("-- attaching -->" + XmlUtils.marshaltoString(newR, true, true));
                            }
						}
						
						
						// create a run specifically for end char
						newR = Context.getWmlObjectFactory().createR();
						newAttachPoint.getContent().add(newR);
						newR.getContent().add(o2);
						currentField.setEndRun(newR);
						
						//for whatever follows the field
						newR = Context.getWmlObjectFactory().createR();
						
					} else {
						newR.getContent().add(o2);	
						

					}
					
				}
				
				stack.pop();
				currentField = stack.peek();
				
			} else if (currentField==null) {
					// run content before or after the field
					// - preserve this content
					
					newR.getContent().add(o2);

					if (!newAttachPoint.getContent().contains(newR)) {
						newAttachPoint.getContent().add(newR);
                        if(log.isDebugEnabled()) {
                            log.debug("-- attaching -->" + XmlUtils.marshaltoString(newR, true, true));
                        }
					}
				
					newR = Context.getWmlObjectFactory().createR();						
				
			} else if ( !currentField.haveSeenSeparate() ) {
				
				// Handles problems with empty w:instrText elements within complex field "begin" section
				Object o = XmlUtils.unwrap(o2);
				if (o instanceof Text && ((Text) o).getValue().trim().isEmpty()) {
					log.debug("Empty w:instrText found. Ignore it!");
					continue;
				}
				
				
//				log.debug("Processing " +((JAXBElement<Text>)o2).getValue().getValue() );
				
				currentField.getInstructions().add(o2);
				if (inParentResult()) {

					if (preserveParentResult()) {
						newR.getContent().add(o2);
					} else {
						log.debug(".. but in result, so don't add to run");
					}

				} else {				
					newR.getContent().add(o2);
				}

			} else if (preserveResult(currentField)) {
				// ie locked, or not MERGEFIELD, or DOCPROPERTY
				log.debug("preserveResult-> adding");
				newR.getContent().add(o2);		
				
				if (currentField.getResultsSlot()==null) {
					currentField.setResultsSlot(newR);  // no harm in doing this - same as in SEPARATE processing?
				} else if (currentField.getResultsSlot()!=newR) {
					log.warn("Multiple runs in results slot?");
				}
				
			} else {
				// result content .. can ignore unless it has \* MERGEFORMAT
				
				// if \* MERGEFORMAT, attach the rPr of first run in the result
				if (o2 instanceof R
						&& currentField.isMergeFormat() 
						&& currentField.getResultsSlot()==null) {

					R resultR = Context.getWmlObjectFactory().createR();
					currentField.setResultsSlot(resultR);
					resultR.setRPr(((R)o2).getRPr()); // could be null, but that's ok
					log.debug("MERGEFORMAT Set rPr");
				}
				
				
				
				// TODO: a TOC field usually has a PAGEREF wrapped in a hyperlink in its
				// result part.  We should either keep the entire result, or empty it.
				// only do this if the field has no nested field; we need a way to look ahead
				// to see whether a nested field is coming up)
				
				// we only want a single run between SEPARATOR and END,
				// and we added that in the SEPARATE stuff above
                if(log.isDebugEnabled()) {
                    log.debug("IGNORING " + XmlUtils.marshaltoString(o2, true, true));
                }
				
			} 

			// Doesn't solve the problem of Word failing to display some spaces.
//			if ( o2 instanceof Text
//					|| ((o2 instanceof JAXBElement
//							&& ((JAXBElement)o2).getName().equals(_RT_QNAME)) )	) {
//				Text t = (Text)XmlUtils.unwrap(o2);
//				t.setSpace("preserve");
//			}
			
			if (newR.getContent().size() > 0 && !newAttachPoint.getContent().contains(newR)) {
				newAttachPoint.getContent().add(newR);
			}
			
		} // end for (Object o2 : existingRun.getContent() )
		
	}
		
	    private final static QName _RT_QNAME = new QName("http://schemas.openxmlformats.org/wordprocessingml/2006/main", "t");
		
	
//	public static boolean containsCharType(Object o, STFldCharType charType) {
//		
//		if (o instanceof R) {
//			for (Object o2 : ((R)o).getContent() ) {
//				
//				if (isCharType(o2, charType)) {
//						return true;
//				}				
//			}
//		} 
//		return false;
//	}	

	public static boolean isCharType(Object o2, STFldCharType charType) {
		
		o2 = XmlUtils.unwrap(o2);
		
		if (o2 instanceof org.docx4j.wml.FldChar) {
			FldChar fldChar = (FldChar)o2;
			if (fldChar.getFldCharType().equals(charType) ) {
								
				return true;
			} else {
				log.debug(fldChar.getFldCharType().toString());				
			}
		}
		return false;
	}	
	
}