/**
* Copyright (C) 2011-2015 The XDocReport Team <xdocreport@googlegroups.com>
*
* All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
* LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
* OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
* WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package fr.opensagres.poi.xwpf.converter.core;
import java.io.IOException;
import java.lang.reflect.Field;
import java.math.BigInteger;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.poi.openxml4j.opc.PackagePart;
import org.apache.poi.xwpf.usermodel.*;
import org.apache.xmlbeans.XmlCursor;
import org.apache.xmlbeans.XmlException;
import org.apache.xmlbeans.XmlObject;
import org.apache.xmlbeans.XmlTokenSource;
import org.openxmlformats.schemas.drawingml.x2006.main.CTGraphicalObject;
import org.openxmlformats.schemas.drawingml.x2006.main.CTGraphicalObjectData;
import org.openxmlformats.schemas.drawingml.x2006.picture.CTPicture;
import org.openxmlformats.schemas.drawingml.x2006.wordprocessingDrawing.CTAnchor;
import org.openxmlformats.schemas.drawingml.x2006.wordprocessingDrawing.CTInline;
import org.openxmlformats.schemas.drawingml.x2006.wordprocessingDrawing.CTPosH;
import org.openxmlformats.schemas.drawingml.x2006.wordprocessingDrawing.CTPosV;
import org.openxmlformats.schemas.drawingml.x2006.wordprocessingDrawing.CTWrapSquare;
import org.openxmlformats.schemas.drawingml.x2006.wordprocessingDrawing.STRelFromH;
import org.openxmlformats.schemas.drawingml.x2006.wordprocessingDrawing.STRelFromV;
import org.openxmlformats.schemas.drawingml.x2006.wordprocessingDrawing.STWrapText;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTBookmark;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTBr;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTDecimalNumber;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTDrawing;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTEmpty;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTHdrFtr;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTHdrFtrRef;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTHyperlink;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTLvl;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTNumPr;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTOnOff;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTP;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTPPr;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTPTab;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTR;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTRPr;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTRow;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTRunTrackChange;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSdtBlock;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSdtCell;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSdtContentBlock;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSdtContentRun;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSdtRun;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSectPr;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSimpleField;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSmartTagRun;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTString;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTStyle;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTTabs;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTTbl;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTTc;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTText;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.FtrDocument;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.HdrDocument;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.STBrType;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.STFldCharType;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.STMerge;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.STOnOff;
import fr.opensagres.poi.xwpf.converter.core.styles.XWPFStylesDocument;
import fr.opensagres.poi.xwpf.converter.core.utils.DxaUtil;
import fr.opensagres.poi.xwpf.converter.core.utils.StringUtils;
import fr.opensagres.poi.xwpf.converter.core.utils.XWPFRunHelper;
import fr.opensagres.poi.xwpf.converter.core.utils.XWPFTableUtil;
import org.xml.sax.SAXException;
/**
* Visitor to visit elements from entry word/document.xml, word/header*.xml, word/footer*.xml
*
* @param <T>
* @param <O>
* @param <E>
*/
public abstract class XWPFDocumentVisitor<T, O extends Options, E extends IXWPFMasterPage>
implements IMasterPageHandler<E>
{
private static final Logger LOGGER = Logger.getLogger( XWPFDocumentVisitor.class.getName() );
protected static final String WORD_MEDIA = "word/media/";
protected final XWPFDocument document;
private final MasterPageManager masterPageManager;
private XWPFHeader currentHeader;
private XWPFFooter currentFooter;
protected final XWPFStylesDocument stylesDocument;
protected final O options;
private boolean pageBreakOnNextParagraph;
protected boolean processingTotalPageCountField = false;
protected boolean totalPageFieldUsed = false;
/**
* Map of w:numId and ListContext
*/
private Map<Integer, ListContext> listContextMap;
public XWPFDocumentVisitor( XWPFDocument document, O options )
throws Exception
{
this.document = document;
this.options = options;
this.stylesDocument = createStylesDocument( document );
this.masterPageManager = new MasterPageManager( document.getDocument(), this );
}
protected XWPFStylesDocument createStylesDocument( XWPFDocument document )
throws XmlException, IOException
{
return new XWPFStylesDocument( document );
}
public XWPFStylesDocument getStylesDocument()
{
return stylesDocument;
}
public O getOptions()
{
return options;
}
public MasterPageManager getMasterPageManager()
{
return masterPageManager;
}
// ------------------------------ Start/End document visitor -----------
/**
* Main entry for visit XWPFDocument.
*
* @throws Exception
*/
public void start()
throws Exception
{
// start document
T container = startVisitDocument();
// Create IText, XHTML element for each XWPF elements from the w:body
List<IBodyElement> bodyElements = document.getBodyElements();
visitBodyElements( bodyElements, container );
// end document
endVisitDocument();
}
/**
* Start of visit document.
*
* @return
* @throws Exception
*/
protected abstract T startVisitDocument()
throws Exception;
/**
* End of visit document.
*
* @throws Exception
*/
protected abstract void endVisitDocument()
throws Exception;
// ------------------------------ XWPF Elements visitor -----------
protected void visitBodyElements( List<IBodyElement> bodyElements, T container )
throws Exception
{
if ( !masterPageManager.isInitialized() )
{
// master page manager which hosts each <:w;sectPr declared in the word/document.xml
// must be initialized. The initialization loop for each
// <w:p paragraph to compute a list of <w:sectPr which contains information
// about header/footer declared in the <w:headerReference/<w:footerReference
masterPageManager.initialize();
}
String previousParagraphStyleName = null;
for ( int i = 0; i < bodyElements.size(); i++ )
{
IBodyElement bodyElement = bodyElements.get( i );
switch ( bodyElement.getElementType() )
{
case PARAGRAPH:
XWPFParagraph paragraph = (XWPFParagraph) bodyElement;
String paragraphStyleName = paragraph.getStyleID();
boolean sameStyleBelow =
( paragraphStyleName != null && paragraphStyleName.equals( previousParagraphStyleName ) );
visitParagraph( paragraph, i, container );
break;
case TABLE:
previousParagraphStyleName = null;
visitTable( (XWPFTable) bodyElement, i, container );
break;
case CONTENTCONTROL:
visitSDT((XWPFSDT)bodyElement, i, container);
break;
}
}
}
/**
* @param contents content controls
*/
protected void visitSDT(XWPFSDT contents, int index, T container) throws Exception {
T sdtContainer = startVisitSDT( contents, container );
visitSDTBody( contents, sdtContainer );
endVisitSDT( contents, container, sdtContainer );
}
protected abstract T startVisitSDT(XWPFSDT contents, T container) throws SAXException;
protected abstract void endVisitSDT(XWPFSDT contents, T container, T sdtContainer) throws SAXException;
protected void visitSDTBody(XWPFSDT contents, T sdtContainer) throws Exception {
ISDTContent content = contents.getContent();
Field bodyElements;
try {
bodyElements = content.getClass().getDeclaredField("bodyElements");
bodyElements.setAccessible(true);
List<ISDTContents> isdtContents = (List<ISDTContents>) bodyElements.get(content);
for (int i = 0; i < isdtContents.size(); i++) {
ISDTContents isdtContent = isdtContents.get(i);
if (isdtContent instanceof XWPFParagraph) {
visitParagraph((XWPFParagraph) isdtContent, i, sdtContainer);
} else if (isdtContent instanceof XWPFTable) {
visitTable((XWPFTable) isdtContent, i, sdtContainer);
} else if (isdtContent instanceof XWPFRun) {
visitRun((XWPFParagraph) ((XWPFRun) isdtContent).getParent(), (XmlObject) isdtContent, sdtContainer);
} else if (isdtContent instanceof XWPFSDT) {
visitSDT((XWPFSDT) isdtContent, i, sdtContainer);
}
}
} catch (NoSuchFieldException e) {
e.printStackTrace();
} catch (IllegalAccessException e) {
e.printStackTrace();
}
}
/**
* Visit the given paragraph.
*
* @param paragraph
* @param index
* @param container
* @throws Exception
*/
protected void visitParagraph( XWPFParagraph paragraph, int index, T container )
throws Exception
{
if ( isWordDocumentPartParsing() )
{
// header/footer is not parsing.
// It's the word/document.xml which is parsing
// test if the current paragraph define a <w:sectPr
// to update the header/footer declared in the <w:headerReference/<w:footerReference
masterPageManager.update( paragraph.getCTP() );
}
if ( pageBreakOnNextParagraph )
{
pageBreak();
}
this.pageBreakOnNextParagraph = false;
ListItemContext itemContext = null;
CTNumPr originalNumPr = stylesDocument.getParagraphNumPr( paragraph );
CTNumPr numPr = getNumPr( originalNumPr );
if ( numPr != null )
{
// paragraph is a numbered/bullet list
// see http://msdn.microsoft.com/en-us/library/office/ee922775%28v=office.14%29.aspx
// - <w:p>
// - <w:pPr>
// <w:pStyle w:val="style0" />
// - <w:numPr>
// <w:ilvl w:val="0" />
// <w:numId w:val="2" />
// </w:numPr>
// get numbering.xml/w:num
/**
* <w:num w:numId="2"> <w:abstractNumId w:val="1" /> </w:num>
*/
XWPFNum num = getXWPFNum( numPr );
if ( num != null )
{
// get the abstractNum by usisng abstractNumId
/**
* <w:abstractNum w:abstractNumId="1"> <w:nsid w:val="3CBA6E67" /> <w:multiLevelType
* w:val="hybridMultilevel" /> <w:tmpl w:val="7416D4FA" /> - <w:lvl w:ilvl="0" w:tplc="040C0001">
* <w:start w:val="1" /> <w:numFmt w:val="bullet" /> <w:lvlText w:val="o" /> <w:lvlJc w:val="left" /> -
* <w:pPr> <w:ind w:left="720" w:hanging="360" /> </w:pPr> - <w:rPr> <w:rFonts w:ascii="Symbol"
* w:hAnsi="Symbol" w:hint="default" /> </w:rPr> </w:lvl>
*/
XWPFAbstractNum abstractNum = getXWPFAbstractNum( num );
// get the <w:lvl by using abstractNum and numPr level
/**
* <w:num w:numId="2"> <w:abstractNumId w:val="1" /> </w:num>
*/
CTDecimalNumber ilvl = numPr.getIlvl();
int level = ilvl != null ? ilvl.getVal().intValue() : 0;
CTLvl lvl = abstractNum.getAbstractNum().getLvlArray( level );
if ( lvl != null )
{
ListContext listContext = getListContext( originalNumPr.getNumId().getVal().intValue() );
itemContext = listContext.addItem( lvl );
}
}
}
T paragraphContainer = startVisitParagraph( paragraph, itemContext, container );
visitParagraphBody( paragraph, index, paragraphContainer );
endVisitParagraph( paragraph, container, paragraphContainer );
}
private CTNumPr getNumPr( CTNumPr numPr )
{
if ( numPr != null )
{
XWPFNum num = getXWPFNum( numPr );
if ( num != null )
{
// get the abstractNum by usisng abstractNumId
/**
* <w:abstractNum w:abstractNumId="1"> <w:nsid w:val="3CBA6E67" /> <w:multiLevelType
* w:val="hybridMultilevel" /> <w:tmpl w:val="7416D4FA" /> - <w:lvl w:ilvl="0" w:tplc="040C0001">
* <w:start w:val="1" /> <w:numFmt w:val="bullet" /> <w:lvlText w:val="o" /> <w:lvlJc w:val="left" /> -
* <w:pPr> <w:ind w:left="720" w:hanging="360" /> </w:pPr> - <w:rPr> <w:rFonts w:ascii="Symbol"
* w:hAnsi="Symbol" w:hint="default" /> </w:rPr> </w:lvl>
*/
XWPFAbstractNum abstractNum = getXWPFAbstractNum( num );
CTString numStyleLink = abstractNum.getAbstractNum().getNumStyleLink();
String styleId = numStyleLink != null ? numStyleLink.getVal() : null;
if ( styleId != null )
{
// has w:numStyleLink which reference other style
/*
* <w:abstractNum w:abstractNumId="0"> <w:nsid w:val="03916EF0"/> <w:multiLevelType
* w:val="multilevel"/> <w:tmpl w:val="0409001D"/> <w:numStyleLink w:val="EricsListStyle"/>
* </w:abstractNum>
*/
CTStyle style = stylesDocument.getStyle( styleId );
CTPPr ppr = style.getPPr();
if ( ppr == null )
{
return null;
}
return getNumPr( ppr.getNumPr() );
}
}
}
return numPr;
}
private ListContext getListContext( int numId )
{
if ( listContextMap == null )
{
listContextMap = new HashMap<Integer, ListContext>();
}
ListContext listContext = listContextMap.get( numId );
if ( listContext == null )
{
listContext = new ListContext();
listContextMap.put( numId, listContext );
}
return listContext;
}
protected abstract T startVisitParagraph( XWPFParagraph paragraph, ListItemContext itemContext, T parentContainer )
throws Exception;
protected abstract void endVisitParagraph( XWPFParagraph paragraph, T parentContainer, T paragraphContainer )
throws Exception;
protected void visitParagraphBody( XWPFParagraph paragraph, int index, T paragraphContainer )
throws Exception
{
List<XWPFRun> runs = paragraph.getRuns();
if ( runs.isEmpty() )
{
// a new line must be generated if :
// - there is next paragraph/table
// - if the body is a cell (with none vMerge) and contains just this paragraph
if ( isAddNewLine( paragraph, index ) )
{
visitEmptyRun( paragraphContainer );
}
// sometimes, POI tells that run is empty
// but it can be have w:r in the w:pPr
// <w:p><w:pPr .. <w:r> => See the header1.xml of DocxBig.docx ,
// => test if it exist w:r
// CTP p = paragraph.getCTP();
// CTPPr pPr = p.getPPr();
// if (pPr != null) {
// XmlObject[] wRuns =
// pPr.selectPath("declare namespace w='http://schemas.openxmlformats.org/wordprocessingml/2006/main' .//w:r");
// if (wRuns != null) {
// for ( int i = 0; i < wRuns.length; i++ )
// {
// XmlObject o = wRuns[i];
// o.getDomNode().getParentNode()
// if (o instanceof CTR) {
// System.err.println(wRuns[i]);
// }
//
// }
// }
// }
// //XmlObject[] t =
// o.selectPath("declare namespace w='http://schemas.openxmlformats.org/wordprocessingml/2006/main' .//w:t");
// //paragraph.getCTP().get
}
else
{
// Loop for each element of <w:r, w:fldSimple
// to keep the order of those elements.
visitRuns( paragraph, paragraphContainer );
}
// Page Break
// Cannot use paragraph.isPageBreak() because it throws NPE because
// pageBreak.getVal() can be null.
CTPPr ppr = paragraph.getCTP().getPPr();
if ( ppr != null )
{
if ( ppr.isSetPageBreakBefore() )
{
CTOnOff pageBreak = ppr.getPageBreakBefore();
if ( pageBreak != null
&& ( pageBreak.getVal() == null || pageBreak.getVal().intValue() == STOnOff.INT_TRUE ) )
{
pageBreak();
}
}
}
}
// ------------------------ Numbering --------------
protected XWPFNum getXWPFNum( CTNumPr numPr )
{
CTDecimalNumber numID = numPr.getNumId();
if ( numID == null )
{
// numID can be null, ignore the numbering
// see https://code.google.com/p/xdocreport/issues/detail?id=239
return null;
}
XWPFNum num = document.getNumbering().getNum( numID.getVal() );
return num;
}
protected XWPFAbstractNum getXWPFAbstractNum( XWPFNum num )
{
CTDecimalNumber abstractNumID = num.getCTNum().getAbstractNumId();
XWPFAbstractNum abstractNum = document.getNumbering().getAbstractNum( abstractNumID.getVal() );
return abstractNum;
}
/**
* Returns true if the given paragraph which is empty (none <w:r> run) must generate new line and false otherwise.
*
* @param paragraph
* @param index
* @return
*/
private boolean isAddNewLine( XWPFParagraph paragraph, int index )
{
// a new line must be generated if :
// - there is next paragraph/table
// - if the body is a cell (with none vMerge) and contains just this paragraph
IBody body = paragraph.getBody();
List<IBodyElement> bodyElements = body.getBodyElements();
if ( body.getPartType() == BodyType.TABLECELL && bodyElements.size() == 1 )
{
XWPFTableCell cell = (XWPFTableCell) body;
STMerge.Enum vMerge = stylesDocument.getTableCellVMerge( cell );
if ( vMerge != null && vMerge.equals( STMerge.CONTINUE ) )
{
// here a new line must not be generated because the body is a cell (with none vMerge) and contains just
// this paragraph
return false;
}
// Loop for each cell of the row : if all cells are empty, new line must be generated otherwise none empty
// line must be generated.
XWPFTableRow row = cell.getTableRow();
List<XWPFTableCell> cells = row.getTableCells();
for ( XWPFTableCell c : cells )
{
if ( c.getBodyElements().size() != 1 )
{
return false;
}
IBodyElement element = c.getBodyElements().get( 0 );
if ( element.getElementType() != BodyElementType.PARAGRAPH )
{
return false;
}
return ( (XWPFParagraph) element ).getRuns().size() == 0;
}
return true;
}
// here a new line must be generated if there is next paragraph/table
return bodyElements.size() > index + 1;
}
private void visitRuns( XWPFParagraph paragraph, T paragraphContainer )
throws Exception
{
boolean fldCharTypeParsing = false;
boolean pageNumber = false;
String url = null;
List<XmlObject> rListAfterSeparate = null;
CTP ctp = paragraph.getCTP();
XmlCursor c = ctp.newCursor();
c.selectPath( "child::*" );
while ( c.toNextSelection() )
{
XmlObject o = c.getObject();
if ( o instanceof CTR )
{
/*
* Test if it's : <w:r> <w:rPr /> <w:fldChar w:fldCharType="begin" /> </w:r>
*/
CTR r = (CTR) o;
STFldCharType.Enum fldCharType = XWPFRunHelper.getFldCharType( r );
if ( fldCharType != null )
{
if ( fldCharType.equals( STFldCharType.BEGIN ) )
{
process( paragraph, paragraphContainer, pageNumber, url, rListAfterSeparate );
fldCharTypeParsing = true;
rListAfterSeparate = new ArrayList<XmlObject>();
pageNumber = false;
url = null;
}
else if ( fldCharType.equals( STFldCharType.END ) )
{
process( paragraph, paragraphContainer, pageNumber, url, rListAfterSeparate );
fldCharTypeParsing = false;
rListAfterSeparate = null;
pageNumber = false;
processingTotalPageCountField = false;
url = null;
}
}
else
{
if ( fldCharTypeParsing )
{
String instrText = XWPFRunHelper.getInstrText( r );
if ( instrText != null )
{
if ( StringUtils.isNotEmpty( instrText ) )
{
// test if it's <w:r><w:instrText>PAGE</w:instrText></w:r>
boolean instrTextPage = XWPFRunHelper.isInstrTextPage( instrText );
if ( !instrTextPage )
{
// test if it's <w:r><w:instrText>NUMPAGES</w:instrText></w:r>
processingTotalPageCountField = XWPFRunHelper.isInstrTextNumpages( instrText );
if(!totalPageFieldUsed){
totalPageFieldUsed = true;
}
// test if it's <w:instrText>HYPERLINK
// "http://code.google.com/p/xdocrepor"</w:instrText>
String instrTextHyperlink = XWPFRunHelper.getInstrTextHyperlink( instrText );
if ( instrTextHyperlink != null )
{
// test if it's <w:instrText>HYPERLINK \l _Toc29586</w:instrText>
if (instrTextHyperlink.startsWith("\\l ")) {
url = "#" + instrTextHyperlink.substring(3);
} else {
url = instrTextHyperlink;
}
}
}
else
{
pageNumber = true;
}
}
}
else
{
rListAfterSeparate.add( r );
}
}
else
{
XWPFRun run = new XWPFRun( r, paragraph );
visitRun( run, false, null, paragraphContainer );
}
}
}
else
{
if ( fldCharTypeParsing )
{
rListAfterSeparate.add( o );
}
else
{
visitRun( paragraph, o, paragraphContainer );
}
}
}
c.dispose();
process( paragraph, paragraphContainer, pageNumber, url, rListAfterSeparate );
fldCharTypeParsing = false;
rListAfterSeparate = null;
pageNumber = false;
url = null;
}
private void process( XWPFParagraph paragraph, T paragraphContainer, boolean pageNumber, String url,
List<XmlObject> rListAfterSeparate )
throws Exception
{
if ( rListAfterSeparate != null )
{
for ( XmlObject oAfterSeparate : rListAfterSeparate )
{
if ( oAfterSeparate instanceof CTR )
{
CTR ctr = (CTR) oAfterSeparate;
XWPFRun run = new XWPFRun( ctr, paragraph );
visitRun( run, pageNumber, url, paragraphContainer );
}
else
{
visitRun( paragraph, oAfterSeparate, paragraphContainer );
}
}
}
}
private void visitRun( XWPFParagraph paragraph, XmlObject o, T paragraphContainer )
throws Exception
{
if ( o instanceof CTHyperlink )
{
CTHyperlink link = (CTHyperlink) o;
String anchor = link.getAnchor();
String href = null;
// Test if the is an id for hyperlink
String hyperlinkId = link.getId();
if ( StringUtils.isNotEmpty( hyperlinkId ) )
{
XWPFHyperlink hyperlink = document.getHyperlinkByID( hyperlinkId );
href = hyperlink != null ? hyperlink.getURL() : null;
}
for ( CTR r : link.getRList() )
{
XWPFRun run = new XWPFHyperlinkRun( link, r, paragraph );
visitRun( run, false, href != null ? href : "#" + anchor, paragraphContainer );
}
}
else if ( o instanceof CTSdtRun )
{
CTSdtContentRun run = ( (CTSdtRun) o ).getSdtContent();
for ( CTR r : run.getRList() )
{
XWPFRun ru = new XWPFRun( r, paragraph );
visitRun( ru, false, null, paragraphContainer );
}
}
else if ( o instanceof CTRunTrackChange )
{
for ( CTR r : ( (CTRunTrackChange) o ).getRList() )
{
XWPFRun run = new XWPFRun( r, paragraph );
visitRun( run, false, null, paragraphContainer );
}
}
else if ( o instanceof CTSimpleField )
{
CTSimpleField simpleField = (CTSimpleField) o;
String instr = simpleField.getInstr();
// 1) test if it's page number
// <w:fldSimple w:instr=" PAGE \* MERGEFORMAT "> <w:r> <w:rPr> <w:noProof/>
// </w:rPr> <w:t>- 1 -</w:t> </w:r> </w:fldSimple>
boolean fieldPageNumber = XWPFRunHelper.isInstrTextPage( instr );
String fieldHref = null;
if ( !fieldPageNumber )
{
// not page number, test if it's hyperlink :
// <w:instrText>HYPERLINK "http://code.google.com/p/xdocrepor"</w:instrText>
fieldHref = XWPFRunHelper.getInstrTextHyperlink( instr );
}
for ( CTR r : simpleField.getRList() )
{
XWPFRun run = new XWPFRun( r, paragraph );
visitRun( run, fieldPageNumber, fieldHref, paragraphContainer );
}
}
else if ( o instanceof CTSmartTagRun )
{
// Smart Tags can be nested many times.
// This implementation does not preserve the tagging information
// buildRunsInOrderFromXml(o);
}
else if ( o instanceof CTBookmark )
{
CTBookmark bookmark = (CTBookmark) o;
visitBookmark( bookmark, paragraph, paragraphContainer );
}
}
protected abstract void visitEmptyRun( T paragraphContainer )
throws Exception;
protected void visitRun( XWPFRun run, boolean pageNumber, String url, T paragraphContainer )
throws Exception
{
CTR ctr = run.getCTR();
CTRPr rPr = ctr.getRPr();
boolean hasTexStyles = rPr != null && (rPr.getHighlight() != null || rPr.getStrike() != null ||
rPr.getDstrike() != null || rPr.getVertAlign() != null ) ;
StringBuilder text = new StringBuilder();
// Loop for each element of <w:run text, tab, image etc
// to keep the order of thoses elements.
XmlCursor c = ctr.newCursor();
c.selectPath( "./*" );
while ( c.toNextSelection() )
{
XmlObject o = c.getObject();
if ( o instanceof CTText )
{
CTText ctText = (CTText) o;
String tagName = o.getDomNode().getNodeName();
// Field Codes (w:instrText, defined in spec sec. 17.16.23)
// come up as instances of CTText, but we don't want them
// in the normal text output
if ( "w:instrText".equals( tagName ) )
{
}
else
{
if(hasTexStyles)
{
text.append(ctText.getStringValue());
}
else
{
visitText( ctText, pageNumber, paragraphContainer );
}
}
}
else if ( o instanceof CTPTab )
{
visitTab( (CTPTab) o, paragraphContainer );
}
else if ( o instanceof CTBr )
{
visitBR( (CTBr) o, paragraphContainer );
}
else if ( o instanceof CTEmpty )
{
// Some inline text elements get returned not as
// themselves, but as CTEmpty, owing to some odd
// definitions around line 5642 of the XSDs
// This bit works around it, and replicates the above
// rules for that case
String tagName = o.getDomNode().getNodeName();
if ( "w:tab".equals( tagName ) )
{
CTTabs tabs = stylesDocument.getParagraphTabs( run.getParagraph() );
visitTabs( tabs, paragraphContainer );
}
if ( "w:br".equals( tagName ) )
{
visitBR( null, paragraphContainer );
}
if ( "w:cr".equals( tagName ) )
{
visitBR( null, paragraphContainer );
}
}
else if ( o instanceof CTDrawing )
{
visitDrawing( (CTDrawing) o, paragraphContainer );
}
}
if(hasTexStyles && StringUtils.isNotEmpty(text.toString()))
{
visitStyleText(run, text.toString());
}
c.dispose();
}
/**
* Text styles handling, fonts, highlighting, background colors, subscript, superscript, strikes (single strikes) etc.
* @param run
* @param text
* @throws Exception
*/
protected void visitStyleText(XWPFRun run, String text) throws Exception
{
//child should implement
}
protected abstract void visitText( CTText ctText, boolean pageNumber, T paragraphContainer )
throws Exception;
protected abstract void visitTab( CTPTab o, T paragraphContainer )
throws Exception;
protected abstract void visitTabs( CTTabs tabs, T paragraphContainer )
throws Exception;
protected void visitBR( CTBr br, T paragraphContainer )
throws Exception
{
STBrType.Enum brType = XWPFRunHelper.getBrType( br );
if ( brType.equals( STBrType.PAGE ) )
{
pageBreakOnNextParagraph = true;
}
else
{
addNewLine( br, paragraphContainer );
}
}
protected abstract void visitBookmark( CTBookmark bookmark, XWPFParagraph paragraph, T paragraphContainer )
throws Exception;
protected abstract void addNewLine( CTBr br, T paragraphContainer )
throws Exception;
protected abstract void pageBreak()
throws Exception;
protected void visitTable( XWPFTable table, int index, T container )
throws Exception
{
// 1) Compute colWidth
float[] colWidths = XWPFTableUtil.computeColWidths( table );
T tableContainer = startVisitTable( table, colWidths, container );
visitTableBody( table, colWidths, tableContainer );
endVisitTable( table, container, tableContainer );
}
protected void visitTableBody( XWPFTable table, float[] colWidths, T tableContainer )
throws Exception
{
// Proces Row
boolean firstRow = false;
boolean lastRow = false;
List<XWPFTableRow> rows = table.getRows();
int rowsSize = rows.size();
for ( int i = 0; i < rowsSize; i++ )
{
firstRow = ( i == 0 );
lastRow = isLastRow( i, rowsSize );
XWPFTableRow row = rows.get( i );
visitTableRow( row, colWidths, tableContainer, firstRow, lastRow, i, rowsSize );
}
}
private boolean isLastRow( int rowIndex, int rowsSize )
{
return rowIndex == rowsSize - 1;
}
protected abstract T startVisitTable( XWPFTable table, float[] colWidths, T tableContainer )
throws Exception;
protected abstract void endVisitTable( XWPFTable table, T parentContainer, T tableContainer )
throws Exception;
protected void visitTableRow( XWPFTableRow row, float[] colWidths, T tableContainer, boolean firstRow,
boolean lastRowIfNoneVMerge, int rowIndex, int rowsSize )
throws Exception
{
boolean headerRow = stylesDocument.isTableRowHeader( row );
startVisitTableRow( row, tableContainer, rowIndex, headerRow );
int nbColumns = colWidths.length;
// Process cell
boolean firstCol = true;
boolean lastCol = false;
boolean lastRow = false;
List<XWPFTableCell> vMergedCells = null;
List<XWPFTableCell> cells = row.getTableCells();
if ( nbColumns > cells.size() )
{
// Columns number is not equal to cells number.
// POI have a bug with
// <w:tr w:rsidR="00C55C20">
// <w:tc>
// <w:tc>...
// <w:sdt>
// <w:sdtContent>
// <w:tc> <= this tc which is a XWPFTableCell is not included in the row.getTableCells();
firstCol = true;
int cellIndex = -1;
int cellPtr = 0;
CTRow ctRow = row.getCtRow();
XmlCursor c = ctRow.newCursor();
c.selectPath( "./*" );
while ( c.toNextSelection() )
{
XmlObject o = c.getObject();
if ( o instanceof CTTc )
{
CTTc tc = (CTTc) o;
XWPFTableCell cell = row.getTableCell( tc );
cellIndex = getCellIndex( cellIndex, cell );
lastCol = ( cellIndex == nbColumns );
vMergedCells = getVMergedCells( cell, rowIndex, cellPtr );
if ( vMergedCells == null || vMergedCells.size() > 0 )
{
lastRow = isLastRow( lastRowIfNoneVMerge, rowIndex, rowsSize, vMergedCells );
visitCell( cell, tableContainer, firstRow, lastRow, firstCol, lastCol, rowIndex, cellPtr,
vMergedCells );
}
cellPtr++;
firstCol = false;
}
else if ( o instanceof CTSdtCell )
{
// Fix bug of POI
CTSdtCell sdtCell = (CTSdtCell) o;
List<CTTc> tcList = sdtCell.getSdtContent().getTcList();
for ( CTTc ctTc : tcList )
{
XWPFTableCell cell = new XWPFTableCell( ctTc, row, row.getTable().getBody() );
cellIndex = getCellIndex( cellIndex, cell );
lastCol = ( cellIndex == nbColumns );
List<XWPFTableCell> rowCells = row.getTableCells();
if (!rowCells.contains(cell))
{
rowCells.add(cell);
}
vMergedCells = getVMergedCells( cell, rowIndex, cellPtr );
if ( vMergedCells == null || vMergedCells.size() > 0 )
{
lastRow = isLastRow( lastRowIfNoneVMerge, rowIndex, rowsSize, vMergedCells );
visitCell( cell, tableContainer, firstRow, lastRow, firstCol, lastCol, rowIndex, cellPtr,
vMergedCells );
}
cellPtr++;
firstCol = false;
}
}
}
c.dispose();
}
else
{
// Column number is equal to cells number.
for ( int i = 0; i < cells.size(); i++ )
{
lastCol = ( i == cells.size() - 1 );
XWPFTableCell cell = cells.get( i );
vMergedCells = getVMergedCells( cell, rowIndex, i );
if ( vMergedCells == null || vMergedCells.size() > 0 )
{
lastRow = isLastRow( lastRowIfNoneVMerge, rowIndex, rowsSize, vMergedCells );
visitCell( cell, tableContainer, firstRow, lastRow, firstCol, lastCol, rowIndex, i, vMergedCells );
}
firstCol = false;
}
}
endVisitTableRow( row, tableContainer, firstRow, lastRow, headerRow );
}
private boolean isLastRow( boolean lastRowIfNoneVMerge, int rowIndex, int rowsSize, List<XWPFTableCell> vMergedCells )
{
if ( vMergedCells == null )
{
return lastRowIfNoneVMerge;
}
return isLastRow( rowIndex - 1 + vMergedCells.size(), rowsSize );
}
private int getCellIndex( int cellIndex, XWPFTableCell cell )
{
BigInteger gridSpan = stylesDocument.getTableCellGridSpan( cell.getCTTc().getTcPr() );
if ( gridSpan != null )
{
cellIndex = cellIndex + gridSpan.intValue();
}
else
{
cellIndex++;
}
return cellIndex;
}
protected void startVisitTableRow( XWPFTableRow row, T tableContainer, int rowIndex, boolean headerRow )
throws Exception
{
}
protected void endVisitTableRow( XWPFTableRow row, T tableContainer, boolean firstRow, boolean lastRow,
boolean headerRow )
throws Exception
{
}
protected void visitCell( XWPFTableCell cell, T tableContainer, boolean firstRow, boolean lastRow,
boolean firstCol, boolean lastCol, int rowIndex, int cellIndex,
List<XWPFTableCell> vMergedCells )
throws Exception
{
T tableCellContainer =
startVisitTableCell( cell, tableContainer, firstRow, lastRow, firstCol, lastCol, vMergedCells );
visitTableCellBody( cell, vMergedCells, tableCellContainer );
endVisitTableCell( cell, tableContainer, tableCellContainer );
}
private List<XWPFTableCell> getVMergedCells( XWPFTableCell cell, int rowIndex, int cellIndex )
{
List<XWPFTableCell> vMergedCells = null;
STMerge.Enum vMerge = stylesDocument.getTableCellVMerge( cell );
if ( vMerge != null )
{
if ( vMerge.equals( STMerge.RESTART ) )
{
// vMerge="restart"
// Loop for each table cell of each row upon vMerge="restart" was found or cell without vMerge
// was declared.
vMergedCells = new ArrayList<XWPFTableCell>();
vMergedCells.add( cell );
XWPFTableRow row = null;
XWPFTableCell c;
XWPFTable table = cell.getTableRow().getTable();
for ( int i = rowIndex + 1; i < table.getRows().size(); i++ )
{
row = table.getRow( i );
c = row.getCell( cellIndex );
if ( c == null )
{
break;
}
vMerge = stylesDocument.getTableCellVMerge( c );
if ( vMerge != null && vMerge.equals( STMerge.CONTINUE ) )
{
vMergedCells.add( c );
}
else
{
return vMergedCells;
}
}
}
else
{
// vMerge="continue", ignore the cell because it was already processed
return Collections.emptyList();
}
}
return vMergedCells;
}
protected void visitTableCellBody( XWPFTableCell cell, List<XWPFTableCell> vMergeCells, T tableCellContainer )
throws Exception
{
if ( vMergeCells != null )
{
for ( XWPFTableCell mergedCell : vMergeCells )
{
List<IBodyElement> bodyElements = mergedCell.getBodyElements();
visitBodyElements( bodyElements, tableCellContainer );
}
}
else
{
List<IBodyElement> bodyElements = cell.getBodyElements();
visitBodyElements( bodyElements, tableCellContainer );
}
}
protected abstract T startVisitTableCell( XWPFTableCell cell, T tableContainer, boolean firstRow, boolean lastRow,
boolean firstCol, boolean lastCol, List<XWPFTableCell> vMergeCells )
throws Exception;
protected abstract void endVisitTableCell( XWPFTableCell cell, T tableContainer, T tableCellContainer )
throws Exception;
protected XWPFStyle getXWPFStyle( String styleID )
{
if ( styleID == null )
return null;
else
return document.getStyles().getStyle( styleID );
}
/**
* Returns true if word/document.xml is parsing and false otherwise.
*
* @return true if word/document.xml is parsing and false otherwise.
*/
protected boolean isWordDocumentPartParsing()
{
return currentHeader == null && currentFooter == null;
}
// ------------------------------ Header/Footer visitor -----------
public void visitHeaderRef( CTHdrFtrRef headerRef, CTSectPr sectPr, E masterPage )
throws Exception
{
this.currentHeader = getXWPFHeader( headerRef );
visitHeader( currentHeader, headerRef, sectPr, masterPage );
this.currentHeader = null;
}
protected abstract void visitHeader( XWPFHeader header, CTHdrFtrRef headerRef, CTSectPr sectPr, E masterPage )
throws Exception;
public void visitFooterRef( CTHdrFtrRef footerRef, CTSectPr sectPr, E masterPage )
throws Exception
{
this.currentFooter = getXWPFFooter( footerRef );
visitFooter( currentFooter, footerRef, sectPr, masterPage );
this.currentFooter = null;
}
protected abstract void visitFooter( XWPFFooter footer, CTHdrFtrRef footerRef, CTSectPr sectPr, E masterPage )
throws Exception;
/**
* Returns the list of {@link IBodyElement} of the given header/footer. We do that because
* {@link XWPFHeaderFooter#getBodyElements()} doesn't contains the // <w:sdt><w:sdtContent>
* <p
* (see JUnit Docx4j_GettingStarted, DocXperT_Output_4_3, Issue222 which defines page number in the <w:sdt. ...
*
* @param part
* @return
*/
protected List<IBodyElement> getBodyElements( XWPFHeaderFooter part )
{
List<IBodyElement> bodyElements = new ArrayList<IBodyElement>();
XmlTokenSource headerFooter = part._getHdrFtr();
addBodyElements( headerFooter, part, bodyElements );
return bodyElements;
}
/**
* Add body elements from the given token source.
*
* @param source
* @param part
* @param bodyElements
*/
private void addBodyElements( XmlTokenSource source, IBody part, List<IBodyElement> bodyElements )
{
// parse the document with cursor and add
// the XmlObject to its lists
XmlCursor cursor = source.newCursor();
cursor.selectPath( "./*" );
while ( cursor.toNextSelection() )
{
XmlObject o = cursor.getObject();
if ( o instanceof CTSdtBlock )
{
// <w:sdt><w:sdtContent><p...
CTSdtBlock block = (CTSdtBlock) o;
CTSdtContentBlock contentBlock = block.getSdtContent();
if ( contentBlock != null )
{
addBodyElements( contentBlock, part, bodyElements );
}
}
else if ( o instanceof CTP )
{
XWPFParagraph p = new XWPFParagraph( (CTP) o, part );
bodyElements.add( p );
}
else if ( o instanceof CTTbl )
{
XWPFTable t = new XWPFTable( (CTTbl) o, part );
bodyElements.add( t );
}
}
cursor.dispose();
}
/**
* Returns the {@link XWPFHeader} of the given header reference.
*
* @param headerRef the header reference.
* @return
* @throws XmlException
* @throws IOException
*/
protected XWPFHeader getXWPFHeader( CTHdrFtrRef headerRef )
throws XmlException, IOException
{
PackagePart hdrPart = document.getPartById( headerRef.getId() );
List<XWPFHeader> headers = document.getHeaderList();
for ( XWPFHeader header : headers )
{
if ( header.getPackagePart().equals( hdrPart ) )
{
// header is aleady loaded, return it.
return header;
}
}
// should never come, but load the header if needed.
HdrDocument hdrDoc = HdrDocument.Factory.parse( hdrPart.getInputStream() );
CTHdrFtr hdrFtr = hdrDoc.getHdr();
XWPFHeader hdr = new XWPFHeader( document, hdrFtr );
return hdr;
}
/**
* Returns the {@link XWPFFooter} of the given footer reference.
*
* @param footerRef the footer reference.
* @return
* @throws XmlException
* @throws IOException
*/
protected XWPFFooter getXWPFFooter( CTHdrFtrRef footerRef )
throws XmlException, IOException
{
PackagePart hdrPart = document.getPartById( footerRef.getId() );
List<XWPFFooter> footers = document.getFooterList();
for ( XWPFFooter footer : footers )
{
if ( footer.getPackagePart().equals( hdrPart ) )
{
// footer is aleady loaded, return it.
return footer;
}
}
// should never come, but load the footer if needed.
FtrDocument hdrDoc = FtrDocument.Factory.parse( hdrPart.getInputStream() );
CTHdrFtr hdrFtr = hdrDoc.getFtr();
XWPFFooter ftr = new XWPFFooter( document, hdrFtr );
return ftr;
}
// ------------------------ Image --------------
protected void visitDrawing( CTDrawing drawing, T parentContainer )
throws Exception
{
List<CTInline> inlines = drawing.getInlineList();
for ( CTInline inline : inlines )
{
visitInline( inline, parentContainer );
}
List<CTAnchor> anchors = drawing.getAnchorList();
for ( CTAnchor anchor : anchors )
{
visitAnchor( anchor, parentContainer );
}
}
protected void visitAnchor( CTAnchor anchor, T parentContainer )
throws Exception
{
CTGraphicalObject graphic = anchor.getGraphic();
/*
* wp:positionH relativeFrom="column"> <wp:posOffset>-898525</wp:posOffset> </wp:positionH>
*/
STRelFromH.Enum relativeFromH = null;
Float offsetX = null;
CTPosH positionH = anchor.getPositionH();
if ( positionH != null )
{
relativeFromH = positionH.getRelativeFrom();
offsetX = DxaUtil.emu2points( positionH.getPosOffset() );
}
STRelFromV.Enum relativeFromV = null;
Float offsetY = null;
CTPosV positionV = anchor.getPositionV();
if ( positionV != null )
{
relativeFromV = positionV.getRelativeFrom();
offsetY = DxaUtil.emu2points( positionV.getPosOffset() );
}
STWrapText.Enum wrapText = null;
CTWrapSquare wrapSquare = anchor.getWrapSquare();
if ( wrapSquare != null )
{
wrapText = wrapSquare.getWrapText();
}
visitGraphicalObject( parentContainer, graphic, offsetX, relativeFromH, offsetY, relativeFromV, wrapText );
}
protected void visitInline( CTInline inline, T parentContainer )
throws Exception
{
CTGraphicalObject graphic = inline.getGraphic();
visitGraphicalObject( parentContainer, graphic, null, null, null, null, null );
}
private void visitGraphicalObject( T parentContainer, CTGraphicalObject graphic, Float offsetX,
STRelFromH.Enum relativeFromH, Float offsetY, STRelFromV.Enum relativeFromV,
STWrapText.Enum wrapText )
throws Exception
{
if ( graphic != null )
{
CTGraphicalObjectData graphicData = graphic.getGraphicData();
if ( graphicData != null )
{
XmlCursor c = graphicData.newCursor();
c.selectPath( "./*" );
while ( c.toNextSelection() )
{
XmlObject o = c.getObject();
if ( o instanceof CTPicture )
{
CTPicture picture = (CTPicture) o;
// extract the picture if needed
IImageExtractor extractor = getImageExtractor();
if ( extractor != null )
{
XWPFPictureData pictureData = getPictureData( picture );
if ( pictureData != null )
{
try
{
extractor.extract( WORD_MEDIA + pictureData.getFileName(), pictureData.getData() );
}
catch ( Throwable e )
{
LOGGER.log( Level.SEVERE,
"Error while extracting the image " + pictureData.getFileName(), e );
}
}
}
// visit the picture.
visitPicture( picture, offsetX, relativeFromH, offsetY, relativeFromV, wrapText,
parentContainer );
}
}
c.dispose();
}
}
}
/**
* Returns the picture data of the given image id.
*
* @param blipId
* @return
*/
protected XWPFPictureData getPictureDataByID( String blipId )
{
if ( currentHeader != null )
{
return currentHeader.getPictureDataByID( blipId );
}
if ( currentFooter != null )
{
return currentFooter.getPictureDataByID( blipId );
}
return document.getPictureDataByID( blipId );
}
/**
* Returns the image extractor and null otherwise.
*
* @return
*/
protected IImageExtractor getImageExtractor()
{
return options.getExtractor();
}
/**
* Returns the picture data of the given picture.
*
* @param picture
* @return
*/
public XWPFPictureData getPictureData( CTPicture picture )
{
String blipId = picture.getBlipFill().getBlip().getEmbed();
return getPictureDataByID( blipId );
}
protected abstract void visitPicture( CTPicture picture, Float offsetX, STRelFromH.Enum relativeFromH,
Float offsetY, STRelFromV.Enum relativeFromV, STWrapText.Enum wrapText,
T parentContainer )
throws Exception;
}