/** * Copyright (C) 2011-2015 The XDocReport Team <xdocreport@googlegroups.com> * * All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining * a copy of this software and associated documentation files (the * "Software"), to deal in the Software without restriction, including * without limitation the rights to use, copy, modify, merge, publish, * distribute, sublicense, and/or sell copies of the Software, and to * permit persons to whom the Software is furnished to do so, subject to * the following conditions: * * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ package fr.opensagres.poi.xwpf.converter.core; import java.io.IOException; import java.lang.reflect.Field; import java.math.BigInteger; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.logging.Level; import java.util.logging.Logger; import org.apache.poi.openxml4j.opc.PackagePart; import org.apache.poi.xwpf.usermodel.*; import org.apache.xmlbeans.XmlCursor; import org.apache.xmlbeans.XmlException; import org.apache.xmlbeans.XmlObject; import org.apache.xmlbeans.XmlTokenSource; import org.openxmlformats.schemas.drawingml.x2006.main.CTGraphicalObject; import org.openxmlformats.schemas.drawingml.x2006.main.CTGraphicalObjectData; import org.openxmlformats.schemas.drawingml.x2006.picture.CTPicture; import org.openxmlformats.schemas.drawingml.x2006.wordprocessingDrawing.CTAnchor; import org.openxmlformats.schemas.drawingml.x2006.wordprocessingDrawing.CTInline; import org.openxmlformats.schemas.drawingml.x2006.wordprocessingDrawing.CTPosH; import org.openxmlformats.schemas.drawingml.x2006.wordprocessingDrawing.CTPosV; import org.openxmlformats.schemas.drawingml.x2006.wordprocessingDrawing.CTWrapSquare; import org.openxmlformats.schemas.drawingml.x2006.wordprocessingDrawing.STRelFromH; import org.openxmlformats.schemas.drawingml.x2006.wordprocessingDrawing.STRelFromV; import org.openxmlformats.schemas.drawingml.x2006.wordprocessingDrawing.STWrapText; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTBookmark; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTBr; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTDecimalNumber; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTDrawing; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTEmpty; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTHdrFtr; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTHdrFtrRef; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTHyperlink; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTLvl; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTNumPr; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTOnOff; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTP; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTPPr; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTPTab; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTR; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTRPr; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTRow; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTRunTrackChange; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSdtBlock; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSdtCell; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSdtContentBlock; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSdtContentRun; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSdtRun; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSectPr; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSimpleField; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSmartTagRun; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTString; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTStyle; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTTabs; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTTbl; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTTc; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTText; import org.openxmlformats.schemas.wordprocessingml.x2006.main.FtrDocument; import org.openxmlformats.schemas.wordprocessingml.x2006.main.HdrDocument; import org.openxmlformats.schemas.wordprocessingml.x2006.main.STBrType; import org.openxmlformats.schemas.wordprocessingml.x2006.main.STFldCharType; import org.openxmlformats.schemas.wordprocessingml.x2006.main.STMerge; import org.openxmlformats.schemas.wordprocessingml.x2006.main.STOnOff; import fr.opensagres.poi.xwpf.converter.core.styles.XWPFStylesDocument; import fr.opensagres.poi.xwpf.converter.core.utils.DxaUtil; import fr.opensagres.poi.xwpf.converter.core.utils.StringUtils; import fr.opensagres.poi.xwpf.converter.core.utils.XWPFRunHelper; import fr.opensagres.poi.xwpf.converter.core.utils.XWPFTableUtil; import org.xml.sax.SAXException; /** * Visitor to visit elements from entry word/document.xml, word/header*.xml, word/footer*.xml * * @param <T> * @param <O> * @param <E> */ public abstract class XWPFDocumentVisitor<T, O extends Options, E extends IXWPFMasterPage> implements IMasterPageHandler<E> { private static final Logger LOGGER = Logger.getLogger( XWPFDocumentVisitor.class.getName() ); protected static final String WORD_MEDIA = "word/media/"; protected final XWPFDocument document; private final MasterPageManager masterPageManager; private XWPFHeader currentHeader; private XWPFFooter currentFooter; protected final XWPFStylesDocument stylesDocument; protected final O options; private boolean pageBreakOnNextParagraph; protected boolean processingTotalPageCountField = false; protected boolean totalPageFieldUsed = false; /** * Map of w:numId and ListContext */ private Map<Integer, ListContext> listContextMap; public XWPFDocumentVisitor( XWPFDocument document, O options ) throws Exception { this.document = document; this.options = options; this.stylesDocument = createStylesDocument( document ); this.masterPageManager = new MasterPageManager( document.getDocument(), this ); } protected XWPFStylesDocument createStylesDocument( XWPFDocument document ) throws XmlException, IOException { return new XWPFStylesDocument( document ); } public XWPFStylesDocument getStylesDocument() { return stylesDocument; } public O getOptions() { return options; } public MasterPageManager getMasterPageManager() { return masterPageManager; } // ------------------------------ Start/End document visitor ----------- /** * Main entry for visit XWPFDocument. * * @throws Exception */ public void start() throws Exception { // start document T container = startVisitDocument(); // Create IText, XHTML element for each XWPF elements from the w:body List<IBodyElement> bodyElements = document.getBodyElements(); visitBodyElements( bodyElements, container ); // end document endVisitDocument(); } /** * Start of visit document. * * @return * @throws Exception */ protected abstract T startVisitDocument() throws Exception; /** * End of visit document. * * @throws Exception */ protected abstract void endVisitDocument() throws Exception; // ------------------------------ XWPF Elements visitor ----------- protected void visitBodyElements( List<IBodyElement> bodyElements, T container ) throws Exception { if ( !masterPageManager.isInitialized() ) { // master page manager which hosts each <:w;sectPr declared in the word/document.xml // must be initialized. The initialization loop for each // <w:p paragraph to compute a list of <w:sectPr which contains information // about header/footer declared in the <w:headerReference/<w:footerReference masterPageManager.initialize(); } String previousParagraphStyleName = null; for ( int i = 0; i < bodyElements.size(); i++ ) { IBodyElement bodyElement = bodyElements.get( i ); switch ( bodyElement.getElementType() ) { case PARAGRAPH: XWPFParagraph paragraph = (XWPFParagraph) bodyElement; String paragraphStyleName = paragraph.getStyleID(); boolean sameStyleBelow = ( paragraphStyleName != null && paragraphStyleName.equals( previousParagraphStyleName ) ); visitParagraph( paragraph, i, container ); break; case TABLE: previousParagraphStyleName = null; visitTable( (XWPFTable) bodyElement, i, container ); break; case CONTENTCONTROL: visitSDT((XWPFSDT)bodyElement, i, container); break; } } } /** * @param contents content controls */ protected void visitSDT(XWPFSDT contents, int index, T container) throws Exception { T sdtContainer = startVisitSDT( contents, container ); visitSDTBody( contents, sdtContainer ); endVisitSDT( contents, container, sdtContainer ); } protected abstract T startVisitSDT(XWPFSDT contents, T container) throws SAXException; protected abstract void endVisitSDT(XWPFSDT contents, T container, T sdtContainer) throws SAXException; protected void visitSDTBody(XWPFSDT contents, T sdtContainer) throws Exception { ISDTContent content = contents.getContent(); Field bodyElements; try { bodyElements = content.getClass().getDeclaredField("bodyElements"); bodyElements.setAccessible(true); List<ISDTContents> isdtContents = (List<ISDTContents>) bodyElements.get(content); for (int i = 0; i < isdtContents.size(); i++) { ISDTContents isdtContent = isdtContents.get(i); if (isdtContent instanceof XWPFParagraph) { visitParagraph((XWPFParagraph) isdtContent, i, sdtContainer); } else if (isdtContent instanceof XWPFTable) { visitTable((XWPFTable) isdtContent, i, sdtContainer); } else if (isdtContent instanceof XWPFRun) { visitRun((XWPFParagraph) ((XWPFRun) isdtContent).getParent(), (XmlObject) isdtContent, sdtContainer); } else if (isdtContent instanceof XWPFSDT) { visitSDT((XWPFSDT) isdtContent, i, sdtContainer); } } } catch (NoSuchFieldException e) { e.printStackTrace(); } catch (IllegalAccessException e) { e.printStackTrace(); } } /** * Visit the given paragraph. * * @param paragraph * @param index * @param container * @throws Exception */ protected void visitParagraph( XWPFParagraph paragraph, int index, T container ) throws Exception { if ( isWordDocumentPartParsing() ) { // header/footer is not parsing. // It's the word/document.xml which is parsing // test if the current paragraph define a <w:sectPr // to update the header/footer declared in the <w:headerReference/<w:footerReference masterPageManager.update( paragraph.getCTP() ); } if ( pageBreakOnNextParagraph ) { pageBreak(); } this.pageBreakOnNextParagraph = false; ListItemContext itemContext = null; CTNumPr originalNumPr = stylesDocument.getParagraphNumPr( paragraph ); CTNumPr numPr = getNumPr( originalNumPr ); if ( numPr != null ) { // paragraph is a numbered/bullet list // see http://msdn.microsoft.com/en-us/library/office/ee922775%28v=office.14%29.aspx // - <w:p> // - <w:pPr> // <w:pStyle w:val="style0" /> // - <w:numPr> // <w:ilvl w:val="0" /> // <w:numId w:val="2" /> // </w:numPr> // get numbering.xml/w:num /** * <w:num w:numId="2"> <w:abstractNumId w:val="1" /> </w:num> */ XWPFNum num = getXWPFNum( numPr ); if ( num != null ) { // get the abstractNum by usisng abstractNumId /** * <w:abstractNum w:abstractNumId="1"> <w:nsid w:val="3CBA6E67" /> <w:multiLevelType * w:val="hybridMultilevel" /> <w:tmpl w:val="7416D4FA" /> - <w:lvl w:ilvl="0" w:tplc="040C0001"> * <w:start w:val="1" /> <w:numFmt w:val="bullet" /> <w:lvlText w:val="o" /> <w:lvlJc w:val="left" /> - * <w:pPr> <w:ind w:left="720" w:hanging="360" /> </w:pPr> - <w:rPr> <w:rFonts w:ascii="Symbol" * w:hAnsi="Symbol" w:hint="default" /> </w:rPr> </w:lvl> */ XWPFAbstractNum abstractNum = getXWPFAbstractNum( num ); // get the <w:lvl by using abstractNum and numPr level /** * <w:num w:numId="2"> <w:abstractNumId w:val="1" /> </w:num> */ CTDecimalNumber ilvl = numPr.getIlvl(); int level = ilvl != null ? ilvl.getVal().intValue() : 0; CTLvl lvl = abstractNum.getAbstractNum().getLvlArray( level ); if ( lvl != null ) { ListContext listContext = getListContext( originalNumPr.getNumId().getVal().intValue() ); itemContext = listContext.addItem( lvl ); } } } T paragraphContainer = startVisitParagraph( paragraph, itemContext, container ); visitParagraphBody( paragraph, index, paragraphContainer ); endVisitParagraph( paragraph, container, paragraphContainer ); } private CTNumPr getNumPr( CTNumPr numPr ) { if ( numPr != null ) { XWPFNum num = getXWPFNum( numPr ); if ( num != null ) { // get the abstractNum by usisng abstractNumId /** * <w:abstractNum w:abstractNumId="1"> <w:nsid w:val="3CBA6E67" /> <w:multiLevelType * w:val="hybridMultilevel" /> <w:tmpl w:val="7416D4FA" /> - <w:lvl w:ilvl="0" w:tplc="040C0001"> * <w:start w:val="1" /> <w:numFmt w:val="bullet" /> <w:lvlText w:val="o" /> <w:lvlJc w:val="left" /> - * <w:pPr> <w:ind w:left="720" w:hanging="360" /> </w:pPr> - <w:rPr> <w:rFonts w:ascii="Symbol" * w:hAnsi="Symbol" w:hint="default" /> </w:rPr> </w:lvl> */ XWPFAbstractNum abstractNum = getXWPFAbstractNum( num ); CTString numStyleLink = abstractNum.getAbstractNum().getNumStyleLink(); String styleId = numStyleLink != null ? numStyleLink.getVal() : null; if ( styleId != null ) { // has w:numStyleLink which reference other style /* * <w:abstractNum w:abstractNumId="0"> <w:nsid w:val="03916EF0"/> <w:multiLevelType * w:val="multilevel"/> <w:tmpl w:val="0409001D"/> <w:numStyleLink w:val="EricsListStyle"/> * </w:abstractNum> */ CTStyle style = stylesDocument.getStyle( styleId ); CTPPr ppr = style.getPPr(); if ( ppr == null ) { return null; } return getNumPr( ppr.getNumPr() ); } } } return numPr; } private ListContext getListContext( int numId ) { if ( listContextMap == null ) { listContextMap = new HashMap<Integer, ListContext>(); } ListContext listContext = listContextMap.get( numId ); if ( listContext == null ) { listContext = new ListContext(); listContextMap.put( numId, listContext ); } return listContext; } protected abstract T startVisitParagraph( XWPFParagraph paragraph, ListItemContext itemContext, T parentContainer ) throws Exception; protected abstract void endVisitParagraph( XWPFParagraph paragraph, T parentContainer, T paragraphContainer ) throws Exception; protected void visitParagraphBody( XWPFParagraph paragraph, int index, T paragraphContainer ) throws Exception { List<XWPFRun> runs = paragraph.getRuns(); if ( runs.isEmpty() ) { // a new line must be generated if : // - there is next paragraph/table // - if the body is a cell (with none vMerge) and contains just this paragraph if ( isAddNewLine( paragraph, index ) ) { visitEmptyRun( paragraphContainer ); } // sometimes, POI tells that run is empty // but it can be have w:r in the w:pPr // <w:p><w:pPr .. <w:r> => See the header1.xml of DocxBig.docx , // => test if it exist w:r // CTP p = paragraph.getCTP(); // CTPPr pPr = p.getPPr(); // if (pPr != null) { // XmlObject[] wRuns = // pPr.selectPath("declare namespace w='http://schemas.openxmlformats.org/wordprocessingml/2006/main' .//w:r"); // if (wRuns != null) { // for ( int i = 0; i < wRuns.length; i++ ) // { // XmlObject o = wRuns[i]; // o.getDomNode().getParentNode() // if (o instanceof CTR) { // System.err.println(wRuns[i]); // } // // } // } // } // //XmlObject[] t = // o.selectPath("declare namespace w='http://schemas.openxmlformats.org/wordprocessingml/2006/main' .//w:t"); // //paragraph.getCTP().get } else { // Loop for each element of <w:r, w:fldSimple // to keep the order of those elements. visitRuns( paragraph, paragraphContainer ); } // Page Break // Cannot use paragraph.isPageBreak() because it throws NPE because // pageBreak.getVal() can be null. CTPPr ppr = paragraph.getCTP().getPPr(); if ( ppr != null ) { if ( ppr.isSetPageBreakBefore() ) { CTOnOff pageBreak = ppr.getPageBreakBefore(); if ( pageBreak != null && ( pageBreak.getVal() == null || pageBreak.getVal().intValue() == STOnOff.INT_TRUE ) ) { pageBreak(); } } } } // ------------------------ Numbering -------------- protected XWPFNum getXWPFNum( CTNumPr numPr ) { CTDecimalNumber numID = numPr.getNumId(); if ( numID == null ) { // numID can be null, ignore the numbering // see https://code.google.com/p/xdocreport/issues/detail?id=239 return null; } XWPFNum num = document.getNumbering().getNum( numID.getVal() ); return num; } protected XWPFAbstractNum getXWPFAbstractNum( XWPFNum num ) { CTDecimalNumber abstractNumID = num.getCTNum().getAbstractNumId(); XWPFAbstractNum abstractNum = document.getNumbering().getAbstractNum( abstractNumID.getVal() ); return abstractNum; } /** * Returns true if the given paragraph which is empty (none <w:r> run) must generate new line and false otherwise. * * @param paragraph * @param index * @return */ private boolean isAddNewLine( XWPFParagraph paragraph, int index ) { // a new line must be generated if : // - there is next paragraph/table // - if the body is a cell (with none vMerge) and contains just this paragraph IBody body = paragraph.getBody(); List<IBodyElement> bodyElements = body.getBodyElements(); if ( body.getPartType() == BodyType.TABLECELL && bodyElements.size() == 1 ) { XWPFTableCell cell = (XWPFTableCell) body; STMerge.Enum vMerge = stylesDocument.getTableCellVMerge( cell ); if ( vMerge != null && vMerge.equals( STMerge.CONTINUE ) ) { // here a new line must not be generated because the body is a cell (with none vMerge) and contains just // this paragraph return false; } // Loop for each cell of the row : if all cells are empty, new line must be generated otherwise none empty // line must be generated. XWPFTableRow row = cell.getTableRow(); List<XWPFTableCell> cells = row.getTableCells(); for ( XWPFTableCell c : cells ) { if ( c.getBodyElements().size() != 1 ) { return false; } IBodyElement element = c.getBodyElements().get( 0 ); if ( element.getElementType() != BodyElementType.PARAGRAPH ) { return false; } return ( (XWPFParagraph) element ).getRuns().size() == 0; } return true; } // here a new line must be generated if there is next paragraph/table return bodyElements.size() > index + 1; } private void visitRuns( XWPFParagraph paragraph, T paragraphContainer ) throws Exception { boolean fldCharTypeParsing = false; boolean pageNumber = false; String url = null; List<XmlObject> rListAfterSeparate = null; CTP ctp = paragraph.getCTP(); XmlCursor c = ctp.newCursor(); c.selectPath( "child::*" ); while ( c.toNextSelection() ) { XmlObject o = c.getObject(); if ( o instanceof CTR ) { /* * Test if it's : <w:r> <w:rPr /> <w:fldChar w:fldCharType="begin" /> </w:r> */ CTR r = (CTR) o; STFldCharType.Enum fldCharType = XWPFRunHelper.getFldCharType( r ); if ( fldCharType != null ) { if ( fldCharType.equals( STFldCharType.BEGIN ) ) { process( paragraph, paragraphContainer, pageNumber, url, rListAfterSeparate ); fldCharTypeParsing = true; rListAfterSeparate = new ArrayList<XmlObject>(); pageNumber = false; url = null; } else if ( fldCharType.equals( STFldCharType.END ) ) { process( paragraph, paragraphContainer, pageNumber, url, rListAfterSeparate ); fldCharTypeParsing = false; rListAfterSeparate = null; pageNumber = false; processingTotalPageCountField = false; url = null; } } else { if ( fldCharTypeParsing ) { String instrText = XWPFRunHelper.getInstrText( r ); if ( instrText != null ) { if ( StringUtils.isNotEmpty( instrText ) ) { // test if it's <w:r><w:instrText>PAGE</w:instrText></w:r> boolean instrTextPage = XWPFRunHelper.isInstrTextPage( instrText ); if ( !instrTextPage ) { // test if it's <w:r><w:instrText>NUMPAGES</w:instrText></w:r> processingTotalPageCountField = XWPFRunHelper.isInstrTextNumpages( instrText ); if(!totalPageFieldUsed){ totalPageFieldUsed = true; } // test if it's <w:instrText>HYPERLINK // "http://code.google.com/p/xdocrepor"</w:instrText> String instrTextHyperlink = XWPFRunHelper.getInstrTextHyperlink( instrText ); if ( instrTextHyperlink != null ) { // test if it's <w:instrText>HYPERLINK \l _Toc29586</w:instrText> if (instrTextHyperlink.startsWith("\\l ")) { url = "#" + instrTextHyperlink.substring(3); } else { url = instrTextHyperlink; } } } else { pageNumber = true; } } } else { rListAfterSeparate.add( r ); } } else { XWPFRun run = new XWPFRun( r, paragraph ); visitRun( run, false, null, paragraphContainer ); } } } else { if ( fldCharTypeParsing ) { rListAfterSeparate.add( o ); } else { visitRun( paragraph, o, paragraphContainer ); } } } c.dispose(); process( paragraph, paragraphContainer, pageNumber, url, rListAfterSeparate ); fldCharTypeParsing = false; rListAfterSeparate = null; pageNumber = false; url = null; } private void process( XWPFParagraph paragraph, T paragraphContainer, boolean pageNumber, String url, List<XmlObject> rListAfterSeparate ) throws Exception { if ( rListAfterSeparate != null ) { for ( XmlObject oAfterSeparate : rListAfterSeparate ) { if ( oAfterSeparate instanceof CTR ) { CTR ctr = (CTR) oAfterSeparate; XWPFRun run = new XWPFRun( ctr, paragraph ); visitRun( run, pageNumber, url, paragraphContainer ); } else { visitRun( paragraph, oAfterSeparate, paragraphContainer ); } } } } private void visitRun( XWPFParagraph paragraph, XmlObject o, T paragraphContainer ) throws Exception { if ( o instanceof CTHyperlink ) { CTHyperlink link = (CTHyperlink) o; String anchor = link.getAnchor(); String href = null; // Test if the is an id for hyperlink String hyperlinkId = link.getId(); if ( StringUtils.isNotEmpty( hyperlinkId ) ) { XWPFHyperlink hyperlink = document.getHyperlinkByID( hyperlinkId ); href = hyperlink != null ? hyperlink.getURL() : null; } for ( CTR r : link.getRList() ) { XWPFRun run = new XWPFHyperlinkRun( link, r, paragraph ); visitRun( run, false, href != null ? href : "#" + anchor, paragraphContainer ); } } else if ( o instanceof CTSdtRun ) { CTSdtContentRun run = ( (CTSdtRun) o ).getSdtContent(); for ( CTR r : run.getRList() ) { XWPFRun ru = new XWPFRun( r, paragraph ); visitRun( ru, false, null, paragraphContainer ); } } else if ( o instanceof CTRunTrackChange ) { for ( CTR r : ( (CTRunTrackChange) o ).getRList() ) { XWPFRun run = new XWPFRun( r, paragraph ); visitRun( run, false, null, paragraphContainer ); } } else if ( o instanceof CTSimpleField ) { CTSimpleField simpleField = (CTSimpleField) o; String instr = simpleField.getInstr(); // 1) test if it's page number // <w:fldSimple w:instr=" PAGE \* MERGEFORMAT "> <w:r> <w:rPr> <w:noProof/> // </w:rPr> <w:t>- 1 -</w:t> </w:r> </w:fldSimple> boolean fieldPageNumber = XWPFRunHelper.isInstrTextPage( instr ); String fieldHref = null; if ( !fieldPageNumber ) { // not page number, test if it's hyperlink : // <w:instrText>HYPERLINK "http://code.google.com/p/xdocrepor"</w:instrText> fieldHref = XWPFRunHelper.getInstrTextHyperlink( instr ); } for ( CTR r : simpleField.getRList() ) { XWPFRun run = new XWPFRun( r, paragraph ); visitRun( run, fieldPageNumber, fieldHref, paragraphContainer ); } } else if ( o instanceof CTSmartTagRun ) { // Smart Tags can be nested many times. // This implementation does not preserve the tagging information // buildRunsInOrderFromXml(o); } else if ( o instanceof CTBookmark ) { CTBookmark bookmark = (CTBookmark) o; visitBookmark( bookmark, paragraph, paragraphContainer ); } } protected abstract void visitEmptyRun( T paragraphContainer ) throws Exception; protected void visitRun( XWPFRun run, boolean pageNumber, String url, T paragraphContainer ) throws Exception { CTR ctr = run.getCTR(); CTRPr rPr = ctr.getRPr(); boolean hasTexStyles = rPr != null && (rPr.getHighlight() != null || rPr.getStrike() != null || rPr.getDstrike() != null || rPr.getVertAlign() != null ) ; StringBuilder text = new StringBuilder(); // Loop for each element of <w:run text, tab, image etc // to keep the order of thoses elements. XmlCursor c = ctr.newCursor(); c.selectPath( "./*" ); while ( c.toNextSelection() ) { XmlObject o = c.getObject(); if ( o instanceof CTText ) { CTText ctText = (CTText) o; String tagName = o.getDomNode().getNodeName(); // Field Codes (w:instrText, defined in spec sec. 17.16.23) // come up as instances of CTText, but we don't want them // in the normal text output if ( "w:instrText".equals( tagName ) ) { } else { if(hasTexStyles) { text.append(ctText.getStringValue()); } else { visitText( ctText, pageNumber, paragraphContainer ); } } } else if ( o instanceof CTPTab ) { visitTab( (CTPTab) o, paragraphContainer ); } else if ( o instanceof CTBr ) { visitBR( (CTBr) o, paragraphContainer ); } else if ( o instanceof CTEmpty ) { // Some inline text elements get returned not as // themselves, but as CTEmpty, owing to some odd // definitions around line 5642 of the XSDs // This bit works around it, and replicates the above // rules for that case String tagName = o.getDomNode().getNodeName(); if ( "w:tab".equals( tagName ) ) { CTTabs tabs = stylesDocument.getParagraphTabs( run.getParagraph() ); visitTabs( tabs, paragraphContainer ); } if ( "w:br".equals( tagName ) ) { visitBR( null, paragraphContainer ); } if ( "w:cr".equals( tagName ) ) { visitBR( null, paragraphContainer ); } } else if ( o instanceof CTDrawing ) { visitDrawing( (CTDrawing) o, paragraphContainer ); } } if(hasTexStyles && StringUtils.isNotEmpty(text.toString())) { visitStyleText(run, text.toString()); } c.dispose(); } /** * Text styles handling, fonts, highlighting, background colors, subscript, superscript, strikes (single strikes) etc. * @param run * @param text * @throws Exception */ protected void visitStyleText(XWPFRun run, String text) throws Exception { //child should implement } protected abstract void visitText( CTText ctText, boolean pageNumber, T paragraphContainer ) throws Exception; protected abstract void visitTab( CTPTab o, T paragraphContainer ) throws Exception; protected abstract void visitTabs( CTTabs tabs, T paragraphContainer ) throws Exception; protected void visitBR( CTBr br, T paragraphContainer ) throws Exception { STBrType.Enum brType = XWPFRunHelper.getBrType( br ); if ( brType.equals( STBrType.PAGE ) ) { pageBreakOnNextParagraph = true; } else { addNewLine( br, paragraphContainer ); } } protected abstract void visitBookmark( CTBookmark bookmark, XWPFParagraph paragraph, T paragraphContainer ) throws Exception; protected abstract void addNewLine( CTBr br, T paragraphContainer ) throws Exception; protected abstract void pageBreak() throws Exception; protected void visitTable( XWPFTable table, int index, T container ) throws Exception { // 1) Compute colWidth float[] colWidths = XWPFTableUtil.computeColWidths( table ); T tableContainer = startVisitTable( table, colWidths, container ); visitTableBody( table, colWidths, tableContainer ); endVisitTable( table, container, tableContainer ); } protected void visitTableBody( XWPFTable table, float[] colWidths, T tableContainer ) throws Exception { // Proces Row boolean firstRow = false; boolean lastRow = false; List<XWPFTableRow> rows = table.getRows(); int rowsSize = rows.size(); for ( int i = 0; i < rowsSize; i++ ) { firstRow = ( i == 0 ); lastRow = isLastRow( i, rowsSize ); XWPFTableRow row = rows.get( i ); visitTableRow( row, colWidths, tableContainer, firstRow, lastRow, i, rowsSize ); } } private boolean isLastRow( int rowIndex, int rowsSize ) { return rowIndex == rowsSize - 1; } protected abstract T startVisitTable( XWPFTable table, float[] colWidths, T tableContainer ) throws Exception; protected abstract void endVisitTable( XWPFTable table, T parentContainer, T tableContainer ) throws Exception; protected void visitTableRow( XWPFTableRow row, float[] colWidths, T tableContainer, boolean firstRow, boolean lastRowIfNoneVMerge, int rowIndex, int rowsSize ) throws Exception { boolean headerRow = stylesDocument.isTableRowHeader( row ); startVisitTableRow( row, tableContainer, rowIndex, headerRow ); int nbColumns = colWidths.length; // Process cell boolean firstCol = true; boolean lastCol = false; boolean lastRow = false; List<XWPFTableCell> vMergedCells = null; List<XWPFTableCell> cells = row.getTableCells(); if ( nbColumns > cells.size() ) { // Columns number is not equal to cells number. // POI have a bug with // <w:tr w:rsidR="00C55C20"> // <w:tc> // <w:tc>... // <w:sdt> // <w:sdtContent> // <w:tc> <= this tc which is a XWPFTableCell is not included in the row.getTableCells(); firstCol = true; int cellIndex = -1; int cellPtr = 0; CTRow ctRow = row.getCtRow(); XmlCursor c = ctRow.newCursor(); c.selectPath( "./*" ); while ( c.toNextSelection() ) { XmlObject o = c.getObject(); if ( o instanceof CTTc ) { CTTc tc = (CTTc) o; XWPFTableCell cell = row.getTableCell( tc ); cellIndex = getCellIndex( cellIndex, cell ); lastCol = ( cellIndex == nbColumns ); vMergedCells = getVMergedCells( cell, rowIndex, cellPtr ); if ( vMergedCells == null || vMergedCells.size() > 0 ) { lastRow = isLastRow( lastRowIfNoneVMerge, rowIndex, rowsSize, vMergedCells ); visitCell( cell, tableContainer, firstRow, lastRow, firstCol, lastCol, rowIndex, cellPtr, vMergedCells ); } cellPtr++; firstCol = false; } else if ( o instanceof CTSdtCell ) { // Fix bug of POI CTSdtCell sdtCell = (CTSdtCell) o; List<CTTc> tcList = sdtCell.getSdtContent().getTcList(); for ( CTTc ctTc : tcList ) { XWPFTableCell cell = new XWPFTableCell( ctTc, row, row.getTable().getBody() ); cellIndex = getCellIndex( cellIndex, cell ); lastCol = ( cellIndex == nbColumns ); List<XWPFTableCell> rowCells = row.getTableCells(); if (!rowCells.contains(cell)) { rowCells.add(cell); } vMergedCells = getVMergedCells( cell, rowIndex, cellPtr ); if ( vMergedCells == null || vMergedCells.size() > 0 ) { lastRow = isLastRow( lastRowIfNoneVMerge, rowIndex, rowsSize, vMergedCells ); visitCell( cell, tableContainer, firstRow, lastRow, firstCol, lastCol, rowIndex, cellPtr, vMergedCells ); } cellPtr++; firstCol = false; } } } c.dispose(); } else { // Column number is equal to cells number. for ( int i = 0; i < cells.size(); i++ ) { lastCol = ( i == cells.size() - 1 ); XWPFTableCell cell = cells.get( i ); vMergedCells = getVMergedCells( cell, rowIndex, i ); if ( vMergedCells == null || vMergedCells.size() > 0 ) { lastRow = isLastRow( lastRowIfNoneVMerge, rowIndex, rowsSize, vMergedCells ); visitCell( cell, tableContainer, firstRow, lastRow, firstCol, lastCol, rowIndex, i, vMergedCells ); } firstCol = false; } } endVisitTableRow( row, tableContainer, firstRow, lastRow, headerRow ); } private boolean isLastRow( boolean lastRowIfNoneVMerge, int rowIndex, int rowsSize, List<XWPFTableCell> vMergedCells ) { if ( vMergedCells == null ) { return lastRowIfNoneVMerge; } return isLastRow( rowIndex - 1 + vMergedCells.size(), rowsSize ); } private int getCellIndex( int cellIndex, XWPFTableCell cell ) { BigInteger gridSpan = stylesDocument.getTableCellGridSpan( cell.getCTTc().getTcPr() ); if ( gridSpan != null ) { cellIndex = cellIndex + gridSpan.intValue(); } else { cellIndex++; } return cellIndex; } protected void startVisitTableRow( XWPFTableRow row, T tableContainer, int rowIndex, boolean headerRow ) throws Exception { } protected void endVisitTableRow( XWPFTableRow row, T tableContainer, boolean firstRow, boolean lastRow, boolean headerRow ) throws Exception { } protected void visitCell( XWPFTableCell cell, T tableContainer, boolean firstRow, boolean lastRow, boolean firstCol, boolean lastCol, int rowIndex, int cellIndex, List<XWPFTableCell> vMergedCells ) throws Exception { T tableCellContainer = startVisitTableCell( cell, tableContainer, firstRow, lastRow, firstCol, lastCol, vMergedCells ); visitTableCellBody( cell, vMergedCells, tableCellContainer ); endVisitTableCell( cell, tableContainer, tableCellContainer ); } private List<XWPFTableCell> getVMergedCells( XWPFTableCell cell, int rowIndex, int cellIndex ) { List<XWPFTableCell> vMergedCells = null; STMerge.Enum vMerge = stylesDocument.getTableCellVMerge( cell ); if ( vMerge != null ) { if ( vMerge.equals( STMerge.RESTART ) ) { // vMerge="restart" // Loop for each table cell of each row upon vMerge="restart" was found or cell without vMerge // was declared. vMergedCells = new ArrayList<XWPFTableCell>(); vMergedCells.add( cell ); XWPFTableRow row = null; XWPFTableCell c; XWPFTable table = cell.getTableRow().getTable(); for ( int i = rowIndex + 1; i < table.getRows().size(); i++ ) { row = table.getRow( i ); c = row.getCell( cellIndex ); if ( c == null ) { break; } vMerge = stylesDocument.getTableCellVMerge( c ); if ( vMerge != null && vMerge.equals( STMerge.CONTINUE ) ) { vMergedCells.add( c ); } else { return vMergedCells; } } } else { // vMerge="continue", ignore the cell because it was already processed return Collections.emptyList(); } } return vMergedCells; } protected void visitTableCellBody( XWPFTableCell cell, List<XWPFTableCell> vMergeCells, T tableCellContainer ) throws Exception { if ( vMergeCells != null ) { for ( XWPFTableCell mergedCell : vMergeCells ) { List<IBodyElement> bodyElements = mergedCell.getBodyElements(); visitBodyElements( bodyElements, tableCellContainer ); } } else { List<IBodyElement> bodyElements = cell.getBodyElements(); visitBodyElements( bodyElements, tableCellContainer ); } } protected abstract T startVisitTableCell( XWPFTableCell cell, T tableContainer, boolean firstRow, boolean lastRow, boolean firstCol, boolean lastCol, List<XWPFTableCell> vMergeCells ) throws Exception; protected abstract void endVisitTableCell( XWPFTableCell cell, T tableContainer, T tableCellContainer ) throws Exception; protected XWPFStyle getXWPFStyle( String styleID ) { if ( styleID == null ) return null; else return document.getStyles().getStyle( styleID ); } /** * Returns true if word/document.xml is parsing and false otherwise. * * @return true if word/document.xml is parsing and false otherwise. */ protected boolean isWordDocumentPartParsing() { return currentHeader == null && currentFooter == null; } // ------------------------------ Header/Footer visitor ----------- public void visitHeaderRef( CTHdrFtrRef headerRef, CTSectPr sectPr, E masterPage ) throws Exception { this.currentHeader = getXWPFHeader( headerRef ); visitHeader( currentHeader, headerRef, sectPr, masterPage ); this.currentHeader = null; } protected abstract void visitHeader( XWPFHeader header, CTHdrFtrRef headerRef, CTSectPr sectPr, E masterPage ) throws Exception; public void visitFooterRef( CTHdrFtrRef footerRef, CTSectPr sectPr, E masterPage ) throws Exception { this.currentFooter = getXWPFFooter( footerRef ); visitFooter( currentFooter, footerRef, sectPr, masterPage ); this.currentFooter = null; } protected abstract void visitFooter( XWPFFooter footer, CTHdrFtrRef footerRef, CTSectPr sectPr, E masterPage ) throws Exception; /** * Returns the list of {@link IBodyElement} of the given header/footer. We do that because * {@link XWPFHeaderFooter#getBodyElements()} doesn't contains the // <w:sdt><w:sdtContent> * <p * (see JUnit Docx4j_GettingStarted, DocXperT_Output_4_3, Issue222 which defines page number in the <w:sdt. ... * * @param part * @return */ protected List<IBodyElement> getBodyElements( XWPFHeaderFooter part ) { List<IBodyElement> bodyElements = new ArrayList<IBodyElement>(); XmlTokenSource headerFooter = part._getHdrFtr(); addBodyElements( headerFooter, part, bodyElements ); return bodyElements; } /** * Add body elements from the given token source. * * @param source * @param part * @param bodyElements */ private void addBodyElements( XmlTokenSource source, IBody part, List<IBodyElement> bodyElements ) { // parse the document with cursor and add // the XmlObject to its lists XmlCursor cursor = source.newCursor(); cursor.selectPath( "./*" ); while ( cursor.toNextSelection() ) { XmlObject o = cursor.getObject(); if ( o instanceof CTSdtBlock ) { // <w:sdt><w:sdtContent><p... CTSdtBlock block = (CTSdtBlock) o; CTSdtContentBlock contentBlock = block.getSdtContent(); if ( contentBlock != null ) { addBodyElements( contentBlock, part, bodyElements ); } } else if ( o instanceof CTP ) { XWPFParagraph p = new XWPFParagraph( (CTP) o, part ); bodyElements.add( p ); } else if ( o instanceof CTTbl ) { XWPFTable t = new XWPFTable( (CTTbl) o, part ); bodyElements.add( t ); } } cursor.dispose(); } /** * Returns the {@link XWPFHeader} of the given header reference. * * @param headerRef the header reference. * @return * @throws XmlException * @throws IOException */ protected XWPFHeader getXWPFHeader( CTHdrFtrRef headerRef ) throws XmlException, IOException { PackagePart hdrPart = document.getPartById( headerRef.getId() ); List<XWPFHeader> headers = document.getHeaderList(); for ( XWPFHeader header : headers ) { if ( header.getPackagePart().equals( hdrPart ) ) { // header is aleady loaded, return it. return header; } } // should never come, but load the header if needed. HdrDocument hdrDoc = HdrDocument.Factory.parse( hdrPart.getInputStream() ); CTHdrFtr hdrFtr = hdrDoc.getHdr(); XWPFHeader hdr = new XWPFHeader( document, hdrFtr ); return hdr; } /** * Returns the {@link XWPFFooter} of the given footer reference. * * @param footerRef the footer reference. * @return * @throws XmlException * @throws IOException */ protected XWPFFooter getXWPFFooter( CTHdrFtrRef footerRef ) throws XmlException, IOException { PackagePart hdrPart = document.getPartById( footerRef.getId() ); List<XWPFFooter> footers = document.getFooterList(); for ( XWPFFooter footer : footers ) { if ( footer.getPackagePart().equals( hdrPart ) ) { // footer is aleady loaded, return it. return footer; } } // should never come, but load the footer if needed. FtrDocument hdrDoc = FtrDocument.Factory.parse( hdrPart.getInputStream() ); CTHdrFtr hdrFtr = hdrDoc.getFtr(); XWPFFooter ftr = new XWPFFooter( document, hdrFtr ); return ftr; } // ------------------------ Image -------------- protected void visitDrawing( CTDrawing drawing, T parentContainer ) throws Exception { List<CTInline> inlines = drawing.getInlineList(); for ( CTInline inline : inlines ) { visitInline( inline, parentContainer ); } List<CTAnchor> anchors = drawing.getAnchorList(); for ( CTAnchor anchor : anchors ) { visitAnchor( anchor, parentContainer ); } } protected void visitAnchor( CTAnchor anchor, T parentContainer ) throws Exception { CTGraphicalObject graphic = anchor.getGraphic(); /* * wp:positionH relativeFrom="column"> <wp:posOffset>-898525</wp:posOffset> </wp:positionH> */ STRelFromH.Enum relativeFromH = null; Float offsetX = null; CTPosH positionH = anchor.getPositionH(); if ( positionH != null ) { relativeFromH = positionH.getRelativeFrom(); offsetX = DxaUtil.emu2points( positionH.getPosOffset() ); } STRelFromV.Enum relativeFromV = null; Float offsetY = null; CTPosV positionV = anchor.getPositionV(); if ( positionV != null ) { relativeFromV = positionV.getRelativeFrom(); offsetY = DxaUtil.emu2points( positionV.getPosOffset() ); } STWrapText.Enum wrapText = null; CTWrapSquare wrapSquare = anchor.getWrapSquare(); if ( wrapSquare != null ) { wrapText = wrapSquare.getWrapText(); } visitGraphicalObject( parentContainer, graphic, offsetX, relativeFromH, offsetY, relativeFromV, wrapText ); } protected void visitInline( CTInline inline, T parentContainer ) throws Exception { CTGraphicalObject graphic = inline.getGraphic(); visitGraphicalObject( parentContainer, graphic, null, null, null, null, null ); } private void visitGraphicalObject( T parentContainer, CTGraphicalObject graphic, Float offsetX, STRelFromH.Enum relativeFromH, Float offsetY, STRelFromV.Enum relativeFromV, STWrapText.Enum wrapText ) throws Exception { if ( graphic != null ) { CTGraphicalObjectData graphicData = graphic.getGraphicData(); if ( graphicData != null ) { XmlCursor c = graphicData.newCursor(); c.selectPath( "./*" ); while ( c.toNextSelection() ) { XmlObject o = c.getObject(); if ( o instanceof CTPicture ) { CTPicture picture = (CTPicture) o; // extract the picture if needed IImageExtractor extractor = getImageExtractor(); if ( extractor != null ) { XWPFPictureData pictureData = getPictureData( picture ); if ( pictureData != null ) { try { extractor.extract( WORD_MEDIA + pictureData.getFileName(), pictureData.getData() ); } catch ( Throwable e ) { LOGGER.log( Level.SEVERE, "Error while extracting the image " + pictureData.getFileName(), e ); } } } // visit the picture. visitPicture( picture, offsetX, relativeFromH, offsetY, relativeFromV, wrapText, parentContainer ); } } c.dispose(); } } } /** * Returns the picture data of the given image id. * * @param blipId * @return */ protected XWPFPictureData getPictureDataByID( String blipId ) { if ( currentHeader != null ) { return currentHeader.getPictureDataByID( blipId ); } if ( currentFooter != null ) { return currentFooter.getPictureDataByID( blipId ); } return document.getPictureDataByID( blipId ); } /** * Returns the image extractor and null otherwise. * * @return */ protected IImageExtractor getImageExtractor() { return options.getExtractor(); } /** * Returns the picture data of the given picture. * * @param picture * @return */ public XWPFPictureData getPictureData( CTPicture picture ) { String blipId = picture.getBlipFill().getBlip().getEmbed(); return getPictureDataByID( blipId ); } protected abstract void visitPicture( CTPicture picture, Float offsetX, STRelFromH.Enum relativeFromH, Float offsetY, STRelFromV.Enum relativeFromV, STWrapText.Enum wrapText, T parentContainer ) throws Exception; }