package com.aspose.words.examples.loading_saving; import com.aspose.words.*; import com.aspose.words.examples.Utils; import com.sun.media.jfxmediaimpl.MediaUtils; import javafx.scene.shape.Path; import java.io.File; import java.text.MessageFormat; import java.util.ArrayList; import java.util.Collections; import java.util.Hashtable; import java.util.Stack; public class PageSplitter { public static void main(String[] args) throws Exception { // The path to the documents directory. String dataDir = Utils.getDataDir(PageSplitter.class); SplitAllDocumentsToPages(dataDir); System.out.println("\nDocument split to pages successfully.\nFile saved at " + dataDir + "\\Out"); } public static void SplitDocumentToPages(File docName) throws Exception { String folderName = docName.getParent(); String fileName = docName.getName(); String extensionName = fileName.substring(fileName.lastIndexOf(".")); String outFolder = new File(folderName, "Out").getAbsolutePath(); System.out.println("Processing document: " + fileName ); Document doc = new Document(docName.getAbsolutePath()); // Create and attach collector to the document before page layout is built. LayoutCollector layoutCollector = new LayoutCollector(doc); // This will build layout model and collect necessary information. doc.updatePageLayout(); // Split nodes in the document into separate pages. DocumentPageSplitter splitter = new DocumentPageSplitter(layoutCollector); // Save each page to the disk as a separate document. for (int page = 1; page <= doc.getPageCount(); page++) { Document pageDoc = splitter.GetDocumentOfPage(page); pageDoc.save(new File(outFolder, MessageFormat.format("{0} - page{1} Out{2}", fileName, page, extensionName)).getAbsolutePath()); } // Detach the collector from the document. layoutCollector.setDocument(null); } public static void SplitAllDocumentsToPages(String folderName) throws Exception { File[] files = new File(folderName).listFiles(); for (File file : files) { if (file.isFile()) { SplitDocumentToPages(file); } } } } class DocumentPageSplitter { /** * Initializes new instance of this class. This method splits the document into sections so that each page * begins and ends at a section boundary. It is recommended not to modify the document afterwards. */ public DocumentPageSplitter(LayoutCollector collector) throws Exception { mPageNumberFinder = new PageNumberFinder(collector); mPageNumberFinder.SplitNodesAcrossPages(); } /** * Gets the document of a page. */ public Document GetDocumentOfPage(int pageIndex) throws Exception { return GetDocumentOfPageRange(pageIndex, pageIndex); } /** * Gets the document of a page range. */ public Document GetDocumentOfPageRange(int startIndex, int endIndex) throws Exception { Document result = (Document) getDocument().deepClone(false); for (Section section : (Iterable<Section>) mPageNumberFinder.RetrieveAllNodesOnPages(startIndex, endIndex, NodeType.SECTION)) result.appendChild(result.importNode(section, true)); return result; } /** * Gets the document this instance works with. */ private Document getDocument() { return mPageNumberFinder.getDocument(); } private PageNumberFinder mPageNumberFinder; } class PageNumberFinder { /** * Initializes new instance of this class. */ public PageNumberFinder(LayoutCollector collector) { mCollector = collector; } /** * Retrieves 1-based index of a page that the node begins on. */ public int GetPage(Node node) throws Exception { if (mNodeStartPageLookup.containsKey(node)) return (Integer) mNodeStartPageLookup.get(node); return mCollector.getStartPageIndex(node); } /** * Retrieves 1-based index of a page that the node ends on. */ public int GetPageEnd(Node node) throws Exception { if (mNodeEndPageLookup.containsKey(node)) return (Integer) mNodeEndPageLookup.get(node); return mCollector.getEndPageIndex(node); } /** * Returns how many pages the specified node spans over. Returns 1 if the node is contained within one page. */ public int PageSpan(Node node) throws Exception { return GetPageEnd(node) - GetPage(node) + 1; } /** * Returns a list of nodes that are contained anywhere on the specified page or pages which match the specified node type. */ public ArrayList RetrieveAllNodesOnPages(int startPage, int endPage, int nodeType) throws Exception { if (startPage < 1 || startPage > getDocument().getPageCount()) throw new Exception("startPage"); if (endPage < 1 || endPage > getDocument().getPageCount() || endPage < startPage) throw new Exception("endPage"); CheckPageListsPopulated(); ArrayList pageNodes = new ArrayList(); for (int page = startPage; page <= endPage; page++) { // Some pages can be empty. if (!mReversePageLookup.containsKey(page)) continue; for (Node node : (Iterable<Node>) mReversePageLookup.get(page)) { if (node.getParentNode() != null && ((nodeType == NodeType.ANY) || (nodeType == node.getNodeType())) && !pageNodes.contains(node)) pageNodes.add(node); } } return pageNodes; } /** * Splits nodes which appear over two or more pages into separate nodes so that they still appear in the same way * but no longer appear across a page. */ public void SplitNodesAcrossPages() throws Exception { // Visit any composites which are possibly split across pages and split them into separate nodes. getDocument().accept(new SectionSplitter(this)); } /** * Gets the document this instance works with. */ public Document getDocument() { return mCollector.getDocument(); } /** * This is called by <see cref="SectionSplitter"/> to update page numbers of split nodes. */ void AddPageNumbersForNode(Node node, int startPage, int endPage) { if (startPage > 0) mNodeStartPageLookup.put(node, startPage); if (endPage > 0) mNodeEndPageLookup.put(node, endPage); } private void CheckPageListsPopulated() throws Exception { if (mReversePageLookup != null) return; mReversePageLookup = new Hashtable(); // Add each node to a list which represent the nodes found on each page. for (Node node : (Iterable<Node>) getDocument().getChildNodes(NodeType.ANY, true)) { // Headers/Footers follow sections. They are not split by themselves. if (IsHeaderFooterType(node)) continue; int startPage = GetPage(node); int endPage = GetPageEnd(node); for (int page = startPage; page <= endPage; page++) { if (!mReversePageLookup.containsKey(page)) mReversePageLookup.put(page, new ArrayList()); ((ArrayList) mReversePageLookup.get(page)).add(node); } } } private static boolean IsHeaderFooterType(Node node) { return node.getNodeType() == NodeType.HEADER_FOOTER || node.getAncestor(NodeType.HEADER_FOOTER) != null; } // Maps node to a start/end page numbers. This is used to override baseline page numbers provided by collector when document is split. private Hashtable mNodeStartPageLookup = new Hashtable(); private Hashtable mNodeEndPageLookup = new Hashtable(); // Maps page number to a list of nodes found on that page. private Hashtable mReversePageLookup; private LayoutCollector mCollector; } class SectionSplitter extends DocumentVisitor { public SectionSplitter(PageNumberFinder pageNumberFinder) { mPageNumberFinder = pageNumberFinder; } public int visitParagraphStart(Paragraph paragraph) throws Exception { if (paragraph.isListItem()) { List paraList = paragraph.getListFormat().getList(); ListLevel currentLevel = paragraph.getListFormat().getListLevel(); // Since we have encountered a list item we need to check if this will reset // any subsequent list levels and if so then update the numbering of the level. int currentListLevelNumber = paragraph.getListFormat().getListLevelNumber(); for (int i = currentListLevelNumber + 1; i < paraList.getListLevels().getCount(); i++) { ListLevel paraLevel = paraList.getListLevels().get(i); if (paraLevel.getRestartAfterLevel() >= currentListLevelNumber) { // This list level needs to be reset after the current list number. mListLevelToListNumberLookup.put(paraLevel, paraLevel.getStartAt()); } } // A list which was used on a previous page is present on a different page, the list // needs to be copied so list numbering is retained when extracting individual pages. if (ContainsListLevelAndPageChanged(paragraph)) { List copyList = paragraph.getDocument().getLists().addCopy(paraList); mListLevelToListNumberLookup.put(currentLevel, paragraph.getListLabel().getLabelValue()); // Set the numbering of each list level to start at the numbering of the level on the previous page. for (int i = 0; i < paraList.getListLevels().getCount(); i++) { ListLevel paraLevel = paraList.getListLevels().get(i); if (mListLevelToListNumberLookup.containsKey(paraLevel)) copyList.getListLevels().get(i).setStartAt((Integer) mListLevelToListNumberLookup.get(paraLevel)); } mListToReplacementListLookup.put(paraList, copyList); } if (mListToReplacementListLookup.containsKey(paraList)) { // This paragraph belongs to a list from a previous page. Apply the replacement list. paragraph.getListFormat().setList((List) mListToReplacementListLookup.get(paraList)); // This is a trick to get the spacing of the list level to set correctly. paragraph.getListFormat().setListLevelNumber(paragraph.getListFormat().getListLevelNumber() + 0); } mListLevelToPageLookup.put(currentLevel, mPageNumberFinder.GetPage(paragraph)); mListLevelToListNumberLookup.put(currentLevel, paragraph.getListLabel().getLabelValue()); } Section prevSection = (Section) paragraph.getParentSection().getPreviousSibling(); Paragraph prevBodyPara = null; if (paragraph.getPreviousSibling() != null && paragraph.getPreviousSibling().getNodeType() == NodeType.PARAGRAPH) prevBodyPara = (Paragraph) paragraph.getPreviousSibling(); Paragraph prevSectionPara = prevSection != null && paragraph == paragraph.getParentSection().getBody().getFirstChild() ? prevSection.getBody().getLastParagraph() : null; Paragraph prevParagraph = prevBodyPara != null ? prevBodyPara : prevSectionPara; if (paragraph.isEndOfSection() && !paragraph.hasChildNodes()) paragraph.remove(); // Paragraphs across pages can merge or remove spacing depending upon the previous paragraph. if (prevParagraph != null) { if (mPageNumberFinder.GetPage(paragraph) != mPageNumberFinder.GetPageEnd(prevParagraph)) { if (paragraph.isListItem() && prevParagraph.isListItem() && !prevParagraph.isEndOfSection()) prevParagraph.getParagraphFormat().setSpaceAfter(0); else if (prevParagraph.getParagraphFormat().getStyleName() == paragraph.getParagraphFormat().getStyleName() && paragraph.getParagraphFormat().getNoSpaceBetweenParagraphsOfSameStyle()) paragraph.getParagraphFormat().setSpaceBefore(0); else if (paragraph.getParagraphFormat().getPageBreakBefore() || (prevParagraph.isEndOfSection() && prevSection.getPageSetup().getSectionStart() != SectionStart.NEW_COLUMN)) paragraph.getParagraphFormat().setSpaceBefore(Math.max(paragraph.getParagraphFormat().getSpaceBefore() - prevParagraph.getParagraphFormat().getSpaceAfter(), 0)); else paragraph.getParagraphFormat().setSpaceBefore(0); } } return VisitorAction.CONTINUE; } public int visitSectionStart(Section section) throws Exception { mSectionCount++; Section previousSection = (Section) section.getPreviousSibling(); // If there is a previous section attempt to copy any linked header footers otherwise they will not appear in an // extracted document if the previous section is missing. if (previousSection != null) { if (!section.getPageSetup().getRestartPageNumbering()) { section.getPageSetup().setRestartPageNumbering(true); section.getPageSetup().setPageStartingNumber(previousSection.getPageSetup().getPageStartingNumber() + mPageNumberFinder.PageSpan(previousSection)); } for (HeaderFooter previousHeaderFooter : previousSection.getHeadersFooters()) { if (section.getHeadersFooters().getByHeaderFooterType(previousHeaderFooter.getHeaderFooterType()) == null) { HeaderFooter newHeaderFooter = (HeaderFooter) previousSection.getHeadersFooters().getByHeaderFooterType(previousHeaderFooter.getHeaderFooterType()).deepClone(true); section.getHeadersFooters().add(newHeaderFooter); } } } // Manually set the result of these fields before sections are split. for (HeaderFooter headerFooter : section.getHeadersFooters()) { for (Field field : headerFooter.getRange().getFields()) { if (field.getType() == FieldType.FIELD_SECTION || field.getType() == FieldType.FIELD_SECTION_PAGES) { field.setResult((field.getType() == FieldType.FIELD_SECTION) ? Integer.toString(mSectionCount) : Integer.toString(mPageNumberFinder.PageSpan(section))); field.isLocked(true); } } } // All fields in the body should stay the same, this also improves field update time. for (Field field : section.getBody().getRange().getFields()) field.isLocked(true); return VisitorAction.CONTINUE; } public int visitDocumentEnd(Document doc) throws Exception { // All sections have separate headers and footers now, update the fields in all headers and footers // to the correct values. This allows each page to maintain the correct field results even when // PAGE or IF fields are used. doc.updateFields(); for (HeaderFooter headerFooter : (Iterable<HeaderFooter>) doc.getChildNodes(NodeType.HEADER_FOOTER, true)) { for (Field field : headerFooter.getRange().getFields()) field.isLocked(true); } return VisitorAction.CONTINUE; } public int visitSmartTagEnd(SmartTag smartTag) throws Exception { if (IsCompositeAcrossPage(smartTag)) SplitComposite(smartTag); return VisitorAction.CONTINUE; } // public int visitCustomXmlMarkupEnd(CustomXmlMarkup customXmlMarkup) throws Exception { // if (IsCompositeAcrossPage(customXmlMarkup)) // SplitComposite(customXmlMarkup); // // return VisitorAction.CONTINUE; // } public int visitStructuredDocumentTagEnd(StructuredDocumentTag sdt) throws Exception { if (IsCompositeAcrossPage(sdt)) SplitComposite(sdt); return VisitorAction.CONTINUE; } public int visitCellEnd(Cell cell) throws Exception { if (IsCompositeAcrossPage(cell)) SplitComposite(cell); return VisitorAction.CONTINUE; } public int visitRowEnd(Row row) throws Exception { if (IsCompositeAcrossPage(row)) SplitComposite(row); return VisitorAction.CONTINUE; } public int visitTableEnd(Table table) throws Exception { if (IsCompositeAcrossPage(table)) { // Copy any header rows to other pages. Row[] rows = table.getRows().toArray(); for (Table cloneTable : (Iterable<Table>) SplitComposite(table)) { for (Row row : rows) { if (row.getRowFormat().getHeadingFormat()) cloneTable.prependChild(row.deepClone(true)); } } } return VisitorAction.CONTINUE; } public int visitParagraphEnd(Paragraph paragraph) throws Exception { if (IsCompositeAcrossPage(paragraph)) { for (Paragraph clonePara : (Iterable<Paragraph>) SplitComposite(paragraph)) { // Remove list numbering from the cloned paragraph but leave the indent the same // as the paragraph is supposed to be part of the item before. if (paragraph.isListItem()) { double textPosition = clonePara.getListFormat().getListLevel().getTextPosition(); clonePara.getListFormat().removeNumbers(); clonePara.getParagraphFormat().setLeftIndent(textPosition); } // Reset spacing of split paragraphs as additional spacing is removed. clonePara.getParagraphFormat().setSpaceBefore(0); paragraph.getParagraphFormat().setSpaceAfter(0); } } return VisitorAction.CONTINUE; } public int visitSectionEnd(Section section) throws Exception { if (IsCompositeAcrossPage(section)) { // If a TOC field spans across more than one page then the hyperlink formatting may show through. // Remove direct formatting to avoid this. for (FieldStart start : (Iterable<FieldStart>) section.getChildNodes(NodeType.FIELD_START, true)) { if (start.getFieldType() == FieldType.FIELD_TOC) { Field field = start.getField(); Node node = field.getSeparator(); while ((node = node.nextPreOrder(section)) != field.getEnd()) if (node.getNodeType() == NodeType.RUN) ((Run) node).getFont().clearFormatting(); } } for (Section cloneSection : (Iterable<Section>) SplitComposite(section)) { cloneSection.getPageSetup().setSectionStart(SectionStart.NEW_PAGE); cloneSection.getPageSetup().setRestartPageNumbering(true); cloneSection.getPageSetup().setPageStartingNumber(section.getPageSetup().getPageStartingNumber() + (section.getDocument().indexOf(cloneSection) - section.getDocument().indexOf(section))); cloneSection.getPageSetup().setDifferentFirstPageHeaderFooter(false); RemovePageBreaksFromParagraph(cloneSection.getBody().getLastParagraph()); } RemovePageBreaksFromParagraph(section.getBody().getLastParagraph()); // Add new page numbering for the body of the section as well. mPageNumberFinder.AddPageNumbersForNode(section.getBody(), mPageNumberFinder.GetPage(section), mPageNumberFinder.GetPageEnd(section)); } return VisitorAction.CONTINUE; } private boolean IsCompositeAcrossPage(CompositeNode composite) throws Exception { return (mPageNumberFinder.PageSpan(composite) > 1); } private boolean ContainsListLevelAndPageChanged(Paragraph para) throws Exception { return mListLevelToPageLookup.containsKey(para.getListFormat().getListLevel()) && (Integer) mListLevelToPageLookup.get(para.getListFormat().getListLevel()) != mPageNumberFinder.GetPage(para); } private void RemovePageBreaksFromParagraph(Paragraph para) throws Exception { if (para != null) { for (Run run : para.getRuns()) run.setText(run.getText().replace(ControlChar.PAGE_BREAK, "")); } } private ArrayList SplitComposite(CompositeNode composite) throws Exception { ArrayList splitNodes = new ArrayList(); for (Node splitNode : (Iterable<Node>) FindChildSplitPositions(composite)) splitNodes.add(SplitCompositeAtNode(composite, splitNode)); return splitNodes; } private ArrayList FindChildSplitPositions(CompositeNode node) throws Exception { // A node may span across multiple pages so a list of split positions is returned. // The split node is the first node on the next page. ArrayList splitList = new ArrayList(); int startingPage = mPageNumberFinder.GetPage(node); Node[] childNodes = node.getNodeType() == NodeType.SECTION ? ((Section) node).getBody().getChildNodes().toArray() : node.getChildNodes().toArray(); for (Node childNode : childNodes) { int pageNum = mPageNumberFinder.GetPage(childNode); // If the page of the child node has changed then this is the split position. Add // this to the list. if (pageNum > startingPage) { splitList.add(childNode); startingPage = pageNum; } if (mPageNumberFinder.PageSpan(childNode) > 1) mPageNumberFinder.AddPageNumbersForNode(childNode, pageNum, pageNum); } // Split composites backward so the cloned nodes are inserted in the right order. Collections.reverse(splitList); return splitList; } private CompositeNode SplitCompositeAtNode(CompositeNode baseNode, Node targetNode) throws Exception { CompositeNode cloneNode = (CompositeNode) baseNode.deepClone(false); Node node = targetNode; int currentPageNum = mPageNumberFinder.GetPage(baseNode); // Move all nodes found on the next page into the copied node. Handle row nodes separately. if (baseNode.getNodeType() != NodeType.ROW) { CompositeNode composite = cloneNode; if (baseNode.getNodeType() == NodeType.SECTION) { cloneNode = (CompositeNode) baseNode.deepClone(true); Section section = (Section) cloneNode; section.getBody().removeAllChildren(); composite = section.getBody(); } while (node != null) { Node nextNode = node.getNextSibling(); composite.appendChild(node); node = nextNode; } } else { // If we are dealing with a row then we need to add in dummy cells for the cloned row. int targetPageNum = mPageNumberFinder.GetPage(targetNode); Node[] childNodes = baseNode.getChildNodes().toArray(); for (Node childNode : childNodes) { int pageNum = mPageNumberFinder.GetPage(childNode); if (pageNum == targetPageNum) { cloneNode.getLastChild().remove(); cloneNode.appendChild(childNode); } else if (pageNum == currentPageNum) { cloneNode.appendChild(childNode.deepClone(false)); if (cloneNode.getLastChild().getNodeType() != NodeType.CELL) ((CompositeNode) cloneNode.getLastChild()).appendChild(((CompositeNode) childNode).getFirstChild().deepClone(false)); } } } // Insert the split node after the original. baseNode.getParentNode().insertAfter(cloneNode, baseNode); // Update the new page numbers of the base node and the clone node including its descendents. // This will only be a single page as the cloned composite is split to be on one page. int currentEndPageNum = mPageNumberFinder.GetPageEnd(baseNode); mPageNumberFinder.AddPageNumbersForNode(baseNode, currentPageNum, currentEndPageNum - 1); mPageNumberFinder.AddPageNumbersForNode(cloneNode, currentEndPageNum, currentEndPageNum); for (Node childNode : (Iterable<Node>) cloneNode.getChildNodes(NodeType.ANY, true)) mPageNumberFinder.AddPageNumbersForNode(childNode, currentEndPageNum, currentEndPageNum); return cloneNode; } private Hashtable mListLevelToListNumberLookup = new Hashtable(); private Hashtable mListToReplacementListLookup = new Hashtable(); private Hashtable mListLevelToPageLookup = new Hashtable(); private PageNumberFinder mPageNumberFinder; private int mSectionCount; }