/*
* Copyright 2001-2015 Aspose Pty Ltd. All Rights Reserved.
*
* This file is part of Aspose.Words. The source code in this file
* is only intended as a supplement to the documentation, and is provided
* "as is", without warranty of any kind, either expressed or implied.
*/
package com.aspose.words.examples.loading_saving;
import com.aspose.words.*;
import com.aspose.words.examples.Utils;
import java.io.File;
import java.util.ArrayList;
public class SplitIntoHtmlPages
{
public static void main(String[] args) throws Exception
{
// You need to have a valid license for Aspose.Words.
// The best way is to embed the license as a resource into the project
// and specify only file name without path in the following call.
// Aspose.Words.License license = new Aspose.Words.License();
// license.SetLicense(@"Aspose.Words.lic");
// The path to the documents directory.
String dataDir = Utils.getDataDir(SplitIntoHtmlPages.class);
String srcFileName = dataDir + "SOI 2007-2012-DeeM with footnote added.doc";
String tocTemplate = dataDir + "TocTemplate.doc";
File outDir = new File(dataDir, "Out");
outDir.mkdirs();
// This class does the job.
Worker w = new Worker();
w.execute(srcFileName, tocTemplate, outDir.getPath());
System.out.println("Document split into HTML pages successfully.");
}
}
/**
* A custom data source for Aspose.Words mail merge.
* Returns topic objects.
*/
class TocMailMergeDataSource implements IMailMergeDataSource
{
TocMailMergeDataSource(ArrayList topics) throws Exception
{
mTopics = topics;
// Initialize to BOF.
mIndex = -1;
}
public boolean moveNext() throws Exception
{
if (mIndex < mTopics.size() - 1)
{
mIndex++;
return true;
}
else
{
// Reached EOF, return false.
return false;
}
}
public boolean getValue(String fieldName, Object[] fieldValue) throws Exception
{
if ("TocEntry".equals(fieldName))
{
// The template document is supposed to have only one field called "TocEntry".
fieldValue[0] = mTopics.get(mIndex);
return true;
}
else
{
fieldValue[0] = null;
return false;
}
}
public String getTableName() throws Exception { return "TOC"; }
public IMailMergeDataSource getChildDataSource(String tableName) throws Exception
{
return null;
}
private final ArrayList mTopics;
private int mIndex;
}
/**
* A simple class to hold a topic title and HTML file name together.
*/
class Topic
{
Topic(String title, String fileName) throws Exception
{
mTitle = title;
mFileName = fileName;
}
String getTitle() throws Exception { return mTitle; }
String getFileName() throws Exception { return mFileName; }
private final String mTitle;
private final String mFileName;
}
/**
*This class takes a Microsoft Word document, splits it into topics at paragraphs formatted
* with the Heading 1 style and saves every topic as an HTML file.
*
* Also generates contents.html file that provides links to all saved topics.
*/
class Worker
{
/**
* Performs the Word to HTML conversion.
*
* @param srcFileName The MS Word file to convert.
* @param tocTemplate An MS Word file that is used as a template to build
* a table of contents. This file needs to have a mail merge region called "TOC" defined
* and one mail merge field called "TocEntry".
* @param dstDir The output directory where to write HTML files. Must exist.
*/
void execute(String srcFileName, String tocTemplate, String dstDir) throws Exception
{
mDoc = new Document(srcFileName);
mTocTemplate = tocTemplate;
mDstDir = dstDir;
ArrayList topicStartParas = selectTopicStarts();
insertSectionBreaks(topicStartParas);
ArrayList topics = saveHtmlTopics();
saveTableOfContents(topics);
}
/**
* Selects heading paragraphs that must become topic starts.
* We can't modify them in this loop, we have to remember them in an array first.
*/
private ArrayList selectTopicStarts() throws Exception
{
NodeCollection paras = mDoc.getChildNodes(NodeType.PARAGRAPH, true, false);
ArrayList topicStartParas = new ArrayList();
for (Paragraph para : (Iterable<Paragraph>) paras)
{
int style = para.getParagraphFormat().getStyleIdentifier();
if (style == StyleIdentifier.HEADING_1)
topicStartParas.add(para);
}
return topicStartParas;
}
/**
* Inserts section breaks before the specified paragraphs.
*/
private void insertSectionBreaks(ArrayList topicStartParas) throws Exception
{
DocumentBuilder builder = new DocumentBuilder(mDoc);
for (Paragraph para : (Iterable<Paragraph>) topicStartParas)
{
Section section = para.getParentSection();
// Insert section break if the paragraph is not at the beginning of a section already.
if (para != section.getBody().getFirstParagraph())
{
builder.moveTo(para.getFirstChild());
builder.insertBreak(BreakType.SECTION_BREAK_NEW_PAGE);
// This is the paragraph that was inserted at the end of the now old section.
// We don't really need the extra paragraph, we just needed the section.
section.getBody().getLastParagraph().remove();
}
}
}
/**
* Splits the current document into one topic per section and saves each topic
* as an HTML file. Returns a collection of Topic objects.
*/
private ArrayList saveHtmlTopics() throws Exception
{
ArrayList topics = new ArrayList();
for (int sectionIdx = 0; sectionIdx < mDoc.getSections().getCount(); sectionIdx++)
{
Section section = mDoc.getSections().get(sectionIdx);
String paraText = section.getBody().getFirstParagraph().getText();
// The text of the heading paragaph is used to generate the HTML file name.
String fileName = makeTopicFileName(paraText);
if ("".equals(fileName))
fileName = "UNTITLED SECTION " + sectionIdx;
fileName = new File(mDstDir, fileName + ".html").getPath();
// The text of the heading paragraph is also used to generate the title for the TOC.
String title = makeTopicTitle(paraText);
if ("".equals(title))
title = "UNTITLED SECTION " + sectionIdx;
Topic topic = new Topic(title, fileName);
topics.add(topic);
saveHtmlTopic(section, topic);
}
return topics;
}
/**
* Leaves alphanumeric characters, replaces white space with underscore
* and removes all other characters from a string.
*/
private static String makeTopicFileName(String paraText) throws Exception
{
StringBuilder b = new StringBuilder();
for (int i = 0; i < paraText.length(); i++)
{
char c = paraText.charAt(i);
if (Character.isLetterOrDigit(c))
b.append(c);
else if (c == ' ')
b.append('_');
}
return b.toString();
}
/**
* Removes the last character (which is a paragraph break character from the given string).
*/
private static String makeTopicTitle(String paraText) throws Exception
{
return paraText.substring((0), (0) + (paraText.length() - 1));
}
/**
* Saves one section of a document as an HTML file.
* Any embedded images are saved as separate files in the same folder as the HTML file.
*/
private static void saveHtmlTopic(Section section, Topic topic) throws Exception
{
Document dummyDoc = new Document();
dummyDoc.removeAllChildren();
dummyDoc.appendChild(dummyDoc.importNode(section, true, ImportFormatMode.KEEP_SOURCE_FORMATTING));
dummyDoc.getBuiltInDocumentProperties().setTitle(topic.getTitle());
HtmlSaveOptions saveOptions = new HtmlSaveOptions();
saveOptions.setPrettyFormat(true);
// This is to allow headings to appear to the left of main text.
saveOptions.setAllowNegativeLeftIndent(true);
saveOptions.setExportHeadersFootersMode(ExportHeadersFootersMode.NONE);
dummyDoc.save(topic.getFileName(), saveOptions);
}
/**
* Generates a table of contents for the topics and saves to contents.html.
*/
private void saveTableOfContents(ArrayList topics) throws Exception
{
Document tocDoc = new Document(mTocTemplate);
// We use a custom mail merge even handler defined below.
tocDoc.getMailMerge().setFieldMergingCallback(new HandleTocMergeField());
// We use a custom mail merge data source based on the collection of the topics we created.
tocDoc.getMailMerge().executeWithRegions(new TocMailMergeDataSource(topics));
tocDoc.save(new File(mDstDir, "contents.html").getPath());
}
private class HandleTocMergeField implements IFieldMergingCallback
{
public void fieldMerging(FieldMergingArgs e) throws Exception
{
if (mBuilder == null)
mBuilder = new DocumentBuilder(e.getDocument());
// Our custom data source returns topic objects.
Topic topic = (Topic)e.getFieldValue();
// We use the document builder to move to the current merge field and insert a hyperlink.
mBuilder.moveToMergeField(e.getFieldName());
mBuilder.insertHyperlink(topic.getTitle(), topic.getFileName(), false);
// Signal to the mail merge engine that it does not need to insert text into the field
// as we've done it already.
e.setText("");
}
public void imageFieldMerging(ImageFieldMergingArgs args) throws Exception
{
// Do nothing.
}
private DocumentBuilder mBuilder;
}
private Document mDoc;
private String mTocTemplate;
private String mDstDir;
}