package com.aspose.words.examples.loading_saving;
import com.aspose.words.*;
import com.aspose.words.examples.Utils;
import org.w3c.dom.Element;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Source;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import java.io.*;
import java.text.MessageFormat;
import java.util.ArrayList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* This project converts documentation stored inside a DOC format to a series of HTML documents. This output is in
* a form that can then be easily compiled together into a single compiled help file (CHM) by using
* the Microsoft HTML Help Workshop application.
*/
public class Word2Help
{
public static void main(String[] args) throws Exception
{
// The path to the documents directory.
String dataDir = Utils.getDataDir(Word2Help.class);
// Specifies the destination directory where the HTML files are output.
File outPath = new File(dataDir, "Out");
// Remove any existing output and recreate the Out folder.
if(outPath.exists())
{
for(File file : outPath.listFiles())
{
file.delete();
}
}
outPath.mkdirs();
String outDir = outPath.getAbsolutePath();
// Specifies the part of the URLs to remove. If there are any hyperlinks that start
// with the above URL, this URL is removed. This allows the document designer to include
// links to the HTML API and they will be "corrected" so they work both in the online
// HTML and also in the compiled CHM.
String fixUrl = "";
// *** LICENSING ***
// An Aspose.Words license is required to use this project fully.
// Without a license Aspose.Words will work in evaluation mode and truncate documents
// and output watermarks.
//
// You can download a free 30-day trial license from the Aspose site. The easiest way is to set the license is to
// include the license in the executing directory and uncomment the following code.
//
// Aspose.Words.License license = new Aspose.Words.License();
// license.setLicense("Aspose.Words.lic");
System.out.println(MessageFormat.format("Extracting topics from {0}.", dataDir));
TopicCollection topics = new TopicCollection(dataDir, fixUrl);
topics.addFromDir(dataDir);
topics.writeHtml(outDir);
topics.writeContentXml(outDir);
System.out.println("Conversion completed successfully.");
}
}
/**
* This "facade" class makes it easier to work with a hyperlink field in a Word document.
*
* A hyperlink is represented by a HYPERLINK field in a Word document. A field in Aspose.Words
* consists of several nodes and it might be difficult to work with all those nodes directly.
* This is a simple implementation and will work only if the hyperlink code and name
* each consist of one Run only.
*
* [FieldStart][Run - field code][FieldSeparator][Run - field result][FieldEnd]
*
* The field code contains a string in one of these formats:
* HYPERLINK "url"
* HYPERLINK \l "bookmark name"
*
* The field result contains text that is displayed to the user.
*/
class Hyperlink
{
public Hyperlink(FieldStart fieldStart) throws Exception
{
if (fieldStart == null)
throw new IllegalArgumentException("fieldStart");
if (fieldStart.getFieldType() != FieldType.FIELD_HYPERLINK)
throw new IllegalArgumentException("Field start type must be FieldHyperlink.");
mFieldStart = fieldStart;
// Find field separator node.
mFieldSeparator = findNextSibling(mFieldStart, NodeType.FIELD_SEPARATOR);
if (mFieldSeparator == null)
throw new Exception("Cannot find field separator.");
// Find field end node. Normally field end will always be found, but in the example document
// there happens to be a paragraph break included in the hyperlink and this puts the field end
// in the next paragraph. It will be much more complicated to handle fields which span several
// paragraphs correctly, but in this case allowing field end to be null is enough for our purposes.
mFieldEnd = findNextSibling(mFieldSeparator, NodeType.FIELD_END);
// Field code looks something like [ HYPERLINK "http:\\www.myurl.com" ], but it can consist of several runs.
String fieldCode = getTextSameParent(mFieldStart.getNextSibling(), mFieldSeparator);
Matcher match = G_REGEX.matcher(fieldCode.trim());
if(match.find())
{
mIsLocal = match.group(1) != null;
mTarget = match.group(2);
}
}
/*
* Gets or sets the display name of the hyperlink.
*/
public String getName() throws Exception
{
return getTextSameParent(mFieldSeparator, mFieldEnd);
}
public void setName(String value) throws Exception
{
// Hyperlink display name is stored in the field result which is a Run
// node between field separator and field end.
Run fieldResult = (Run)mFieldSeparator.getNextSibling();
fieldResult.setText(value);
// But sometimes the field result can consist of more than one run, delete these runs.
removeSameParent(fieldResult.getNextSibling(), mFieldEnd);
}
/*
* Gets or sets the target url or bookmark name of the hyperlink.
*/
public String getTarget() throws Exception
{
return mTarget;
}
public void setTarget(String value) throws Exception
{
mTarget = value;
updateFieldCode();
}
/*
* True if the hyperlink's target is a bookmark inside the document. False if the hyperlink is a url.
*/
public boolean isLocal() throws Exception
{
return mIsLocal;
}
public void setLocal(boolean value) throws Exception
{
mIsLocal = value;
updateFieldCode();
}
/**
* Updates the field code.
*/
private void updateFieldCode() throws Exception
{
// Field code is stored in a Run node between field start and field separator.
Run fieldCode = (Run)mFieldStart.getNextSibling();
fieldCode.setText(java.text.MessageFormat.format("HYPERLINK {0}\"{1}\"", ((mIsLocal) ? "\\l " : ""), mTarget));
// But sometimes the field code can consist of more than one run, delete these runs.
removeSameParent(fieldCode.getNextSibling(), mFieldSeparator);
}
/**
* Goes through siblings starting from the start node until it finds a node of the specified type or null.
*/
private static Node findNextSibling(Node start, int nodeType) throws Exception
{
for (Node node = start; node != null; node = node.getNextSibling())
{
if (node.getNodeType() == nodeType)
return node;
}
return null;
}
/*
* Retrieves text from start up to but not including the end node.
*/
private static String getTextSameParent(Node start, Node end) throws Exception
{
if ((end != null) && (start.getParentNode() != end.getParentNode()))
throw new IllegalArgumentException("Start and end nodes are expected to have the same parent.");
StringBuilder builder = new StringBuilder();
for (Node child = start; child != end; child = child.getNextSibling())
builder.append(child.getText());
return builder.toString();
}
/*
* Removes nodes from start up to but not including the end node.
* Start and end are assumed to have the same parent.
*/
private static void removeSameParent(Node start, Node end) throws Exception
{
if ((end != null) && (start.getParentNode() != end.getParentNode()))
throw new IllegalArgumentException("Start and end nodes are expected to have the same parent.");
Node curChild = start;
while (curChild != end)
{
Node nextChild = curChild.getNextSibling();
curChild.remove();
curChild = nextChild;
}
}
private final Node mFieldStart;
private final Node mFieldSeparator;
private final Node mFieldEnd;
private String mTarget;
private boolean mIsLocal;
private static final Pattern G_REGEX = Pattern.compile(
"\\S+" + // One or more non spaces HYPERLINK or other word in other languages
"\\s+" + // One or more spaces
"(?:\"\"\\s+)?" + // Non capturing optional "" and one or more spaces, found in one of the customers files.
"(\\\\l\\s+)?" + // Optional \l flag followed by one or more spaces
"\"" + // One apostrophe
"([^\"]+)" + // One or more chars except apostrophe (hyperlink target)
"\"" // One closing apostrophe
);
}
/**
* Central storage for regular expressions used in the project.
*/
class RegularExpressions
{
// This class is static. No instance creation is allowed.
private RegularExpressions() throws Exception {}
/**
* Regular expression specifying html title (framing tags excluded).
*/
public static Pattern getHtmlTitle() throws Exception
{
if (gHtmlTitle == null)
{
gHtmlTitle = Pattern.compile(HTML_TITLE_PATTERN,
Pattern.CASE_INSENSITIVE);
}
return gHtmlTitle;
}
/**
* Regular expression specifying html head.
*/
public static Pattern getHtmlHead() throws Exception
{
if (gHtmlHead == null)
{
gHtmlHead = Pattern.compile(HTML_HEAD_PATTERN,
Pattern.CASE_INSENSITIVE);
}
return gHtmlHead;
}
/**
* Regular expression specifying space right after div keyword in the first div declaration of html body.
*/
public static Pattern getHtmlBodyDivStart() throws Exception
{
if (gHtmlBodyDivStart == null)
{
gHtmlBodyDivStart = Pattern.compile(HTML_BODY_DIV_START_PATTERN,
Pattern.CASE_INSENSITIVE);
}
return gHtmlBodyDivStart;
}
private static final String HTML_TITLE_PATTERN = "(?<=\\<title\\>).*?(?=\\</title\\>)";
private static Pattern gHtmlTitle;
private static final String HTML_HEAD_PATTERN = "\\<head\\>.*?\\</head\\>";
private static Pattern gHtmlHead;
private static final String HTML_BODY_DIV_START_PATTERN = "(?<=\\<body\\>\\s{0,200}\\<div)\\s";
private static Pattern gHtmlBodyDivStart;
}
/**
* Represents a single topic that will be written as an HTML file.
*/
class TopicWord2Help
{
/**
* Creates a topic.
*/
public TopicWord2Help(Section section, String fixUrl) throws Exception
{
mTopicDoc = new Document();
mTopicDoc.appendChild(mTopicDoc.importNode(section, true, ImportFormatMode.KEEP_SOURCE_FORMATTING));
mTopicDoc.getFirstSection().remove();
Paragraph headingPara = (Paragraph)mTopicDoc.getFirstSection().getBody().getFirstChild();
if (headingPara == null)
throwTopicException("The section does not start with a paragraph.", section);
mHeadingLevel = headingPara.getParagraphFormat().getStyleIdentifier() - StyleIdentifier.HEADING_1;
if ((mHeadingLevel < 0) || (mHeadingLevel > 8))
throwTopicException("This topic does not start with a heading style paragraph.", section);
mTitle = headingPara.getText().trim();
if ("".equals(mTitle))
throwTopicException("This topic heading does not have text.", section);
// We actually remove the heading paragraph because <h1> will be output in the banner.
headingPara.remove();
mTopicDoc.getBuiltInDocumentProperties().setTitle(mTitle);
fixHyperlinks(section.getDocument(), fixUrl);
}
private static void throwTopicException(String message, Section section) throws Exception
{
throw new Exception(message + " Section text: " + section.getBody().toString(SaveFormat.TEXT).substring(0, 50));
}
private void fixHyperlinks(DocumentBase originalDoc, String fixUrl) throws Exception
{
if (fixUrl.endsWith("/"))
fixUrl = fixUrl.substring(0, fixUrl.length() - 1);
NodeCollection fieldStarts = mTopicDoc.getChildNodes(NodeType.FIELD_START, true);
for (FieldStart fieldStart : (Iterable<FieldStart>) fieldStarts)
{
if (fieldStart.getFieldType() != FieldType.FIELD_HYPERLINK)
continue;
Hyperlink hyperlink = new Hyperlink(fieldStart);
if (hyperlink.isLocal())
{
// We use "Hyperlink to a place in this document" feature of Microsoft Word
// to create local hyperlinks between topics within the same doc file.
// It causes MS Word to auto generate the bookmark name.
String bmkName = hyperlink.getTarget();
// But we have to follow the bookmark to get the text of the topic heading paragraph
// in order to be able to build the proper filename of the topic file.
Bookmark bmk = originalDoc.getRange().getBookmarks().get(bmkName);
// String test1 = MessageFormat.format("Found a link to a bookmark, but cannot locate the bookmark. Name:{0}.", bmkName);
if (bmk == null)
throw new Exception(MessageFormat.format("Found a link to a bookmark, but cannot locate the bookmark. Name:{0}.", bmkName));
Paragraph para = (Paragraph)bmk.getBookmarkStart().getParentNode();
String topicName = para.getText().trim();
hyperlink.setTarget(headingToFileName(topicName) + ".html");
hyperlink.setLocal(false);
}
else
{
// We "fix" URL like this:
// http://www.aspose.com/Products/Aspose.Words/Api/Aspose.Words.Body.html
// by changing them into this:
// Aspose.Words.Body.html
if (hyperlink.getTarget().startsWith(fixUrl) &&
(hyperlink.getTarget().length() > (fixUrl.length() + 1)))
{
hyperlink.setTarget(hyperlink.getTarget().substring(fixUrl.length() + 1));
}
}
}
}
public void writeHtml(String htmlHeader, String htmlBanner, String htmlFooter, String outDir) throws Exception
{
String fileName = new File(outDir, getFileName()).getAbsolutePath();
HtmlSaveOptions saveOptions = new HtmlSaveOptions();
saveOptions.setPrettyFormat(true);
// This is to allow headings to appear to the left of main text.
saveOptions.setAllowNegativeLeftIndent(true);
// Disable headers and footers.
saveOptions.setExportHeadersFootersMode(ExportHeadersFootersMode.NONE);
// Export the document to HTML.
mTopicDoc.save(fileName, saveOptions);
// We need to modify the HTML string, read HTML back.
String html;
FileInputStream reader = null;
try{
reader = new FileInputStream(fileName);
byte[] fileBytes = new byte[reader.available()];
reader.read(fileBytes);
html = new String(fileBytes);
}
finally { if (reader != null) reader.close(); }
// Builds the HTML <head> element.
String header = htmlHeader.replaceFirst(RegularExpressions.getHtmlTitle().pattern(), mTitle);
// Applies the new <head> element instead of the original one.
html = html.replaceFirst(RegularExpressions.getHtmlHead().pattern(), header);
html = html.replaceFirst(RegularExpressions.getHtmlBodyDivStart().pattern(), " id=\"nstext\"");
String banner = htmlBanner.replace("###TOPIC_NAME###", mTitle);
// Add the standard banner.
html = html.replace("<body>", "<body>" + banner);
// Add the standard footer.
html = html.replace("</body>", htmlFooter + "</body>");
FileOutputStream writer = null;
try{
writer = new FileOutputStream(fileName);
writer.write(html.getBytes());
}
finally { if (writer != null) writer.close(); }
}
/**
* Removes various characters from the header to form a file name that does not require escaping.
*/
private static String headingToFileName(String heading) throws Exception
{
StringBuilder b = new StringBuilder();
for (int i = 0; i < heading.length(); i++)
{
char c = heading.charAt(i);
if (Character.isLetterOrDigit(c))
b.append(c);
}
return b.toString();
}
public Document getDocument() throws Exception { return mTopicDoc; }
/**
* Gets the name of the topic html file without path.
*/
public String getFileName() throws Exception { return headingToFileName(mTitle) + ".html"; }
public String getTitle() throws Exception { return mTitle; }
public int getHeadingLevel() throws Exception { return mHeadingLevel; }
/**
* Returns true if the topic has no text (the heading paragraph has already been removed from the topic).
*/
public boolean isHeadingOnly() throws Exception
{
Body body = mTopicDoc.getFirstSection().getBody();
return (body.getFirstParagraph() == null);
}
private final Document mTopicDoc;
private final String mTitle;
private final int mHeadingLevel;
}
/**
* This is the main class.
* Loads Word document(s), splits them into topics, saves HTML files and builds content.xml.
*/
class TopicCollection
{
/**
* Ctor.
*
* @param htmlTemplatesDir The directory that contains header.html, banner.html and footer.html files.
*
* @param fixUrl The url that will be removed from any hyperlinks that start with this url.
* This allows turning some absolute URLS into relative ones.
*/
public TopicCollection(String htmlTemplatesDir, String fixUrl) throws Exception
{
mTopics = new ArrayList();
mFixUrl = fixUrl;
mHtmlHeader = readFile(htmlTemplatesDir + "header.html");
mHtmlBanner = readFile(htmlTemplatesDir + "banner.html");
mHtmlFooter = readFile(htmlTemplatesDir + "footer.html");
}
/**
* Processes all DOC files found in the specified directory.
* Loads and splits each document into separate topics.
*/
public void addFromDir(String dirName) throws Exception
{
FilenameFilter fileFilter = new FilenameFilter() {
public boolean accept(File dir, String name) {
return name.endsWith(".doc");
}
};
for (File filename : new File(dirName).listFiles(fileFilter))
addFromFile(filename.getAbsolutePath());
}
/**
* Processes a specified DOC file. Loads and splits into topics.
*/
public void addFromFile(String fileName) throws Exception
{
Document doc = new Document(fileName);
insertTopicSections(doc);
addTopics(doc);
}
/**
* Saves all topics as HTML files.
*/
public void writeHtml(String outDir) throws Exception
{
for (TopicWord2Help topic : (Iterable<TopicWord2Help>) mTopics)
{
if (!topic.isHeadingOnly())
topic.writeHtml(mHtmlHeader, mHtmlBanner, mHtmlFooter, outDir);
}
}
/**
* Saves the content.xml file that describes the tree of topics.
*/
public void writeContentXml(String outDir) throws Exception
{
DocumentBuilderFactory fact = DocumentBuilderFactory.newInstance();
javax.xml.parsers.DocumentBuilder parser = fact.newDocumentBuilder();
org.w3c.dom.Document doc = parser.newDocument();
Element root = doc.createElement("content");
root.setAttribute("dir", outDir);
doc.appendChild(root);
Element currentElement = root;
for (int i = 0; i < mTopics.size(); i++)
{
TopicWord2Help topic = (TopicWord2Help)mTopics.get(i);
int nextTopicIdx = i + 1;
TopicWord2Help nextTopic = (nextTopicIdx < mTopics.size()) ? (TopicWord2Help)mTopics.get(i + 1) : null;
int nextHeadingLevel = (nextTopic != null) ? nextTopic.getHeadingLevel() : 0;
if (nextHeadingLevel > topic.getHeadingLevel())
{
// Next topic is nested, therefore we have to start a book.
// We only allow increase level at a time.
if (nextHeadingLevel != topic.getHeadingLevel() + 1)
throw new Exception("Topic is nested for more than one level at a time. Title: " + topic.getTitle());
currentElement = writeBookStart(currentElement, topic);
}
else if (nextHeadingLevel < topic.getHeadingLevel())
{
// Next topic is one or more levels higher in the outline.
// Write out the current topic.
writeItem(currentElement, topic.getTitle(), topic.getFileName());
// End one or more nested topics could have ended at this point.
int levelsToClose = topic.getHeadingLevel() - nextHeadingLevel;
while (levelsToClose > 0)
{
currentElement = (Element)currentElement.getParentNode();
levelsToClose--;
}
}
else
{
// A topic at the current level and it has no children.
writeItem(currentElement, topic.getTitle(), topic.getFileName());
}
}
// Prepare the DOM document for writing
Source source = new DOMSource(doc);
// Prepare the output file
File file = new File(outDir, "content.xml");
FileOutputStream outputStream = new FileOutputStream(file.getAbsolutePath());
StreamResult result = new StreamResult(new OutputStreamWriter(outputStream,"UTF-8")); // UTF-8 encoding must be specified in order for the output to have proper indentation.
// Write the DOM document to disk.
TransformerFactory tf = TransformerFactory.newInstance();
tf.setAttribute("indent-number", 2); // Set the indentation for child elements.
// Export as XML.
Transformer transformer = tf.newTransformer();
transformer.setOutputProperty(OutputKeys.INDENT, "yes");
transformer.transform(source, result);
}
/**
* Inserts section breaks that delimit the topics.
*
* @param doc The document where to insert the section breaks.
*/
private static void insertTopicSections(Document doc) throws Exception
{
DocumentBuilder builder = new DocumentBuilder(doc);
NodeCollection paras = doc.getChildNodes(NodeType.PARAGRAPH, true, false);
ArrayList topicStartParas = new ArrayList();
for (Paragraph para : (Iterable<Paragraph>) paras)
{
int style = para.getParagraphFormat().getStyleIdentifier();
if ((style >= StyleIdentifier.HEADING_1) && (style <= MAX_TOPIC_HEADING) &&
(para.hasChildNodes()))
{
// Select heading paragraphs that must become topic starts.
// We can't modify them in this loop, we have to remember them in an array first.
topicStartParas.add(para);
}
else if ((style > MAX_TOPIC_HEADING) && (style <= StyleIdentifier.HEADING_9))
{
// Pull up headings. For example: if Heading 1-4 become topics, then I want Headings 5+
// to become Headings 4+. Maybe I want to pull up even higher?
para.getParagraphFormat().setStyleIdentifier(style - 1);
}
}
for (Paragraph para : (Iterable<Paragraph>) topicStartParas)
{
Section section = para.getParentSection();
// Insert section break if the paragraph is not at the beginning of a section already.
if (para != section.getBody().getFirstParagraph())
{
builder.moveTo(para.getFirstChild());
builder.insertBreak(BreakType.SECTION_BREAK_NEW_PAGE);
// This is the paragraph that was inserted at the end of the now old section.
// We don't really need the extra paragraph, we just needed the section.
section.getBody().getLastParagraph().remove();
}
}
}
/**
* Goes through the sections in the document and adds them as topics to the collection.
*/
private void addTopics(Document doc) throws Exception
{
for (Section section : doc.getSections())
{
try
{
TopicWord2Help topic = new TopicWord2Help(section, mFixUrl);
mTopics.add(topic);
}
catch (Exception e)
{
// If one topic fails, we continue with others.
System.out.println(e.getMessage());
}
}
}
private static Element writeBookStart(Element root, TopicWord2Help topic) throws Exception
{
Element book = root.getOwnerDocument().createElement("book");
root.appendChild(book);
book.setAttribute("name", topic.getTitle());
if (!topic.isHeadingOnly())
book.setAttribute("href", topic.getFileName());
return book;
}
private static void writeItem(Element root, String name, String href) throws Exception
{
Element item = root.getOwnerDocument().createElement("item");
root.appendChild(item);
item.setAttribute("name", name);
item.setAttribute("href", href);
}
private static String readFile(String fileName) throws Exception
{
FileInputStream reader = null;
try
{
reader = new FileInputStream(fileName);
byte[] fileBytes = new byte[reader.available()];
reader.read(fileBytes);
return new String(fileBytes);
}
finally {
if (reader != null)
reader.close();
}
}
private final ArrayList mTopics;
private final String mFixUrl;
private final String mHtmlHeader;
private final String mHtmlBanner;
private final String mHtmlFooter;
/**
* Specifies the maximum Heading X number.
* All of the headings above or equal to this will be put into their own topics.
*/
private static final int MAX_TOPIC_HEADING = StyleIdentifier.HEADING_4;
}