/* * ModeShape (http://www.modeshape.org) * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.modeshape.sequencer.msoffice.word; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.List; import org.apache.poi.hwpf.HWPFDocument; import org.apache.poi.hwpf.model.StyleSheet; import org.apache.poi.hwpf.usermodel.Paragraph; import org.apache.poi.hwpf.usermodel.Range; import org.modeshape.common.logging.Logger; /** * Infers table of contents from Word document by reading all paragraphs with style <code>Heading*</code>. This is analogous to * the default behavior of Word when generating a table of contents. */ public class WordMetadataReader { private static final Logger log = Logger.getLogger(WordMetadataReader.class); /** Prefix for styles that will be extracted and treated as outline information for the document */ private static final String HEADER_PREFIX = "Heading"; public static WordMetadata instance( InputStream stream ) throws IOException { WordMetadata metadata = new WordMetadata(); List<WordMetadata.WordHeading> headings = new ArrayList<WordMetadata.WordHeading>(); HWPFDocument document = new HWPFDocument(stream); Range range = document.getRange(); StyleSheet stylesheet = document.getStyleSheet(); for (int i = 0; i < range.numParagraphs(); i++) { Paragraph paragraph = range.getParagraph(i); String styleName = stylesheet.getStyleDescription(paragraph.getStyleIndex()).getName(); if (styleName.startsWith(HEADER_PREFIX)) { String rawLevelNum = styleName.substring(HEADER_PREFIX.length() + 1).trim(); int levelNum = 0; try { levelNum = Integer.parseInt(rawLevelNum); } catch (NumberFormatException nfe) { log.debug("Could not parse heading level from: " + styleName); } String text = Paragraph.stripFields(paragraph.text()); if ('\r' == text.charAt(text.length() - 1)) { text = text.substring(0, text.length() - 1); } headings.add(new WordMetadata.WordHeading(text, levelNum)); } } metadata.setHeadings(headings); metadata.setMetadata(document.getSummaryInformation()); return metadata; } }