/******************************************************************************* * Copyright (c) 2009, Adobe Systems Incorporated * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * · Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * · Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * · Neither the name of Adobe Systems Incorporated nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ package com.adobe.dp.office.word; import java.io.File; import java.io.IOException; import java.io.InputStream; import java.util.Hashtable; import java.util.Iterator; import java.util.Stack; import java.util.zip.ZipEntry; import java.util.zip.ZipFile; import javax.xml.parsers.ParserConfigurationException; import javax.xml.parsers.SAXParser; import javax.xml.parsers.SAXParserFactory; import org.xml.sax.Attributes; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import org.xml.sax.XMLReader; import org.xml.sax.helpers.DefaultHandler; import com.adobe.dp.office.embedded.EmbeddedObject; import com.adobe.dp.office.types.BorderSide; import com.adobe.dp.office.types.FontFamily; import com.adobe.dp.office.types.Paint; import com.adobe.dp.office.types.RGBColor; import com.adobe.dp.office.vml.VMLElement; import com.adobe.dp.office.vml.VMLElementFactory; import com.adobe.dp.office.vml.VMLFormulasElement; import com.adobe.dp.office.vml.VMLShapeTypeElement; public class WordDocumentParser { static final String cpNS = "http://schemas.openxmlformats.org/package/2006/metadata/core-properties"; static final String dcNS = "http://purl.org/dc/elements/1.1/"; static final String dctNS = "http://purl.org/dc/terms/"; static final String wNS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"; static final String wpNS = "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing"; static final String aNS = "http://schemas.openxmlformats.org/drawingml/2006/main"; static final String picNS = "http://schemas.openxmlformats.org/drawingml/2006/picture"; static final String vNS = "urn:schemas-microsoft-com:vml"; static final String rNS = "http://schemas.openxmlformats.org/officeDocument/2006/relationships"; static final String rPkNS = "http://schemas.openxmlformats.org/package/2006/relationships"; static final String hyperlinkRel = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink"; static final String imageRel = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/image"; static final String stylesRel = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/styles"; static final String numberingRel = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/numbering"; static final String themeRel = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/theme"; static final String fontsRel = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/fontTable"; static final String footnotesRel = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/footnotes"; WordDocument doc; Hashtable rels; File docFile; ZipFile zip; Stack contextStack; SAXParserFactory factory; String stylesName; String fontsName; String themeName; String footnotesName; String numberingName; Hashtable fonts = new Hashtable(); String majorFontName; String minorFontName; Hashtable vmldefs = new Hashtable(); static final Hashtable propertyParsers = new Hashtable(); static final Hashtable colorTable = new Hashtable(); static { SimplePropertyParser simpleParser = new SimplePropertyParser(""); propertyParsers.put("vertAlign", simpleParser); propertyParsers.put("u", simpleParser); propertyParsers.put("jc", simpleParser); propertyParsers.put("lang", simpleParser); propertyParsers.put("vAlign", simpleParser); NumberPropertyParser numberParser = new NumberPropertyParser(); propertyParsers.put("sz", numberParser); propertyParsers.put("spacing-r", numberParser); OnOffPropertyParser onOffParser = new OnOffPropertyParser(); propertyParsers.put("b", onOffParser); propertyParsers.put("i", onOffParser); propertyParsers.put("webHidden", onOffParser); propertyParsers.put("strike", onOffParser); propertyParsers.put("keepNext", onOffParser); propertyParsers.put("keepLines", onOffParser); propertyParsers.put("pageBreakBefore", onOffParser); propertyParsers.put("contextualSpacing", onOffParser); IntegerPropertyParser integerParser = new IntegerPropertyParser(); propertyParsers.put("ilvl", integerParser); propertyParsers.put("outlineLvl", integerParser); propertyParsers.put("numId", integerParser); propertyParsers.put("gridSpan", integerParser); SpacingPropertyParser insetsParser = new SpacingPropertyParser(); propertyParsers.put("spacing", insetsParser); PaintPropertyParser paintParser = new PaintPropertyParser(); propertyParsers.put("color", paintParser); propertyParsers.put("highlight", paintParser); ShadingPropertyParser shdParser = new ShadingPropertyParser(); propertyParsers.put("shd", shdParser); FontsPropertyParser fontsParser = new FontsPropertyParser(); propertyParsers.put("rFonts", fontsParser); IndentParser indentParser = new IndentParser(); propertyParsers.put("ind", indentParser); FrameParser frameParser = new FrameParser(); propertyParsers.put("framePr", frameParser); colorTable.put("white", new RGBColor(0xFFFFFF)); colorTable.put("black", new RGBColor(0x000000)); colorTable.put("red", new RGBColor(0xFF0000)); colorTable.put("green", new RGBColor(0x00FF00)); colorTable.put("blue", new RGBColor(0x0000FF)); colorTable.put("yellow", new RGBColor(0xFFFF00)); colorTable.put("magenta", new RGBColor(0xFF00FF)); colorTable.put("cyan", new RGBColor(0x00FFFF)); colorTable.put("lightGray", new RGBColor(0xCCCCCC)); colorTable.put("gray", new RGBColor(0x999999)); colorTable.put("darkGray", new RGBColor(0x777777)); } static class ParseContext { Element parentElement; Style parentStyle; BaseProperties properties; BaseProperties borderProp; EmbeddedObject embedded; MetadataItem item; FontFamily font; String state; AbstractNumberingDefinition abstractNumberingDefinition; NumberingDefinitionInstance numberingDefinitionInstance; NumberingLevelDefinition numberingLevelDefinition; Integer ilvlOverride; } static class Relationship { String type; String target; String targetMode; Relationship(String type, String target, String targetMode) { this.target = target; this.type = type; this.targetMode = targetMode; } } abstract static class PropertyParser { abstract boolean parse(BaseProperties target, String localName, Attributes attributes, WordDocumentParser self); } static class SimplePropertyParser extends PropertyParser { SimplePropertyParser(Object defaultValue) { this.defaultValue = defaultValue; } boolean parse(BaseProperties target, String localName, Attributes attributes, WordDocumentParser self) { String propertyName = localName; Object propertyValue = attributes.getValue(wNS, "val"); if (propertyValue == null) propertyValue = defaultValue; target.put(propertyName, propertyValue); return true; } Object defaultValue; } static class OnOffPropertyParser extends PropertyParser { boolean parse(BaseProperties target, String localName, Attributes attributes, WordDocumentParser self) { String propertyName = localName; Object propertyValue; String val = attributes.getValue(wNS, "val"); if (val == null || val.equals("on")) propertyValue = Boolean.TRUE; else propertyValue = Boolean.FALSE; target.put(propertyName, propertyValue); return true; } } static class NumberPropertyParser extends PropertyParser { boolean parse(BaseProperties target, String localName, Attributes attributes, WordDocumentParser self) { String propertyName = localName; String val = attributes.getValue(wNS, "val"); if (val != null) { try { Object propertyValue = new Double(val); target.put(propertyName, propertyValue); return true; } catch (NumberFormatException e) { e.printStackTrace(); } } return false; } } static class IntegerPropertyParser extends PropertyParser { boolean parse(BaseProperties target, String localName, Attributes attributes, WordDocumentParser self) { String propertyName = localName; String val = attributes.getValue(wNS, "val"); if (val != null) { try { Object propertyValue = new Integer(val); target.put(propertyName, propertyValue); return true; } catch (NumberFormatException e) { e.printStackTrace(); } } return false; } } static abstract class CompoundPropertyParser extends PropertyParser { void parseComponent(BaseProperties target, String baseName, String componentName, Attributes attributes, WordDocumentParser self) { try { String before = attributes.getValue(wNS, componentName); if (before != null) { Object propertyValue = new Double(before); target.put(baseName + "-" + componentName, propertyValue); } } catch (NumberFormatException e) { e.printStackTrace(); } } } static class SpacingPropertyParser extends CompoundPropertyParser { boolean parse(BaseProperties target, String localName, Attributes attributes, WordDocumentParser self) { parseComponent(target, localName, "before", attributes, self); parseComponent(target, localName, "after", attributes, self); parseComponent(target, localName, "line", attributes, self); String lineRule = attributes.getValue(wNS, "lineRule"); if (lineRule != null) { target.put(localName + "-lineRule", lineRule); } return true; } } static class PaintPropertyParser extends PropertyParser { boolean parse(BaseProperties target, String localName, Attributes attributes, WordDocumentParser self) { String propertyName = localName; String val = attributes.getValue(wNS, "val"); Object propertyValue = parsePaint(val); if (propertyValue != null) { target.put(propertyName, propertyValue); return true; } return false; } } static class ShadingPropertyParser extends PropertyParser { boolean parse(BaseProperties target, String localName, Attributes attributes, WordDocumentParser self) { String propertyName = localName; String val = attributes.getValue(wNS, "fill"); if (val == null) return false; Object propertyValue = parsePaint(val); if (propertyValue != null) { target.put(propertyName, propertyValue); return true; } return false; } } static class FontsPropertyParser extends PropertyParser { boolean parse(BaseProperties target, String localName, Attributes attributes, WordDocumentParser self) { String propertyName = localName; String name = attributes.getValue(wNS, "ascii"); if (name == null) { String themeName = attributes.getValue(wNS, "asciiTheme"); if (themeName != null) { if (themeName.equals("majorHAnsi")) name = self.majorFontName; else if (themeName.equals("minorHAnsi")) name = self.minorFontName; } } if (name != null) { Object propertyValue = self.fonts.get(name); if (propertyValue != null) { target.put(propertyName, propertyValue); return true; } } return false; } } static class IndentParser extends CompoundPropertyParser { boolean parse(BaseProperties target, String localName, Attributes attributes, WordDocumentParser self) { parseComponent(target, localName, "left", attributes, self); parseComponent(target, localName, "right", attributes, self); parseComponent(target, localName, "firstLine", attributes, self); parseComponent(target, localName, "hanging", attributes, self); return true; } } static class FrameParser extends CompoundPropertyParser { boolean parse(BaseProperties target, String localName, Attributes attributes, WordDocumentParser self) { parseComponent(target, localName, "w", attributes, self); parseComponent(target, localName, "hSpace", attributes, self); parseComponent(target, localName, "vSpace", attributes, self); String align = attributes.getValue(wNS, "xAlign"); if (align != null) { target.put(localName + "-align", align); } return true; } } static Paint parsePaint(String val) { if (val == null) return null; if (val.equals("auto")) val = "black"; Paint propertyValue = (Paint) colorTable.get(val); if (propertyValue != null) return propertyValue; try { int ival = Integer.parseInt(val, 16); return new RGBColor(ival); } catch (NumberFormatException e) { e.printStackTrace(); } return null; } static PropertyParser getPropertyParser(String localName) { return (PropertyParser) propertyParsers.get(localName); } private BorderSide parseBorderSide(Attributes attributes) { String val = attributes.getValue(wNS, "val"); String szVal = attributes.getValue(wNS, "sz"); float sz = 0; if (szVal != null) try { sz = Float.parseFloat(szVal); } catch (NumberFormatException e) { e.printStackTrace(); } String spaceVal = attributes.getValue(wNS, "space"); float space = 0; if (spaceVal != null) try { space = Float.parseFloat(spaceVal); } catch (NumberFormatException e) { e.printStackTrace(); } String colorVal = attributes.getValue(wNS, "color"); Paint color = parsePaint(colorVal); return new BorderSide(val, sz, space, color); } class XMLHandler extends DefaultHandler implements DrawingElement.Context { XMLHandler(String prefix) { xrefPrefix = prefix; } String xrefPrefix; public void characters(char[] ch, int start, int length) throws SAXException { ParseContext context = (ParseContext) contextStack.peek(); Element p = context.parentElement; if (p instanceof TextElement) { ((TextElement) p).text += new String(ch, start, length); } if (context.item != null) { StringBuffer sb = new StringBuffer(); String value = context.item.getValue(); if (value != null) sb.append(value); sb.append(ch, start, length); context.item.setValue(sb.toString()); } } public void endElement(String uri, String localName, String qName) throws SAXException { ParseContext context = (ParseContext) contextStack.pop(); EmbeddedObject embedded = context.embedded; if (embedded != null) { embedded.finish(this); if (!contextStack.isEmpty()) { ParseContext parentContext = (ParseContext) contextStack.peek(); if (parentContext.embedded != null) parentContext.embedded.finishChild(this, embedded); } } if (context.item != null && context.item.getValue() != null) { doc.metadata.add(context.item); } if (uri.equals(vNS)) { if (!contextStack.isEmpty() && context.parentElement instanceof VMLFormulasElement) { ParseContext parentContext = (ParseContext) contextStack.peek(); if (parentContext.parentElement instanceof VMLShapeTypeElement) { ((VMLShapeTypeElement) parentContext.parentElement) .setFormulas((VMLFormulasElement) context.parentElement); } } } if (doc.defaultParagraphStyle != null && context.parentElement instanceof ParagraphElement) { ParagraphElement p = (ParagraphElement) context.parentElement; if (p.paragraphProperties == null) p.paragraphProperties = new ParagraphProperties(); if (p.paragraphProperties.paragraphStyle == null) p.paragraphProperties.paragraphStyle = doc.defaultParagraphStyle; } if (doc.defaultRunStyle != null && context.parentElement instanceof RunElement) { RunElement r = (RunElement) context.parentElement; if (r.runProperties == null) r.runProperties = new RunProperties(); if (r.runProperties.runStyle == null) r.runProperties.runStyle = doc.defaultRunStyle; } if (context.parentStyle != null) { Style style = context.parentStyle; if (style.parent == null) { if (style != doc.docDefaultParagraphStyle && style != doc.docDefaultRunStyle) { if (style.type != null) { if (style.type.equals("paragraph")) style.parent = doc.docDefaultParagraphStyle; } } } } } public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException { ParseContext newContext = new ParseContext(); if (contextStack.isEmpty()) { if (uri.equals(wNS) && localName.equals("footnotes")) { Element p = createWordElement(localName, attributes); newContext.parentElement = p; doc.footnotes = (BodyElement) p; } } else { ParseContext parentContext = (ParseContext) contextStack.peek(); if (uri != null) { if (uri.equals(wNS)) { if (localName.equals("style")) { newContext.parentStyle = new Style(); String styleId = attributes.getValue(wNS, "styleId"); if (styleId != null) { newContext.parentStyle.styleId = styleId; doc.stylesById.put(styleId, newContext.parentStyle); } newContext.parentStyle.type = attributes.getValue(wNS, "type"); if (newContext.parentStyle.type != null) { String defStr = attributes.getValue(wNS, "default"); if (defStr != null && defStr.equals("1")) { if (newContext.parentStyle.type.equals("character")) doc.defaultRunStyle = newContext.parentStyle; else if (newContext.parentStyle.type.equals("paragraph")) doc.defaultParagraphStyle = newContext.parentStyle; } } } else if (localName.equals("abstractNum")) { newContext.abstractNumberingDefinition = new AbstractNumberingDefinition(); String abstractNumIdStr = attributes.getValue(wNS, "abstractNumId"); try { Integer abstractNumId = new Integer(abstractNumIdStr); doc.abstractNumberingDefinitions.put(abstractNumId, newContext.abstractNumberingDefinition); } catch (Exception e) { e.printStackTrace(); } } else if (localName.equals("num")) { String numIdStr = attributes.getValue(wNS, "numId"); try { Integer numId = new Integer(numIdStr); newContext.numberingDefinitionInstance = new NumberingDefinitionInstance(doc, numId .intValue()); doc.numberingDefinitions.put(numId, newContext.numberingDefinitionInstance); } catch (Exception e) { e.printStackTrace(); } } else if (localName.equals("lvlOverride")) { if (parentContext.numberingDefinitionInstance != null) { newContext.numberingDefinitionInstance = parentContext.numberingDefinitionInstance; String ilvlStr = attributes.getValue(wNS, "ilvl"); if (ilvlStr != null) { try { newContext.ilvlOverride = new Integer(ilvlStr); } catch (Exception e) { e.printStackTrace(); } } } } else if (localName.equals("startOverride")) { if (parentContext.ilvlOverride != null && parentContext.numberingDefinitionInstance != null) { String startStr = attributes.getValue(wNS, "val"); if (startStr != null) { try { Integer start = new Integer(startStr); parentContext.numberingDefinitionInstance.startOverrides.put( parentContext.ilvlOverride, start); } catch (Exception e) { e.printStackTrace(); } } } } else if (localName.equals("lvl")) { newContext.numberingLevelDefinition = new NumberingLevelDefinition(); String ilvlStr = attributes.getValue(wNS, "ilvl"); if (ilvlStr != null) { try { Integer ilvl = new Integer(ilvlStr); newContext.numberingLevelDefinition.lvl = ilvl.intValue(); newContext.numberingLevelDefinition.lvlRestart = newContext.numberingLevelDefinition.lvl - 1; if (parentContext.abstractNumberingDefinition != null) { parentContext.abstractNumberingDefinition.numberingLevelDefinitions.put(ilvl, newContext.numberingLevelDefinition); } else if (parentContext.numberingDefinitionInstance != null) { parentContext.numberingDefinitionInstance.numberingLevelDefinitions.put(ilvl, newContext.numberingLevelDefinition); } } catch (Exception e) { e.printStackTrace(); } } } else if (localName.equals("font")) { String name = attributes.getValue(wNS, "name"); if (name != null) { FontFamily font = new FontFamily(name); newContext.font = font; fonts.put(name, font); } } else if (localName.endsWith("Pr") && !localName.equals("framePr")) { BaseProperties prop = createWordProperties(localName); newContext.properties = prop; if (parentContext.parentStyle != null) assignStyleProperties(parentContext.parentStyle, prop); else if (parentContext.parentElement != null) assignElementProperties(parentContext.parentElement, prop); else if (parentContext.properties != null) assignInnerProperties(parentContext.properties, prop); else if (parentContext.numberingLevelDefinition != null) { if (localName.equals("pPr")) parentContext.numberingLevelDefinition.paragraphProperties = (ParagraphProperties) prop; else if (localName.equals("rPr")) parentContext.numberingLevelDefinition.runProperties = (RunProperties) prop; } } else if (parentContext.parentStyle != null) { if (localName.equals("name")) { parentContext.parentStyle.name = attributes.getValue(wNS, "val"); } else if (localName.equals("basedOn")) { String parentStyle = attributes.getValue(wNS, "val"); if (parentStyle != null) parentContext.parentStyle.parent = doc.getStyleById(parentStyle); } } else if (parentContext.font != null) { if (localName.equals("panose1")) { parentContext.font.setPanose(attributes.getValue(wNS, "val")); } else if (localName.equals("family")) { parentContext.font.setFamily(attributes.getValue(wNS, "val")); } else if (localName.equals("pitch")) { parentContext.font.setPitch(attributes.getValue(wNS, "val")); } } else if (parentContext.abstractNumberingDefinition != null) { if (localName.equals("numStyleLink")) { parentContext.abstractNumberingDefinition.numStyleLink = attributes .getValue(wNS, "val"); } } else if (parentContext.numberingDefinitionInstance != null) { if (localName.equals("abstractNumId")) { String abstractNumIdStr = attributes.getValue(wNS, "val"); try { Integer abstractNumId = new Integer(abstractNumIdStr); parentContext.numberingDefinitionInstance .setAbstractNumbering((AbstractNumberingDefinition) doc.abstractNumberingDefinitions .get(abstractNumId)); } catch (Exception e) { e.printStackTrace(); } } } else if (parentContext.numberingLevelDefinition != null) { if (localName.equals("start")) { try { parentContext.numberingLevelDefinition.start = Integer.parseInt(attributes .getValue(wNS, "val")); } catch (Exception e) { e.printStackTrace(); } } else if (localName.equals("lvlRestart")) { try { parentContext.numberingLevelDefinition.lvlRestart = Integer.parseInt(attributes .getValue(wNS, "val")); } catch (Exception e) { e.printStackTrace(); } } else if (localName.equals("numFmt")) parentContext.numberingLevelDefinition.numFmt = attributes.getValue(wNS, "val"); else if (localName.equals("lvlText")) parentContext.numberingLevelDefinition.lvlText = attributes.getValue(wNS, "val"); else if (localName.equals("lvlJc")) parentContext.numberingLevelDefinition.lvlJc = attributes.getValue(wNS, "val"); } else if (parentContext.borderProp != null) { if (localName.equals("top") || localName.equals("bottom") || localName.equals("left") || localName.equals("right") || localName.equals("insideH") || localName.equals("insideV")) { BorderSide side = parseBorderSide(attributes); if (localName.equals("top")) parentContext.borderProp.put("border-top", side); else if (localName.equals("bottom")) parentContext.borderProp.put("border-bottom", side); else if (localName.equals("left")) parentContext.borderProp.put("border-left", side); else if (localName.equals("right")) parentContext.borderProp.put("border-right", side); else if (localName.equals("insideH")) parentContext.borderProp.put("border-insideH", side); else if (localName.equals("insideV")) parentContext.borderProp.put("border-insideV", side); } } else if (parentContext.properties != null) { BaseProperties prop = parentContext.properties; if (localName.equals("rStyle") || localName.equals("pStyle") || localName.equals("tblStyle")) { String val = attributes.getValue(wNS, "val"); if (val != null) { Style style = doc.getStyleById(val); if (prop instanceof ParagraphProperties && localName.equals("pStyle")) ((ParagraphProperties) prop).paragraphStyle = style; else if (prop instanceof RunProperties && localName.equals("rStyle")) ((RunProperties) prop).runStyle = style; else if (prop instanceof TableProperties && localName.equals("tblStyle")) ((TableProperties) prop).tableStyle = style; } } else if (localName.equals("pBdr") || localName.equals("tblBorders") || localName.equals("tcBorders")) { newContext.borderProp = prop; } else { if (localName.equals("spacing") && parentContext.properties instanceof RunProperties) localName = "spacing-r"; PropertyParser propertyParser = getPropertyParser(localName); if (propertyParser != null) { propertyParser.parse(parentContext.properties, localName, attributes, WordDocumentParser.this); } else { // System.out.println("unknown property: " + // localName); } } } else if (localName.equals("rPrDefault")) { newContext.parentStyle = doc.docDefaultRunStyle; } else if (localName.equals("pPrDefault")) { newContext.parentStyle = doc.docDefaultParagraphStyle; } else { Element p = createWordElement(localName, attributes); if (p != null) { Element parent = parentContext.parentElement; if (parent instanceof ContainerElement) { ((ContainerElement) parent).add(p); } if (contextStack.size() == 1) { if (localName.equals("body")) doc.body = (BodyElement) p; } newContext.parentElement = p; if (p instanceof EmbeddedObject) { newContext.embedded = (EmbeddedObject) p; } } } } else if (uri.equals(cpNS) || uri.equals(dcNS) || uri.equals(dctNS)) { if (contextStack.size() == 1) { newContext.item = new MetadataItem(uri, localName, null); } } else if (uri.equals(vNS)) { Element parent = parentContext.parentElement; VMLElement vmlp = null; if (parent instanceof VMLElement) vmlp = (VMLElement) parent; Element p = VMLElementFactory.createVMLElement(vmlp, vmldefs, localName, attributes); if (p != null) { if (parent instanceof ContainerElement) { ((ContainerElement) parent).add(p); } newContext.parentElement = p; } } else if (uri.equals(rPkNS)) { if (localName.equals("Relationship")) { String id = attributes.getValue("Id"); String type = attributes.getValue("Type"); String target = attributes.getValue("Target"); String targetMode = attributes.getValue("TargetMode"); if (id != null) rels.put(id, new Relationship(type, target, targetMode)); if (type.equals(numberingRel)) { if (numberingName == null) numberingName = target; } else if (type.equals(stylesRel)) { if (stylesName == null) stylesName = target; } else if (type.equals(fontsRel)) { if (fontsName == null) fontsName = target; } else if (type.equals(themeRel)) { if (themeName == null) themeName = target; } else if (type.equals(footnotesRel)) { if (footnotesName == null) footnotesName = target; } } } else { if (uri.equals(aNS)) { if (localName.equals("majorFont")) newContext.state = "majorFont"; else if (localName.equals("minorFont")) newContext.state = "minorFont"; else if (localName.equals("latin")) { String typeface = attributes.getValue("typeface"); if (parentContext.state == "majorFont") majorFontName = typeface; else if (parentContext.state == "minorFont") minorFontName = typeface; } } if (parentContext.embedded != null) newContext.embedded = parentContext.embedded.newChild(this, uri, localName, attributes); } } } contextStack.push(newContext); } public String getPictureURL(String resId) { Relationship rel = (Relationship) rels.get(resId); if (rel != null && rel.type.equals(imageRel)) return xrefPrefix + rel.target; return null; } } public WordDocumentParser(File docFile) { this.docFile = docFile; } public WordDocument parse() throws IOException { doc = new WordDocument(); parseInternal(); return doc; } void parseInternal() throws IOException { doc.body = null; doc.docDefaultParagraphStyle = new Style("__p"); doc.docDefaultRunStyle = new Style("__r"); doc.stylesById = new Hashtable(); zip = new ZipFile(docFile); factory = SAXParserFactory.newInstance(); factory.setNamespaceAware(true); rels = new Hashtable(); contextStack = new Stack(); numberingName = null; stylesName = null; fontsName = null; themeName = null; parseXML("docProps/core.xml"); contextStack.clear(); parseXML("word/_rels/document.xml.rels"); contextStack.clear(); if (themeName != null) { parseXML("word/" + themeName); contextStack.clear(); } if (fontsName != null) { parseXML("word/" + fontsName); contextStack.clear(); } if (stylesName != null) { parseXML("word/" + stylesName); contextStack.clear(); } if (numberingName != null) { parseXML("word/" + numberingName); contextStack.clear(); } if (footnotesName != null) { parseXML("word/" + footnotesName); contextStack.clear(); } parseXML("word/document.xml"); contextStack.clear(); if (doc.body != null) number(doc.body); if (doc.footnotes != null) number(doc.footnotes); zip.close(); } private void number(Element e) { if (e instanceof ParagraphElement) { ParagraphProperties pp = ((ParagraphElement) e).paragraphProperties; if (pp != null) { NumberingProperties np = pp.getNumberingProperties(); if (np != null) { Integer numId = (Integer) np.get("numId"); Integer ilvl = (Integer) np.get("ilvl"); if (numId != null) { NumberingDefinitionInstance inst = (NumberingDefinitionInstance) doc.numberingDefinitions .get(numId); if (inst != null) { int lvl = ilvl != null ? ilvl.intValue() : 0; Iterator it = inst.iteratorForLevel(lvl); if (it != null) { NumberingLabel label = (NumberingLabel) it.next(); pp.numberingLabel = label; } } } } } } Iterator it = e.content(); while (it.hasNext()) { Object child = it.next(); if (child instanceof ContainerElement) number((ContainerElement) child); } } private void parseXML(String entryName) throws IOException { ZipEntry entry = zip.getEntry(entryName); if (entry == null) return; try { SAXParser parser = factory.newSAXParser(); XMLReader reader = parser.getXMLReader(); int index = entryName.lastIndexOf('/'); String xrefPrefix = entryName.substring(0, index + 1); XMLHandler handler = new XMLHandler(xrefPrefix); reader.setContentHandler(handler); InputStream in = zip.getInputStream(entry); InputSource source = new InputSource(in); source.setSystemId(entryName); reader.parse(source); } catch (ParserConfigurationException e) { e.printStackTrace(); } catch (SAXException e) { e.printStackTrace(); } } private void assignStyleProperties(Style style, BaseProperties prop) { if (prop instanceof ParagraphProperties) { style.paragraphProperties = (ParagraphProperties) prop; } else if (prop instanceof RunProperties) { style.runProperties = (RunProperties) prop; } else if (prop instanceof TableProperties) { style.tableProperties = (TableProperties) prop; } } private void assignElementProperties(Element element, BaseProperties prop) { if (prop instanceof ParagraphProperties) { if (element instanceof ParagraphElement) { ((ParagraphElement) element).paragraphProperties = (ParagraphProperties) prop; } } else if (prop instanceof RunProperties) { if (element instanceof RunElement) { ((RunElement) element).runProperties = (RunProperties) prop; } } else if (prop instanceof TableProperties) { if (element instanceof TableElement) { ((TableElement) element).tableProperties = (TableProperties) prop; } } else if (prop instanceof TableRowProperties) { if (element instanceof TableRowElement) { ((TableRowElement) element).tableRowProperties = (TableRowProperties) prop; } } else if (prop instanceof TableCellProperties) { if (element instanceof TableCellElement) { ((TableCellElement) element).tableCellProperties = (TableCellProperties) prop; } } } private void assignInnerProperties(BaseProperties parent, BaseProperties prop) { if (parent instanceof ParagraphProperties) { ParagraphProperties p = (ParagraphProperties) parent; if (prop instanceof RunProperties) p.runProperties = (RunProperties) prop; else if (prop instanceof NumberingProperties) p.numberingProperties = (NumberingProperties) prop; } } private BaseProperties createWordProperties(String localName) { if (localName.equals("pPr")) return new ParagraphProperties(); if (localName.equals("rPr")) return new RunProperties(); if (localName.equals("numPr")) return new NumberingProperties(); if (localName.equals("tblPr")) return new TableProperties(); if (localName.equals("trPr")) return new TableRowProperties(); if (localName.equals("tcPr")) return new TableCellProperties(); return null; } private Element createWordElement(String localName, Attributes attributes) { if (localName.equals("t")) { TextElement te = new TextElement(); String val = attributes.getValue("http://www.w3.org/XML/1998/namespace", "space"); te.preserveSpace = val != null && val.equals("preserve"); return te; } if (localName.equals("p")) return new ParagraphElement(); if (localName.equals("body") || localName.equals("footnotes")) return new BodyElement(); if (localName.equals("r")) return new RunElement(); if (localName.equals("tab")) return new TabElement(); if (localName.equals("br")) return new BRElement(); if (localName.equals("drawing")) return new DrawingElement(); if (localName.equals("pict")) return new PictElement(); if (localName.equals("tbl")) return new TableElement(); if (localName.equals("tr")) return new TableRowElement(); if (localName.equals("tc")) return new TableCellElement(); if (localName.equals("txbxContent")) return new TXBXContentElement(); if (localName.equals("smartTag")) return new SmartTagElement(); if (localName.equals("hyperlink")) { HyperlinkElement he = new HyperlinkElement(); String rid = attributes.getValue(rNS, "id"); if (rid != null) { Relationship rel = (Relationship) rels.get(rid); if (rel != null) he.href = rel.target; } return he; } if (localName.equals("footnote")) { FootnoteElement fe = new FootnoteElement(); fe.id = attributes.getValue(wNS, "id"); if (fe.id == null || fe.id.equals("0") || fe.id.startsWith("-")) return null; return fe; } if (localName.equals("footnoteReference")) { FootnoteReferenceElement fe = new FootnoteReferenceElement(); fe.id = attributes.getValue(wNS, "id"); return fe; } if (localName.equals("lastRenderedPageBreak")) return new LastRenderedPageBreakElement(); return null; } }