/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package mj.ocraptor.extraction.tika.parser.odf; import static org.apache.tika.sax.XHTMLContentHandler.XHTML; import java.io.IOException; import java.io.InputStream; import java.util.BitSet; import java.util.Collections; import java.util.HashMap; import java.util.Map; import java.util.Set; import java.util.Stack; import javax.xml.XMLConstants; import javax.xml.namespace.QName; import javax.xml.parsers.ParserConfigurationException; import javax.xml.parsers.SAXParser; import javax.xml.parsers.SAXParserFactory; import org.apache.tika.exception.TikaException; import org.apache.tika.io.CloseShieldInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.AbstractParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.sax.ElementMappingContentHandler; import org.apache.tika.sax.OfflineContentHandler; import org.apache.tika.sax.XHTMLContentHandler; import org.apache.tika.sax.ElementMappingContentHandler.TargetElement; import org.xml.sax.Attributes; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; import org.xml.sax.SAXNotRecognizedException; import org.xml.sax.helpers.AttributesImpl; import org.xml.sax.helpers.DefaultHandler; /** * Parser for ODF <code>content.xml</code> files. */ public class OpenDocumentContentParser extends AbstractParser { private static final class OpenDocumentElementMappingContentHandler extends ElementMappingContentHandler { private final ContentHandler handler; private final BitSet textNodeStack = new BitSet(); private int nodeDepth = 0; private int completelyFiltered = 0; private Stack<String> headingStack = new Stack<String>(); private OpenDocumentElementMappingContentHandler(ContentHandler handler, Map<QName, TargetElement> mappings) { super(handler, mappings); this.handler = handler; } @Override public void characters(char[] ch, int start, int length) throws SAXException { // only forward content of tags from text:-namespace if (completelyFiltered == 0 && nodeDepth > 0 && textNodeStack.get(nodeDepth - 1)) { super.characters(ch, start, length); } } // helper for checking tags which need complete filtering // (with sub-tags) private boolean needsCompleteFiltering(String namespaceURI, String localName) { if (TEXT_NS.equals(namespaceURI)) { return localName.endsWith("-template") || localName.endsWith("-style"); } else if (TABLE_NS.equals(namespaceURI)) { return "covered-table-cell".equals(localName); } else { return false; } } // map the heading level to <hX> HTML tags private String getXHTMLHeaderTagName(Attributes atts) { String depthStr = atts.getValue(TEXT_NS, "outline-level"); if (depthStr == null) { return "h1"; } int depth = Integer.parseInt(depthStr); if (depth >= 6) { return "h6"; } else if (depth <= 1) { return "h1"; } else { return "h" + depth; } } /** * Check if a node is a text node */ private boolean isTextNode(String namespaceURI, String localName) { if (TEXT_NS.equals(namespaceURI) && !localName.equals("page-number") && !localName.equals("page-count")) { return true; } if (SVG_NS.equals(namespaceURI)) { return "title".equals(localName) || "desc".equals(localName); } return false; } @Override public void startElement(String namespaceURI, String localName, String qName, Attributes atts) throws SAXException { // keep track of current node type. If it is a text node, // a bit at the current depth ist set in textNodeStack. // characters() checks the top bit to determine, if the // actual node is a text node to print out nodeDepth contains // the depth of the current node and also marks top of stack. assert nodeDepth >= 0; textNodeStack.set(nodeDepth++, isTextNode(namespaceURI, localName)); // filter *all* content of some tags assert completelyFiltered >= 0; if (needsCompleteFiltering(namespaceURI, localName)) { completelyFiltered++; } // call next handler if no filtering if (completelyFiltered == 0) { // special handling of text:h, that are directly passed // to incoming handler if (TEXT_NS.equals(namespaceURI) && "h".equals(localName)) { final String el = headingStack.push(getXHTMLHeaderTagName(atts)); handler.startElement(XHTMLContentHandler.XHTML, el, el, EMPTY_ATTRIBUTES); } else { super.startElement(namespaceURI, localName, qName, atts); } } } @Override public void endElement(String namespaceURI, String localName, String qName) throws SAXException { // call next handler if no filtering if (completelyFiltered == 0) { // special handling of text:h, that are directly passed // to incoming handler if (TEXT_NS.equals(namespaceURI) && "h".equals(localName)) { final String el = headingStack.pop(); handler.endElement(XHTMLContentHandler.XHTML, el, el); } else { super.endElement(namespaceURI, localName, qName); } // special handling of tabulators if (TEXT_NS.equals(namespaceURI) && ("tab-stop".equals(localName) || "tab".equals(localName))) { this.characters(TAB, 0, TAB.length); } } // revert filter for *all* content of some tags if (needsCompleteFiltering(namespaceURI, localName)) { completelyFiltered--; } assert completelyFiltered >= 0; // reduce current node depth nodeDepth--; assert nodeDepth >= 0; } @Override public void startPrefixMapping(String prefix, String uri) { // remove prefix mappings as they should not occur in XHTML } @Override public void endPrefixMapping(String prefix) { // remove prefix mappings as they should not occur in XHTML } } public static final String TEXT_NS = "urn:oasis:names:tc:opendocument:xmlns:text:1.0"; public static final String TABLE_NS = "urn:oasis:names:tc:opendocument:xmlns:table:1.0"; public static final String OFFICE_NS = "urn:oasis:names:tc:opendocument:xmlns:office:1.0"; public static final String SVG_NS = "urn:oasis:names:tc:opendocument:xmlns:svg-compatible:1.0"; public static final String PRESENTATION_NS = "urn:oasis:names:tc:opendocument:xmlns:presentation:1.0"; public static final String DRAW_NS = "urn:oasis:names:tc:opendocument:xmlns:drawing:1.0"; public static final String XLINK_NS = "http://www.w3.org/1999/xlink"; protected static final char[] TAB = new char[] { '\t' }; private static final Attributes EMPTY_ATTRIBUTES = new AttributesImpl(); /** * Mappings between ODF tag names and XHTML tag names (including attributes). * All other tag names/attributes are ignored and left out from event stream. */ private static final HashMap<QName, TargetElement> MAPPINGS = new HashMap<QName, TargetElement>(); static { // general mappings of text:-tags MAPPINGS.put(new QName(TEXT_NS, "p"), new TargetElement(XHTML, "p")); // text:h-tags are mapped specifically in startElement/endElement MAPPINGS.put(new QName(TEXT_NS, "line-break"), new TargetElement(XHTML, "br")); MAPPINGS.put(new QName(TEXT_NS, "list"), new TargetElement(XHTML, "ul")); MAPPINGS.put(new QName(TEXT_NS, "list-item"), new TargetElement(XHTML, "li")); MAPPINGS.put(new QName(TEXT_NS, "note"), new TargetElement(XHTML, "div")); MAPPINGS.put(new QName(OFFICE_NS, "annotation"), new TargetElement(XHTML, "div")); MAPPINGS.put(new QName(PRESENTATION_NS, "notes"), new TargetElement(XHTML, "div")); MAPPINGS.put(new QName(DRAW_NS, "object"), new TargetElement(XHTML, "object")); MAPPINGS.put(new QName(DRAW_NS, "text-box"), new TargetElement(XHTML, "div")); MAPPINGS.put(new QName(SVG_NS, "title"), new TargetElement(XHTML, "span")); MAPPINGS.put(new QName(SVG_NS, "desc"), new TargetElement(XHTML, "span")); MAPPINGS.put(new QName(TEXT_NS, "span"), new TargetElement(XHTML, "span")); final HashMap<QName, QName> aAttsMapping = new HashMap<QName, QName>(); aAttsMapping.put(new QName(XLINK_NS, "href"), new QName("href")); aAttsMapping.put(new QName(XLINK_NS, "title"), new QName("title")); MAPPINGS.put(new QName(TEXT_NS, "a"), new TargetElement(XHTML, "a", aAttsMapping)); // create HTML tables from table:-tags MAPPINGS.put(new QName(TABLE_NS, "table"), new TargetElement(XHTML, "table")); // repeating of rows is ignored; for columns, see below! MAPPINGS.put(new QName(TABLE_NS, "table-row"), new TargetElement(XHTML, "tr")); // special mapping for rowspan/colspan attributes final HashMap<QName, QName> tableCellAttsMapping = new HashMap<QName, QName>(); tableCellAttsMapping.put(new QName(TABLE_NS, "number-columns-spanned"), new QName("colspan")); tableCellAttsMapping.put(new QName(TABLE_NS, "number-rows-spanned"), new QName("rowspan")); /* * TODO: The following is not correct, the cell should be repeated not * spanned! Code generates a HTML cell, spanning all repeated columns, to * make the cell look correct. Problems may occur when both spanning and * repeating is given, which is not allowed by spec. Cell spanning instead * of repeating is not a problem, because OpenOffice uses it only for empty * cells. */ tableCellAttsMapping.put(new QName(TABLE_NS, "number-columns-repeated"), new QName("colspan")); MAPPINGS.put(new QName(TABLE_NS, "table-cell"), new TargetElement(XHTML, "td", tableCellAttsMapping)); } public Set<MediaType> getSupportedTypes(ParseContext context) { return Collections.emptySet(); // not a top-level parser } public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { parseInternal(stream, new XHTMLContentHandler(handler, metadata), metadata, context); } void parseInternal(InputStream stream, final ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { DefaultHandler dh = new OpenDocumentElementMappingContentHandler(handler, MAPPINGS); try { SAXParserFactory factory = SAXParserFactory.newInstance(); factory.setValidating(false); factory.setNamespaceAware(true); try { // TODO: mj factory.setFeature(XMLConstants.DEFAULT_NS_PREFIX, true); } catch (SAXNotRecognizedException e) { // TIKA-329: Some XML parsers do not support the secure-processing // feature, even though it's required by JAXP in Java 5. Ignoring // the exception is fine here, deployments without this feature // are inherently vulnerable to XML denial-of-service attacks. } SAXParser parser = factory.newSAXParser(); parser.parse(new CloseShieldInputStream(stream), new OfflineContentHandler(new NSNormalizerContentHandler(dh))); } catch (ParserConfigurationException e) { throw new TikaException("XML parser configuration error", e); } } }