/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package mj.ocraptor.extraction.tika.parser.odf;
import static org.apache.tika.sax.XHTMLContentHandler.XHTML;
import java.io.IOException;
import java.io.InputStream;
import java.util.BitSet;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
import java.util.Stack;
import javax.xml.XMLConstants;
import javax.xml.namespace.QName;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.CloseShieldInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.ElementMappingContentHandler;
import org.apache.tika.sax.OfflineContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
import org.apache.tika.sax.ElementMappingContentHandler.TargetElement;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.SAXNotRecognizedException;
import org.xml.sax.helpers.AttributesImpl;
import org.xml.sax.helpers.DefaultHandler;
/**
* Parser for ODF <code>content.xml</code> files.
*/
public class OpenDocumentContentParser extends AbstractParser {
private static final class OpenDocumentElementMappingContentHandler extends ElementMappingContentHandler {
private final ContentHandler handler;
private final BitSet textNodeStack = new BitSet();
private int nodeDepth = 0;
private int completelyFiltered = 0;
private Stack<String> headingStack = new Stack<String>();
private OpenDocumentElementMappingContentHandler(ContentHandler handler, Map<QName, TargetElement> mappings) {
super(handler, mappings);
this.handler = handler;
}
@Override
public void characters(char[] ch, int start, int length) throws SAXException {
// only forward content of tags from text:-namespace
if (completelyFiltered == 0 && nodeDepth > 0 && textNodeStack.get(nodeDepth - 1)) {
super.characters(ch, start, length);
}
}
// helper for checking tags which need complete filtering
// (with sub-tags)
private boolean needsCompleteFiltering(String namespaceURI, String localName) {
if (TEXT_NS.equals(namespaceURI)) {
return localName.endsWith("-template") || localName.endsWith("-style");
} else if (TABLE_NS.equals(namespaceURI)) {
return "covered-table-cell".equals(localName);
} else {
return false;
}
}
// map the heading level to <hX> HTML tags
private String getXHTMLHeaderTagName(Attributes atts) {
String depthStr = atts.getValue(TEXT_NS, "outline-level");
if (depthStr == null) {
return "h1";
}
int depth = Integer.parseInt(depthStr);
if (depth >= 6) {
return "h6";
} else if (depth <= 1) {
return "h1";
} else {
return "h" + depth;
}
}
/**
* Check if a node is a text node
*/
private boolean isTextNode(String namespaceURI, String localName) {
if (TEXT_NS.equals(namespaceURI) && !localName.equals("page-number") && !localName.equals("page-count")) {
return true;
}
if (SVG_NS.equals(namespaceURI)) {
return "title".equals(localName) || "desc".equals(localName);
}
return false;
}
@Override
public void startElement(String namespaceURI, String localName, String qName, Attributes atts) throws SAXException {
// keep track of current node type. If it is a text node,
// a bit at the current depth ist set in textNodeStack.
// characters() checks the top bit to determine, if the
// actual node is a text node to print out nodeDepth contains
// the depth of the current node and also marks top of stack.
assert nodeDepth >= 0;
textNodeStack.set(nodeDepth++, isTextNode(namespaceURI, localName));
// filter *all* content of some tags
assert completelyFiltered >= 0;
if (needsCompleteFiltering(namespaceURI, localName)) {
completelyFiltered++;
}
// call next handler if no filtering
if (completelyFiltered == 0) {
// special handling of text:h, that are directly passed
// to incoming handler
if (TEXT_NS.equals(namespaceURI) && "h".equals(localName)) {
final String el = headingStack.push(getXHTMLHeaderTagName(atts));
handler.startElement(XHTMLContentHandler.XHTML, el, el, EMPTY_ATTRIBUTES);
} else {
super.startElement(namespaceURI, localName, qName, atts);
}
}
}
@Override
public void endElement(String namespaceURI, String localName, String qName) throws SAXException {
// call next handler if no filtering
if (completelyFiltered == 0) {
// special handling of text:h, that are directly passed
// to incoming handler
if (TEXT_NS.equals(namespaceURI) && "h".equals(localName)) {
final String el = headingStack.pop();
handler.endElement(XHTMLContentHandler.XHTML, el, el);
} else {
super.endElement(namespaceURI, localName, qName);
}
// special handling of tabulators
if (TEXT_NS.equals(namespaceURI) && ("tab-stop".equals(localName) || "tab".equals(localName))) {
this.characters(TAB, 0, TAB.length);
}
}
// revert filter for *all* content of some tags
if (needsCompleteFiltering(namespaceURI, localName)) {
completelyFiltered--;
}
assert completelyFiltered >= 0;
// reduce current node depth
nodeDepth--;
assert nodeDepth >= 0;
}
@Override
public void startPrefixMapping(String prefix, String uri) {
// remove prefix mappings as they should not occur in XHTML
}
@Override
public void endPrefixMapping(String prefix) {
// remove prefix mappings as they should not occur in XHTML
}
}
public static final String TEXT_NS = "urn:oasis:names:tc:opendocument:xmlns:text:1.0";
public static final String TABLE_NS = "urn:oasis:names:tc:opendocument:xmlns:table:1.0";
public static final String OFFICE_NS = "urn:oasis:names:tc:opendocument:xmlns:office:1.0";
public static final String SVG_NS = "urn:oasis:names:tc:opendocument:xmlns:svg-compatible:1.0";
public static final String PRESENTATION_NS = "urn:oasis:names:tc:opendocument:xmlns:presentation:1.0";
public static final String DRAW_NS = "urn:oasis:names:tc:opendocument:xmlns:drawing:1.0";
public static final String XLINK_NS = "http://www.w3.org/1999/xlink";
protected static final char[] TAB = new char[] { '\t' };
private static final Attributes EMPTY_ATTRIBUTES = new AttributesImpl();
/**
* Mappings between ODF tag names and XHTML tag names (including attributes).
* All other tag names/attributes are ignored and left out from event stream.
*/
private static final HashMap<QName, TargetElement> MAPPINGS = new HashMap<QName, TargetElement>();
static {
// general mappings of text:-tags
MAPPINGS.put(new QName(TEXT_NS, "p"), new TargetElement(XHTML, "p"));
// text:h-tags are mapped specifically in startElement/endElement
MAPPINGS.put(new QName(TEXT_NS, "line-break"), new TargetElement(XHTML, "br"));
MAPPINGS.put(new QName(TEXT_NS, "list"), new TargetElement(XHTML, "ul"));
MAPPINGS.put(new QName(TEXT_NS, "list-item"), new TargetElement(XHTML, "li"));
MAPPINGS.put(new QName(TEXT_NS, "note"), new TargetElement(XHTML, "div"));
MAPPINGS.put(new QName(OFFICE_NS, "annotation"), new TargetElement(XHTML, "div"));
MAPPINGS.put(new QName(PRESENTATION_NS, "notes"), new TargetElement(XHTML, "div"));
MAPPINGS.put(new QName(DRAW_NS, "object"), new TargetElement(XHTML, "object"));
MAPPINGS.put(new QName(DRAW_NS, "text-box"), new TargetElement(XHTML, "div"));
MAPPINGS.put(new QName(SVG_NS, "title"), new TargetElement(XHTML, "span"));
MAPPINGS.put(new QName(SVG_NS, "desc"), new TargetElement(XHTML, "span"));
MAPPINGS.put(new QName(TEXT_NS, "span"), new TargetElement(XHTML, "span"));
final HashMap<QName, QName> aAttsMapping = new HashMap<QName, QName>();
aAttsMapping.put(new QName(XLINK_NS, "href"), new QName("href"));
aAttsMapping.put(new QName(XLINK_NS, "title"), new QName("title"));
MAPPINGS.put(new QName(TEXT_NS, "a"), new TargetElement(XHTML, "a", aAttsMapping));
// create HTML tables from table:-tags
MAPPINGS.put(new QName(TABLE_NS, "table"), new TargetElement(XHTML, "table"));
// repeating of rows is ignored; for columns, see below!
MAPPINGS.put(new QName(TABLE_NS, "table-row"), new TargetElement(XHTML, "tr"));
// special mapping for rowspan/colspan attributes
final HashMap<QName, QName> tableCellAttsMapping = new HashMap<QName, QName>();
tableCellAttsMapping.put(new QName(TABLE_NS, "number-columns-spanned"), new QName("colspan"));
tableCellAttsMapping.put(new QName(TABLE_NS, "number-rows-spanned"), new QName("rowspan"));
/*
* TODO: The following is not correct, the cell should be repeated not
* spanned! Code generates a HTML cell, spanning all repeated columns, to
* make the cell look correct. Problems may occur when both spanning and
* repeating is given, which is not allowed by spec. Cell spanning instead
* of repeating is not a problem, because OpenOffice uses it only for empty
* cells.
*/
tableCellAttsMapping.put(new QName(TABLE_NS, "number-columns-repeated"), new QName("colspan"));
MAPPINGS.put(new QName(TABLE_NS, "table-cell"), new TargetElement(XHTML, "td", tableCellAttsMapping));
}
public Set<MediaType> getSupportedTypes(ParseContext context) {
return Collections.emptySet(); // not a top-level parser
}
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
throws IOException, SAXException, TikaException {
parseInternal(stream, new XHTMLContentHandler(handler, metadata), metadata, context);
}
void parseInternal(InputStream stream, final ContentHandler handler, Metadata metadata, ParseContext context)
throws IOException, SAXException, TikaException {
DefaultHandler dh = new OpenDocumentElementMappingContentHandler(handler, MAPPINGS);
try {
SAXParserFactory factory = SAXParserFactory.newInstance();
factory.setValidating(false);
factory.setNamespaceAware(true);
try {
// TODO: mj
factory.setFeature(XMLConstants.DEFAULT_NS_PREFIX, true);
} catch (SAXNotRecognizedException e) {
// TIKA-329: Some XML parsers do not support the secure-processing
// feature, even though it's required by JAXP in Java 5. Ignoring
// the exception is fine here, deployments without this feature
// are inherently vulnerable to XML denial-of-service attacks.
}
SAXParser parser = factory.newSAXParser();
parser.parse(new CloseShieldInputStream(stream), new OfflineContentHandler(new NSNormalizerContentHandler(dh)));
} catch (ParserConfigurationException e) {
throw new TikaException("XML parser configuration error", e);
}
}
}