/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.tika.parser.microsoft.ooxml; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Locale; import java.util.Map; import java.util.Set; import org.apache.poi.POIXMLTextExtractor; import org.apache.poi.hssf.extractor.ExcelExtractor; import org.apache.poi.openxml4j.exceptions.InvalidFormatException; import org.apache.poi.openxml4j.exceptions.OpenXML4JException; import org.apache.poi.openxml4j.opc.OPCPackage; import org.apache.poi.openxml4j.opc.PackagePart; import org.apache.poi.openxml4j.opc.PackagePartName; import org.apache.poi.openxml4j.opc.PackageRelationship; import org.apache.poi.openxml4j.opc.PackagingURIHelper; import org.apache.poi.openxml4j.opc.TargetMode; import org.apache.poi.ss.usermodel.DataFormatter; import org.apache.poi.ss.usermodel.HeaderFooter; import org.apache.poi.xssf.eventusermodel.ReadOnlySharedStringsTable; import org.apache.poi.xssf.eventusermodel.XSSFReader; import org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler; import org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler.SheetContentsHandler; import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor; import org.apache.poi.xssf.model.CommentsTable; import org.apache.poi.xssf.model.StylesTable; import org.apache.poi.xssf.usermodel.XSSFComment; import org.apache.poi.xssf.usermodel.XSSFRelation; import org.apache.poi.xssf.usermodel.XSSFShape; import org.apache.poi.xssf.usermodel.XSSFSimpleShape; import org.apache.poi.xssf.usermodel.helpers.HeaderFooterHelper; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaMetadataKeys; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.microsoft.TikaExcelDataFormatter; import org.apache.tika.sax.XHTMLContentHandler; import org.apache.xmlbeans.XmlException; import org.openxmlformats.schemas.drawingml.x2006.main.CTHyperlink; import org.openxmlformats.schemas.drawingml.x2006.main.CTNonVisualDrawingProps; import org.openxmlformats.schemas.drawingml.x2006.spreadsheetDrawing.CTShape; import org.openxmlformats.schemas.drawingml.x2006.spreadsheetDrawing.CTShapeNonVisual; import org.xml.sax.Attributes; import org.xml.sax.ContentHandler; import org.xml.sax.InputSource; import org.xml.sax.Locator; import org.xml.sax.SAXException; import org.xml.sax.XMLReader; public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor { /** * Allows access to headers/footers from raw xml strings */ protected static HeaderFooterHelper hfHelper = new HeaderFooterHelper(); protected final DataFormatter formatter; protected final List<PackagePart> sheetParts = new ArrayList<PackagePart>(); protected final Map<String, String> drawingHyperlinks = new HashMap<>(); protected Metadata metadata; protected ParseContext parseContext; public XSSFExcelExtractorDecorator( ParseContext context, POIXMLTextExtractor extractor, Locale locale) { super(context, extractor); this.parseContext = context; this.extractor = (XSSFEventBasedExcelExtractor)extractor; configureExtractor(this.extractor, locale); if (locale == null) { formatter = new TikaExcelDataFormatter(); } else { formatter = new TikaExcelDataFormatter(locale); } } protected void configureExtractor(POIXMLTextExtractor extractor, Locale locale) { ((XSSFEventBasedExcelExtractor)extractor).setIncludeTextBoxes(config.getIncludeShapeBasedContent()); ((XSSFEventBasedExcelExtractor)extractor).setFormulasNotResults(false); ((XSSFEventBasedExcelExtractor)extractor).setLocale(locale); } @Override public void getXHTML( ContentHandler handler, Metadata metadata, ParseContext context) throws SAXException, XmlException, IOException, TikaException { this.metadata = metadata; this.parseContext = context; metadata.set(TikaMetadataKeys.PROTECTED, "false"); super.getXHTML(handler, metadata, context); } /** * @see org.apache.poi.xssf.extractor.XSSFExcelExtractor#getText() */ @Override protected void buildXHTML(XHTMLContentHandler xhtml) throws SAXException, XmlException, IOException { OPCPackage container = extractor.getPackage(); ReadOnlySharedStringsTable strings; XSSFReader.SheetIterator iter; XSSFReader xssfReader; StylesTable styles; try { xssfReader = new XSSFReader(container); styles = xssfReader.getStylesTable(); iter = (XSSFReader.SheetIterator) xssfReader.getSheetsData(); strings = new ReadOnlySharedStringsTable(container); } catch (InvalidFormatException e) { throw new XmlException(e); } catch (OpenXML4JException oe) { throw new XmlException(oe); } //temporary workaround for POI-61034 //remove once POI 3.17-beta1 is released Set<String> seen = new HashSet<>(); while (iter.hasNext()) { SheetTextAsHTML sheetExtractor = new SheetTextAsHTML(xhtml); PackagePart sheetPart = null; try (InputStream stream = iter.next()) { sheetPart = iter.getSheetPart(); final String partName = sheetPart.getPartName().toString(); if (seen.contains(partName)) { continue; } seen.add(partName); addDrawingHyperLinks(sheetPart); sheetParts.add(sheetPart); CommentsTable comments = iter.getSheetComments(); // Start, and output the sheet name xhtml.startElement("div"); xhtml.element("h1", iter.getSheetName()); // Extract the main sheet contents xhtml.startElement("table"); xhtml.startElement("tbody"); processSheet(sheetExtractor, comments, styles, strings, stream); } xhtml.endElement("tbody"); xhtml.endElement("table"); // Output any headers and footers // (Need to process the sheet to get them, so we can't // do the headers before the contents) for (String header : sheetExtractor.headers) { extractHeaderFooter(header, xhtml); } for (String footer : sheetExtractor.footers) { extractHeaderFooter(footer, xhtml); } // Do text held in shapes, if required if (config.getIncludeShapeBasedContent()) { List<XSSFShape> shapes = iter.getShapes(); processShapes(shapes, xhtml); } //for now dump sheet hyperlinks at bottom of page //consider a double-pass of the inputstream to reunite hyperlinks with cells/textboxes //step 1: extract hyperlink info from bottom of page //step 2: process as we do now, but with cached hyperlink relationship info extractHyperLinks(sheetPart, xhtml); // All done with this sheet xhtml.endElement("div"); } } protected void addDrawingHyperLinks(PackagePart sheetPart) { try { for (PackageRelationship rel : sheetPart.getRelationshipsByType(XSSFRelation.DRAWINGS.getRelation())) { if (rel.getTargetMode() == TargetMode.INTERNAL) { PackagePartName relName = PackagingURIHelper.createPartName(rel.getTargetURI()); PackagePart part = rel.getPackage().getPart(relName); //parts can go missing, and Excel quietly ignores missing images -- TIKA-2134 if (part == null) { continue; } for (PackageRelationship drawRel : part .getRelationshipsByType(XSSFRelation.SHEET_HYPERLINKS.getRelation())) { drawingHyperlinks.put(drawRel.getId(), drawRel.getTargetURI().toString()); } } } } catch (InvalidFormatException e) { //swallow //an exception trying to extract //hyperlinks on drawings should not cause a parse failure } } private void extractHyperLinks(PackagePart sheetPart, XHTMLContentHandler xhtml) throws SAXException { try { for (PackageRelationship rel : sheetPart.getRelationshipsByType(XSSFRelation.SHEET_HYPERLINKS.getRelation())) { xhtml.startElement("a", "href", rel.getTargetURI().toString()); xhtml.characters(rel.getTargetURI().toString()); xhtml.endElement("a"); } } catch (InvalidFormatException e) { //swallow } } protected void extractHeaderFooter(String hf, XHTMLContentHandler xhtml) throws SAXException { String content = ExcelExtractor._extractHeaderFooter( new HeaderFooterFromString(hf)); if (content.length() > 0) { xhtml.element("p", content); } } private void processShapes(List<XSSFShape> shapes, XHTMLContentHandler xhtml) throws SAXException { if (shapes == null) { return; } for (XSSFShape shape : shapes) { if (shape instanceof XSSFSimpleShape) { String sText = ((XSSFSimpleShape) shape).getText(); if (sText != null && sText.length() > 0) { xhtml.element("p", sText); } extractHyperLinksFromShape(((XSSFSimpleShape)shape).getCTShape(), xhtml); } } } private void extractHyperLinksFromShape(CTShape ctShape, XHTMLContentHandler xhtml) throws SAXException { if (ctShape == null) return; CTShapeNonVisual nvSpPR = ctShape.getNvSpPr(); if (nvSpPR == null) return; CTNonVisualDrawingProps cNvPr = nvSpPR.getCNvPr(); if (cNvPr == null) return; CTHyperlink ctHyperlink = cNvPr.getHlinkClick(); if (ctHyperlink == null) return; String url = drawingHyperlinks.get(ctHyperlink.getId()); if (url != null) { xhtml.startElement("a", "href", url); xhtml.characters(url); xhtml.endElement("a"); } CTHyperlink ctHoverHyperlink = cNvPr.getHlinkHover(); if (ctHoverHyperlink == null) return; url = drawingHyperlinks.get(ctHoverHyperlink.getId()); if (url != null) { xhtml.startElement("a", "href", url); xhtml.characters(url); xhtml.endElement("a"); } } public void processSheet( SheetContentsHandler sheetContentsExtractor, CommentsTable comments, StylesTable styles, ReadOnlySharedStringsTable strings, InputStream sheetInputStream) throws IOException, SAXException { InputSource sheetSource = new InputSource(sheetInputStream); try { XMLReader sheetParser = parseContext.getXMLReader(); XSSFSheetInterestingPartsCapturer handler = new XSSFSheetInterestingPartsCapturer(new XSSFSheetXMLHandler( styles, comments, strings, sheetContentsExtractor, formatter, false)); sheetParser.setContentHandler(handler); sheetParser.parse(sheetSource); sheetInputStream.close(); if (handler.hasProtection) { metadata.set(TikaMetadataKeys.PROTECTED, "true"); } } catch (TikaException e) { throw new RuntimeException("SAX parser appears to be broken - " + e.getMessage()); } } /** * In Excel files, sheets have things embedded in them, * and sheet drawings which have the images */ @Override protected List<PackagePart> getMainDocumentParts() throws TikaException { List<PackagePart> parts = new ArrayList<PackagePart>(); for (PackagePart part : sheetParts) { // Add the sheet parts.add(part); // If it has drawings, return those too try { for (PackageRelationship rel : part.getRelationshipsByType(XSSFRelation.DRAWINGS.getRelation())) { if (rel.getTargetMode() == TargetMode.INTERNAL) { PackagePartName relName = PackagingURIHelper.createPartName(rel.getTargetURI()); parts.add(rel.getPackage().getPart(relName)); } } for (PackageRelationship rel : part.getRelationshipsByType(XSSFRelation.VML_DRAWINGS.getRelation())) { if (rel.getTargetMode() == TargetMode.INTERNAL) { PackagePartName relName = PackagingURIHelper.createPartName(rel.getTargetURI()); parts.add(rel.getPackage().getPart(relName)); } } } catch (InvalidFormatException e) { throw new TikaException("Broken OOXML file", e); } } //add main document so that macros can be extracted //by AbstractOOXMLExtractor for (PackagePart part : extractor.getPackage(). getPartsByRelationshipType(RELATION_OFFICE_DOCUMENT)) { parts.add(part); } return parts; } /** * Turns formatted sheet events into HTML */ protected static class SheetTextAsHTML implements SheetContentsHandler { private XHTMLContentHandler xhtml; protected List<String> headers; protected List<String> footers; protected SheetTextAsHTML(XHTMLContentHandler xhtml) { this.xhtml = xhtml; headers = new ArrayList<String>(); footers = new ArrayList<String>(); } public void startRow(int rowNum) { try { xhtml.startElement("tr"); } catch (SAXException e) { } } public void endRow(int rowNum) { try { xhtml.endElement("tr"); } catch (SAXException e) { } } public void cell(String cellRef, String formattedValue, XSSFComment comment) { try { xhtml.startElement("td"); // Main cell contents if (formattedValue != null) { xhtml.characters(formattedValue); } // Comments if (comment != null) { xhtml.startElement("br"); xhtml.endElement("br"); xhtml.characters(comment.getAuthor()); xhtml.characters(": "); xhtml.characters(comment.getString().getString()); } xhtml.endElement("td"); } catch (SAXException e) { } } public void headerFooter(String text, boolean isHeader, String tagName) { if (isHeader) { headers.add(text); } else { footers.add(text); } } } protected static class HeaderFooterFromString implements HeaderFooter { private String text; protected HeaderFooterFromString(String text) { this.text = text; } public String getCenter() { return hfHelper.getCenterSection(text); } public void setCenter(String paramString) { } public String getLeft() { return hfHelper.getLeftSection(text); } public void setLeft(String paramString) { } public String getRight() { return hfHelper.getRightSection(text); } public void setRight(String paramString) { } } /** * Captures information on interesting tags, whilst * delegating the main work to the formatting handler */ protected static class XSSFSheetInterestingPartsCapturer implements ContentHandler { private ContentHandler delegate; private boolean hasProtection = false; protected XSSFSheetInterestingPartsCapturer(ContentHandler delegate) { this.delegate = delegate; } public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException { if ("sheetProtection".equals(qName)) { hasProtection = true; } delegate.startElement(uri, localName, qName, atts); } public void characters(char[] ch, int start, int length) throws SAXException { delegate.characters(ch, start, length); } public void endDocument() throws SAXException { delegate.endDocument(); } public void endElement(String uri, String localName, String qName) throws SAXException { delegate.endElement(uri, localName, qName); } public void endPrefixMapping(String prefix) throws SAXException { delegate.endPrefixMapping(prefix); } public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException { delegate.ignorableWhitespace(ch, start, length); } public void processingInstruction(String target, String data) throws SAXException { delegate.processingInstruction(target, data); } public void setDocumentLocator(Locator locator) { delegate.setDocumentLocator(locator); } public void skippedEntity(String name) throws SAXException { delegate.skippedEntity(name); } public void startDocument() throws SAXException { delegate.startDocument(); } public void startPrefixMapping(String prefix, String uri) throws SAXException { delegate.startPrefixMapping(prefix, uri); } } }