/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.tika.parser.microsoft; import java.awt.*; import java.io.IOException; import java.text.NumberFormat; import java.util.ArrayList; import java.util.Comparator; import java.util.List; import java.util.Locale; import java.util.Map; import java.util.SortedMap; import java.util.TreeMap; import org.apache.poi.ddf.EscherBSERecord; import org.apache.poi.ddf.EscherBlipRecord; import org.apache.poi.ddf.EscherRecord; import org.apache.poi.hssf.eventusermodel.FormatTrackingHSSFListener; import org.apache.poi.hssf.eventusermodel.HSSFEventFactory; import org.apache.poi.hssf.eventusermodel.HSSFListener; import org.apache.poi.hssf.eventusermodel.HSSFRequest; import org.apache.poi.hssf.extractor.OldExcelExtractor; import org.apache.poi.hssf.record.BOFRecord; import org.apache.poi.hssf.record.BoundSheetRecord; import org.apache.poi.hssf.record.CellValueRecordInterface; import org.apache.poi.hssf.record.CountryRecord; import org.apache.poi.hssf.record.DateWindow1904Record; import org.apache.poi.hssf.record.DrawingGroupRecord; import org.apache.poi.hssf.record.EOFRecord; import org.apache.poi.hssf.record.ExtendedFormatRecord; import org.apache.poi.hssf.record.FooterRecord; import org.apache.poi.hssf.record.FormatRecord; import org.apache.poi.hssf.record.FormulaRecord; import org.apache.poi.hssf.record.HeaderRecord; import org.apache.poi.hssf.record.HyperlinkRecord; import org.apache.poi.hssf.record.LabelRecord; import org.apache.poi.hssf.record.LabelSSTRecord; import org.apache.poi.hssf.record.NumberRecord; import org.apache.poi.hssf.record.RKRecord; import org.apache.poi.hssf.record.Record; import org.apache.poi.hssf.record.SSTRecord; import org.apache.poi.hssf.record.StringRecord; import org.apache.poi.hssf.record.TextObjectRecord; import org.apache.poi.hssf.record.chart.SeriesTextRecord; import org.apache.poi.hssf.record.common.UnicodeString; import org.apache.poi.hssf.record.crypto.Biff8EncryptionKey; import org.apache.poi.hssf.usermodel.HSSFPictureData; import org.apache.poi.poifs.filesystem.DirectoryEntry; import org.apache.poi.poifs.filesystem.DirectoryNode; import org.apache.poi.poifs.filesystem.DocumentInputStream; import org.apache.poi.poifs.filesystem.Entry; import org.apache.poi.poifs.filesystem.NPOIFSFileSystem; import org.apache.tika.exception.EncryptedDocumentException; import org.apache.tika.exception.TikaException; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; import org.apache.tika.sax.XHTMLContentHandler; import org.xml.sax.SAXException; /** * Excel parser implementation which uses POI's Event API * to handle the contents of a Workbook. * <p/> * The Event API uses a much smaller memory footprint than * <code>HSSFWorkbook</code> when processing excel files * but at the cost of more complexity. * <p/> * With the Event API a <i>listener</i> is registered for * specific record types and those records are created, * fired off to the listener and then discarded as the stream * is being processed. * * @see org.apache.poi.hssf.eventusermodel.HSSFListener * @see <a href="http://poi.apache.org/hssf/how-to.html#event_api"> * POI Event API How To</a> */ public class ExcelExtractor extends AbstractPOIFSExtractor { private static final String WORKBOOK_ENTRY = "Workbook"; private static final String BOOK_ENTRY = "Book"; /** * <code>true</code> if the HSSFListener should be registered * to listen for all records or <code>false</code> (the default) * if the listener should be configured to only receive specified * records. */ private boolean listenForAllRecords = false; public ExcelExtractor(ParseContext context, Metadata metadata) { super(context, metadata); } /** * Returns <code>true</code> if this parser is configured to listen * for all records instead of just the specified few. */ public boolean isListenForAllRecords() { return listenForAllRecords; } /** * Specifies whether this parser should to listen for all * records or just for the specified few. * <p/> * <strong>Note:</strong> Under normal operation this setting should * be <code>false</code> (the default), but you can experiment with * this setting for testing and debugging purposes. * * @param listenForAllRecords <code>true</code> if the HSSFListener * should be registered to listen for all records or <code>false</code> * if the listener should be configured to only receive specified records. */ public void setListenForAllRecords(boolean listenForAllRecords) { this.listenForAllRecords = listenForAllRecords; } /** * Extracts text from an Excel Workbook writing the extracted content * to the specified {@link Appendable}. * * @param filesystem POI file system * @throws IOException if an error occurs processing the workbook * or writing the extracted content */ protected void parse( NPOIFSFileSystem filesystem, XHTMLContentHandler xhtml, Locale locale) throws IOException, SAXException, TikaException { parse(filesystem.getRoot(), xhtml, locale); } protected void parse( DirectoryNode root, XHTMLContentHandler xhtml, Locale locale) throws IOException, SAXException, TikaException { if (!root.hasEntry(WORKBOOK_ENTRY)) { if (root.hasEntry(BOOK_ENTRY)) { // Excel 5 / Excel 95 file // Records are in a different structure so needs a // different parser to process them OldExcelExtractor extractor = new OldExcelExtractor(root); OldExcelParser.parse(extractor, xhtml); return; } else { // Corrupt file / very old file, just skip text extraction return; } } // If a password was supplied, use it, otherwise the default Biff8EncryptionKey.setCurrentUserPassword(getPassword()); // Have the file processed in event mode TikaHSSFListener listener = new TikaHSSFListener(xhtml, locale, this); listener.processFile(root, isListenForAllRecords()); listener.throwStoredException(); for (Entry entry : root) { if (entry.getName().startsWith("MBD") && entry instanceof DirectoryEntry) { try { handleEmbeddedOfficeDoc((DirectoryEntry) entry, xhtml); } catch (TikaException e) { // ignore parse errors from embedded documents } } } } // ====================================================================== /** * HSSF Listener implementation which processes the HSSF records. */ private static class TikaHSSFListener implements HSSFListener { /** * XHTML content handler to which the document content is rendered. */ private final XHTMLContentHandler handler; /** * The POIFS Extractor, used for embeded resources. */ private final AbstractPOIFSExtractor extractor; /** * Format for rendering numbers in the worksheet. Currently we just * use the platform default formatting. * * @see <a href="https://issues.apache.org/jira/browse/TIKA-103">TIKA-103</a> */ private final NumberFormat format; /** * Potential exception thrown by the content handler. When set to * non-<code>null</code>, causes all subsequent HSSF records to be * ignored and the stored exception to be thrown when * {@link #throwStoredException()} is invoked. */ private Exception exception = null; private SSTRecord sstRecord; private FormulaRecord stringFormulaRecord; private short previousSid; /** * Internal <code>FormatTrackingHSSFListener</code> to handle cell * formatting within the extraction. */ private FormatTrackingHSSFListener formatListener; private final TikaExcelDataFormatter tikaExcelDataFormatter; /** * List of worksheet names. */ private List<String> sheetNames = new ArrayList<String>(); /** * Index of the current worksheet within the workbook. * Used to find the worksheet name in the {@link #sheetNames} list. */ private short currentSheetIndex; /** * Content of the current worksheet, or <code>null</code> if no * worksheet is currently active. */ private SortedMap<Point, Cell> currentSheet = null; /** * Extra text or cells that crops up, typically as part of a * worksheet but not always. */ private List<Cell> extraTextCells = new ArrayList<Cell>(); /** * These aren't complete when we first see them, as the * depend on continue records that aren't always * contiguous. Collect them for later processing. */ private List<DrawingGroupRecord> drawingGroups = new ArrayList<DrawingGroupRecord>(); /** * Construct a new listener instance outputting parsed data to * the specified XHTML content handler. * * @param handler Destination to write the parsed output to */ private TikaHSSFListener(XHTMLContentHandler handler, Locale locale, AbstractPOIFSExtractor extractor) { this.handler = handler; this.extractor = extractor; this.format = NumberFormat.getInstance(locale); this.formatListener = new TikaFormatTrackingHSSFListener(this, locale); this.tikaExcelDataFormatter = new TikaExcelDataFormatter(locale); } /** * Entry point to listener to start the processing of a file. * * @param filesystem POI file system. * @param listenForAllRecords sets whether the listener is configured to listen * for all records types or not. * @throws IOException on any IO errors. * @throws SAXException on any SAX parsing errors. */ public void processFile(NPOIFSFileSystem filesystem, boolean listenForAllRecords) throws IOException, SAXException, TikaException { processFile(filesystem.getRoot(), listenForAllRecords); } public void processFile(DirectoryNode root, boolean listenForAllRecords) throws IOException, SAXException, TikaException { // Set up listener and register the records we want to process HSSFRequest hssfRequest = new HSSFRequest(); if (listenForAllRecords) { hssfRequest.addListenerForAllRecords(formatListener); } else { hssfRequest.addListener(formatListener, BOFRecord.sid); hssfRequest.addListener(formatListener, EOFRecord.sid); hssfRequest.addListener(formatListener, DateWindow1904Record.sid); hssfRequest.addListener(formatListener, CountryRecord.sid); hssfRequest.addListener(formatListener, BoundSheetRecord.sid); hssfRequest.addListener(formatListener, SSTRecord.sid); hssfRequest.addListener(formatListener, FormulaRecord.sid); hssfRequest.addListener(formatListener, LabelRecord.sid); hssfRequest.addListener(formatListener, LabelSSTRecord.sid); hssfRequest.addListener(formatListener, NumberRecord.sid); hssfRequest.addListener(formatListener, RKRecord.sid); hssfRequest.addListener(formatListener, StringRecord.sid); hssfRequest.addListener(formatListener, HyperlinkRecord.sid); hssfRequest.addListener(formatListener, TextObjectRecord.sid); hssfRequest.addListener(formatListener, SeriesTextRecord.sid); hssfRequest.addListener(formatListener, FormatRecord.sid); hssfRequest.addListener(formatListener, ExtendedFormatRecord.sid); hssfRequest.addListener(formatListener, DrawingGroupRecord.sid); hssfRequest.addListener(formatListener, HeaderRecord.sid); hssfRequest.addListener(formatListener, FooterRecord.sid); } // Create event factory and process Workbook (fire events) DocumentInputStream documentInputStream = root.createDocumentInputStream(WORKBOOK_ENTRY); HSSFEventFactory eventFactory = new HSSFEventFactory(); try { eventFactory.processEvents(hssfRequest, documentInputStream); } catch (org.apache.poi.EncryptedDocumentException e) { throw new EncryptedDocumentException(e); } // Output any extra text that came after all the sheets processExtraText(); // Look for embeded images, now that the drawing records // have been fully matched with their continue data for (DrawingGroupRecord dgr : drawingGroups) { dgr.decode(); findPictures(dgr.getEscherRecords()); } } /** * Process a HSSF record. * * @param record HSSF Record */ public void processRecord(Record record) { if (exception == null) { try { internalProcessRecord(record); } catch (TikaException te) { exception = te; } catch (IOException ie) { exception = ie; } catch (SAXException se) { exception = se; } } } public void throwStoredException() throws TikaException, SAXException, IOException { if (exception != null) { if (exception instanceof IOException) throw (IOException) exception; if (exception instanceof SAXException) throw (SAXException) exception; if (exception instanceof TikaException) throw (TikaException) exception; throw new TikaException(exception.getMessage()); } } private void internalProcessRecord(Record record) throws SAXException, TikaException, IOException { switch (record.getSid()) { case BOFRecord.sid: // start of workbook, worksheet etc. records BOFRecord bof = (BOFRecord) record; if (bof.getType() == BOFRecord.TYPE_WORKBOOK) { currentSheetIndex = -1; } else if (bof.getType() == BOFRecord.TYPE_CHART) { if (previousSid == EOFRecord.sid) { // This is a sheet which contains only a chart newSheet(); } else { // This is a chart within a normal sheet // Handling of this is a bit hacky... if (currentSheet != null) { processSheet(); currentSheetIndex--; newSheet(); } } } else if (bof.getType() == BOFRecord.TYPE_WORKSHEET) { newSheet(); } break; case EOFRecord.sid: // end of workbook, worksheet etc. records if (currentSheet != null) { processSheet(); } currentSheet = null; break; case BoundSheetRecord.sid: // Worksheet index record BoundSheetRecord boundSheetRecord = (BoundSheetRecord) record; sheetNames.add(boundSheetRecord.getSheetname()); break; case SSTRecord.sid: // holds all the strings for LabelSSTRecords sstRecord = (SSTRecord) record; break; case FormulaRecord.sid: // Cell value from a formula FormulaRecord formula = (FormulaRecord) record; if (formula.hasCachedResultString()) { // The String itself should be the next record stringFormulaRecord = formula; } else { addTextCell(record, formatListener.formatNumberDateCell(formula)); } break; case StringRecord.sid: if (previousSid == FormulaRecord.sid) { // Cached string value of a string formula StringRecord sr = (StringRecord) record; addTextCell(stringFormulaRecord, sr.getString()); } else { // Some other string not associated with a cell, skip } break; case LabelRecord.sid: // strings stored directly in the cell LabelRecord label = (LabelRecord) record; addTextCell(record, label.getValue()); break; case LabelSSTRecord.sid: // Ref. a string in the shared string table LabelSSTRecord sst = (LabelSSTRecord) record; UnicodeString unicode = sstRecord.getString(sst.getSSTIndex()); addTextCell(record, unicode.getString()); break; case NumberRecord.sid: // Contains a numeric cell value NumberRecord number = (NumberRecord) record; addTextCell(record, formatListener.formatNumberDateCell(number)); break; case RKRecord.sid: // Excel internal number record RKRecord rk = (RKRecord) record; addCell(record, new NumberCell(rk.getRKNumber(), format)); break; case HyperlinkRecord.sid: // holds a URL associated with a cell if (currentSheet != null) { HyperlinkRecord link = (HyperlinkRecord) record; Point point = new Point(link.getFirstColumn(), link.getFirstRow()); Cell cell = currentSheet.get(point); if (cell != null) { String address = link.getAddress(); if (address != null) { addCell(record, new LinkedCell(cell, address)); } else { addCell(record, cell); } } } break; case TextObjectRecord.sid: if (extractor.officeParserConfig.getIncludeShapeBasedContent()) { TextObjectRecord tor = (TextObjectRecord) record; addTextCell(record, tor.getStr().getString()); } break; case SeriesTextRecord.sid: // Chart label or title SeriesTextRecord str = (SeriesTextRecord) record; addTextCell(record, str.getText()); break; case DrawingGroupRecord.sid: // Collect this now, we'll process later when all // the continue records are in drawingGroups.add((DrawingGroupRecord) record); break; case HeaderRecord.sid: HeaderRecord headerRecord = (HeaderRecord) record; addTextCell(record, headerRecord.getText()); break; case FooterRecord.sid: FooterRecord footerRecord = (FooterRecord) record; addTextCell(record, footerRecord.getText()); break; } previousSid = record.getSid(); if (stringFormulaRecord != record) { stringFormulaRecord = null; } } private void processExtraText() throws SAXException { if (extraTextCells.size() > 0) { for (Cell cell : extraTextCells) { handler.startElement("div", "class", "outside"); cell.render(handler); handler.endElement("div"); } // Reset extraTextCells.clear(); } } /** * Adds the given cell (unless <code>null</code>) to the current * worksheet (if any) at the position (if any) of the given record. * * @param record record that holds the cell value * @param cell cell value (or <code>null</code>) */ private void addCell(Record record, Cell cell) throws SAXException { if (cell == null) { // Ignore empty cells } else if (currentSheet != null && record instanceof CellValueRecordInterface) { // Normal cell inside a worksheet CellValueRecordInterface value = (CellValueRecordInterface) record; Point point = new Point(value.getColumn(), value.getRow()); currentSheet.put(point, cell); } else { // Cell outside the worksheets extraTextCells.add(cell); } } /** * Adds a text cell with the given text comment. The given text * is trimmed, and ignored if <code>null</code> or empty. * * @param record record that holds the text value * @param text text content, may be <code>null</code> * @throws SAXException */ private void addTextCell(Record record, String text) throws SAXException { if (text != null) { text = text.trim(); if (text.length() > 0) { addCell(record, new TextCell(text)); } } } private void newSheet() { currentSheetIndex++; currentSheet = new TreeMap<Point, Cell>(new PointComparator()); } /** * Process an excel sheet. * * @throws SAXException if an error occurs */ private void processSheet() throws SAXException { // Sheet Start handler.startElement("div", "class", "page"); if (currentSheetIndex < sheetNames.size()) { handler.element("h1", sheetNames.get(currentSheetIndex)); } handler.startElement("table"); handler.startElement("tbody"); // Process Rows int currentRow = 0; int currentColumn = 0; handler.startElement("tr"); handler.startElement("td"); for (Map.Entry<Point, Cell> entry : currentSheet.entrySet()) { while (currentRow < entry.getKey().y) { handler.endElement("td"); handler.endElement("tr"); handler.startElement("tr"); handler.startElement("td"); currentRow++; currentColumn = 0; } while (currentColumn < entry.getKey().x) { handler.endElement("td"); handler.startElement("td"); currentColumn++; } entry.getValue().render(handler); } handler.endElement("td"); handler.endElement("tr"); // Sheet End handler.endElement("tbody"); handler.endElement("table"); // Finish up processExtraText(); handler.endElement("div"); } private void findPictures(List<EscherRecord> records) throws IOException, SAXException, TikaException { for (EscherRecord escherRecord : records) { if (escherRecord instanceof EscherBSERecord) { EscherBlipRecord blip = ((EscherBSERecord) escherRecord).getBlipRecord(); if (blip != null) { HSSFPictureData picture = new HSSFPictureData(blip); String mimeType = picture.getMimeType(); TikaInputStream stream = TikaInputStream.get(picture.getData()); // Handle the embeded resource extractor.handleEmbeddedResource( stream, null, null, mimeType, handler, true ); } } // Recursive call. findPictures(escherRecord.getChildRecords()); } } private class TikaFormatTrackingHSSFListener extends FormatTrackingHSSFListener { //TIKA-2025 -- use this to preserve large numbers in "General" format //against the MS spec. final TikaExcelGeneralFormat generalFormat; public TikaFormatTrackingHSSFListener(HSSFListener childListener, Locale locale) { super(childListener, locale); generalFormat = new TikaExcelGeneralFormat(locale); } @Override public String formatNumberDateCell(CellValueRecordInterface cell) { String formatString = this.getFormatString(cell); if (formatString != null && ! formatString.equals("General")) { return super.formatNumberDateCell(cell); } double value; if(cell instanceof NumberRecord) { value = ((NumberRecord)cell).getValue(); } else { if(!(cell instanceof FormulaRecord)) { throw new IllegalArgumentException("Unsupported CellValue Record passed in " + cell); } value = ((FormulaRecord)cell).getValue(); } return generalFormat.format(value); } } } /** * Utility comparator for points. */ private static class PointComparator implements Comparator<Point> { public int compare(Point a, Point b) { int diff = a.y - b.y; if (diff == 0) { diff = a.x - b.x; } return diff; } } }