/* * ModeShape (http://www.modeshape.org) * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.modeshape.sequencer.msoffice.excel; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.List; import org.apache.poi.hssf.usermodel.HSSFCell; import org.apache.poi.hssf.usermodel.HSSFComment; import org.apache.poi.hssf.usermodel.HSSFRow; import org.apache.poi.hssf.usermodel.HSSFSheet; import org.apache.poi.hssf.usermodel.HSSFWorkbook; import org.apache.poi.poifs.filesystem.POIFSFileSystem; /** * Extracts data and some metadata from Excel files */ public class ExcelMetadataReader { /** The character to output after each row. */ private static final char ROW_DELIMITER_CHAR = '\n'; /** The character to output after each cell (column). */ private static final char CELL_DELIMITER_CHAR = '\t'; public static ExcelMetadata instance( InputStream stream ) throws IOException { ExcelMetadata metadata = new ExcelMetadata(); HSSFWorkbook wb = new HSSFWorkbook(new POIFSFileSystem(stream)); List<ExcelSheetMetadata> sheets = new ArrayList<ExcelSheetMetadata>(); for (int sheetInd = 0; sheetInd < wb.getNumberOfSheets(); sheetInd++) { ExcelSheetMetadata meta = new ExcelSheetMetadata(); meta.setName(wb.getSheetName(sheetInd)); sheets.add(meta); HSSFSheet worksheet = wb.getSheetAt(sheetInd); int lastRowNum = worksheet.getLastRowNum(); StringBuilder buff = new StringBuilder(); for (int rowNum = worksheet.getFirstRowNum(); rowNum <= lastRowNum; rowNum++) { HSSFRow row = worksheet.getRow(rowNum); // Empty rows are returned as null if (row == null) { continue; } int lastCellNum = row.getLastCellNum(); for (int cellNum = row.getFirstCellNum(); cellNum < lastCellNum; cellNum++) { HSSFCell cell = row.getCell(cellNum); // Undefined cells are returned as null if (cell == null) { continue; } /* * Builds a string of body content from all string, numeric, * and formula values in the body of each worksheet. * * This code currently duplicates the POI 3.1 ExcelExtractor behavior of * combining the body text from all worksheets into a single string. */ switch (cell.getCellType()) { case HSSFCell.CELL_TYPE_STRING: buff.append(cell.getRichStringCellValue().getString()); break; case HSSFCell.CELL_TYPE_NUMERIC: buff.append(cell.getNumericCellValue()); break; case HSSFCell.CELL_TYPE_FORMULA: buff.append(cell.getCellFormula()); break; } HSSFComment comment = cell.getCellComment(); if (comment != null) { // Filter out row delimiter characters from comment String commentText = comment.getString().getString().replace(ROW_DELIMITER_CHAR, ' '); buff.append(" ["); buff.append(commentText); buff.append(" by "); buff.append(comment.getAuthor()); buff.append(']'); } if (cellNum < lastCellNum - 1) { buff.append(CELL_DELIMITER_CHAR); } else { buff.append(ROW_DELIMITER_CHAR); } } } meta.setText(buff.toString()); } metadata.setSheets(sheets); metadata.setMetadata(wb.getSummaryInformation()); return metadata; } }