/*
* See the NOTICE file distributed with this work for additional
* information regarding copyright ownership.
*
* This is free software; you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as
* published by the Free Software Foundation; either version 2.1 of
* the License, or (at your option) any later version.
*
* This software is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this software; if not, write to the Free
* Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
* 02110-1301 USA, or see the FSF site: http://www.fsf.org.
*/
package com.xpn.xwiki.plugin.lucene.textextraction;
import java.io.ByteArrayInputStream;
import java.text.DecimalFormat;
import java.text.SimpleDateFormat;
import java.util.Date;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.poi.hssf.usermodel.HSSFCell;
import org.apache.poi.hssf.usermodel.HSSFCellStyle;
import org.apache.poi.hssf.usermodel.HSSFDataFormat;
import org.apache.poi.hssf.usermodel.HSSFDateUtil;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
public class MSExcelTextExtractor implements MimetypeTextExtractor
{
/** Logging helper. */
private static final Log LOG = LogFactory.getLog(MSExcelTextExtractor.class);
/** The currently preparing Excel workbook. */
private HSSFWorkbook mWorkbook;
/**
* Contains all data formats used in the currently preparing Excel workbook.
*/
private HSSFDataFormat mDataFormat;
public static final int DEFAULT_BUFFER_SIZE = 16384; // 16 k
/**
* Extracts all text from an Excel by parsing all the sheets in that excel document.
*
* @return String
*/
public String getText(byte[] data) throws Exception
{
POIFSFileSystem poiFs = new POIFSFileSystem(new ByteArrayInputStream(data));
mWorkbook = new HSSFWorkbook(poiFs);
mDataFormat = mWorkbook.createDataFormat();
StringBuffer cleanBuffer = new StringBuffer(DEFAULT_BUFFER_SIZE);
for (int sheetIdx = 0; sheetIdx < mWorkbook.getNumberOfSheets(); sheetIdx++) {
HSSFSheet sheet = mWorkbook.getSheetAt(sheetIdx);
if (sheet != null) {
parseSheet(sheet, cleanBuffer);
}
}
return cleanBuffer.toString();
}
/**
* It will parse the sheet with row wise and get the text from the sheet.
*/
private void parseSheet(HSSFSheet sheet, StringBuffer cleanBuffer)
{
int firstRow = sheet.getFirstRowNum();
int lastRow = sheet.getLastRowNum();
for (int rowIdx = firstRow; rowIdx <= lastRow; rowIdx++) {
HSSFRow row = sheet.getRow(rowIdx);
if (row != null) {
parseRow(row, cleanBuffer);
}
}
}
/**
* It will parse row and return the text
*/
private void parseRow(HSSFRow row, StringBuffer cleanBuffer)
{
short firstCell = row.getFirstCellNum();
short lastCell = row.getLastCellNum();
for (short cellIdx = firstCell; cellIdx <= lastCell; cellIdx++) {
HSSFCell cell = row.getCell(cellIdx);
if (cell != null) {
parseCell(cell, cleanBuffer);
}
}
}
/**
* Extracts all text from each cell of the sheet
*/
private void parseCell(HSSFCell cell, StringBuffer cleanBuffer)
{
String cellValue = null;
if (cell.getCellType() == HSSFCell.CELL_TYPE_STRING) {
cellValue = cell.getRichStringCellValue().getString();
} else if (cell.getCellType() == HSSFCell.CELL_TYPE_NUMERIC) {
HSSFCellStyle style = cell.getCellStyle();
short formatId = style.getDataFormat();
String formatPattern = mDataFormat.getFormat(formatId);
formatPattern = replace(formatPattern, "\\ ", " ");
if (isCellDateFormatted(cell)) {
// This is a date
formatPattern = replace(formatPattern, "mmmm", "MMMM");
formatPattern = replace(formatPattern, "/", ".");
SimpleDateFormat format;
try {
format = new SimpleDateFormat(formatPattern);
} catch (Throwable thr) {
if (LOG.isDebugEnabled()) {
LOG.debug("Creating date format failed: '" + formatPattern + "'", thr);
}
format = new SimpleDateFormat();
}
double numberValue = cell.getNumericCellValue();
Date date = HSSFDateUtil.getJavaDate(numberValue);
cellValue = format.format(date);
} else {
// This is a Number
DecimalFormat format;
try {
format = new DecimalFormat(formatPattern);
} catch (Throwable thr) {
if (LOG.isDebugEnabled()) {
LOG.debug("Creating number format failed: '" + formatPattern + "'", thr);
}
format = new DecimalFormat();
}
double numberValue = cell.getNumericCellValue();
cellValue = format.format(numberValue);
}
}
if (cellValue != null) {
cellValue = cellValue.trim();
if (cellValue.length() != 0) {
cleanBuffer.append(cellValue);
cleanBuffer.append(" ");
}
}
}
/**
* Checks cell is date formatted or not.
*
* @return boolean
*/
private boolean isCellDateFormatted(HSSFCell cell)
{
short format = cell.getCellStyle().getDataFormat();
if (HSSFDateUtil.isValidExcelDate(cell.getNumericCellValue())) {
if (HSSFDateUtil.isCellDateFormatted(cell)) {
return true;
} else {
String fmtText = mDataFormat.getFormat(format);
if (fmtText != null) {
fmtText = fmtText.toLowerCase();
if (fmtText.indexOf("d") >= 0 || fmtText.indexOf("m") >= 0
|| fmtText.indexOf("y") >= 0 || fmtText.indexOf("h") >= 0
|| fmtText.indexOf("s") >= 0) {
return true;
}
}
}
}
return false;
}
/**
* It will replace all occurances of pattern in the source with replacement value
*
* @return String
*/
public static String replace(String source, String pattern, String replacement)
{
// Check whether the pattern occurs in the source at all
int firstPatternPos = source.indexOf(pattern);
if (firstPatternPos == -1) {
// The pattern does not occur in the source -> return the source
return source;
}
// Build a new String where pattern is replaced by the replacement
StringBuffer target = new StringBuffer(source.length());
int start = 0; // The start of a part without the pattern
int end = firstPatternPos; // The end of a part without the pattern
do {
target.append(source.substring(start, end));
target.append(replacement);
start = end + pattern.length();
} while ((end = source.indexOf(pattern, start)) != -1);
target.append(source.substring(start, source.length()));
// return the String
return target.toString();
}
}