package org.gbif.ipt.model;
import org.gbif.ipt.utils.FileUtils;
import org.gbif.utils.file.ClosableReportingIterator;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.Date;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
import org.apache.log4j.Logger;
import org.apache.poi.hssf.usermodel.HSSFFormulaEvaluator;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.DataFormatter;
import org.apache.poi.ss.usermodel.FormulaEvaluator;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.ss.usermodel.Workbook;
import org.apache.poi.ss.usermodel.WorkbookFactory;
import org.apache.poi.xssf.usermodel.XSSFFormulaEvaluator;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
/**
* Uses apache POI to parse excel spreadsheets.
* A single file can have multiple sheets which each act as a separate source.
* The same file can therefore be used for multiple ExcelFileSource instances.
* POI usage example, see http://svn.apache.org/repos/asf/poi/trunk/src/examples/src/org/apache/poi/ss/examples/ToCSV.java
*/
public class ExcelFileSource extends SourceBase implements FileSource {
private static final Logger LOG = Logger.getLogger(ExcelFileSource.class);
private static final String SUFFIX = ".xls";
private int sheetIdx = 0;
private int ignoreHeaderLines = 0;
private File file;
private long fileSize;
private int rows;
protected Date lastModified;
private String escape(String x) {
if (x == null) {
return null;
}
return x.replaceAll("\\t", "\\\\t").replaceAll("\\n", "\\\\n").replaceAll("\\r", "\\\\r")
.replaceAll("\\f", "\\\\f");
}
public File getFile() {
return file;
}
public long getFileSize() {
return fileSize;
}
public String getFileSizeFormatted() {
return FileUtils.formatSize(fileSize, 1);
}
public int getIgnoreHeaderLines() {
return ignoreHeaderLines;
}
public void setIgnoreHeaderLines(int ignoreHeaderLines) {
this.ignoreHeaderLines = ignoreHeaderLines;
}
public Date getLastModified() {
return lastModified;
}
public int getSheetIdx() {
return sheetIdx;
}
public void setSheetIdx(int sheetIdx) {
this.sheetIdx = sheetIdx;
}
private Workbook openBook() throws IOException {
LOG.info("Opening excel workbook [" + file.getName() + "]");
try {
FileInputStream fis = new FileInputStream(file);
return WorkbookFactory.create(fis);
} catch (InvalidFormatException e) {
throw new IOException("Cannot open invalid excel speadsheet", e);
}
}
private Sheet getSheet(Workbook book) throws IOException {
return book.getSheetAt(sheetIdx);
}
public int getRows() {
return rows;
}
private class RowIterator implements ClosableReportingIterator<String[]> {
private final Sheet sheet; // 0 based
private final Iterator<Row> iter;
private final int rowSize;
// DataFormatter displays data exactly as it appears in Excel
private final DataFormatter dataFormatter = new DataFormatter();
private boolean rowError;
private String errorMessage;
private Exception exception;
// FormulaEvaluator evaluate any formula in Excel cell and returns result
private FormulaEvaluator formulaEvaluator;
RowIterator(ExcelFileSource source) throws IOException, InvalidFormatException {
Workbook book = openBook();
sheet = getSheet(book);
// instantiate the appropriate FormulaEvaluator, depending on whether workbook is .xls or .xlsx
formulaEvaluator = (book instanceof XSSFWorkbook) ? new XSSFFormulaEvaluator((XSSFWorkbook) book)
: new HSSFFormulaEvaluator((HSSFWorkbook) book);
iter = sheet.rowIterator();
rowSize = source.getColumns();
}
RowIterator(ExcelFileSource source, int skipRows) throws IOException, InvalidFormatException {
this(source);
while (skipRows > 0) {
iter.next();
skipRows--;
}
}
public void close() {
// nothing to do
}
public boolean hasNext() {
return iter.hasNext();
}
public String[] next() {
//TODO: log empty or irregular rows, setting rowError to true and populating errorMessage
String[] val = new String[rowSize];
if (hasNext()) {
resetReportingIterator();
try {
Row row = iter.next();
for (int i = 0; i < rowSize; i++) {
Cell c = row.getCell(i, Row.CREATE_NULL_AS_BLANK);
formulaEvaluator.evaluate(c);
val[i] = dataFormatter.formatCellValue(c, formulaEvaluator);
}
} catch (Exception e) {
LOG.debug("Exception caught: " + e.getMessage(), e);
exception = e;
errorMessage = e.getMessage();
}
}
return val;
}
/**
* Reset all reporting parameters.
*/
private void resetReportingIterator() {
rowError = false;
exception = null;
errorMessage = null;
}
public void remove() {
// unsupported
}
public boolean hasRowError() {
return rowError;
}
public String getErrorMessage() {
return errorMessage;
}
public Exception getException() {
return exception;
}
}
public ClosableReportingIterator<String[]> rowIterator() {
try {
return new RowIterator(this, ignoreHeaderLines);
} catch (Exception e) {
LOG.error("Exception while reading excel source " + name, e);
}
return null;
}
/**
* @return list of available sheets, keyed on sheet index
*/
public Map<Integer, String> sheets() throws IOException {
Workbook book = openBook();
int cnt = book.getNumberOfSheets();
Map<Integer, String> sheets = Maps.newHashMap();
for (int x = 0; x < cnt; x++) {
sheets.put(x, book.getSheetName(x));
}
return sheets;
}
public List<String> columns() {
if (rows > 0) {
try {
if (ignoreHeaderLines > 0) {
return Lists.newArrayList(new RowIterator(this, ignoreHeaderLines - 1).next());
} else {
List<String> columnList = Lists.newArrayList();
for (int x = 1; x <= columns; x++) {
columnList.add("Column #" + x);
}
return columnList;
}
} catch (Exception e) {
LOG.error("Exception while reading excel source " + name, e);
}
}
// no rows, no columns
return Lists.newArrayList();
}
public void setFile(File file) {
this.file = file;
}
public void setFileSize(long fileSize) {
this.fileSize = fileSize;
}
public void setIgnoreHeaderLines(Integer ignoreHeaderLines) {
this.ignoreHeaderLines = ignoreHeaderLines == null ? 0 : ignoreHeaderLines;
}
public void setLastModified(Date lastModified) {
this.lastModified = lastModified;
}
public String getPreferredFileSuffix() {
return SUFFIX;
}
public void setRows(int rows) {
this.rows = rows;
}
public Set<Integer> analyze() throws IOException {
setFileSize(getFile().length());
// find row size
Workbook book = openBook();
Sheet sheet = getSheet(book);
setRows(sheet.getPhysicalNumberOfRows());
Iterator<Row> iter = sheet.rowIterator();
if (iter.hasNext()) {
setColumns(iter.next().getLastCellNum());
setReadable(true);
} else {
setColumns(0);
setReadable(false);
}
//TODO: report empty or irregular rows
return Sets.newHashSet();
}
private String unescape(String x) {
if (x == null) {
return null;
}
return x.replaceAll("\\\\t", String.valueOf('\t')).replaceAll("\\\\n", String.valueOf('\n'))
.replaceAll("\\\\r", String.valueOf('\r')).replaceAll("\\\\f", String.valueOf('\f'));
}
}