/* * Copyright (c) 2017 OBiBa. All rights reserved. * * This program and the accompanying materials * are made available under the terms of the GNU Public License v3.0. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ package org.obiba.magma.datasource.excel; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.io.PushbackInputStream; import java.util.Arrays; import java.util.Collection; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Set; import java.util.HashMap; import java.util.HashSet; import java.util.LinkedHashMap; import java.util.LinkedHashSet; import javax.validation.constraints.NotNull; import org.apache.poi.POIXMLDocument; import org.apache.poi.hssf.eventusermodel.HSSFEventFactory; import org.apache.poi.hssf.eventusermodel.HSSFListener; import org.apache.poi.hssf.eventusermodel.HSSFRequest; import org.apache.poi.hssf.record.BOFRecord; import org.apache.poi.hssf.record.BoundSheetRecord; import org.apache.poi.hssf.record.LabelSSTRecord; import org.apache.poi.hssf.record.NumberRecord; import org.apache.poi.hssf.record.Record; import org.apache.poi.hssf.record.RowRecord; import org.apache.poi.hssf.record.SSTRecord; import org.apache.poi.hssf.usermodel.HSSFWorkbook; import org.apache.poi.openxml4j.exceptions.InvalidFormatException; import org.apache.poi.openxml4j.exceptions.OpenXML4JException; import org.apache.poi.openxml4j.opc.OPCPackage; import org.apache.poi.poifs.filesystem.POIFSFileSystem; import org.apache.poi.ss.usermodel.BuiltinFormats; import org.apache.poi.ss.usermodel.Cell; import org.apache.poi.ss.usermodel.CellStyle; import org.apache.poi.ss.usermodel.DataFormatter; import org.apache.poi.ss.usermodel.Font; import org.apache.poi.ss.usermodel.Row; import org.apache.poi.ss.usermodel.Sheet; import org.apache.poi.ss.usermodel.Workbook; import org.apache.poi.ss.usermodel.WorkbookFactory; import org.apache.poi.xssf.eventusermodel.ReadOnlySharedStringsTable; import org.apache.poi.xssf.eventusermodel.XSSFReader; import org.apache.poi.xssf.model.StylesTable; import org.apache.poi.xssf.usermodel.XSSFCellStyle; import org.apache.poi.xssf.usermodel.XSSFRichTextString; import org.apache.poi.xssf.usermodel.XSSFWorkbook; import org.obiba.magma.Disposable; import org.obiba.magma.MagmaRuntimeException; import org.obiba.magma.Timestamps; import org.obiba.magma.ValueTable; import org.obiba.magma.ValueTableWriter; import org.obiba.magma.datasource.excel.support.ExcelDatasourceParsingException; import org.obiba.magma.datasource.excel.support.ExcelUtil; import org.obiba.magma.datasource.excel.support.NameConverter; import org.obiba.magma.datasource.excel.support.VariableConverter; import org.obiba.magma.support.AbstractDatasource; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.xml.sax.Attributes; import org.xml.sax.ContentHandler; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import org.xml.sax.XMLReader; import org.xml.sax.helpers.DefaultHandler; import org.xml.sax.helpers.XMLReaderFactory; import com.google.common.collect.Lists; import com.google.common.collect.Sets; /** * Implements a {@code Datasource} on top of an Excel Workbook. */ public class ExcelDatasource extends AbstractDatasource { private static final Logger log = LoggerFactory.getLogger(ExcelDatasource.class); public static final String VARIABLES_SHEET = "Variables"; public static final String CATEGORIES_SHEET = "Categories"; public static final String HELP_SHEET = "Help"; public static final String DEFAULT_TABLE_NAME = "Table"; public static final Set<String> SHEET_RESERVED_NAMES = Sets.newHashSet(VARIABLES_SHEET, CATEGORIES_SHEET, HELP_SHEET); private static final int SHEET_NAME_MAX_LENGTH = 30; private static final int BOLD_WEIGHT = 700; private Workbook excelWorkbook; private Sheet variablesSheet; private Sheet categoriesSheet; private File excelFile; private OutputStream excelOutput; private InputStream excelInput; private Map<String, CellStyle> excelStyles; private final Map<String, ExcelValueTable> valueTablesMapOnInit = new LinkedHashMap<>(100); private ExcelValueTableWriter excelValueTableWriter; /** * Excel workbook will be read from the provided file if it exists, and will be written in the file at datasource * disposal. * * @param name * @param excelFile */ public ExcelDatasource(String name, File excelFile) { super(name, "excel"); this.excelFile = excelFile; } /** * Excel workbook will be written in the output stream at datasource disposal. * * @param name * @param output */ public ExcelDatasource(String name, OutputStream output) { super(name, "excel"); excelOutput = output; } /** * Excel workbook will be read from input stream. * * @param name * @param input */ public ExcelDatasource(String name, InputStream input) { super(name, "excel"); excelInput = input; } @Override protected void onInitialise() { if(excelFile != null) { createWorbookFromFile(); } else if(excelInput != null) { createWorkbookFromInputStream(); } else { // Create a XSSFWorkbook that will be written in output stream excelWorkbook = new XSSFWorkbook(); } createExcelStyles(); } private void createWorbookFromFile(){ if(excelFile.exists()) { try (InputStream input = new FileInputStream(excelFile)) { createWorkbookFromInputStream(input); } catch(IOException e) { throw new MagmaRuntimeException("Exception reading excel spreadsheet " + excelFile.getName(), e); } } else { if(excelFile.getName().endsWith("xls")) log.warn( "Creating an ExcelDatasource using Excel 97 format which only supports 256 columns. This may not be sufficient for large amounts of variables. Specify a filename with an extension other than 'xls' to use Excel 2007 format."); excelWorkbook = excelFile.getName().endsWith("xls") ? new HSSFWorkbook() : new XSSFWorkbook(); } } private void createWorkbookFromInputStream() { createWorkbookFromInputStream(excelInput); } private void createWorkbookFromInputStream(InputStream inpOrig) { try(InputStream inp = !inpOrig.markSupported() ? new PushbackInputStream(inpOrig, 8) : inpOrig) { if(POIFSFileSystem.hasPOIFSHeader(inp)) { createHSSFWorkbook(inp); } else if(POIXMLDocument.hasOOXMLHeader(inp)) { createXSSFWorkbook(inp); } else { excelWorkbook = WorkbookFactory.create(inpOrig); } } catch(IOException e) { throw new MagmaRuntimeException("Exception reading excel spreadsheet " + excelFile.getName(), e); } catch(OpenXML4JException | SAXException e) { throw new MagmaRuntimeException("Invalid excel spreadsheet format " + excelFile.getName(), e); } catch(IllegalArgumentException e) { throw new MagmaRuntimeException( "Invalid excel spreadsheet format from input stream (neither an OLE2 stream nor an OOXML stream)."); } } private void createHSSFWorkbook(InputStream inp) throws IOException { POIFSFileSystem poifs = new POIFSFileSystem(inp); try (InputStream din = poifs.createDocumentInputStream("Workbook")) { HSSFRequest req = new HSSFRequest(); excelWorkbook = new HSSFWorkbook(); req.addListenerForAllRecords(new SheetExtractorListener(excelWorkbook, VARIABLES_SHEET, CATEGORIES_SHEET)); HSSFEventFactory factory = new HSSFEventFactory(); factory.processEvents(req, din); } } private void createXSSFWorkbook(InputStream inp) throws IOException, SAXException, OpenXML4JException { excelWorkbook = new XSSFWorkbook(); OPCPackage container = OPCPackage.open(inp); ReadOnlySharedStringsTable strings = new ReadOnlySharedStringsTable(container); XSSFReader reader = new XSSFReader(container); parseSheets(reader, strings, excelWorkbook, VARIABLES_SHEET, CATEGORIES_SHEET); } private void parseSheets(XSSFReader reader, ReadOnlySharedStringsTable strings, Workbook excelWorkbook, String... sheetNames) throws SAXException, IOException, InvalidFormatException { XSSFReader.SheetIterator iter = (XSSFReader.SheetIterator) reader.getSheetsData(); while (iter.hasNext()) { try (InputStream sheet = iter.next()) { String sName = iter.getSheetName(); if(Arrays.asList(sheetNames).contains(sName)) { XMLReader parser = buildSheetParser(strings, reader.getStylesTable(), excelWorkbook, sName); parser.parse(new InputSource(sheet)); } } } } /** * Set the output stream to which the Excel workbook will be persisted at datasource disposal. * * @param excelOutput */ public void setExcelOutput(OutputStream excelOutput) { this.excelOutput = excelOutput; } @Override @NotNull public ValueTableWriter createWriter(@NotNull String name, @NotNull String entityType) { ExcelValueTable valueTable; if(hasValueTable(name)) { valueTable = (ExcelValueTable) getValueTable(name); } else { addValueTable(valueTable = new ExcelValueTable(this, name, entityType)); } if (excelValueTableWriter == null) excelValueTableWriter = new ExcelValueTableWriter(valueTable); else excelValueTableWriter = new ExcelValueTableWriter(valueTable, excelValueTableWriter); return excelValueTableWriter; } @Override public void dispose() { if (excelValueTableWriter != null && excelValueTableWriter.writeVariables() != null) ((Disposable) excelValueTableWriter.writeVariables()).dispose(); super.dispose(); } /** * Write the Excel workbook into provided output stream. * * @param excelOutputStream * @throws IOException */ private void writeWorkbook(OutputStream excelOutputStream) throws IOException { excelWorkbook.write(excelOutputStream); } @Override protected void onDispose() { // Write the workbook (datasource) to file/OutputStream if any of them is defined try(OutputStream out = excelFile == null ? excelOutput : new FileOutputStream(excelFile)) { if(out != null) { writeWorkbook(out); } } catch(Exception e) { throw new MagmaRuntimeException("Could not write to excel output stream", e); } } @Override protected Set<String> getValueTableNames() { Collection<String> sheetNames = new LinkedHashSet<>(100); // find the table names from the Variables sheet if(hasVariablesSheet()) { Row headerVariables = getVariablesSheet().getRow(0); if(headerVariables != null) { Map<String, Integer> headerMapVariables = getVariablesHeaderMap(); if(headerMapVariables != null) { List<ExcelDatasourceParsingException> errors = readValueTablesFromVariableSheet(headerMapVariables, sheetNames); if(errors.size() > 0) { ExcelDatasourceParsingException parent = new ExcelDatasourceParsingException( "Errors while parsing variables", // "TableDefinitionErrors", getVariablesSheet().getSheetName(), 1, getName()); parent.setChildren(errors); throw parent; } } } } // find other tables from their sheet name int sheetCount = excelWorkbook.getNumberOfSheets(); for(int i = 0; i < sheetCount; i++) { String sheetName = excelWorkbook.getSheetAt(i).getSheetName(); if(!sheetNames.contains(sheetName) && !SHEET_RESERVED_NAMES.contains(sheetName)) { valueTablesMapOnInit.put(sheetName, new ExcelValueTable(this, sheetName, "Participant")); } } return valueTablesMapOnInit.keySet(); } private List<ExcelDatasourceParsingException> readValueTablesFromVariableSheet( Map<String, Integer> headerMapVariables, Collection<String> sheetNames) { for(int i = 1; i < getVariablesSheet().getPhysicalNumberOfRows(); i++) { Row variableRow = getVariablesSheet().getRow(i); readValueTablesFromVariableRow(headerMapVariables, sheetNames, variableRow); } return Lists.newArrayList(); } private void readValueTablesFromVariableRow(Map<String, Integer> headerMapVariables, Collection<String> sheetNames, Row variableRow) { if (variableRow == null) return; String tableHeader = ExcelUtil.findNormalizedHeader(headerMapVariables.keySet(), VariableConverter.TABLE); String tableName = DEFAULT_TABLE_NAME; if(tableHeader != null) { tableName = ExcelUtil.getCellValueAsString(variableRow.getCell(headerMapVariables.get(tableHeader))); if(tableName.trim().isEmpty()) { return; // ignore rows without table name } } if(!valueTablesMapOnInit.containsKey(tableName)) { String entityTypeHeader = ExcelUtil .findNormalizedHeader(headerMapVariables.keySet(), VariableConverter.ENTITY_TYPE); String entityType = "Participant"; if(entityTypeHeader != null) { entityType = ExcelUtil.getCellValueAsString(variableRow.getCell(headerMapVariables.get(entityTypeHeader))); } valueTablesMapOnInit.put(tableName, new ExcelValueTable(this, tableName, entityType)); sheetNames.add(getSheetName(tableName)); } } @Override protected ValueTable initialiseValueTable(String tableName) { return valueTablesMapOnInit.get(tableName); } public boolean hasVariablesSheet() { return excelWorkbook.getSheet("Variables") != null; } public Sheet getVariablesSheet() { if(variablesSheet == null) { variablesSheet = createSheetIfNotExist("Variables"); } return variablesSheet; } public Sheet getCategoriesSheet() { if(categoriesSheet == null) { categoriesSheet = createSheetIfNotExist("Categories"); } return categoriesSheet; } public Set<String> getVariablesCustomAttributeNames() { return getCustomAttributeNames(getVariablesSheet(), VariableConverter.reservedVariableHeaders); } public Set<String> getCategoriesCustomAttributeNames() { return getCustomAttributeNames(getCategoriesSheet(), VariableConverter.reservedCategoryHeaders); } private Set<String> getCustomAttributeNames(Sheet sheet, Iterable<String> reservedAttributeNames) { Row rowHeader = sheet.getRow(0); Set<String> attributesNames = new HashSet<>(); int cellCount = rowHeader.getPhysicalNumberOfCells(); for(int i = 0; i < cellCount; i++) { Cell c = rowHeader.getCell(i, Row.RETURN_BLANK_AS_NULL); if (c == null) { throw new MagmaRuntimeException("Missing header: " + sheet.getSheetName() + " at column " + colToName(i)); } String attributeName = ExcelUtil.getCellValueAsString(c).trim(); if(ExcelUtil.findNormalizedHeader(reservedAttributeNames, attributeName) == null) { attributesNames.add(attributeName); } } return attributesNames; } public Map<String, Integer> getVariablesHeaderMap() { return getMapSheetHeader(getVariablesSheet()); } public Map<String, Integer> getCategoriesHeaderMap() { return getMapSheetHeader(getCategoriesSheet()); } private Map<String, Integer> getMapSheetHeader(Sheet sheet) { Row rowHeader = sheet.getRow(0); Map<String, Integer> headerMap = null; if(rowHeader != null) { headerMap = new HashMap<>(); int cellCount = rowHeader.getPhysicalNumberOfCells(); Cell cell; for(int i = 0; i < cellCount; i++) { cell = rowHeader.getCell(i, Row.RETURN_BLANK_AS_NULL); if (cell == null) { throw new MagmaRuntimeException("Missing header: " + sheet.getSheetName() + " at column " + colToName(i)); } headerMap.put(cell.getStringCellValue().trim(), i); } } return headerMap; } Sheet createSheetIfNotExist(String tableName) { Sheet sheet; String sheetName = getSheetName(tableName); sheet = excelWorkbook.getSheet(sheetName); if(sheet == null) { sheet = excelWorkbook.createSheet(sheetName); } return sheet; } /** * Get converted sheet name from table name. * * @param tableName * @return */ private String getSheetName(String tableName) { String sheetName = NameConverter.toExcelName(tableName); // Excel allows a maximum of 30 chars for table names if(sheetName.length() > SHEET_NAME_MAX_LENGTH) { sheetName = sheetName.substring(0, 27) + "$" + (sheetName.length() - SHEET_NAME_MAX_LENGTH); } log.debug("{}={}", tableName, sheetName); return sheetName; } /** * Get the sheet from table name. * * @param tableName * @return null if sheet does not exists */ Sheet getSheet(String tableName) { return excelWorkbook.getSheet(getSheetName(tableName)); } private void createExcelStyles() { excelStyles = new HashMap<>(); CellStyle headerCellStyle = excelWorkbook.createCellStyle(); Font headerFont = excelWorkbook.createFont(); headerFont.setBoldweight((short) BOLD_WEIGHT); headerCellStyle.setFont(headerFont); excelStyles.put("headerCellStyle", headerCellStyle); } @Override @NotNull public Timestamps getTimestamps() { return new ExcelTimestamps(excelFile); } public CellStyle getHeaderCellStyle() { return excelStyles.get("headerCellStyle"); } private String colToName(int colNum) { String colName = ""; int colNumTemp = colNum; do { colName = String.valueOf(Character.toChars('A' + (colNumTemp % 26))) + colName; colNumTemp = Math.floorDiv(colNumTemp, 26) - 1; } while (colNumTemp >= 0); return colName; } private XMLReader buildSheetParser(ReadOnlySharedStringsTable strings, StylesTable st, Workbook workbook, String name) throws SAXException { XMLReader parser = XMLReaderFactory.createXMLReader(); ContentHandler handler = new SheetHandler(strings, st, workbook, name); parser.setContentHandler(handler); return parser; } enum xssfDataType { BOOL, ERROR, FORMULA, INLINESTR, SSTINDEX, NUMBER, } private static class SheetHandler extends DefaultHandler { private Workbook workbook; private Sheet sh; private Row row; private StylesTable stylesTable; private ReadOnlySharedStringsTable sharedStringsTable; private boolean vIsOpen; private xssfDataType nextDataType; private short formatIndex; private String formatString; private final DataFormatter formatter; private int thisColumn = -1; private int lastColumnNumber = -1; private StringBuffer value; private SheetHandler(ReadOnlySharedStringsTable sst, StylesTable st, Workbook workbook, String name) { this.sharedStringsTable = sst; this.stylesTable = st; this.workbook = workbook; sh = workbook.createSheet(name); row = sh.createRow(0); this.value = new StringBuffer(); this.formatter = new DataFormatter(); } @Override @SuppressWarnings({ "PMD.NcssMethodCount", "OverlyLongMethod" }) public void startElement(String uri, String localName, String name, Attributes attributes) throws SAXException { if ("inlineStr".equals(name) || "v".equals(name)) { vIsOpen = true; value.setLength(0); } else if ("c".equals(name)) { String r = attributes.getValue("r"); int firstDigit = -1; for (int c = 0; c < r.length(); ++c) { if (Character.isDigit(r.charAt(c))) { firstDigit = c; break; } } thisColumn = nameToColumn(r.substring(0, firstDigit)); this.nextDataType = xssfDataType.NUMBER; this.formatIndex = -1; this.formatString = null; String cellType = attributes.getValue("t"); String cellStyleStr = attributes.getValue("s"); if ("b".equals(cellType)) nextDataType = xssfDataType.BOOL; else if ("e".equals(cellType)) nextDataType = xssfDataType.ERROR; else if ("inlineStr".equals(cellType)) nextDataType = xssfDataType.INLINESTR; else if ("s".equals(cellType)) nextDataType = xssfDataType.SSTINDEX; else if ("str".equals(cellType)) nextDataType = xssfDataType.FORMULA; else if (cellStyleStr != null) { int styleIndex = Integer.parseInt(cellStyleStr); XSSFCellStyle style = stylesTable.getStyleAt(styleIndex); this.formatIndex = style.getDataFormat(); this.formatString = style.getDataFormatString(); if (this.formatString == null) this.formatString = BuiltinFormats.getBuiltinFormat(this.formatIndex); } } } @Override @SuppressWarnings({ "PMD.NcssMethodCount", "OverlyLongMethod" }) public void endElement(String uri, String localName, String name) throws SAXException { String thisStr; if ("v".equals(name)) { switch (nextDataType) { case BOOL: char first = value.charAt(0); thisStr = "" + first; break; case ERROR: thisStr = value.toString(); break; case FORMULA: thisStr = value.toString(); break; case INLINESTR: XSSFRichTextString rtsi = new XSSFRichTextString(value.toString()); thisStr = rtsi.toString(); break; case SSTINDEX: String sstIndex = value.toString(); try { int idx = Integer.parseInt(sstIndex); XSSFRichTextString rtss = new XSSFRichTextString(sharedStringsTable.getEntryAt(idx)); thisStr = rtss.toString(); } catch (NumberFormatException ex) { throw new MagmaRuntimeException("Failed to parse SST index '" + sstIndex + "': " + ex.toString()); } break; case NUMBER: String n = value.toString(); if (this.formatString != null) thisStr = formatter.formatRawCellContents(Double.parseDouble(n), this.formatIndex, this.formatString); else thisStr = n; break; default: thisStr = nextDataType.toString(); break; } Cell cell = row.createCell(thisColumn); cell.setCellValue(thisStr); if (thisColumn > -1) lastColumnNumber = thisColumn; } else if ("row".equals(name)) { lastColumnNumber = -1; row = sh.createRow(sh.getPhysicalNumberOfRows()); } } @Override public void characters(char[] ch, int start, int length) throws SAXException { if (vIsOpen) value.append(ch, start, length); } private int nameToColumn(String name) { int column = -1; for (int i = 0; i < name.length(); ++i) { int c = name.charAt(i); column = (column + 1) * 26 + c - 'A'; } return column; } } private static class SheetExtractorListener implements HSSFListener { private SSTRecord sstrec; private Workbook workbook; private LinkedList<Sheet> sheets = new LinkedList<>(); private Sheet sheet; private List<String> sheetNames; public SheetExtractorListener(Workbook workbook, String... sheetNames) { this.workbook = workbook; this.sheetNames = Lists.newArrayList(sheetNames); } /** * * @param record */ @Override @SuppressWarnings({ "PMD.NcssMethodCount", "OverlyLongMethod" }) public void processRecord(Record record) { switch(record.getSid()) { case BOFRecord.sid: BOFRecord bof = (BOFRecord) record; if (bof.getType() == bof.TYPE_WORKBOOK) { //ignore } else if (bof.getType() == bof.TYPE_WORKSHEET) { sheet = sheets.poll(); } break; case BoundSheetRecord.sid: BoundSheetRecord bsr = (BoundSheetRecord) record; if (sheetNames.contains(bsr.getSheetname())) { sheets.add(this.workbook.createSheet(bsr.getSheetname())); } else { sheets.add(null); } break; case RowRecord.sid: RowRecord rowrec = (RowRecord) record; if(sheet != null) { sheet.createRow(rowrec.getRowNumber()); } break; case NumberRecord.sid: NumberRecord numrec = (NumberRecord) record; if(sheet != null) { sheet.getRow(numrec.getRow()).createCell(numrec.getColumn()).setCellValue(numrec.getValue()); } break; case SSTRecord.sid: sstrec = (SSTRecord) record; break; case LabelSSTRecord.sid: LabelSSTRecord lrec = (LabelSSTRecord) record; if(sheet != null) { sheet.getRow(lrec.getRow()).createCell(lrec.getColumn()) .setCellValue(sstrec.getString(lrec.getSSTIndex()).getString()); } break; } } } }