package org.molgenis.data.excel; import com.google.common.collect.Iterables; import org.apache.commons.lang3.StringUtils; import org.apache.poi.ss.usermodel.Cell; import org.apache.poi.ss.usermodel.Row; import org.apache.poi.ss.usermodel.Sheet; import org.apache.poi.ss.util.CellReference; import org.molgenis.data.Entity; import org.molgenis.data.MolgenisDataException; import org.molgenis.data.RepositoryCapability; import org.molgenis.data.meta.model.AttributeFactory; import org.molgenis.data.meta.model.EntityType; import org.molgenis.data.meta.model.EntityTypeFactory; import org.molgenis.data.processor.AbstractCellProcessor; import org.molgenis.data.processor.CellProcessor; import org.molgenis.data.support.AbstractRepository; import org.springframework.util.LinkedCaseInsensitiveMap; import java.util.*; import static java.lang.String.format; import static java.util.Objects.requireNonNull; import static org.molgenis.data.meta.AttributeType.STRING; /** * ExcelSheet {@link org.molgenis.data.Repository} implementation * <p> * It is assumed that the first row of the sheet is the header row. * <p> * All attributes will be of the string type. The cell values are converted to string. * <p> * The url of this Repository is defined as excel://${filename}/${sheetname} */ public class ExcelRepository extends AbstractRepository { private final Sheet sheet; private final EntityTypeFactory entityTypeFactory; private final AttributeFactory attrMetaFactory; /** * process cells after reading */ private List<CellProcessor> cellProcessors; /** * column names index */ private Map<String, Integer> colNamesMap; private EntityType entityType; public ExcelRepository(String fileName, Sheet sheet, EntityTypeFactory entityTypeFactory, AttributeFactory attrMetaFactory) { this(fileName, sheet, entityTypeFactory, attrMetaFactory, null); } public ExcelRepository(String fileName, Sheet sheet, EntityTypeFactory entityTypeFactory, AttributeFactory attrMetaFactory, List<CellProcessor> cellProcessors) { this.sheet = requireNonNull(sheet); if (sheet.getNumMergedRegions() > 0) { throw new MolgenisDataException( format("Sheet [%s] contains merged regions which is not supported", sheet.getSheetName())); } this.entityTypeFactory = requireNonNull(entityTypeFactory); this.attrMetaFactory = requireNonNull(attrMetaFactory); this.cellProcessors = cellProcessors; } public int getNrRows() { return sheet.getLastRowNum() + 1; // getLastRowNum is 0-based } @Override public Iterator<Entity> iterator() { final Iterator<Row> it = sheet.iterator(); if (!it.hasNext()) return Collections.<Entity>emptyList().iterator(); // create column header index once and reuse Row headerRow = it.next(); if (colNamesMap == null) { colNamesMap = toColNamesMap(headerRow); } if (!it.hasNext()) return Collections.<Entity>emptyList().iterator(); return new Iterator<Entity>() { ExcelEntity next = null; @Override public boolean hasNext() { // iterator skips empty lines. if (it.hasNext() && next == null) { ExcelEntity entity = new ExcelEntity(it.next(), colNamesMap, cellProcessors, getEntityType()); // check if there is any column containing a value for (String name : entity.getAttributeNames()) { if (StringUtils.isNotEmpty(entity.getString(name))) { next = entity; break; } } // next line not empty? if (next == null) { hasNext(); } } return next != null; } @Override public ExcelEntity next() { hasNext(); ExcelEntity result = next; next = null; return result; } @Override public void remove() { throw new UnsupportedOperationException(); } }; } public void addCellProcessor(CellProcessor cellProcessor) { if (cellProcessors == null) cellProcessors = new ArrayList<>(); cellProcessors.add(cellProcessor); } public EntityType getEntityType() { if (entityType == null) { EntityType entityType = entityTypeFactory.create().setName(sheet.getSheetName()); if (colNamesMap == null) { Iterator<Row> it = sheet.iterator(); if (it.hasNext()) { // First row contains the headers colNamesMap = toColNamesMap(it.next()); } } if (colNamesMap != null) { for (String colName : colNamesMap.keySet()) { entityType.addAttribute(attrMetaFactory.create().setName(colName).setDataType(STRING)); } } this.entityType = entityType; } return entityType; } private Map<String, Integer> toColNamesMap(Row headerRow) { if (headerRow == null) return null; Map<String, Integer> columnIdx = new LinkedCaseInsensitiveMap<>(); int i = 0; for (Iterator<Cell> it = headerRow.cellIterator(); it.hasNext(); ) { try { String header = AbstractCellProcessor.processCell(ExcelUtils.toValue(it.next()), true, cellProcessors); if (null != header) columnIdx.put(header, i++); } catch (final IllegalStateException ex) { final int row = headerRow.getRowNum(); final String column = CellReference.convertNumToColString(i); throw new IllegalStateException("Invalid value at [" + sheet.getSheetName() + "] " + column + row + 1, ex); } } return columnIdx; } @Override public Set<RepositoryCapability> getCapabilities() { return Collections.emptySet(); } @Override public long count() { return Iterables.size(this); } }