// $HeadURL: // svn+ssh://ant4@orchestra.med.harvard.edu/svn/iccb/screensaver/trunk/src/edu/harvard/med/screensaver/io/screenresults/ScreenResultParser.java // $ // $Id$ // // Copyright © 2006, 2010, 2011, 2012 by the President and Fellows of Harvard College. // // Screensaver is an open-source project developed by the ICCB-L and NSRB labs // at Harvard Medical School. This software is distributed under the terms of // the GNU General Public License. package edu.harvard.med.screensaver.io.screenresults; import java.io.File; import java.io.FileNotFoundException; import java.io.IOException; import java.util.ArrayList; import java.util.Collections; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; import java.util.SortedMap; import java.util.TreeMap; import java.util.regex.Matcher; import java.util.regex.Pattern; import com.google.common.base.Predicates; import com.google.common.collect.Iterables; import com.google.common.collect.Maps; import com.google.common.collect.Sets; import org.apache.commons.lang.math.IntRange; import org.apache.log4j.Logger; import edu.harvard.med.screensaver.db.AbstractDAO; import edu.harvard.med.screensaver.db.GenericEntityDAO; import edu.harvard.med.screensaver.db.LibrariesDAO; import edu.harvard.med.screensaver.db.ScreenResultsDAO; import edu.harvard.med.screensaver.io.libraries.ExtantLibraryException; import edu.harvard.med.screensaver.io.workbook2.Cell; import edu.harvard.med.screensaver.io.workbook2.CellValueParser; import edu.harvard.med.screensaver.io.workbook2.CellVocabularyParser; import edu.harvard.med.screensaver.io.workbook2.Row; import edu.harvard.med.screensaver.io.workbook2.WellNameParser; import edu.harvard.med.screensaver.io.workbook2.Workbook; import edu.harvard.med.screensaver.io.workbook2.WorkbookParseError; import edu.harvard.med.screensaver.io.workbook2.Worksheet; import edu.harvard.med.screensaver.model.DataModelViolationException; import edu.harvard.med.screensaver.model.libraries.Library; import edu.harvard.med.screensaver.model.libraries.Well; import edu.harvard.med.screensaver.model.libraries.WellKey; import edu.harvard.med.screensaver.model.screenresults.AssayWell; import edu.harvard.med.screensaver.model.screenresults.AssayWellControlType; import edu.harvard.med.screensaver.model.screenresults.ConfirmedPositiveValue; import edu.harvard.med.screensaver.model.screenresults.DataColumn; import edu.harvard.med.screensaver.model.screenresults.DataType; import edu.harvard.med.screensaver.model.screenresults.PartitionedValue; import edu.harvard.med.screensaver.model.screenresults.ResultValue; import edu.harvard.med.screensaver.model.screenresults.ScreenResult; import edu.harvard.med.screensaver.model.screens.AssayReadoutType; import edu.harvard.med.screensaver.model.screens.Screen; import edu.harvard.med.screensaver.util.AlphabeticCounter; import edu.harvard.med.screensaver.util.DevelopmentException; import edu.harvard.med.screensaver.util.StringUtils; /** * Parses data from a workbook files (a.k.a. Excel spreadsheets) necessary for * instantiating a * {@link edu.harvard.med.screensaver.model.screenresults.ScreenResult}. A * {@link ScreenResult} is comprised of {@link DataColumn}s that in turn contain * {@link ResultValue}s. By convention, each worksheet contains the raw data for * a single plate, but the parser is indifferent to how data may be arranged * across worksheets. * <p> * The "Data Columns" worksheet is used to create * {@link DataColumn} objects, * while the raw data is used to instantiate each of the {@link DataColumn}s' * {@link ResultValue} * objects. Together, these objects are used instantiate a {@link ScreenResult} * object, which is the returned result of the {@link #parse} method. * <p> * The class attempts to parse the file(s) as fully as possible, carrying on in * the face of errors, in order to catch as many errors as possible, as this * will aid the manual effort of correcting the files' format and content * between import attempts. The {@link #getErrors()} method will * the errors messages encountered during parsing. * <p> * Each call to {@link #parse} will clear the errors accumulated from the * previous call, and so the result of calling {@link #getErrors()} will change * after each call to {@link #parse}. * * @author <a mailto="andrew_tolopko@hms.harvard.edu">Andrew Tolopko</a> * @author <a mailto="john_sullivan@hms.harvard.edu">John Sullivan</a> */ public class ScreenResultParser implements ScreenResultWorkbookSpecification { private static final Logger log = Logger.getLogger(ScreenResultParser.class); private static final String NO_SCREEN_ID_FOUND_ERROR = "Screen ID not found for row: "; private static final String DATA_COLUMNS_SHEET_NOT_FOUND_ERROR = "\"Data Columns\" sheet not found"; private static final String UNKNOWN_ERROR = "unknown error"; private static final String NO_DATA_SHEETS_FOUND_ERROR = "no data worksheets were found; no result data was imported"; private static final String NO_SUCH_WELL = "library well does not exist"; private static final String NO_SUCH_LIBRARY_WITH_PLATE = "no library with given plate number"; private static final String ASSAY_WELL_TYPE_INCONSISTENCY = "assay well type cannot be changed"; private static SortedMap<String,AssayReadoutType> assayReadoutTypeMap = new TreeMap<String,AssayReadoutType>(); private static SortedMap<String,DataType> dataTypeMap = new TreeMap<String,DataType>(); private static SortedMap<String,Boolean> primaryOrFollowUpMap = new TreeMap<String,Boolean>(); private static SortedMap<String,Boolean> booleanMap = new TreeMap<String,Boolean>(); private static SortedMap<String,PartitionedValue> partitionedValueMap = new TreeMap<String,PartitionedValue>(); private static SortedMap<String,ConfirmedPositiveValue> confirmedPositiveValueMap = new TreeMap<String,ConfirmedPositiveValue>(); private static SortedMap<String,AssayWellControlType> assayWellControlTypeMap = new TreeMap<String,AssayWellControlType>(); static { for (AssayReadoutType assayReadoutType : AssayReadoutType.values()) { assayReadoutTypeMap.put(assayReadoutType.getValue(), assayReadoutType); } for (DataType dataType : DataType.values()) { dataTypeMap.put(dataType.getValue(), dataType); } primaryOrFollowUpMap.put("", false); primaryOrFollowUpMap.put(PRIMARY_VALUE, false); primaryOrFollowUpMap.put(FOLLOWUP_VALUE, true); booleanMap.put("", false); booleanMap.put("false", false); booleanMap.put("no", false); booleanMap.put("n", false); booleanMap.put("0", false); booleanMap.put("true", true); booleanMap.put("yes", true); booleanMap.put("y", true); booleanMap.put("1", true); for (PartitionedValue pv : PartitionedValue.values()) { partitionedValueMap.put(pv.getValue().toLowerCase(), pv); partitionedValueMap.put(pv.getValue().toUpperCase(), pv); } for (ConfirmedPositiveValue pv : ConfirmedPositiveValue.values()) { confirmedPositiveValueMap.put(pv.getValue().toLowerCase(), pv); confirmedPositiveValueMap.put(pv.getValue().toUpperCase(), pv); } for (AssayWellControlType awct : AssayWellControlType.values()) { assayWellControlTypeMap.put(awct.getAbbreviation(), awct); } } private LibrariesDAO _librariesDao; private ScreenResultsDAO _screenResultsDao; private edu.harvard.med.screensaver.db.GenericEntityDAO _genericEntityDao; /** * The ScreenResult object to be populated with data parsed from the spreadsheet. */ private ScreenResult _screenResult; private Workbook _workbook; private ColumnLabelsParser _columnsDerivedFromParser; private ExcludeParser _excludeParser; private CellVocabularyParser<AssayReadoutType> _assayReadoutTypeParser; private CellVocabularyParser<DataType> _dataTypeParser; private CellVocabularyParser<Boolean> _rawOrDerivedParser; private CellVocabularyParser<Boolean> _primaryOrFollowUpParser; private CellVocabularyParser<Boolean> _booleanParser; private CellVocabularyParser<PartitionedValue> _partitionedValueParser; private CellVocabularyParser<ConfirmedPositiveValue> _confirmedPositiveValueParser; private CellVocabularyParser<AssayWellControlType> _assayWellControlTypeParser; private WellNameParser _wellNameParser; private SortedMap<String,DataColumn> _worksheetColumnLabel2DataColumnObjectMap; /** * @motivation runtime detection of duplicate wells in the input stream */ private Set<String> parsedWellKeys = Sets.newHashSet(); private Map<Integer,Integer> _plateNumbersLoadedWithMaxReplicates = Maps.newHashMap(); private Map<Integer,Integer> _dataColumnIndex2WorksheetColumnIndex; private boolean _ignoreDuplicateErrors = false; /** * The library that was associated with the plate that was last accessed. * @motivation optimization for findLibraryWithPlate(); reduce db I/O */ private Library _lastLibrary; public ScreenResultParser(LibrariesDAO librariesDao, ScreenResultsDAO screenResultsDao, GenericEntityDAO genericEntityDao) { _librariesDao = librariesDao; _screenResultsDao = screenResultsDao; _genericEntityDao = genericEntityDao; } public void setIgnoreDuplicateErrors(boolean value) { _ignoreDuplicateErrors = value; } /** * Main function. * Parses the specified workbook file that contains Screen Result data in the * <a * href="https://wiki.med.harvard.edu/ICCBL/NewScreenResultFileFormat">"new" * format</a>. Errors encountered during parsing are stored with this object * until a parse() method is called again, and these errors can be retrieved * via {@link #getErrors}. The returned <code>ScreenResult</code> may only * be partially populated if errors are encountered, so always call * getErrors() to determine parsing success. * * @param screen the parent Screen of the Screen Result being parsed * @param workbook the workbook to be parsed * @param plateNumberRange the range of plate numbers to be parsed, allowing for only a subset * of the data to be imported. This may be required for resource * utilization purposes, where the ScreenResult must be imported over * multiple passes. If null, well data for all plates will be imported. * @return a ScreenResult object containing the data parsed from the workbook * file; <code>null</code> if a fatal error occurs (e.g. file not * found) * @see #getErrors() */ public ScreenResult parse(Screen screen, Workbook workbook, IntRange plateNumberRange, boolean incrementalFlush) { _workbook = workbook; return doParse(screen, plateNumberRange, incrementalFlush); } /** * Note: incrementalFlush is turned off by default */ public ScreenResult parse(Screen screen, File workbookFile) throws FileNotFoundException { return parse(screen, workbookFile, null, false); } public ScreenResult parse(Screen screen, File workbookFile, boolean incrementalFlush) throws FileNotFoundException { return parse(screen, workbookFile, null, incrementalFlush); } public ScreenResult parse(Screen screen, File workbookFile, IntRange plateNumberRange, boolean incrementalFlush) throws FileNotFoundException { return parse(screen, new Workbook(workbookFile), plateNumberRange, incrementalFlush); } private ScreenResult doParse(Screen screen, IntRange plateNumberRange, boolean incrementalFlush) { _screenResult = screen.getScreenResult(); _lastLibrary = null; _assayReadoutTypeParser = new CellVocabularyParser<AssayReadoutType>(assayReadoutTypeMap); _worksheetColumnLabel2DataColumnObjectMap = new TreeMap<String,DataColumn>(); _columnsDerivedFromParser = new ColumnLabelsParser(_worksheetColumnLabel2DataColumnObjectMap); _excludeParser = new ExcludeParser(_worksheetColumnLabel2DataColumnObjectMap); _dataTypeParser = new CellVocabularyParser<DataType>(dataTypeMap); _primaryOrFollowUpParser = new CellVocabularyParser<Boolean>(primaryOrFollowUpMap, Boolean.FALSE); _booleanParser = new CellVocabularyParser<Boolean>(booleanMap, Boolean.FALSE); _partitionedValueParser = new CellVocabularyParser<PartitionedValue>(partitionedValueMap, PartitionedValue.NOT_POSITIVE); _confirmedPositiveValueParser = new CellVocabularyParser<ConfirmedPositiveValue>(confirmedPositiveValueMap, ConfirmedPositiveValue.INCONCLUSIVE); _assayWellControlTypeParser = new CellVocabularyParser<AssayWellControlType>(assayWellControlTypeMap); _wellNameParser = new WellNameParser(); try { log.info("parsing " + _workbook.getName()); if (_screenResult == null) { _screenResult = screen.createScreenResult(); log.debug("created screen result"); if (!parseDataColumnDefinitions(_screenResult, _workbook)) { log.info("errors found in data column definitions, will not attempt to parse data sheets"); return _screenResult; } } initializeDataColumnLocations(_screenResult, _workbook); log.debug("parsing data sheets"); parseData(_workbook, _screenResult, plateNumberRange, incrementalFlush); } catch (UnrecoverableScreenResultParseException e) { _workbook.addError("serious parse error encountered (could not continue further parsing): " + e.getMessage()); } catch (Exception e) { e.printStackTrace(); String errorMsg = UNKNOWN_ERROR + " of type : " + e.getClass() + ": " + e.getMessage(); _workbook.addError(errorMsg); } return _screenResult; } public boolean getHasErrors() { return _workbook.getHasErrors(); } /** * Return all errors that were detected during parsing. This class attempts to * parse as much of the workbook as possible, continuing on after finding an * error. The hope is that multiple errors will help a user/administrator * correct a workbook's errors in a batch fashion, rather than in a piecemeal * fashion. * * @return a <code>List<String></code> of all errors generated during * parsing */ public List<WorkbookParseError> getErrors() { return _workbook.getErrors(); } public Map<Integer,Integer> getPlateNumbersLoadedWithMaxReplicates() { return _plateNumbersLoadedWithMaxReplicates; } public ScreenResult getParsedScreenResult() { return _screenResult; } /** * Finds the total number of data columns. * TODO: this does not account for non-contiguous blocks of empty cells * @param dataColumnsSheet * @return highest cell column index plus 1 * @throws UnrecoverableScreenResultParseException */ private int findDataColumnCount(Worksheet dataColumnsSheet) { dataColumnsSheet.getColumns(); int rows = dataColumnsSheet.getRows(); if (rows == 0) { return 0; } int n = 0; for (Cell cell : dataColumnsSheet.getRow(0)) { if(cell.isEmpty()) break; n++; } return n; } private int getDataColumn(int dataColumn) { return _dataColumnIndex2WorksheetColumnIndex.get(dataColumn); } /** * Parse the worksheet containing the ScreenResult data columns. * This method returns error results by add them to the {@link Workbook}. * Therefore check the workbook after running for errors. * @param workbook * @throws UnrecoverableScreenResultParseException * */ private boolean parseDataColumnDefinitions(ScreenResult screenResult, Workbook workbook) throws UnrecoverableScreenResultParseException { log.debug("parse data columns sheet"); Worksheet dataColumnsSheet = _workbook.getWorksheet(DATA_COLUMNS_SHEET_NAME).forOrigin(DATA_COLUMNS_SHEET__FIRST_DATA_COLUMN__WORKSHEET_COLUMN_INDEX, 0); if (dataColumnsSheet == null) { _workbook.addError(DATA_COLUMNS_SHEET_NOT_FOUND_ERROR); return false; } Map<DataColumnProperty,Row> dataColumnPropertyRows = parseDataColumnPropertyNames(dataColumnsSheet); int dataColumnCount = findDataColumnCount(dataColumnsSheet); for (int iDataColumn = 0; iDataColumn < dataColumnCount; ++iDataColumn) { if (!!!dataColumnPropertyRows.containsKey(DataColumnProperty.NAME)) { throw new UnrecoverableScreenResultParseException(DataColumnProperty.NAME + " data column property is required"); } DataColumn dataColumn = screenResult.createDataColumn(dataColumnPropertyRows.get(DataColumnProperty.NAME).getCell(iDataColumn, true).getString()); if (dataColumnPropertyRows.containsKey(DataColumnProperty.DATA_TYPE)) { Cell cell = dataColumnPropertyRows.get(DataColumnProperty.DATA_TYPE).getCell(iDataColumn, true); DataType dataType = _dataTypeParser.parse(cell); if (dataType != null) { switch (dataType) { case NUMERIC: { Integer decimalPlaces = null; if (dataColumnPropertyRows.containsKey(DataColumnProperty.DECIMAL_PLACES)) { Cell cell2 = dataColumnPropertyRows.get(DataColumnProperty.DECIMAL_PLACES).getCell(iDataColumn); decimalPlaces = cell2.getInteger(); if (decimalPlaces != null && decimalPlaces < 0) { cell2.addError("illegal value"); } } dataColumn.makeNumeric(decimalPlaces); break; } case TEXT: dataColumn.makeTextual(); break; case POSITIVE_INDICATOR_BOOLEAN: dataColumn.makeBooleanPositiveIndicator(); break; case POSITIVE_INDICATOR_PARTITION: dataColumn.makePartitionPositiveIndicator(); break; case CONFIRMED_POSITIVE_INDICATOR: dataColumn.makeConfirmedPositiveIndicator(); break; default: throw new DevelopmentException("unhandled data type " + dataType); } } } else { dataColumn.makeNumeric(null); } if (dataColumnPropertyRows.containsKey(DataColumnProperty.REPLICATE)) { dataColumn.forReplicate(dataColumnPropertyRows.get(DataColumnProperty.REPLICATE).getCell(iDataColumn).getInteger()); } String howDerived = null; if (dataColumnPropertyRows.containsKey(DataColumnProperty.HOW_DERIVED)) { howDerived = dataColumnPropertyRows.get(DataColumnProperty.HOW_DERIVED).getCell(iDataColumn).getString(); } Set<DataColumn> columnsDerivedFrom = Collections.emptySet(); if (dataColumnPropertyRows.containsKey(DataColumnProperty.COLUMNS_DERIVED_FROM)) { columnsDerivedFrom = Sets.newHashSet(Iterables.filter(_columnsDerivedFromParser.parseList(dataColumnPropertyRows.get(DataColumnProperty.COLUMNS_DERIVED_FROM).getCell(iDataColumn)), Predicates.notNull())); } if (!!!(StringUtils.isEmpty(howDerived) && columnsDerivedFrom.isEmpty())) { dataColumn.makeDerived(howDerived, columnsDerivedFrom); } if (!!!dataColumn.isDerived() && !!!dataColumn.isPositiveIndicator()) { if (dataColumnPropertyRows.containsKey(DataColumnProperty.ASSAY_READOUT_TYPE)) { dataColumn.setAssayReadoutType(_assayReadoutTypeParser.parse(dataColumnPropertyRows.get(DataColumnProperty.ASSAY_READOUT_TYPE).getCell(iDataColumn, false))); } } if (dataColumnPropertyRows.containsKey(DataColumnProperty.PRIMARY_OR_FOLLOWUP)) { dataColumn.setFollowUpData(_primaryOrFollowUpParser.parse(dataColumnPropertyRows.get(DataColumnProperty.PRIMARY_OR_FOLLOWUP).getCell(iDataColumn))); } if (dataColumnPropertyRows.containsKey(DataColumnProperty.ASSAY_PHENOTYPE)) { dataColumn.forPhenotype(dataColumnPropertyRows.get(DataColumnProperty.ASSAY_PHENOTYPE).getCell(iDataColumn).getString()); } if (dataColumnPropertyRows.containsKey(DataColumnProperty.DESCRIPTION)) { dataColumn.setDescription(dataColumnPropertyRows.get(DataColumnProperty.DESCRIPTION).getCell(iDataColumn).getString()); } if (dataColumnPropertyRows.containsKey(DataColumnProperty.COMMENTS)) { dataColumn.setComments(dataColumnPropertyRows.get(DataColumnProperty.COMMENTS).getCell(iDataColumn).getString()); } if (dataColumnPropertyRows.containsKey(DataColumnProperty.TIME_POINT)) { dataColumn.forTimePoint(dataColumnPropertyRows.get(DataColumnProperty.TIME_POINT).getCell(iDataColumn).getString()); } if (dataColumnPropertyRows.containsKey(DataColumnProperty.TIME_POINT_ORDINAL)) { dataColumn.forTimePointOrdinal(dataColumnPropertyRows.get(DataColumnProperty.TIME_POINT_ORDINAL).getCell(iDataColumn).getInteger()); } if (dataColumnPropertyRows.containsKey(DataColumnProperty.CHANNEL)) { dataColumn.forChannel(dataColumnPropertyRows.get(DataColumnProperty.CHANNEL).getCell(iDataColumn).getInteger()); } if (dataColumnPropertyRows.containsKey(DataColumnProperty.ZDEPTH_ORDINAL)) { dataColumn.forZdepthOrdinal(dataColumnPropertyRows.get(DataColumnProperty.ZDEPTH_ORDINAL).getCell(iDataColumn).getInteger()); } // if (dataColumnPropertyRows.containsKey(DataColumnProperty.CELL_LINE)) { // dataColumn.forCellLine(dataColumnPropertyRows.get(DataColumnProperty.CELL_LINE).getCell(iDataColumn).getString()); // } // note: we do this last so that _columnsDerivedFromParser does not allow the current column to be considered a valid "derived from" value _worksheetColumnLabel2DataColumnObjectMap.put(dataColumnPropertyRows.get(DataColumnProperty.COLUMN_IN_DATA_WORKSHEET).getCell(iDataColumn, true).getAsString(), dataColumn); } return !!!_workbook.getHasErrors(); } private Map<DataColumnProperty,Row> parseDataColumnPropertyNames(Worksheet dataColumnsSheet) { Map<DataColumnProperty,Row> result = Maps.newHashMap(); Iterator<Row> dataRows = dataColumnsSheet.forOrigin(1, 0).iterator(); for (Row row : dataColumnsSheet.forOrigin(0, 0)) { if (row.isEmpty()) { break; } String dataColumnPropertyLabel = row.getCell(0).getString().trim(); DataColumnProperty dataColumnProperty = DataColumnProperty.fromDisplayText(dataColumnPropertyLabel); Row dataOnlyRow = dataRows.next(); if (dataColumnProperty != null) { result.put(dataColumnProperty, dataOnlyRow); } else { row.getCell(0).addError("unknown data column property: " + dataColumnPropertyLabel); } } return result; } private void initializeDataColumnLocations(ScreenResult screenResult, Workbook workbook) throws UnrecoverableScreenResultParseException { Worksheet dataColumnsSheet = workbook.getWorksheet(DATA_COLUMNS_SHEET_NAME).forOrigin(DATA_COLUMNS_SHEET__FIRST_DATA_COLUMN__WORKSHEET_COLUMN_INDEX, 0); if (dataColumnsSheet == null) { throw new UnrecoverableScreenResultParseException(DATA_COLUMNS_SHEET_NOT_FOUND_ERROR); } _dataColumnIndex2WorksheetColumnIndex = Maps.newHashMap(); int dataColumnCount = findDataColumnCount(dataColumnsSheet); Map<DataColumnProperty,Row> dataColumnProperties = parseDataColumnPropertyNames(dataColumnsSheet); for (int iDataColumn = 0; iDataColumn < dataColumnCount; ++iDataColumn) { Cell cell = dataColumnProperties.get(DataColumnProperty.COLUMN_IN_DATA_WORKSHEET).getCell(iDataColumn, true); String forColumnInDataWorksheet = cell.getString().trim(); try { if (forColumnInDataWorksheet != null) { _dataColumnIndex2WorksheetColumnIndex.put(iDataColumn, AlphabeticCounter.toIndex(forColumnInDataWorksheet)); DataColumn dataColumn = screenResult.getDataColumnsList().get(iDataColumn); _worksheetColumnLabel2DataColumnObjectMap.put(forColumnInDataWorksheet, dataColumn); } } catch (IllegalArgumentException e) { cell.addError(e.getMessage()); } } } /** * Parse the workbook containing the ScreenResult data. * * @param workbook the workbook containing some or all of the raw data for a * ScreenResult * @throws ExtantLibraryException if an existing Well entity cannot be found * in the database * @throws IOException * @throws UnrecoverableScreenResultParseException */ private void parseData(Workbook workbook, ScreenResult screenResult, IntRange plateNumberRange, boolean incrementalFlush) throws ExtantLibraryException, IOException, UnrecoverableScreenResultParseException { log.debug("incrementalFlush:" + incrementalFlush); long startTime = System.currentTimeMillis(); long loopTime = startTime; int wellsWithDataLoaded = 0; int dataSheetsParsed = 0; int totalSheets = workbook.getWorkbook().getNumberOfSheets(); int firstDataSheetIndex = workbook.getWorksheet(DATA_COLUMNS_SHEET_NAME).getSheetIndex() + 1; int totalDataSheets = Math.max(0, totalSheets - firstDataSheetIndex); plateNumberRange = plateNumberRange == null ? new IntRange(Integer.MIN_VALUE, Integer.MAX_VALUE) : plateNumberRange; // Note: we do this to make sure that the DataColumn's are persisted before we persist and clear the RV's if (screenResult.getEntityId() == null) { _genericEntityDao.persistEntity(screenResult); } else { _genericEntityDao.saveOrUpdateEntity(screenResult); } _genericEntityDao.flush(); for (int iSheet = firstDataSheetIndex; iSheet < totalSheets; ++iSheet) { String sheetName = workbook.getWorkbook().getSheet(iSheet).getName(); log.info("parsing sheet " + (dataSheetsParsed + 1) + " of " + totalDataSheets + ", " + sheetName); Worksheet worksheet = workbook.getWorksheet(iSheet).forOrigin(0, DATA_SHEET__FIRST_DATA_ROW_INDEX); for (Row row : worksheet) { // bring in the old findNextRow() logic if (row.getColumns() > 0 && !row.getCell(0).isEmpty() && row.getCell(0).getAsString().trim().length() > 0 ) { Integer plateNumber = row.getCell(WellInfoColumn.PLATE.ordinal(), true).getInteger(); Cell wellNameCell = row.getCell(WellInfoColumn.WELL_NAME.ordinal()); String wellName = _wellNameParser.parse(wellNameCell); if (!wellName.equals("")) { WellKey wellKey = new WellKey(plateNumber, wellName); if (!plateNumberRange.containsInteger(wellKey.getPlateNumber())) { if (log.isDebugEnabled()) { log.debug("Skipping, excluded range: " + plateNumberRange + ", row: " + row.getRow()); } } else { boolean duplicate = !parsedWellKeys.add(wellKey.getKey()); if (duplicate) { if (!_ignoreDuplicateErrors) { wellNameCell.addError("duplicate well: " + wellKey); } else { log.debug("duplicate well: " + wellKey + ", duplicate found at: " + wellNameCell); } } else { if (findLibraryWithPlate(wellKey.getPlateNumber()) == null) { wellNameCell.addError(NO_SUCH_LIBRARY_WITH_PLATE); } else { Well well = _librariesDao.findWell(wellKey); if (well == null) { wellNameCell.addError(NO_SUCH_WELL + ": " + wellKey); } else if (findAssayWell(well) == null) { readResultValues(screenResult, row, well, incrementalFlush); ++wellsWithDataLoaded; if (incrementalFlush && wellsWithDataLoaded % AbstractDAO.ROWS_TO_CACHE == 0) { saveResultValuesAndFlush(screenResult, incrementalFlush); if (log.isInfoEnabled() && wellsWithDataLoaded % (AbstractDAO.ROWS_TO_CACHE * 100) == 0) { long time = System.currentTimeMillis(); long cumulativeTime = time - startTime; log.info("wellsWithDataLoaded: " + wellsWithDataLoaded + ", cumulative time: " + (double)cumulativeTime/(double)60000 + " min, avg row time: " + (double)cumulativeTime/(double)wellsWithDataLoaded + ", loopTime: " + (time - loopTime) ); loopTime = time; } } // incremental } } } } } } } // for row ++dataSheetsParsed; if (wellsWithDataLoaded > 0) { saveResultValuesAndFlush(screenResult, incrementalFlush); log.info("Sheet: " + sheetName + " done, save, count: " + wellsWithDataLoaded); long time = System.currentTimeMillis(); long cumulativeTime = time - startTime; log.info("wellsWithDataLoaded: " + wellsWithDataLoaded + ", cumulative time: " + (double)cumulativeTime/(double)60000 + " min, avg row time: " + (double)cumulativeTime/(double)wellsWithDataLoaded ); } } if (dataSheetsParsed == 0) { _workbook.addError(NO_DATA_SHEETS_FOUND_ERROR); } else { log.info("done parsing " + dataSheetsParsed + " data sheet(s) " + workbook.getName()); log.info("loaded data for " + wellsWithDataLoaded + " well(s) "); } } private void saveResultValuesAndFlush(ScreenResult screenResult, boolean incrementalFlush) { log.debug("incrementally save the screen result and clear values"); for (DataColumn dataColumn : screenResult.getDataColumns()) { _genericEntityDao.saveOrUpdateEntity(dataColumn); //TODO: this should not be required, but the writes to the DB were missing some of the RV's and this fixed it -sde4 for (ResultValue rv : dataColumn.getResultValues()) { _genericEntityDao.saveOrUpdateEntity(rv); } } if (incrementalFlush) { for (DataColumn dataColumn : screenResult.getDataColumns()) { dataColumn.clearResultValues(); } for (AssayWell assayWell: screenResult.getAssayWells()) { _genericEntityDao.saveOrUpdateEntity(assayWell); } screenResult.getAssayWells().clear(); _genericEntityDao.flush(); _genericEntityDao.clear(); } } private void readResultValues(ScreenResult screenResult, Row row, Well well, boolean incrementalFlush) { AssayWellControlType assayWellControlType = _assayWellControlTypeParser.parse(row.getCell(WellInfoColumn.ASSAY_WELL_TYPE.ordinal())); try { AssayWell assayWell = createAssayWell(well, assayWellControlType); List<DataColumn> wellExcludes = _excludeParser.parseList(row.getCell(WellInfoColumn.EXCLUDE.ordinal())); int iDataColumn = 0; int maxReplicateOrdinal = 1; for (DataColumn dataColumn : screenResult.getDataColumns()) { Cell cell = row.getCell(getDataColumn(iDataColumn)); boolean isExclude = (wellExcludes != null && wellExcludes.contains(dataColumn)); ResultValue newResultValue = null; if (dataColumn.isBooleanPositiveIndicator()) { newResultValue = dataColumn.createBooleanPositiveResultValue(assayWell, cell.isBoolean() ? cell.getBoolean() : _booleanParser.parse(cell), isExclude); } else if (dataColumn.isPartitionPositiveIndicator()) { newResultValue = dataColumn.createPartitionedPositiveResultValue(assayWell, _partitionedValueParser.parse(cell), isExclude); } else if (dataColumn.isConfirmedPositiveIndicator()) { newResultValue = dataColumn.createConfirmedPositiveResultValue(assayWell, _confirmedPositiveValueParser.parse(cell), isExclude); } else if (dataColumn.isNumeric()) { newResultValue = dataColumn.createResultValue(assayWell, cell.getDouble(), isExclude); } else { newResultValue = dataColumn.createResultValue(assayWell, cell.getString(), isExclude); } // update the maxReplicateOrdinal to track the actual number of // replicates that have data loaded for a particular plate // (this allows us to later calculate which how many replicates // actually have associated data, in case some of the "extra" screened // replicates were ignored by the screener) if (!dataColumn.isDerived() && !newResultValue.isNull()) { if (dataColumn.getReplicateOrdinal() != null) { maxReplicateOrdinal = Math.max(maxReplicateOrdinal, dataColumn.getReplicateOrdinal()); } } if (newResultValue == null) { cell.addError("duplicate well"); } ++iDataColumn; } recordAssayPlatesDataLoaded(well, maxReplicateOrdinal); if (incrementalFlush) { // [#2119] Optimize ScreenResultParser for scalability: // - in memory RV's must be reloaded as needed assayWell.getLibraryWell().getResultValues().clear(); } } catch (DataModelViolationException e) { row.getCell(WellInfoColumn.ASSAY_WELL_TYPE.ordinal()).addError(e.getMessage()); } } private AssayWell createAssayWell(Well well, AssayWellControlType assayWellControlType) throws DataModelViolationException { AssayWell assayWell = _screenResult.createAssayWell(well); assayWell.setAssayWellControlType(assayWellControlType); return assayWell; } private AssayWell findAssayWell(Well well) { AssayWell assayWell = _screenResultsDao.findAssayWell(_screenResult, well.getWellKey()); // if (assayWell != null) { // if (assayWell.getAssayWellControlType() != assayWellControlType) { // _workbook.addError(ASSAY_WELL_TYPE_INCONSISTENCY + ": " + well); // return null; // } // } return assayWell; } private void recordAssayPlatesDataLoaded(Well well, int replicateCount) { Integer maxReplicateCount = _plateNumbersLoadedWithMaxReplicates.get(well.getPlateNumber()); if (maxReplicateCount == null || maxReplicateCount.compareTo(replicateCount) < 0) { _plateNumbersLoadedWithMaxReplicates.put(well.getPlateNumber(), replicateCount); } } /** * @motivation database I/O optimization */ private Library findLibraryWithPlate(Integer plateNumber) { if (_lastLibrary == null || !_lastLibrary.containsPlate(plateNumber)) { _lastLibrary = _librariesDao.findLibraryWithPlate(plateNumber); } return _lastLibrary; } private static class ParsedScreenInfo { private Integer _screenId; public Integer getScreenId() { return _screenId; } public void setScreenId(Integer screenId) { _screenId = screenId; } } public class ColumnLabelsParser implements CellValueParser<DataColumn> { protected Map<String,DataColumn> _columnLabel2ColMap; private Pattern columnIdPattern = Pattern.compile("[A-Z]+"); public ColumnLabelsParser(Map<String,DataColumn> columnLabel2ColMap) { _columnLabel2ColMap = columnLabel2ColMap; } public DataColumn parse(Cell cell) { throw new UnsupportedOperationException(); } public List<DataColumn> parseList(Cell cell) { String textMultiValue = cell.getString(); List<DataColumn> result = new ArrayList<DataColumn>(); if (textMultiValue == null || textMultiValue.trim().length() == 0) { return result; } String[] textValues = textMultiValue.split(","); for (int i = 0; i < textValues.length; i++) { String text = textValues[i].trim(); DataColumn dataColumn = doParseSingleValue(text, cell); if (dataColumn != null) { result.add(dataColumn); } else { cell.addError("invalid Data Column worksheet column label '" + text + "' (expected one of " + _columnLabel2ColMap.keySet() + ")"); } } return result; } protected DataColumn doParseSingleValue(String value, Cell cell) { Matcher matcher = columnIdPattern.matcher(value); if (!matcher.matches()) { return null; } String columnLabel = matcher.group(0); return _columnLabel2ColMap.get(columnLabel); } } private class ExcludeParser extends ColumnLabelsParser { public ExcludeParser(Map<String,DataColumn> columnLabel2ColMap) { super(columnLabel2ColMap); } public List<DataColumn> parseList(Cell cell) { String textMultiValue = cell.getString(); if (textMultiValue != null && textMultiValue.equalsIgnoreCase(ScreenResultWorkbookSpecification.EXCLUDE_ALL_VALUE)) { return new ArrayList<DataColumn>(_columnLabel2ColMap.values()); } if (textMultiValue == null) { return Collections.emptyList(); } return super.parseList(cell); } } }