ScreenResultParser.java example

Explorer
screensaver-master
// $HeadURL:
// svn+ssh://ant4@orchestra.med.harvard.edu/svn/iccb/screensaver/trunk/src/edu/harvard/med/screensaver/io/screenresults/ScreenResultParser.java
// $
// $Id$
//
// Copyright © 2006, 2010, 2011, 2012 by the President and Fellows of Harvard College.
//
// Screensaver is an open-source project developed by the ICCB-L and NSRB labs
// at Harvard Medical School. This software is distributed under the terms of
// the GNU General Public License.

package edu.harvard.med.screensaver.io.screenresults;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.SortedMap;
import java.util.TreeMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import com.google.common.base.Predicates;
import com.google.common.collect.Iterables;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
import org.apache.commons.lang.math.IntRange;
import org.apache.log4j.Logger;

import edu.harvard.med.screensaver.db.AbstractDAO;
import edu.harvard.med.screensaver.db.GenericEntityDAO;
import edu.harvard.med.screensaver.db.LibrariesDAO;
import edu.harvard.med.screensaver.db.ScreenResultsDAO;
import edu.harvard.med.screensaver.io.libraries.ExtantLibraryException;
import edu.harvard.med.screensaver.io.workbook2.Cell;
import edu.harvard.med.screensaver.io.workbook2.CellValueParser;
import edu.harvard.med.screensaver.io.workbook2.CellVocabularyParser;
import edu.harvard.med.screensaver.io.workbook2.Row;
import edu.harvard.med.screensaver.io.workbook2.WellNameParser;
import edu.harvard.med.screensaver.io.workbook2.Workbook;
import edu.harvard.med.screensaver.io.workbook2.WorkbookParseError;
import edu.harvard.med.screensaver.io.workbook2.Worksheet;
import edu.harvard.med.screensaver.model.DataModelViolationException;
import edu.harvard.med.screensaver.model.libraries.Library;
import edu.harvard.med.screensaver.model.libraries.Well;
import edu.harvard.med.screensaver.model.libraries.WellKey;
import edu.harvard.med.screensaver.model.screenresults.AssayWell;
import edu.harvard.med.screensaver.model.screenresults.AssayWellControlType;
import edu.harvard.med.screensaver.model.screenresults.ConfirmedPositiveValue;
import edu.harvard.med.screensaver.model.screenresults.DataColumn;
import edu.harvard.med.screensaver.model.screenresults.DataType;
import edu.harvard.med.screensaver.model.screenresults.PartitionedValue;
import edu.harvard.med.screensaver.model.screenresults.ResultValue;
import edu.harvard.med.screensaver.model.screenresults.ScreenResult;
import edu.harvard.med.screensaver.model.screens.AssayReadoutType;
import edu.harvard.med.screensaver.model.screens.Screen;
import edu.harvard.med.screensaver.util.AlphabeticCounter;
import edu.harvard.med.screensaver.util.DevelopmentException;
import edu.harvard.med.screensaver.util.StringUtils;

/**
 * Parses data from a workbook files (a.k.a. Excel spreadsheets) necessary for
 * instantiating a
 * {@link edu.harvard.med.screensaver.model.screenresults.ScreenResult}. A
 * {@link ScreenResult} is comprised of {@link DataColumn}s that in turn contain
 * {@link ResultValue}s. By convention, each worksheet contains the raw data for
 * a single plate, but the parser is indifferent to how data may be arranged
 * across worksheets.
 * <p>
 * The "Data Columns" worksheet is used to create
 * {@link DataColumn} objects,
 * while the raw data is used to instantiate each of the {@link DataColumn}s'
 * {@link ResultValue}
 * objects. Together, these objects are used instantiate a {@link ScreenResult}
 * object, which is the returned result of the {@link #parse} method.
 * <p>
 * The class attempts to parse the file(s) as fully as possible, carrying on in
 * the face of errors, in order to catch as many errors as possible, as this
 * will aid the manual effort of correcting the files' format and content
 * between import attempts. The {@link #getErrors()} method will
 * the errors messages encountered during parsing.
 * <p>
 * Each call to {@link #parse} will clear the errors accumulated from the
 * previous call, and so the result of calling {@link #getErrors()} will change
 * after each call to {@link #parse}.
 * 
 * @author <a mailto="andrew_tolopko@hms.harvard.edu">Andrew Tolopko</a>
 * @author <a mailto="john_sullivan@hms.harvard.edu">John Sullivan</a>
 */
public class ScreenResultParser implements ScreenResultWorkbookSpecification
{
  private static final Logger log = Logger.getLogger(ScreenResultParser.class);

  private static final String NO_SCREEN_ID_FOUND_ERROR = "Screen ID not found for row: ";
  private static final String DATA_COLUMNS_SHEET_NOT_FOUND_ERROR = "\"Data Columns\" sheet not found";
  private static final String UNKNOWN_ERROR = "unknown error";
  private static final String NO_DATA_SHEETS_FOUND_ERROR = "no data worksheets were found; no result data was imported";
  private static final String NO_SUCH_WELL = "library well does not exist";
  private static final String NO_SUCH_LIBRARY_WITH_PLATE = "no library with given plate number";
  private static final String ASSAY_WELL_TYPE_INCONSISTENCY = "assay well type cannot be changed";

  private static SortedMap<String,AssayReadoutType> assayReadoutTypeMap = new TreeMap<String,AssayReadoutType>();
  private static SortedMap<String,DataType> dataTypeMap = new TreeMap<String,DataType>();
  private static SortedMap<String,Boolean> primaryOrFollowUpMap = new TreeMap<String,Boolean>();
  private static SortedMap<String,Boolean> booleanMap = new TreeMap<String,Boolean>();
  private static SortedMap<String,PartitionedValue> partitionedValueMap = new TreeMap<String,PartitionedValue>();
  private static SortedMap<String,ConfirmedPositiveValue> confirmedPositiveValueMap = new TreeMap<String,ConfirmedPositiveValue>();
  private static SortedMap<String,AssayWellControlType> assayWellControlTypeMap = new TreeMap<String,AssayWellControlType>();
  static {
    for (AssayReadoutType assayReadoutType : AssayReadoutType.values()) {
      assayReadoutTypeMap.put(assayReadoutType.getValue(),
                              assayReadoutType);
    }

    for (DataType dataType : DataType.values()) {
      dataTypeMap.put(dataType.getValue(), dataType);
    }

    primaryOrFollowUpMap.put("", false);
    primaryOrFollowUpMap.put(PRIMARY_VALUE, false);
    primaryOrFollowUpMap.put(FOLLOWUP_VALUE, true);

    booleanMap.put("", false);
    booleanMap.put("false", false);
    booleanMap.put("no", false);
    booleanMap.put("n", false);
    booleanMap.put("0", false);
    booleanMap.put("true", true);
    booleanMap.put("yes", true);
    booleanMap.put("y", true);
    booleanMap.put("1", true);

    for (PartitionedValue pv : PartitionedValue.values()) {
      partitionedValueMap.put(pv.getValue().toLowerCase(), pv);
      partitionedValueMap.put(pv.getValue().toUpperCase(), pv);
    }

    for (ConfirmedPositiveValue pv : ConfirmedPositiveValue.values()) {
      confirmedPositiveValueMap.put(pv.getValue().toLowerCase(), pv);
      confirmedPositiveValueMap.put(pv.getValue().toUpperCase(), pv);
    }

    for (AssayWellControlType awct : AssayWellControlType.values()) {
      assayWellControlTypeMap.put(awct.getAbbreviation(), awct);
    }
  }

  private LibrariesDAO _librariesDao;
  private ScreenResultsDAO _screenResultsDao;
  private edu.harvard.med.screensaver.db.GenericEntityDAO _genericEntityDao;

  /**
   * The ScreenResult object to be populated with data parsed from the spreadsheet.
   */
  private ScreenResult _screenResult;
  private Workbook _workbook;

  private ColumnLabelsParser _columnsDerivedFromParser;
  private ExcludeParser _excludeParser;
  private CellVocabularyParser<AssayReadoutType> _assayReadoutTypeParser;
  private CellVocabularyParser<DataType> _dataTypeParser;
  private CellVocabularyParser<Boolean> _rawOrDerivedParser;
  private CellVocabularyParser<Boolean> _primaryOrFollowUpParser;
  private CellVocabularyParser<Boolean> _booleanParser;
  private CellVocabularyParser<PartitionedValue> _partitionedValueParser;
  private CellVocabularyParser<ConfirmedPositiveValue> _confirmedPositiveValueParser;
  private CellVocabularyParser<AssayWellControlType> _assayWellControlTypeParser;
  private WellNameParser _wellNameParser;
  
  private SortedMap<String,DataColumn> _worksheetColumnLabel2DataColumnObjectMap;
  /**
   * @motivation runtime detection of duplicate wells in the input stream
   */
  private Set<String> parsedWellKeys = Sets.newHashSet();
  private Map<Integer,Integer> _plateNumbersLoadedWithMaxReplicates = Maps.newHashMap();
  private Map<Integer,Integer> _dataColumnIndex2WorksheetColumnIndex;
  private boolean _ignoreDuplicateErrors = false;
  /**
   * The library that was associated with the plate that was last accessed.
   * @motivation optimization for findLibraryWithPlate(); reduce db I/O
   */
  private Library _lastLibrary;





  public ScreenResultParser(LibrariesDAO librariesDao, 
                            ScreenResultsDAO screenResultsDao, 
                            GenericEntityDAO genericEntityDao)
  {
    _librariesDao = librariesDao;
    _screenResultsDao = screenResultsDao;
    _genericEntityDao = genericEntityDao;
  }

  public void setIgnoreDuplicateErrors(boolean value)
  {
    _ignoreDuplicateErrors = value;
  }

  /**
   * Main function.
   * Parses the specified workbook file that contains Screen Result data in the
   * <a
   * href="https://wiki.med.harvard.edu/ICCBL/NewScreenResultFileFormat">"new"
   * format</a>. Errors encountered during parsing are stored with this object
   * until a parse() method is called again, and these errors can be retrieved
   * via {@link #getErrors}. The returned <code>ScreenResult</code> may only
   * be partially populated if errors are encountered, so always call
   * getErrors() to determine parsing success.
   *
   * @param screen the parent Screen of the Screen Result being parsed
   * @param workbook the workbook to be parsed
   * @param plateNumberRange the range of plate numbers to be parsed, allowing for only a subset
   *          of the data to be imported. This may be required for resource
   *          utilization purposes, where the ScreenResult must be imported over
   *          multiple passes. If null, well data for all plates will be imported.
   * @return a ScreenResult object containing the data parsed from the workbook
   *         file; <code>null</code> if a fatal error occurs (e.g. file not
   *         found)
   * @see #getErrors()
   */
  public ScreenResult parse(Screen screen, Workbook workbook, IntRange plateNumberRange, boolean incrementalFlush)
  {
    _workbook = workbook;
    return doParse(screen,
                   plateNumberRange,
                   incrementalFlush);
  }

  /**
   * Note: incrementalFlush is turned off by default
   */
  public ScreenResult parse(Screen screen, File workbookFile)
    throws FileNotFoundException
  {
    return parse(screen, workbookFile, null, false);
  }
  
  public ScreenResult parse(Screen screen, File workbookFile, boolean incrementalFlush)
    throws FileNotFoundException
  {
    return parse(screen, workbookFile, null, incrementalFlush);
  }

  public ScreenResult parse(Screen screen, File workbookFile, IntRange plateNumberRange, boolean incrementalFlush)
    throws FileNotFoundException
  {
    return parse(screen, new Workbook(workbookFile), plateNumberRange, incrementalFlush);
  }

  private ScreenResult doParse(Screen screen,
                               IntRange plateNumberRange,
                               boolean incrementalFlush)
  {
    _screenResult = screen.getScreenResult();
    _lastLibrary = null;
    _assayReadoutTypeParser = new CellVocabularyParser<AssayReadoutType>(assayReadoutTypeMap);
    _worksheetColumnLabel2DataColumnObjectMap = new TreeMap<String,DataColumn>();
    _columnsDerivedFromParser = new ColumnLabelsParser(_worksheetColumnLabel2DataColumnObjectMap);
    _excludeParser = new ExcludeParser(_worksheetColumnLabel2DataColumnObjectMap);
    _dataTypeParser = new CellVocabularyParser<DataType>(dataTypeMap);
    _primaryOrFollowUpParser = new CellVocabularyParser<Boolean>(primaryOrFollowUpMap, Boolean.FALSE);
    _booleanParser = new CellVocabularyParser<Boolean>(booleanMap, Boolean.FALSE);
    _partitionedValueParser = new CellVocabularyParser<PartitionedValue>(partitionedValueMap, PartitionedValue.NOT_POSITIVE);
    _confirmedPositiveValueParser = new CellVocabularyParser<ConfirmedPositiveValue>(confirmedPositiveValueMap, ConfirmedPositiveValue.INCONCLUSIVE);
    _assayWellControlTypeParser = new CellVocabularyParser<AssayWellControlType>(assayWellControlTypeMap);
    _wellNameParser = new WellNameParser();

    try {
      log.info("parsing " + _workbook.getName());
      if (_screenResult == null) {
        _screenResult = screen.createScreenResult();
        log.debug("created screen result");
        if (!parseDataColumnDefinitions(_screenResult, _workbook)) {
          log.info("errors found in data column definitions, will not attempt to parse data sheets");
          return _screenResult;
        }
      }

      initializeDataColumnLocations(_screenResult, _workbook);
      log.debug("parsing data sheets");
      parseData(_workbook,
                _screenResult,
                plateNumberRange,
                incrementalFlush);
    }
    catch (UnrecoverableScreenResultParseException e) {
      _workbook.addError("serious parse error encountered (could not continue further parsing): " + e.getMessage());
    }
    catch (Exception e) {
      e.printStackTrace();
      String errorMsg = UNKNOWN_ERROR + " of type : " + e.getClass() + ": " + e.getMessage();
      _workbook.addError(errorMsg);
    }
    return _screenResult;
  }

  public boolean getHasErrors()
  {
    return _workbook.getHasErrors();
  }

  /**
   * Return all errors that were detected during parsing. This class attempts to
   * parse as much of the workbook as possible, continuing on after finding an
   * error. The hope is that multiple errors will help a user/administrator
   * correct a workbook's errors in a batch fashion, rather than in a piecemeal
   * fashion.
   *
   * @return a <code>List<String></code> of all errors generated during
   *         parsing
   */
  public List<WorkbookParseError> getErrors()
  {
    return _workbook.getErrors();
  }

  public Map<Integer,Integer> getPlateNumbersLoadedWithMaxReplicates()
  {
    return _plateNumbersLoadedWithMaxReplicates;
  }

  public ScreenResult getParsedScreenResult()
  {
    return _screenResult;
  }

  /**
   * Finds the total number of data columns.
   * TODO: this does not account for non-contiguous blocks of empty cells
   * @param dataColumnsSheet
   * @return highest cell column index plus 1
   * @throws UnrecoverableScreenResultParseException
   */
  private int findDataColumnCount(Worksheet dataColumnsSheet) 
  {
    dataColumnsSheet.getColumns();
    int rows = dataColumnsSheet.getRows();
    
    if (rows == 0) {
      return 0;
    }
    
    int n = 0;
    for (Cell cell : dataColumnsSheet.getRow(0)) {
      if(cell.isEmpty()) break;
      n++;
    }
    return n;
  }

  private int getDataColumn(int dataColumn)
  {
    return _dataColumnIndex2WorksheetColumnIndex.get(dataColumn);
  }

  /**
   * Parse the worksheet containing the ScreenResult data columns.
   * This method returns error results by add them to the {@link Workbook}. 
   * Therefore check the workbook after running for errors.
   * @param workbook
   * @throws UnrecoverableScreenResultParseException 
   * 
   */
  private boolean parseDataColumnDefinitions(ScreenResult screenResult, Workbook workbook) throws UnrecoverableScreenResultParseException
  {
    log.debug("parse data columns sheet");
    
    Worksheet dataColumnsSheet = 
      _workbook.getWorksheet(DATA_COLUMNS_SHEET_NAME).forOrigin(DATA_COLUMNS_SHEET__FIRST_DATA_COLUMN__WORKSHEET_COLUMN_INDEX, 0);
    if (dataColumnsSheet == null) {
      _workbook.addError(DATA_COLUMNS_SHEET_NOT_FOUND_ERROR);
      return false;
    } 
    Map<DataColumnProperty,Row> dataColumnPropertyRows = parseDataColumnPropertyNames(dataColumnsSheet);
    int dataColumnCount = findDataColumnCount(dataColumnsSheet);
    for (int iDataColumn = 0; iDataColumn < dataColumnCount; ++iDataColumn) {
      if (!!!dataColumnPropertyRows.containsKey(DataColumnProperty.NAME)) {
        throw new UnrecoverableScreenResultParseException(DataColumnProperty.NAME + " data column property is required");
      }
      DataColumn dataColumn = 
        screenResult.createDataColumn(dataColumnPropertyRows.get(DataColumnProperty.NAME).getCell(iDataColumn, true).getString());
      if (dataColumnPropertyRows.containsKey(DataColumnProperty.DATA_TYPE)) {
        Cell cell = dataColumnPropertyRows.get(DataColumnProperty.DATA_TYPE).getCell(iDataColumn, true);
        DataType dataType = _dataTypeParser.parse(cell);
        if (dataType != null) {
          switch (dataType) {
          case NUMERIC: {
            Integer decimalPlaces = null;
            if (dataColumnPropertyRows.containsKey(DataColumnProperty.DECIMAL_PLACES)) {
              Cell cell2 = dataColumnPropertyRows.get(DataColumnProperty.DECIMAL_PLACES).getCell(iDataColumn);
              decimalPlaces = cell2.getInteger();
              if (decimalPlaces != null && decimalPlaces < 0) {
                cell2.addError("illegal value");
              }
            }
            dataColumn.makeNumeric(decimalPlaces);
            break;
          }
          case TEXT: dataColumn.makeTextual(); break;
          case POSITIVE_INDICATOR_BOOLEAN: dataColumn.makeBooleanPositiveIndicator(); break;
          case POSITIVE_INDICATOR_PARTITION: dataColumn.makePartitionPositiveIndicator(); break;
          case CONFIRMED_POSITIVE_INDICATOR: dataColumn.makeConfirmedPositiveIndicator(); break; 
          default: throw new DevelopmentException("unhandled data type " + dataType);
          }
        }
      }
      else {
        dataColumn.makeNumeric(null);
      }
      
      if (dataColumnPropertyRows.containsKey(DataColumnProperty.REPLICATE)) {
        dataColumn.forReplicate(dataColumnPropertyRows.get(DataColumnProperty.REPLICATE).getCell(iDataColumn).getInteger());
      }
      String howDerived = null;
      if (dataColumnPropertyRows.containsKey(DataColumnProperty.HOW_DERIVED)) {
        howDerived = dataColumnPropertyRows.get(DataColumnProperty.HOW_DERIVED).getCell(iDataColumn).getString();
      }
      Set<DataColumn> columnsDerivedFrom = Collections.emptySet();
      if (dataColumnPropertyRows.containsKey(DataColumnProperty.COLUMNS_DERIVED_FROM)) {
        columnsDerivedFrom = Sets.newHashSet(Iterables.filter(_columnsDerivedFromParser.parseList(dataColumnPropertyRows.get(DataColumnProperty.COLUMNS_DERIVED_FROM).getCell(iDataColumn)),
                                                              Predicates.notNull()));
      }
      if (!!!(StringUtils.isEmpty(howDerived) && columnsDerivedFrom.isEmpty())) { 
        dataColumn.makeDerived(howDerived, columnsDerivedFrom);
      }
      if (!!!dataColumn.isDerived() && !!!dataColumn.isPositiveIndicator()) {
        if (dataColumnPropertyRows.containsKey(DataColumnProperty.ASSAY_READOUT_TYPE)) {
          dataColumn.setAssayReadoutType(_assayReadoutTypeParser.parse(dataColumnPropertyRows.get(DataColumnProperty.ASSAY_READOUT_TYPE).getCell(iDataColumn, false)));
        }
      }
      if (dataColumnPropertyRows.containsKey(DataColumnProperty.PRIMARY_OR_FOLLOWUP)) {
        dataColumn.setFollowUpData(_primaryOrFollowUpParser.parse(dataColumnPropertyRows.get(DataColumnProperty.PRIMARY_OR_FOLLOWUP).getCell(iDataColumn)));
      }
      if (dataColumnPropertyRows.containsKey(DataColumnProperty.ASSAY_PHENOTYPE)) {
        dataColumn.forPhenotype(dataColumnPropertyRows.get(DataColumnProperty.ASSAY_PHENOTYPE).getCell(iDataColumn).getString());
      }
      if (dataColumnPropertyRows.containsKey(DataColumnProperty.DESCRIPTION)) {
        dataColumn.setDescription(dataColumnPropertyRows.get(DataColumnProperty.DESCRIPTION).getCell(iDataColumn).getString());
      }
      if (dataColumnPropertyRows.containsKey(DataColumnProperty.COMMENTS)) {
        dataColumn.setComments(dataColumnPropertyRows.get(DataColumnProperty.COMMENTS).getCell(iDataColumn).getString());
      }
      if (dataColumnPropertyRows.containsKey(DataColumnProperty.TIME_POINT)) {
        dataColumn.forTimePoint(dataColumnPropertyRows.get(DataColumnProperty.TIME_POINT).getCell(iDataColumn).getString());
      }
      if (dataColumnPropertyRows.containsKey(DataColumnProperty.TIME_POINT_ORDINAL)) {
        dataColumn.forTimePointOrdinal(dataColumnPropertyRows.get(DataColumnProperty.TIME_POINT_ORDINAL).getCell(iDataColumn).getInteger());
      }
      if (dataColumnPropertyRows.containsKey(DataColumnProperty.CHANNEL)) {
        dataColumn.forChannel(dataColumnPropertyRows.get(DataColumnProperty.CHANNEL).getCell(iDataColumn).getInteger());
      }
      if (dataColumnPropertyRows.containsKey(DataColumnProperty.ZDEPTH_ORDINAL)) {
        dataColumn.forZdepthOrdinal(dataColumnPropertyRows.get(DataColumnProperty.ZDEPTH_ORDINAL).getCell(iDataColumn).getInteger());
      }
//      if (dataColumnPropertyRows.containsKey(DataColumnProperty.CELL_LINE)) {
//        dataColumn.forCellLine(dataColumnPropertyRows.get(DataColumnProperty.CELL_LINE).getCell(iDataColumn).getString());
//      }
      // note: we do this last so that _columnsDerivedFromParser does not allow the current column to be considered a valid "derived from" value
      _worksheetColumnLabel2DataColumnObjectMap.put(dataColumnPropertyRows.get(DataColumnProperty.COLUMN_IN_DATA_WORKSHEET).getCell(iDataColumn, true).getAsString(), dataColumn);
    }
    return !!!_workbook.getHasErrors();
  }

  private Map<DataColumnProperty,Row> parseDataColumnPropertyNames(Worksheet dataColumnsSheet)
  {
    Map<DataColumnProperty,Row> result = Maps.newHashMap();
    Iterator<Row> dataRows = dataColumnsSheet.forOrigin(1, 0).iterator(); 
    for (Row row : dataColumnsSheet.forOrigin(0, 0)) {
      if (row.isEmpty()) {
        break;
      }
      
      String dataColumnPropertyLabel = row.getCell(0).getString().trim();
      DataColumnProperty dataColumnProperty = DataColumnProperty.fromDisplayText(dataColumnPropertyLabel);
      Row dataOnlyRow = dataRows.next();
      if (dataColumnProperty != null) {
        result.put(dataColumnProperty, dataOnlyRow);
      }
      else {
        row.getCell(0).addError("unknown data column property: " + dataColumnPropertyLabel);
      }
    }
    return result;
  }

  private void initializeDataColumnLocations(ScreenResult screenResult,
                                             Workbook workbook)
    throws UnrecoverableScreenResultParseException
  {
    Worksheet dataColumnsSheet = workbook.getWorksheet(DATA_COLUMNS_SHEET_NAME).forOrigin(DATA_COLUMNS_SHEET__FIRST_DATA_COLUMN__WORKSHEET_COLUMN_INDEX, 0);
    if (dataColumnsSheet == null) { 
      throw new UnrecoverableScreenResultParseException(DATA_COLUMNS_SHEET_NOT_FOUND_ERROR);
    }

    _dataColumnIndex2WorksheetColumnIndex = Maps.newHashMap();

    int dataColumnCount = findDataColumnCount(dataColumnsSheet);
    Map<DataColumnProperty,Row> dataColumnProperties = parseDataColumnPropertyNames(dataColumnsSheet);
    for (int iDataColumn = 0; iDataColumn < dataColumnCount; ++iDataColumn) {
      Cell cell = dataColumnProperties.get(DataColumnProperty.COLUMN_IN_DATA_WORKSHEET).getCell(iDataColumn, true);
      String forColumnInDataWorksheet = cell.getString().trim();
      try {
        if (forColumnInDataWorksheet != null) {
          _dataColumnIndex2WorksheetColumnIndex.put(iDataColumn,
                                                    AlphabeticCounter.toIndex(forColumnInDataWorksheet));
          DataColumn dataColumn = screenResult.getDataColumnsList().get(iDataColumn);
          _worksheetColumnLabel2DataColumnObjectMap.put(forColumnInDataWorksheet, dataColumn);
        }
      }
      catch (IllegalArgumentException e) {
        cell.addError(e.getMessage());
      }
    }
  }

  /**
   * Parse the workbook containing the ScreenResult data.
   *
   * @param workbook the workbook containing some or all of the raw data for a
   *          ScreenResult
   * @throws ExtantLibraryException if an existing Well entity cannot be found
   *           in the database
   * @throws IOException
   * @throws UnrecoverableScreenResultParseException
   */
  private void parseData(Workbook workbook,
                         ScreenResult screenResult,
                         IntRange plateNumberRange,
                         boolean incrementalFlush)
    throws ExtantLibraryException, IOException, UnrecoverableScreenResultParseException
  {
    log.debug("incrementalFlush:" + incrementalFlush);
    long startTime = System.currentTimeMillis();
    long loopTime = startTime;
    
    int wellsWithDataLoaded = 0;
    int dataSheetsParsed = 0;
    int totalSheets = workbook.getWorkbook().getNumberOfSheets();
    int firstDataSheetIndex = workbook.getWorksheet(DATA_COLUMNS_SHEET_NAME).getSheetIndex() + 1;
    int totalDataSheets = Math.max(0, totalSheets - firstDataSheetIndex);
    plateNumberRange = plateNumberRange == null ? new IntRange(Integer.MIN_VALUE, Integer.MAX_VALUE) : plateNumberRange;

    // Note: we do this to make sure that the DataColumn's are persisted before we persist and clear the RV's
    if (screenResult.getEntityId() == null) {
      _genericEntityDao.persistEntity(screenResult);
    }
    else {
      _genericEntityDao.saveOrUpdateEntity(screenResult);
    }
    _genericEntityDao.flush();

    for (int iSheet = firstDataSheetIndex; iSheet < totalSheets; ++iSheet) {
      String sheetName = workbook.getWorkbook().getSheet(iSheet).getName();
      log.info("parsing sheet " + (dataSheetsParsed + 1) + " of " + totalDataSheets + ", " + sheetName);
      Worksheet worksheet = workbook.getWorksheet(iSheet).forOrigin(0, DATA_SHEET__FIRST_DATA_ROW_INDEX);

      for (Row row : worksheet) {
        // bring in the old findNextRow() logic
        if (row.getColumns() > 0 
          && !row.getCell(0).isEmpty()            
          && row.getCell(0).getAsString().trim().length() > 0 ) 
        {
          Integer plateNumber = row.getCell(WellInfoColumn.PLATE.ordinal(), true).getInteger();
          Cell wellNameCell = row.getCell(WellInfoColumn.WELL_NAME.ordinal()); 
          String wellName = _wellNameParser.parse(wellNameCell);
          if (!wellName.equals("")) {
            WellKey wellKey = new WellKey(plateNumber, wellName);
            if (!plateNumberRange.containsInteger(wellKey.getPlateNumber())) {
              if (log.isDebugEnabled()) {
                log.debug("Skipping, excluded range: " + plateNumberRange + ", row: " + row.getRow());
              }
            } 
            else {
              boolean duplicate = !parsedWellKeys.add(wellKey.getKey());
              if (duplicate) {
                if (!_ignoreDuplicateErrors) {
                  wellNameCell.addError("duplicate well: " + wellKey);
                }
                else {
                  log.debug("duplicate well: " + wellKey + ", duplicate found at: " + wellNameCell);
                }
              } 
              else {
                if (findLibraryWithPlate(wellKey.getPlateNumber()) == null) {
                  wellNameCell.addError(NO_SUCH_LIBRARY_WITH_PLATE);
                } 
                else {
                  Well well = _librariesDao.findWell(wellKey);
                  if (well == null) {
                    wellNameCell.addError(NO_SUCH_WELL + ": " + wellKey);
                  }
                  else if (findAssayWell(well) == null) {
                    readResultValues(screenResult, row, well, incrementalFlush);
                    ++wellsWithDataLoaded;
                    if (incrementalFlush && wellsWithDataLoaded % AbstractDAO.ROWS_TO_CACHE == 0) {
                      saveResultValuesAndFlush(screenResult, incrementalFlush);
                      if (log.isInfoEnabled() && wellsWithDataLoaded % (AbstractDAO.ROWS_TO_CACHE * 100) == 0) {
                        long time = System.currentTimeMillis();
                        long cumulativeTime = time - startTime;
                        log.info("wellsWithDataLoaded: " + wellsWithDataLoaded 
                                 + ", cumulative time: " + (double)cumulativeTime/(double)60000 
                                 + " min, avg row time: " + (double)cumulativeTime/(double)wellsWithDataLoaded 
                                 + ", loopTime: " + (time - loopTime) );
                        loopTime = time;
                      }
                    } // incremental
                  }
                }
              }
            }
          }
        }
      } // for row
      ++dataSheetsParsed;
      if (wellsWithDataLoaded > 0) {
        saveResultValuesAndFlush(screenResult, incrementalFlush);
        log.info("Sheet: " + sheetName + " done, save, count: " + wellsWithDataLoaded);
        long time = System.currentTimeMillis();
        long cumulativeTime = time - startTime;
        log.info("wellsWithDataLoaded: " + wellsWithDataLoaded 
                 + ", cumulative time: " + (double)cumulativeTime/(double)60000 
                 + " min, avg row time: " + (double)cumulativeTime/(double)wellsWithDataLoaded );
      }    
    }

    if (dataSheetsParsed == 0) {
      _workbook.addError(NO_DATA_SHEETS_FOUND_ERROR);
    } else {
      log.info("done parsing " + dataSheetsParsed + " data sheet(s) " + workbook.getName());
      log.info("loaded data for " + wellsWithDataLoaded + " well(s) ");
    }
  }

  private void saveResultValuesAndFlush(ScreenResult screenResult, boolean incrementalFlush)
  {
    log.debug("incrementally save the screen result and clear values");
    for (DataColumn dataColumn : screenResult.getDataColumns()) {
      _genericEntityDao.saveOrUpdateEntity(dataColumn); 
      //TODO: this should not be required, but the writes to the DB were missing some of the RV's and this fixed it -sde4
      for (ResultValue rv : dataColumn.getResultValues()) {
        _genericEntityDao.saveOrUpdateEntity(rv);
      }
    }
    if (incrementalFlush) {
      for (DataColumn dataColumn : screenResult.getDataColumns()) {
        dataColumn.clearResultValues();
      }
      for (AssayWell assayWell: screenResult.getAssayWells()) {
        _genericEntityDao.saveOrUpdateEntity(assayWell);
      }
      screenResult.getAssayWells().clear();
      _genericEntityDao.flush();
      _genericEntityDao.clear();
    }
  }

  private void readResultValues(ScreenResult screenResult, Row row, Well well, boolean incrementalFlush)
  {
    AssayWellControlType assayWellControlType = 
      _assayWellControlTypeParser.parse(row.getCell(WellInfoColumn.ASSAY_WELL_TYPE.ordinal()));
    try {
      AssayWell assayWell = createAssayWell(well, assayWellControlType);
      List<DataColumn> wellExcludes = _excludeParser.parseList(row.getCell(WellInfoColumn.EXCLUDE.ordinal()));
      int iDataColumn = 0;
      int maxReplicateOrdinal = 1;
      for (DataColumn dataColumn : screenResult.getDataColumns()) {
        Cell cell = row.getCell(getDataColumn(iDataColumn));
        boolean isExclude = (wellExcludes != null && wellExcludes.contains(dataColumn));

        ResultValue newResultValue = null;
        if (dataColumn.isBooleanPositiveIndicator()) {
          newResultValue =
            dataColumn.createBooleanPositiveResultValue(assayWell,
                                                        cell.isBoolean() ? cell.getBoolean() : _booleanParser.parse(cell),
                                                          isExclude);
        }
        else if (dataColumn.isPartitionPositiveIndicator()) {
          newResultValue =
            dataColumn.createPartitionedPositiveResultValue(assayWell,
                                                            _partitionedValueParser.parse(cell),
                                                            isExclude);
        }
        else if (dataColumn.isConfirmedPositiveIndicator()) {
          newResultValue =
            dataColumn.createConfirmedPositiveResultValue(assayWell,
                                                            _confirmedPositiveValueParser.parse(cell),
                                                            isExclude);
        }
        else if (dataColumn.isNumeric()) {
          newResultValue =
            dataColumn.createResultValue(assayWell,
                                         cell.getDouble(),
                                         isExclude);
        }
        else {
          newResultValue =
            dataColumn.createResultValue(assayWell,
                                         cell.getString(),
                                         isExclude);
        }
          
        // update the maxReplicateOrdinal to track the actual number of
        // replicates that have data loaded for a particular plate
        // (this allows us to later calculate which how many replicates
        // actually have associated data, in case some of the "extra" screened
        // replicates were ignored by the screener)
        if (!dataColumn.isDerived() && !newResultValue.isNull()) {
          if (dataColumn.getReplicateOrdinal() != null) {
            maxReplicateOrdinal = Math.max(maxReplicateOrdinal, dataColumn.getReplicateOrdinal());
          }
        }
        if (newResultValue == null) {
          cell.addError("duplicate well");
        }

        ++iDataColumn;
      }

      recordAssayPlatesDataLoaded(well, maxReplicateOrdinal);

      if (incrementalFlush) {
        // [#2119] Optimize ScreenResultParser for scalability:
        // - in memory RV's must be reloaded as needed
        assayWell.getLibraryWell().getResultValues().clear();
      }
    }
    catch (DataModelViolationException e) {
      row.getCell(WellInfoColumn.ASSAY_WELL_TYPE.ordinal()).addError(e.getMessage());
    }
  }

  private AssayWell createAssayWell(Well well, AssayWellControlType assayWellControlType)
    throws DataModelViolationException
  {
    AssayWell assayWell = _screenResult.createAssayWell(well);
    assayWell.setAssayWellControlType(assayWellControlType);
    return assayWell;
  }

  private AssayWell findAssayWell(Well well)
  {
    AssayWell assayWell = _screenResultsDao.findAssayWell(_screenResult, well.getWellKey());
//    if (assayWell != null) {
//      if (assayWell.getAssayWellControlType() != assayWellControlType) {
//        _workbook.addError(ASSAY_WELL_TYPE_INCONSISTENCY + ": " + well);
//        return null;
//      }
//    }
    return assayWell;
  }

  private void recordAssayPlatesDataLoaded(Well well, int replicateCount)
  {
    Integer maxReplicateCount = _plateNumbersLoadedWithMaxReplicates.get(well.getPlateNumber());
    if (maxReplicateCount == null || maxReplicateCount.compareTo(replicateCount) < 0) {
      _plateNumbersLoadedWithMaxReplicates.put(well.getPlateNumber(), replicateCount);
    }
  }  
  
  /**
   * @motivation database I/O optimization
   */
  private Library findLibraryWithPlate(Integer plateNumber)
  {
    if (_lastLibrary == null ||
      !_lastLibrary.containsPlate(plateNumber)) {
      _lastLibrary = _librariesDao.findLibraryWithPlate(plateNumber);
    }
    return _lastLibrary;
  }

  private static class ParsedScreenInfo {
    private Integer _screenId;

    public Integer getScreenId()
    {
      return _screenId;
    }

    public void setScreenId(Integer screenId)
    {
      _screenId = screenId;
    }
  }

  public class ColumnLabelsParser implements CellValueParser<DataColumn>
  {
    protected Map<String,DataColumn> _columnLabel2ColMap;
    private Pattern columnIdPattern = Pattern.compile("[A-Z]+");

    public ColumnLabelsParser(Map<String,DataColumn> columnLabel2ColMap)
    {
      _columnLabel2ColMap =  columnLabel2ColMap;
    }

    public DataColumn parse(Cell cell)
    {
      throw new UnsupportedOperationException();
    }

    public List<DataColumn> parseList(Cell cell)
    {
      String textMultiValue = cell.getString();
      List<DataColumn> result = new ArrayList<DataColumn>();

      if (textMultiValue == null || textMultiValue.trim().length() == 0) {
        return result;
      }

      String[] textValues = textMultiValue.split(",");
      for (int i = 0; i < textValues.length; i++) {
        String text = textValues[i].trim();
        DataColumn dataColumn = doParseSingleValue(text, cell);
        if (dataColumn != null) {
          result.add(dataColumn);
        }
        else {
          cell.addError("invalid Data Column worksheet column label '" + text +
            "' (expected one of " + _columnLabel2ColMap.keySet() + ")");
        }
      }
      return result;
    }

    protected DataColumn doParseSingleValue(String value, Cell cell)
    {
      Matcher matcher = columnIdPattern.matcher(value);
      if (!matcher.matches()) {
        return null;
      }
      String columnLabel = matcher.group(0);
      return _columnLabel2ColMap.get(columnLabel);
    }
  }

  private class ExcludeParser extends ColumnLabelsParser
  {
    public ExcludeParser(Map<String,DataColumn> columnLabel2ColMap)
    {
      super(columnLabel2ColMap);
    }

    public List<DataColumn> parseList(Cell cell)
    {
      String textMultiValue = cell.getString();

      if (textMultiValue != null &&
        textMultiValue.equalsIgnoreCase(ScreenResultWorkbookSpecification.EXCLUDE_ALL_VALUE)) {
        return new ArrayList<DataColumn>(_columnLabel2ColMap.values());
      }

      if (textMultiValue == null) {
        return Collections.emptyList();
      }

      return super.parseList(cell);
    }
  }
}