package edu.harvard.med.screensaver.io.cells;
import java.io.File;
import java.io.IOException;
import java.util.Arrays;
import java.util.SortedSet;
import org.apache.commons.lang.StringUtils;
import org.apache.log4j.Logger;
import org.springframework.transaction.annotation.Transactional;
import com.google.common.base.Joiner;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import edu.harvard.med.screensaver.db.GenericEntityDAO;
import edu.harvard.med.screensaver.io.ParseError;
import edu.harvard.med.screensaver.io.ParseException;
import edu.harvard.med.screensaver.io.UnrecoverableParseException;
import edu.harvard.med.screensaver.model.cells.Cell;
import edu.harvard.med.screensaver.model.cells.CellLineage;
import edu.harvard.med.screensaver.model.cells.PrimaryCell;
public class CellParser
{
private static final Logger log = Logger.getLogger(CellParser.class);
private GenericEntityDAO _dao;
public static String DELIMITER_INSIDE_FIELD = ";";
public static String DELIMITER_FIELD = ",";
private static int i = 0;
// public static final int COL_TYPE = i++;
public static final int COL_FACILITY_ID = i++; // Added HMS column, not in the
// DWG Standards
// CL:1 CL_Name
public static final int COL_NAME = i++;
public static final int COL_CLOID = i++;
public static final int COL_ALTERNATE_NAME = i++;
public static final int COL_ALTERNATE_ID = i++;
public static final int COL_CENTER_NAME = i++;
public static final int COL_CENTER_SPECIFIC_ID = i++;
public static final int COL_VENDOR = i++;
public static final int COL_VENDOR_CAT = i++;
public static final int COL_BATCH_ID = i++;
public static final int COL_ORGANISM = i++;
public static final int COL_ORGAN = i++;
public static final int COL_TISSUE = i++;
public static final int COL_CELL_TYPE = i++;
public static final int COL_CELL_TYPE_DETAIL = i++;
public static final int COL_DISEASE = i++;
private static final int COL_DISEASE_DETAIL = i++;
public static final int COL_GROWTH_PROPERTIES = i++;
public static final int COL_GENETIC_MOD = i++;
public static final int COL_RELATED_PROJECTS = i++;
public static final int COL_REC_CULTURE_COND = i++;
public static final int COL_VERIFICATION = i++;
public static final int COL_VERIFICATION_REFERENCE_PROFILE = i++;
private static final int COL_MUTATIONS_REFERENCE = i++;
private static final int COL_MUTATIONS_EXPLICIT = i++;
public static final int COL_ORGANISM_GENDER = i++;
// this column marks the end of the generic columns
public static final int COLUMN_COUNT_MIN = COL_ORGANISM_GENDER + 1;
// These are for primary cells
public static final int COL_DONOR_ETHNICITY = i++;
public static final int COL_AGE_IN_YEARS = i++;
public static final int COL_DONOR_HEALTH_STATUS = i++;
public static final int COL_CELL_MARKERS = i++;
public static final int COL_PASSAGE_NUMBER = i++;
private int[] columnPositions = new int[i];
protected CellParser() {}
public CellParser(GenericEntityDAO dao) {
_dao = dao;
}
@Transactional(rollbackForClassName="Exception")
public void load(File file) throws ParseException, IOException
{
log.info("begin parsing");
WorksheetReader reader = new WorksheetReader(file);
String[] line = reader.parseNext(); // Header Row, blanks indicate non-LINCS columns
parseColumnPositions(line); // determine where the blanks are, exclude these columns
reader.parseNext(); // second row is not used programmatically
while ((line = reader.parseNext()) != null) {
if (line.length < columnPositions.length) {
log.info("warn, line is short: " + Joiner.on(",").join(line) + ", padding");
line = Arrays.copyOf(line, columnPositions.length);
}
try {
Cell cell = parse(line);
_dao.persistEntity(cell);
_dao.flush();
_dao.clear(); // flush and clear so errors are reported early
} catch (ParseException e) {
e.getError().setErrorLocation("at line: " + reader.getLinesRead());
throw e;
}
}
log.info("rows read: " + reader.getLinesRead());
}
/*
* Re-index the columns, ignoring the columns with empty headers.
*/
private void parseColumnPositions(String[] line) {
int j = 0;
log.info("headers: " + Joiner.on(",").join(line));
for (int i = 0; i < line.length; i++) {
if (StringUtils.isEmpty(line[i])) {
continue;
} else {
if (j > columnPositions.length - 1)
throw new UnrecoverableParseException("Unrecognized column: " + line[i] + ", only " + columnPositions.length
+ ", column positions are defined. Please delete this column, or define more columns to be parsed.");
columnPositions[j] = i;
j++;
}
}
log.info("parsed: " + j + " columns from line: " + Joiner.on(",").join(line) + ", " + line.length);
}
private int getColumnForField(int i) {
if (i > columnPositions.length - 1)
new UnrecoverableParseException("Unrecognized column: " + i + ", only " + columnPositions.length
+ ", column positions are defined. Please delete this column, or define more columns to be parsed.");
return columnPositions[i];
}
public Cell parse(String[] fields) throws ParseException {
// CellLineType type = (new
// VocabularyTermParser<CellLineType>(CellLineType.class)).forValue(fields[COL_TYPE]);
// if (type == null)
// throw new ParseException(new
// ParseError("unknown cell line type, allowed values: "
// + Joiner.on(",").join(CellLineType.values()) + ", found: " +
// fields[COL_TYPE], COL_TYPE));
Cell cell = null;
// TODO: will use a command line switch to determine type
// switch (type) {
// case LINE:
cell = new CellLineage();
parseCellLineage(cell, fields);
// break;
// case PRIMARY:
// cell = new PrimaryCell();
// parseCellLineage(cell, fields);
// parsePrimaryCell((PrimaryCell) cell, fields);
// break;
// }
return cell;
}
public void parseCellLineage(Cell cell, String[] fields) throws ParseException {
cell.setFacilityId(fields[getColumnForField(COL_FACILITY_ID)]);
cell.setName(fields[getColumnForField(COL_NAME)]);
cell.setCloId(fields[getColumnForField(COL_CLOID)]);
cell.setAlternateName(fields[getColumnForField(COL_ALTERNATE_NAME)]);
cell.setAlternateId(fields[getColumnForField(COL_ALTERNATE_ID)]);
cell.setCenterName(fields[getColumnForField(COL_CENTER_NAME)]);
cell.setCenterSpecificId(fields[getColumnForField(COL_CENTER_SPECIFIC_ID)]);
cell.setVendor(fields[getColumnForField(COL_VENDOR)]);
cell.setVendorCatalogId(fields[getColumnForField(COL_VENDOR_CAT)]);
cell.setBatchId(fields[getColumnForField(COL_BATCH_ID)]);
cell.setOrganism(fields[getColumnForField(COL_ORGANISM)]);
cell.setOrgan(fields[getColumnForField(COL_ORGAN)]);
cell.setTissue(fields[getColumnForField(COL_TISSUE)]);
cell.setCellType(fields[getColumnForField(COL_CELL_TYPE)]);
cell.setCellTypeDetail(fields[getColumnForField(COL_CELL_TYPE_DETAIL)]);
cell.setDisease(fields[getColumnForField(COL_DISEASE)]);
cell.setDiseaseDetail(fields[getColumnForField(COL_DISEASE_DETAIL)]);
String temp = fields[getColumnForField(COL_GROWTH_PROPERTIES)];
if (!StringUtils.isEmpty(temp)) {
cell.setGrowthProperties(Sets.newTreeSet(Lists.newArrayList(temp.split(DELIMITER_INSIDE_FIELD))));
}
cell.setGeneticModification(fields[getColumnForField(COL_GENETIC_MOD)]);
temp = fields[getColumnForField(COL_RELATED_PROJECTS)];
if (!StringUtils.isEmpty(temp)) {
cell.setRelatedProjects(Sets.newTreeSet(Lists.newArrayList(temp.split(DELIMITER_INSIDE_FIELD))));
}
cell.setVerification(fields[getColumnForField(COL_VERIFICATION)]);
cell.setVerificationReferenceProfile(fields[getColumnForField(COL_VERIFICATION_REFERENCE_PROFILE)]);
cell.setRecommendedCultureConditions(fields[getColumnForField(COL_REC_CULTURE_COND)]);
cell.setMutationsReference(fields[getColumnForField(COL_MUTATIONS_REFERENCE)]);
cell.setMutationsExplicit(fields[getColumnForField(COL_MUTATIONS_EXPLICIT)]);
cell.setOrganismGender(fields[getColumnForField(COL_ORGANISM_GENDER)]);
}
public void parsePrimaryCell(PrimaryCell cell, String[] fields) throws ParseException {
cell.setDonorEthnicity(fields[getColumnForField(COL_DONOR_ETHNICITY)]);
try {
cell.setAgeInYears(Integer.parseInt(fields[getColumnForField(COL_AGE_IN_YEARS)]));
} catch (NumberFormatException e) {
throw new ParseException(new ParseError("age must be an integer: " + fields[getColumnForField(COL_AGE_IN_YEARS)],
COL_AGE_IN_YEARS));
}
cell.setDonorHealthStatus(fields[getColumnForField(COL_DONOR_HEALTH_STATUS)]);
SortedSet<String> temp = Sets.newTreeSet(Lists.newArrayList(fields[getColumnForField(COL_CELL_MARKERS)]
.split(DELIMITER_INSIDE_FIELD)));
cell.setCellMarkers(temp);
try {
cell.setPassageNumber(Integer.parseInt(fields[getColumnForField(COL_PASSAGE_NUMBER)]));
} catch (NumberFormatException e) {
throw new ParseException(new ParseError("Passage Number must be an integer: "
+ fields[getColumnForField(COL_PASSAGE_NUMBER)], COL_PASSAGE_NUMBER));
}
}
}