// $HeadURL$ // $Id$ // // Copyright © 2006, 2010, 2011, 2012 by the President and Fellows of Harvard College. // // Screensaver is an open-source project developed by the ICCB-L and NSRB labs // at Harvard Medical School. This software is distributed under the terms of // the GNU General Public License. package edu.harvard.med.screensaver.io.libraries.smallmolecule; import java.io.BufferedReader; import java.io.IOException; import java.math.BigDecimal; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; import com.google.common.collect.Sets; import org.apache.log4j.Logger; import edu.harvard.med.screensaver.io.DataExporter; import edu.harvard.med.screensaver.io.ParseError; import edu.harvard.med.screensaver.io.ParseException; import edu.harvard.med.screensaver.io.libraries.LibraryContentsParser; import edu.harvard.med.screensaver.model.MolarConcentration; import edu.harvard.med.screensaver.model.MolarUnit; import edu.harvard.med.screensaver.model.libraries.LibraryWellType; import edu.harvard.med.screensaver.model.libraries.MolecularFormula; import edu.harvard.med.screensaver.model.libraries.WellName; //TODO: use common file specification with WellSdfWriter public class SDRecordParser { private static final String END_OF_MOLFILE_MARKER = "M END"; private static final String END_OF_RECORD_DELIMITER = "$$$$"; public static Set EMPTY_FIELD_MARKERS = Sets.newHashSet(new String[] { "n/a", "none" }); private static final Logger log = Logger.getLogger(SDRecordParser.class); private static final Pattern dataHeaderPattern = Pattern.compile("^>.*<(.*)>.*"); private BufferedReader _sdFileReader; private int _lineNumber; SDRecordParser(BufferedReader sdFileReader) throws IOException { _sdFileReader = sdFileReader; } public SDRecord next() throws IOException, ParseException { return parseNextRecord(); } private String readNextLine() throws IOException { String line = _sdFileReader.readLine(); if (line != null) { ++_lineNumber; } return line; } private SDRecord parseNextRecord() throws IOException, ParseException { SDRecord sdRecord = new SDRecord(); boolean emptyRecord = true; StringBuilder molfile = new StringBuilder(); String line = readNextLine(); // read the molfile, unless it is missing while (line != null && ! line.startsWith(">") && !line.equals(END_OF_MOLFILE_MARKER)) { molfile.append(line).append("\n"); line = readNextLine(); } if (molfile.length() > 0) { emptyRecord = false; molfile.append(END_OF_MOLFILE_MARKER).append("\n"); sdRecord.setMolfile(molfile.toString()); if (log.isDebugEnabled()) { log.debug("molfile: " + molfile.toString()); } } // read the "associated data" part of the SD record while (line != null && ! line.equals(END_OF_RECORD_DELIMITER)) { Matcher dataHeaderMatcher = dataHeaderPattern.matcher(line); if (dataHeaderMatcher.matches()) { String fieldName = dataHeaderMatcher.group(1).toLowerCase(); line = readNextLine().trim(); if (line.length() == 0) continue; if (EMPTY_FIELD_MARKERS.contains(line)) continue; try { boolean unusedField = false; if (fieldName.equals("plate")) { sdRecord.setPlateNumber(Integer.parseInt(line)); } else if (fieldName.equals("well")) { sdRecord.setWellName(new WellName(line)); } else if (fieldName.equals("well_type")) { sdRecord.setLibraryWellType(LibraryWellType.valueOf(line.toUpperCase())); } else if (fieldName.equals("facility_reagent_id")) { log.info("facility reagent id: " + line); sdRecord.setFacilityId(line); } else if (fieldName.equals("barcode")) { log.info("barcode: " + line); sdRecord.setBarcode(line); } else if (fieldName.equals("vendor")) { sdRecord.setVendor(line); } else if (fieldName.equals("vendor_reagent_id")) { sdRecord.setVendorIdentifier(line); } else if (fieldName.equals("vendor_batch_id")) { sdRecord.setVendorBatchId(line); } else if (fieldName.equals("facility_batch_id")) { sdRecord.setFacilityBatchId(Integer.parseInt(line)); } else if (fieldName.equals("salt_form_id")) { sdRecord.setSaltFormId(Integer.parseInt(line)); } else if (fieldName.equals("chemical_name")) { String[] names = line.split(DataExporter.LIST_DELIMITER); for (String name : names) { // trim whitespace from the names name = name.trim(); sdRecord.getCompoundNames().add(name); } //sdRecord.getCompoundNames().add(line); } else if (fieldName.equals("pubchem_cid")) { String[] ids = line.split(DataExporter.LIST_DELIMITER); for(String id:ids) { sdRecord.getPubchemCids().add(Integer.parseInt(id)); } } else if (fieldName.equals("chembank_id")) { String[] ids = line.split(DataExporter.LIST_DELIMITER); for(String id:ids) { sdRecord.getChembankIds().add(Integer.parseInt(id)); } } else if (fieldName.equals("chembl_id")) { String[] ids = line.split(DataExporter.LIST_DELIMITER); for(String id:ids) { sdRecord.getChemblIds().add(Integer.parseInt(id)); } } else if (fieldName.equals("pubmed_id")) { String[] ids = line.split(DataExporter.LIST_DELIMITER); for (String id : ids) { sdRecord.getPubmedIds().add(Integer.parseInt(id)); log.info("pubmed id: " + id); } } else if (fieldName.equals("concentration")) { Matcher matcher = LibraryContentsParser.molarConcentrationPattern.matcher(line); if (matcher.matches()) { MolarUnit unit = MolarUnit.forSymbol(matcher.group(2)); sdRecord.setMolarConcentration(MolarConcentration.makeConcentration(matcher.group(1), unit)); } else { matcher = LibraryContentsParser.mgMlConcentrationPattern.matcher(line); if (matcher.matches()) { sdRecord.setMgMlConcentration(new BigDecimal(matcher.group(1) )); } else { throw new ParseException(new ParseError("field 'concentration' value could not be interpreted as a molar or mg/ml concentration:" + line)); } } } else if (fieldName.equals("molecular_mass")) { sdRecord.setMolecularMass(new BigDecimal(line)); } else if (fieldName.equals("molecular_weight")) { sdRecord.setMolecularWeight(new BigDecimal(line)); } else if (fieldName.equals("molecular_formula")) { sdRecord.setMolecularFormula(new MolecularFormula(line)); } else if (fieldName.equals("smiles")) { sdRecord.setSmiles(line); } else if (fieldName.equals("inchi")) { sdRecord.setInChi(line); } else { unusedField = true; if (log.isDebugEnabled()) { log.debug("unused field: " + fieldName + ": " + line); } } if (!unusedField) { emptyRecord = false; } } catch (Exception e) { skipRestOfRecord(); log.warn("bad value in field '" + fieldName + "'" + _lineNumber, e); if (e instanceof ParseException) throw (ParseException) e; //TODO: why not add something from the exception message here (this would show up in the UI)? - sde4 throw new ParseException(new ParseError("field '" + fieldName + "' error: " + e.getMessage(), _lineNumber)); } } line = readNextLine(); } return emptyRecord ? null : sdRecord; } private void skipRestOfRecord() throws IOException { String line; do { line = readNextLine(); } while (line != null && ! line.equals(END_OF_RECORD_DELIMITER)); } public int getLineNumber() { return _lineNumber; } }