/*
* Copyright (C) 2010 Mark Rijnbeek <mark_rynbeek@users.sf.net>
*
* Contact: cdk-devel@lists.sourceforge.net
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public License
* as published by the Free Software Foundation; either version 2.1
* of the License, or (at your option) any later version.
* All we ask is that proper credit is given for our work, which includes
* - but is not limited to - adding the above copyright notice to the beginning
* of your source code files, and to any copyright notice that you may
* distribute with programs based on this work.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
*/
package org.openscience.cdk.io;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.StringTokenizer;
import org.openscience.cdk.annotations.TestClass;
import org.openscience.cdk.annotations.TestMethod;
import org.openscience.cdk.exception.CDKException;
import org.openscience.cdk.interfaces.IAtom;
import org.openscience.cdk.interfaces.IBond;
import org.openscience.cdk.interfaces.IChemObject;
import org.openscience.cdk.interfaces.IMolecule;
import org.openscience.cdk.interfaces.IPseudoAtom;
import org.openscience.cdk.io.formats.IResourceFormat;
import org.openscience.cdk.io.formats.RGroupQueryFormat;
import org.openscience.cdk.isomorphism.matchers.IRGroupQuery;
import org.openscience.cdk.isomorphism.matchers.RGroup;
import org.openscience.cdk.isomorphism.matchers.RGroupList;
import org.openscience.cdk.isomorphism.matchers.RGroupQuery;
import org.openscience.cdk.tools.ILoggingTool;
import org.openscience.cdk.tools.LoggingToolFactory;
/**
* A reader for Symyx' Rgroup files (RGFiles).
* An RGfile describes a single molecular query with Rgroups.
* Each RGfile is a combination of Ctabs defining the root molecule and each
* member of each Rgroup in the query.
*
* <p>The RGFile format is described in the manual
* <a href="http://www.symyx.com/downloads/public/ctfile/ctfile.pdf">
* "CTFile Formats"</a> , Chapter 5.
*
* @cdk.module io
* @cdk.githash
*
* @cdk.keyword Rgroup
* @cdk.keyword R group
* @cdk.keyword R-group
* @author Mark Rijnbeek
*/
@TestClass("org.openscience.cdk.io.RGroupQueryReaderTest")
public class RGroupQueryReader extends DefaultChemObjectReader {
/**
* Private bean style class to capture LOG (logic) lines.
*/
private class RGroupLogic {
int rgoupNumberRequired;
boolean restH;
String occurence;
}
BufferedReader input = null;
private static ILoggingTool logger = LoggingToolFactory.createLoggingTool(RGroupQueryReader.class);
/**
* Default constructor, input not set.
*/
public RGroupQueryReader() {
this(new StringReader(""));
}
/**
* Constructs a new RgroupQueryReader that can read RgroupAtomContainerSet
* from a given InputStream.
* @param in The InputStream to read from.
*/
public RGroupQueryReader(InputStream in) {
this(new InputStreamReader(in));
}
/**
* Constructs a new RgroupQueryReader that can read RgroupAtomContainerSet
* from a given Reader.
* @param in The Reader to read from.
*/
public RGroupQueryReader(Reader in) {
input = new BufferedReader(in);
}
/**
* Sets the input Reader.
* @param input Reader object
* @throws CDKException
*/
@TestMethod("testSetReader_Reader")
public void setReader(Reader input) throws CDKException {
if (input instanceof BufferedReader) {
this.input = (BufferedReader)input;
} else {
this.input = new BufferedReader(input);
}
}
@TestMethod("testSetReader_InputStream")
public void setReader(InputStream input) throws CDKException {
setReader(new InputStreamReader(input));
}
@TestMethod("testGetFormat")
public IResourceFormat getFormat() {
return RGroupQueryFormat.getInstance();
}
@TestMethod("testAccepts")
public boolean accepts(Class classObject) {
Class[] interfaces = classObject.getInterfaces();
for (Class anInterface : interfaces) {
if (IRGroupQuery.class.equals(anInterface)) return true;
}
Class superClass = classObject.getSuperclass();
if (superClass != null) return this.accepts(superClass);
return false;
}
@TestMethod("testClose")
public void close() throws IOException {
input.close();
}
/**
* Check input IChemObject and proceed to parse.
* Accepts/returns IChemObject of type RGroupQuery only.
* @return IChemObject read from file
* @param object class must be of type RGroupQuery
*/
public IChemObject read(IChemObject object) throws CDKException {
if (object instanceof RGroupQuery) {
return parseRGFile((RGroupQuery)object);
} else {
throw new CDKException
("Reader only supports "+RGroupQuery.class.getName()+" objects");
}
}
/**
* Parse the RGFile. Uses of {@link org.openscience.cdk.io.MDLV2000Reader}
* to parse individual $CTAB blocks.
*
* @param rGroupQuery empty
* @return populated query
* @throws CDKException
*/
private RGroupQuery parseRGFile(RGroupQuery rGroupQuery) throws CDKException {
String line = "";
int lineCount = 0;
String eol = System.getProperty("line.separator");
StringTokenizer strTk=null;
/* Variable to capture the LOG line(s) */
Map<Integer, RGroupLogic> logicDefinitions = new HashMap<Integer, RGroupLogic>();
/* Variable to captures attachment order for Rgroups.
* Contains:
* - pseudo atom (Rgroup)
* - map with (integer,bond) meaning "bond" has attachment
* order "integer" (1,2,3) for the Rgroup
* The order is based on the atom block, unless there is an AAL line
* for the pseudo atom.
*/
Map<IAtom, Map<Integer, IBond>> attachmentPoints = new HashMap<IAtom, Map<Integer, IBond>>();
try {
// Process the Header block_________________________________________
//__________________________________________________________________
logger.info("Process the Header block");
checkLineBeginsWith(input.readLine(), "$MDL", ++lineCount);
checkLineBeginsWith(input.readLine(), "$MOL", ++lineCount);
checkLineBeginsWith(input.readLine(), "$HDR", ++lineCount);
for (int i = 1; i <= 3; i++) {
lineCount++;
if (input.readLine() == null) {
throw new CDKException("RGFile invalid, empty/null header line at #" + lineCount);
}
//optional: parse header info here (not implemented)
}
checkLineBeginsWith(input.readLine(), "$END HDR", ++lineCount);
//Process the root structure (scaffold)_____________________________
//__________________________________________________________________
logger.info("Process the root structure (scaffold)");
checkLineBeginsWith(input.readLine(), "$CTAB", ++lineCount);
//Force header
StringBuilder sb = new StringBuilder(RGroup.ROOT_LABEL+"\n\n\n");
line = input.readLine();
++lineCount;
while (line != null && !line.equals("$END CTAB")) {
sb.append(line + eol);
//LOG lines: Logic, Unsatisfied Sites, Range of Occurrence.
if (line.startsWith("M LOG")) {
strTk = new StringTokenizer(line);
strTk.nextToken();
strTk.nextToken();
strTk.nextToken();
RGroupLogic log = null;
log = new RGroupLogic();
int rgroupNumber = new Integer(strTk.nextToken());
String tok = strTk.nextToken();
log.rgoupNumberRequired = tok.equals("0") ? 0 : new Integer(tok);
log.restH = strTk.nextToken().equals("1") ? true : false;
tok = "";
while (strTk.hasMoreTokens()) {
tok += strTk.nextToken();
}
log.occurence = tok;
logicDefinitions.put(rgroupNumber, log);
}
line = input.readLine();
++lineCount;
}
String rootStr = sb.toString();
//Let MDL reader process $CTAB block of the root structure.
MDLV2000Reader reader = new MDLV2000Reader(new StringReader(rootStr), ISimpleChemObjectReader.Mode.STRICT);
IMolecule root = (IMolecule)reader.read(rGroupQuery.getBuilder().newMolecule());
rGroupQuery.setRootStructure(root);
List<IAtom> atomsByLinePosition = reader.getAtomsByLinePosition();
//Atom attachment order: parse AAL lines first
strTk = new StringTokenizer(rootStr, eol);
while (strTk.hasMoreTokens()) {
line = strTk.nextToken();
if (line.startsWith("M AAL")) {
StringTokenizer stAAL = new StringTokenizer(line);
stAAL.nextToken();
stAAL.nextToken();
int pos = new Integer(stAAL.nextToken());
IAtom rGroup = atomsByLinePosition.get(pos);
stAAL.nextToken();
Map<Integer, IBond> bondMap = new HashMap<Integer, IBond>();
while (stAAL.hasMoreTokens()) {
pos = new Integer(stAAL.nextToken());
IAtom partner = atomsByLinePosition.get(pos);
IBond bond = root.getBond(rGroup, partner);
int order = new Integer(stAAL.nextToken());
bondMap.put(order, bond);
logger.info("AAL " + order + " " + ((IPseudoAtom)rGroup).getLabel() +
"-" + partner.getSymbol());
}
if (bondMap.size()!=0) {
attachmentPoints.put(rGroup, bondMap);
}
}
}
//Deal with remaining attachment points (non AAL)
for (IAtom atom : root.atoms()) {
if (atom instanceof IPseudoAtom) {
IPseudoAtom rGroup = (IPseudoAtom)atom;
if (rGroup.getLabel().startsWith("R") &&
!rGroup.getLabel().equals("R") && // only numbered ones
!attachmentPoints.containsKey(rGroup)) {
//Order reflects the order of atoms in the Atom Block
int order = 0;
Map<Integer, IBond> bondMap = new HashMap<Integer, IBond>();
for (IAtom atom2 : atomsByLinePosition) {
if (!atom.equals(atom2)) {
for (IBond bond : root.bonds()) {
if (bond.contains(atom) && bond.contains(atom2)) {
bondMap.put(++order, bond);
logger.info("Def " + order + " " + rGroup.getLabel() + "-" +
atom2.getSymbol());
break;
}
}
}
}
if (bondMap.size()!=0) {
attachmentPoints.put(rGroup, bondMap);
}
}
}
}
//Done with attachment points
rGroupQuery.setRootAttachmentPoints(attachmentPoints);
logger.info("Attachm.points defined for " + attachmentPoints.size() + " R# atoms");
//Process each Rgroup's $CTAB block(s)_____________________________
//__________________________________________________________________
//Set up the RgroupLists, one for each unique R# (# = 1..32 max)
Map<Integer,RGroupList> rGroupDefinitions = new HashMap<Integer,RGroupList>();
for (IAtom atom : root.atoms()) {
if (atom instanceof IPseudoAtom) {
IPseudoAtom rGroup = (IPseudoAtom)atom;
if (RGroupQuery.isValidRgroupQueryLabel(rGroup.getLabel())) {
int rgroupNum = new Integer(rGroup.getLabel().substring(1));
RGroupList rgroupList = new RGroupList(rgroupNum);
if (!rGroupDefinitions.containsKey(rgroupNum)) {
logger.info("Define Rgroup R" + rgroupNum);
RGroupLogic logic = logicDefinitions.get(rgroupNum);
if (logic != null) {
rgroupList.setRestH(logic.restH);
rgroupList.setOccurrence(logic.occurence);
rgroupList.setRequiredRGroupNumber(logic.rgoupNumberRequired);
} else {
rgroupList.setRestH(false);
rgroupList.setOccurrence(">0");
rgroupList.setRequiredRGroupNumber(0);
}
rgroupList.setRGroups(new ArrayList<RGroup>());
rGroupDefinitions.put(rgroupNum, rgroupList);
}
}
}
}
//Parse all $CTAB blocks per Rgroup (there can be more than one)
line = input.readLine();
++lineCount;
boolean hasMoreRGP = true;
while (hasMoreRGP) {
checkLineBeginsWith(line, "$RGP", lineCount);
line = input.readLine();
++lineCount;
logger.info("line for num is " + line);
int rgroupNum = new Integer(line.trim());
line = input.readLine();
++lineCount;
boolean hasMoreCTAB = true;
while (hasMoreCTAB) {
checkLineBeginsWith(line, "$CTAB", lineCount);
sb = new StringBuilder(RGroup.makeLabel(rgroupNum)+"\n\n\n");
line = input.readLine();
while (line != null && !line.startsWith("$END CTAB")) {
sb.append(line + eol);
line = input.readLine();
++lineCount;
}
String groupStr = sb.toString();
reader = new MDLV2000Reader
(new StringReader(groupStr), ISimpleChemObjectReader.Mode.STRICT);
IMolecule group = (IMolecule)reader.read(rGroupQuery.getBuilder().newMolecule());
atomsByLinePosition = reader.getAtomsByLinePosition();
RGroup rGroup = new RGroup();
rGroup.setGroup(group);
//Parse the Rgroup's attachment points (APO)
strTk = new StringTokenizer(groupStr, eol);
while (strTk.hasMoreTokens()) {
line = strTk.nextToken();
if (line.startsWith("M APO")) {
StringTokenizer stAPO = new StringTokenizer(line);
stAPO.nextToken();
stAPO.nextToken();
stAPO.nextToken();
while (stAPO.hasMoreTokens()) {
int pos = new Integer(stAPO.nextToken());
int apo = new Integer(stAPO.nextToken());
IAtom at = atomsByLinePosition.get(pos);
switch (apo) {
case 1:
rGroup.setFirstAttachmentPoint(at);
break;
case 2:
rGroup.setSecondAttachmentPoint(at);
break;
case 3: {
rGroup.setFirstAttachmentPoint(at);
rGroup.setSecondAttachmentPoint(at);
}
break;
}
}
}
}
RGroupList rList = rGroupDefinitions.get(rgroupNum);
if (rList==null) {
throw new CDKException("R"+rgroupNum+" not defined but referenced in $RGP.");
}
else {
rList.getRGroups().add(rGroup);
}
line = input.readLine();
++lineCount;
if (line.startsWith("$END RGP")) {
logger.info("end of RGP block");
hasMoreCTAB = false;
}
}
line = input.readLine();
++lineCount;
if (line.startsWith("$END MOL")) {
hasMoreRGP = false;
}
}
rGroupQuery.setRGroupDefinitions(rGroupDefinitions);
logger.info("Number of lines was " + lineCount);
return rGroupQuery;
} catch (CDKException exception) {
String error = "CDK Error while parsing line " + lineCount + ": " + line + " -> " + exception.getMessage();
logger.error(error);
logger.debug(exception);
throw exception;
} catch (Exception exception) {
exception.printStackTrace();
String error =
exception.getClass() + "Error while parsing line " + lineCount + ": " + line + " -> " + exception.getMessage();
logger.error(error);
logger.debug(exception);
throw new CDKException(error, exception);
}
}
/**
* Checks that a given line starts as expected, according to RGFile format.
* @param line
* @param expect
* @param lineCount
* @throws CDKException
*/
private void checkLineBeginsWith(String line, String expect, int lineCount) throws CDKException {
if (line == null) {
throw new CDKException("RGFile invalid, empty/null line at #" + lineCount);
}
if (!line.startsWith(expect)) {
throw new CDKException("RGFile invalid, line #" + lineCount + " should start with:" + expect + ".");
}
}
}