/* $Revision$ $Author$ $Date$
*
* Copyright (C) 2003-2007 The Chemistry Development Kit (CDK) project
*
* Contact: cdk-devel@lists.sourceforge.net
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public License
* as published by the Free Software Foundation; either version 2.1
* of the License, or (at your option) any later version.
* All we ask is that proper credit is given for our work, which includes
* - but is not limited to - adding the above copyright notice to the beginning
* of your source code files, and to any copyright notice that you may distribute
* with programs based on this work.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*/
package org.openscience.cdk.io.iterator;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringReader;
import java.util.NoSuchElementException;
import org.openscience.cdk.annotations.TestMethod;
import org.openscience.cdk.exception.CDKException;
import org.openscience.cdk.interfaces.IChemObject;
import org.openscience.cdk.interfaces.IChemObjectBuilder;
import org.openscience.cdk.interfaces.IMolecule;
import org.openscience.cdk.io.ISimpleChemObjectReader;
import org.openscience.cdk.io.MDLV2000Reader;
import org.openscience.cdk.io.ReaderFactory;
import org.openscience.cdk.io.formats.IChemFormat;
import org.openscience.cdk.io.formats.IResourceFormat;
import org.openscience.cdk.io.formats.MDLFormat;
import org.openscience.cdk.io.formats.MDLV2000Format;
import org.openscience.cdk.io.formats.MDLV3000Format;
import org.openscience.cdk.io.listener.IChemObjectIOListener;
import org.openscience.cdk.io.setting.BooleanIOSetting;
import org.openscience.cdk.io.setting.IOSetting;
import org.openscience.cdk.tools.ILoggingTool;
import org.openscience.cdk.tools.LoggingToolFactory;
/**
* Iterating MDL SDF reader. It allows to iterate over all molecules
* in the SD file, without reading them into memory first. Suitable
* for (very) large SDF files. For parsing the molecules in the
* SD file, it uses the <code>MDLV2000Reader</code> or
* <code>MDLV3000Reader</code> reader; it does <b>not</b> work
* for SDF files with MDL formats prior to the V2000 format.
*
* <p>Example use:
* <pre>
* File sdfFile = new File("../zinc-structures/ZINC_subset3_3D_charged_wH_maxmin1000.sdf");
* IteratingMDLReader reader = new IteratingMDLReader(
* new FileInputStream(sdfFile), DefaultChemObjectBuilder.getInstance()
* );
* while (reader.hasNext()) {
* IMolecule molecule = (IMolecule)reader.next();
* }
* </pre>
*
* @cdk.module io
* @cdk.githash
*
* @see org.openscience.cdk.io.MDLV2000Reader
* @see org.openscience.cdk.io.MDLV3000Reader
*
* @author Egon Willighagen <egonw@sci.kun.nl>
* @cdk.created 2003-10-19
*
* @cdk.keyword file format, MDL molfile
* @cdk.keyword file format, SDF
*/
public class IteratingMDLReader extends DefaultIteratingChemObjectReader implements IChemObjectIOListener {
private BufferedReader input;
private static ILoggingTool logger =
LoggingToolFactory.createLoggingTool(IteratingMDLReader.class);
private String currentLine;
private IChemFormat currentFormat;
private final ReaderFactory factory = new ReaderFactory();
private boolean nextAvailableIsKnown;
private boolean hasNext;
private IChemObjectBuilder builder;
private IMolecule nextMolecule;
private BooleanIOSetting forceReadAs3DCoords;
/**
* Constructs a new IteratingMDLReader that can read Molecule from a given Reader.
*
* @param in The Reader to read from
* @param builder The builder
*/
public IteratingMDLReader(Reader in, IChemObjectBuilder builder) {
this.builder = builder;
setReader(in);
initIOSettings();
}
/**
* Contructs a new IteratingMDLReader that can read Molecule from a given InputStream.
*
* @param in The InputStream to read from
* @param builder The builder
*/
public IteratingMDLReader(InputStream in, IChemObjectBuilder builder) {
this(new InputStreamReader(in), builder);
}
@TestMethod("testGetFormat")
public IResourceFormat getFormat() {
return currentFormat;
}
/**
* Returns true if another IMolecule can be read.
*/
public boolean hasNext() {
if (!nextAvailableIsKnown) {
hasNext = false;
// now try to parse the next Molecule
try {
if (input.ready()) {
currentFormat = (IChemFormat)MDLFormat.getInstance();
currentLine = input.readLine();
StringBuffer buffer = new StringBuffer();
while (currentLine != null && !currentLine.equals("M END")) {
// still in a molecule
buffer.append(currentLine);
buffer.append(System.getProperty("line.separator"));
if (input.ready()) {
currentLine = input.readLine();
} else {
currentLine = null;
}
// do MDL molfile version checking
if (currentLine.contains("V2000") || currentLine.contains("v2000")) {
currentFormat = (IChemFormat)MDLV2000Format.getInstance();
} else if (currentLine.contains("V3000") || currentLine.contains("v3000")) {
currentFormat = (IChemFormat)MDLV3000Format.getInstance();
}
}
buffer.append(currentLine);
buffer.append(System.getProperty("line.separator"));
logger.debug("MDL file part read: ", buffer);
ISimpleChemObjectReader reader = factory.createReader(currentFormat);
reader.setReader(new StringReader(buffer.toString()));
if (currentFormat instanceof MDLV2000Format) {
reader.addChemObjectIOListener(this);
((MDLV2000Reader)reader).customizeJob();
}
nextMolecule = (IMolecule)reader.read(builder.newMolecule());
// note that a molecule may have 0 atoms, but still
// be useful (by having SD tags for example), so just
// check for null'ness rather than atom count
hasNext = nextMolecule != null;
// now read the data part
currentLine = input.readLine();
readDataBlockInto(nextMolecule);
} else {
hasNext = false;
}
} catch (Exception exception) {
logger.error("Error while reading next molecule: " +
exception.getMessage());
logger.debug(exception);
hasNext = false;
}
if (!hasNext) nextMolecule = null;
nextAvailableIsKnown = true;
}
return hasNext;
}
private void readDataBlockInto(IMolecule m) throws IOException {
String fieldName = null;
while (currentLine != null && !(currentLine.trim().equals("$$$$"))) {
logger.debug("looking for data header: ", currentLine);
String str = new String(currentLine);
if (str.startsWith("> ")) {
fieldName = extractFieldName(fieldName, str);
str = skipOtherFieldHeaderLines(str);
String data = extractFieldData(str);
if (fieldName != null) {
logger.info("fieldName, data: ", fieldName, ", ", data);
m.setProperty(fieldName, data);
}
}
currentLine = input.readLine();
}
}
private String extractFieldData(String str) throws IOException {
StringBuilder data = new StringBuilder();
while (str.trim().length() > 0) {
logger.debug("data line: ", currentLine);
data.append(str);
currentLine = input.readLine();
str = new String(currentLine).trim();
}
return data.toString();
}
private String skipOtherFieldHeaderLines(String str) throws IOException {
while (str.startsWith("> ")) {
logger.debug("data header line: ", currentLine);
currentLine = input.readLine();
str = new String(currentLine);
}
return str;
}
private String extractFieldName(String fieldName, String str) {
int index = str.indexOf("<");
if (index != -1) {
int index2 = str.substring(index).indexOf(">");
if (index2 != -1) {
fieldName = str.substring(
index+1,
index+index2
);
}
}
return fieldName;
}
/**
* Returns the next IMolecule.
*/
public IChemObject next() {
if (!nextAvailableIsKnown) {
hasNext();
}
nextAvailableIsKnown = false;
if (!hasNext) {
throw new NoSuchElementException();
}
return nextMolecule;
}
@TestMethod("testClose")
public void close() throws IOException {
input.close();
}
public void remove() {
throw new UnsupportedOperationException();
}
@TestMethod("testSetReader_Reader")
public void setReader(Reader reader) {
if (reader instanceof BufferedReader) {
input = (BufferedReader)reader;
} else {
input = new BufferedReader(reader);
}
nextMolecule = null;
nextAvailableIsKnown = false;
hasNext = false;
}
@TestMethod("testSetReader_InputStream")
public void setReader(InputStream reader) {
setReader(new InputStreamReader(reader));
}
private void initIOSettings() {
forceReadAs3DCoords = new BooleanIOSetting("ForceReadAs3DCoordinates", IOSetting.LOW,
"Should coordinates always be read as 3D?",
"false");
}
public void customizeJob() {
fireIOSettingQuestion(forceReadAs3DCoords);
}
public IOSetting[] getIOSettings() {
IOSetting[] settings = new IOSetting[1];
settings[0] = forceReadAs3DCoords;
return settings;
}
public void processIOSettingQuestion(IOSetting setting) {
if (setting.getName().equals(forceReadAs3DCoords.getName())) {
try {
setting.setSetting(forceReadAs3DCoords.getSetting());
} catch (CDKException e) {
logger.debug("Could not propagate forceReadAs3DCoords setting");
}
}
}
}