/*************************************************************************
* *
* This file is part of the 20n/act project. *
* 20n/act enables DNA prediction for synthetic biology/bioengineering. *
* Copyright (C) 2017 20n Labs, Inc. *
* *
* Please direct all queries to act@20n.com. *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU General Public License as published by *
* the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU General Public License for more details. *
* *
* You should have received a copy of the GNU General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* *
*************************************************************************/
package com.act.biointerpretation.l2expansion;
import chemaxon.formats.MolFormatException;
import chemaxon.struc.Molecule;
import com.act.analysis.chemicals.molecules.MoleculeFormat;
import com.act.analysis.chemicals.molecules.MoleculeFormat.MoleculeFormatType;
import com.act.analysis.chemicals.molecules.MoleculeFormat$;
import com.act.analysis.chemicals.molecules.MoleculeImporter;
import com.act.jobs.FileChecker;
import com.act.jobs.JavaRunnable;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
/**
* Represents a set of inchis.
*/
public class L2InchiCorpus {
private static final Logger LOGGER = LogManager.getFormatterLogger(L2InchiCorpus.class);
private List<String> corpus = new ArrayList<>();
public L2InchiCorpus() {
}
public L2InchiCorpus(Collection<String> inchiList) {
corpus = new ArrayList<>(inchiList);
}
public void filterByMass(Integer massCutoff) {
corpus.removeIf(
inchi ->
{
try {
// Defaults to "inchi"
Molecule mol = MoleculeImporter.importMolecule(inchi);
if (mol.getMass() > massCutoff) {
LOGGER.warn("Throwing out molecule %s because of mass %f and %d atoms.",
inchi, mol.getMass(), mol.getAtomCount());
return true;
}
return false;
} catch (MolFormatException e) {
LOGGER.error("MolFormatException on metabolite %s. %s", inchi, e.getMessage());
return true;
}
}
);
}
public List<Molecule> getMolecules() {
List<MoleculeFormat.MoleculeFormatType> wrappedInchi = new ArrayList<>();
wrappedInchi.add(MoleculeFormat.stdInchi$.MODULE$);
return getMolecules(wrappedInchi);
}
public List<Molecule> getMolecules(List<MoleculeFormat.MoleculeFormatType> formats) {
// We take in a string list here because java won't load in the scala enumeration type...
List<MoleculeFormatType> formatList = new ArrayList<>();
formatList.addAll(formats);
List<Molecule> results = new ArrayList<>(getInchiList().size());
for (String inchi : getInchiList()) {
try {
results.add(MoleculeImporter.importMolecule(inchi, formatList));
} catch (MolFormatException e) {
LOGGER.error("MolFormatException on metabolite %s. %s", inchi, e.getMessage());
}
}
return results;
}
/**
* Write inchi list to file.
*/
public void writeToFile(File inchisFile) throws IOException {
try (BufferedWriter writer = new BufferedWriter((new FileWriter(inchisFile)))) {
for (String inchi : getInchiList()) {
writer.write(inchi);
writer.newLine();
}
}
}
/**
* Add the chemicals in the inchis file to the corpus.
*/
public void loadCorpus(File inchisFile) throws IOException {
try (BufferedReader inchiReader = getInchiReader(inchisFile)) {
String moleculeString;
while ((moleculeString = inchiReader.readLine()) != null) {
String trimmedMolecule = moleculeString.trim();
if (!trimmedMolecule.equals(moleculeString)) {
LOGGER.warn("Leading or trailing whitespace found in molecule string file.");
}
if (trimmedMolecule.equals("")) {
LOGGER.warn("Blank line detected in molecule string file and ignored.");
continue;
}
corpus.add(trimmedMolecule);
}
}
LOGGER.info("Loaded " + corpus.size() + " molecules into corpus.");
}
/**
* @return A reader for the list of inchis.
*/
private BufferedReader getInchiReader(File inchiFile) throws FileNotFoundException {
FileInputStream inchiInputStream = new FileInputStream(inchiFile);
return new BufferedReader(new InputStreamReader(inchiInputStream));
}
public List<String> getInchiList() {
return corpus;
}
/**
* Wraps mass filtering so that it can be used as a step in a workflow
*
* @param inputSubstrates The initial list of substrates.
* @param outputFile The file to which to write the output.
* @param massThreshold The maximum mass to allow, in Daltons.
* @return A JavaRunnable that can be used in a workflow.
*/
public static JavaRunnable getRunnableSubstrateFilterer(File inputSubstrates,
File outputFile,
Integer massThreshold) {
return new JavaRunnable() {
@Override
public void run() throws IOException {
// Verify files
FileChecker.verifyInputFile(inputSubstrates);
FileChecker.verifyAndCreateOutputFile(outputFile);
// Build input corpus
L2InchiCorpus inchis = new L2InchiCorpus();
inchis.loadCorpus(inputSubstrates);
// Apply filter
inchis.filterByMass(massThreshold);
// Write to output file
inchis.writeToFile(outputFile);
}
@Override
public String toString() {
return "mass_filterer_" + massThreshold.toString();
}
};
}
}