/*************************************************************************
* *
* This file is part of the 20n/act project. *
* 20n/act enables DNA prediction for synthetic biology/bioengineering. *
* Copyright (C) 2017 20n Labs, Inc. *
* *
* Please direct all queries to act@20n.com. *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU General Public License as published by *
* the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU General Public License for more details. *
* *
* You should have received a copy of the GNU General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* *
*************************************************************************/
package com.act.lcms.v2;
import chemaxon.formats.MolFormatException;
import chemaxon.formats.MolImporter;
import com.act.analysis.chemicals.molecules.MoleculeImporter$;
import com.act.jobs.FileChecker;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.Arrays;
import java.util.List;
public class MassToRawMetaboliteMapParser {
private static final Logger LOGGER = LogManager.getFormatterLogger(MassToRawMetaboliteMapParser.class);
// Default headers
private static final String DEFAULT_STRUCTURE_HEADER = "inchi";
private static final String DEFAULT_FORMULA_HEADER = "formula";
private static final String DEFAULT_MASS_HEADER = "mass";
private static final String DEFAULT_NAME_HEADER = "name";
private static final String TSV_SEPARATOR = "\t";
// Instance variables
private String metaboliteHeader;
private Integer metaboliteIndex;
private Integer massIndex;
private Integer nameIndex;
private File inputFile;
private MassToRawMetaboliteMap massToMetaboliteMap;
// Basic getters
public Integer getMetaboliteIndex() {
return metaboliteIndex;
}
public Integer getMassIndex() {
return massIndex;
}
public Integer getNameIndex() {
return nameIndex;
}
public MassToRawMetaboliteMap getMassToMoleculeMap() {
return massToMetaboliteMap;
}
MassToRawMetaboliteMapParser() {
this.metaboliteHeader = null;
// Initialize all indices to -1.
this.metaboliteIndex = -1;
this.massIndex = -1;
this.nameIndex = -1;
}
public MassToRawMetaboliteMapParser(File inputFile) {
this.inputFile = inputFile;
try {
FileChecker.verifyInputFile(inputFile);
String headerLine = getMetabolitesReader(inputFile).readLine();
List<String> headers = Arrays.asList(headerLine.split(TSV_SEPARATOR));
validateHeaders(headers);
} catch (IOException e) {
String msg = String.format("An I/O exception occured when trying to parse input file: %s",
inputFile.getAbsolutePath());
throw new RuntimeException(msg);
}
}
/**
* Validates headers and assign the corresponding indices / metabolite kind
* @param headers headers parsed from the input file
*/
void validateHeaders(List<String> headers) {
if (headers.contains(DEFAULT_STRUCTURE_HEADER)) {
this.metaboliteHeader = DEFAULT_STRUCTURE_HEADER;
this.massToMetaboliteMap = new MassToRawMetaboliteMap(MassToRawMetaboliteMap.RawMetaboliteKind.INCHI);
} else if (headers.contains(DEFAULT_FORMULA_HEADER)) {
this.metaboliteHeader = DEFAULT_FORMULA_HEADER;
this.massToMetaboliteMap = new MassToRawMetaboliteMap(MassToRawMetaboliteMap.RawMetaboliteKind.FORMULA);
} else {
String msg = String.format("Input file did not contain expected metabolite headers: %s or %s", DEFAULT_FORMULA_HEADER, DEFAULT_STRUCTURE_HEADER);
LOGGER.error(msg);
throw new RuntimeException(msg);
}
LOGGER.info("The parser will use the following metabolite header: %s", this.metaboliteHeader);
this.metaboliteIndex = headers.indexOf(this.metaboliteHeader);
if (headers.contains(DEFAULT_MASS_HEADER)) {
String massHeader = DEFAULT_MASS_HEADER;
this.massIndex = headers.indexOf(massHeader);
LOGGER.info("The parser detected the following mass header: %s", massHeader);
} else {
if (this.metaboliteHeader.equals(DEFAULT_FORMULA_HEADER)) {
throw new RuntimeException("Masses should be provided if parsing metabolites from formulae.");
}
this.massIndex = -1;
LOGGER.warn("The parser did not detect any mass header. Masses will be computed.");
}
if (headers.contains(DEFAULT_NAME_HEADER)) {
String namesHeader = DEFAULT_NAME_HEADER;
this.nameIndex = headers.indexOf(namesHeader);
LOGGER.info("The parser detected the following name header: %s", namesHeader);
} else {
this.nameIndex = -1;
LOGGER.info("The parser did not detect any name header");
}
}
/**
* Parses and adds the parsed RawMetabolite to the map
* @param line raw line from the input file to parse
*/
void addRawMetabolite(String line) {
String[] splitLine = line.split(TSV_SEPARATOR);
String metabolite = splitLine[metaboliteIndex];
Double mass;
String name = null;
if (massIndex < 0) {
assert metaboliteHeader.equals(DEFAULT_STRUCTURE_HEADER);
try {
mass = MoleculeImporter$.MODULE$.importMolecule(metabolite).getExactMass();
} catch (MolFormatException e) {
LOGGER.error("Could not parse molecule %s, skipping.", metabolite);
return;
}
} else {
mass = Double.parseDouble(splitLine[massIndex]);
}
if (nameIndex >= 0) {
name = splitLine[nameIndex];
}
massToMetaboliteMap.add(new RawMetabolite(mass, metabolite, name));
}
public void parse() throws IOException {
try (BufferedReader metabolitesReader = getMetabolitesReader(inputFile)) {
int i = 0;
// Skip headers
metabolitesReader.readLine();
while (metabolitesReader.ready()) {
String line = metabolitesReader.readLine();
addRawMetabolite(line);
if (++i % 1000000 == 0) {
LOGGER.info("Metabolites processed so far: %d", i);
}
}
}
}
private BufferedReader getMetabolitesReader(File metaboliteFile) throws FileNotFoundException {
FileInputStream metabolitesInputStream = new FileInputStream(metaboliteFile);
return new BufferedReader(new InputStreamReader(metabolitesInputStream));
}
}