/************************************************************************* * * * This file is part of the 20n/act project. * * 20n/act enables DNA prediction for synthetic biology/bioengineering. * * Copyright (C) 2017 20n Labs, Inc. * * * * Please direct all queries to act@20n.com. * * * * This program is free software: you can redistribute it and/or modify * * it under the terms of the GNU General Public License as published by * * the Free Software Foundation, either version 3 of the License, or * * (at your option) any later version. * * * * This program is distributed in the hope that it will be useful, * * but WITHOUT ANY WARRANTY; without even the implied warranty of * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * * GNU General Public License for more details. * * * * You should have received a copy of the GNU General Public License * * along with this program. If not, see <http://www.gnu.org/licenses/>. * * * *************************************************************************/ package com.act.lcms.v2; import chemaxon.formats.MolFormatException; import chemaxon.formats.MolImporter; import com.act.analysis.chemicals.molecules.MoleculeImporter$; import com.act.jobs.FileChecker; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStreamReader; import java.util.Arrays; import java.util.List; public class MassToRawMetaboliteMapParser { private static final Logger LOGGER = LogManager.getFormatterLogger(MassToRawMetaboliteMapParser.class); // Default headers private static final String DEFAULT_STRUCTURE_HEADER = "inchi"; private static final String DEFAULT_FORMULA_HEADER = "formula"; private static final String DEFAULT_MASS_HEADER = "mass"; private static final String DEFAULT_NAME_HEADER = "name"; private static final String TSV_SEPARATOR = "\t"; // Instance variables private String metaboliteHeader; private Integer metaboliteIndex; private Integer massIndex; private Integer nameIndex; private File inputFile; private MassToRawMetaboliteMap massToMetaboliteMap; // Basic getters public Integer getMetaboliteIndex() { return metaboliteIndex; } public Integer getMassIndex() { return massIndex; } public Integer getNameIndex() { return nameIndex; } public MassToRawMetaboliteMap getMassToMoleculeMap() { return massToMetaboliteMap; } MassToRawMetaboliteMapParser() { this.metaboliteHeader = null; // Initialize all indices to -1. this.metaboliteIndex = -1; this.massIndex = -1; this.nameIndex = -1; } public MassToRawMetaboliteMapParser(File inputFile) { this.inputFile = inputFile; try { FileChecker.verifyInputFile(inputFile); String headerLine = getMetabolitesReader(inputFile).readLine(); List<String> headers = Arrays.asList(headerLine.split(TSV_SEPARATOR)); validateHeaders(headers); } catch (IOException e) { String msg = String.format("An I/O exception occured when trying to parse input file: %s", inputFile.getAbsolutePath()); throw new RuntimeException(msg); } } /** * Validates headers and assign the corresponding indices / metabolite kind * @param headers headers parsed from the input file */ void validateHeaders(List<String> headers) { if (headers.contains(DEFAULT_STRUCTURE_HEADER)) { this.metaboliteHeader = DEFAULT_STRUCTURE_HEADER; this.massToMetaboliteMap = new MassToRawMetaboliteMap(MassToRawMetaboliteMap.RawMetaboliteKind.INCHI); } else if (headers.contains(DEFAULT_FORMULA_HEADER)) { this.metaboliteHeader = DEFAULT_FORMULA_HEADER; this.massToMetaboliteMap = new MassToRawMetaboliteMap(MassToRawMetaboliteMap.RawMetaboliteKind.FORMULA); } else { String msg = String.format("Input file did not contain expected metabolite headers: %s or %s", DEFAULT_FORMULA_HEADER, DEFAULT_STRUCTURE_HEADER); LOGGER.error(msg); throw new RuntimeException(msg); } LOGGER.info("The parser will use the following metabolite header: %s", this.metaboliteHeader); this.metaboliteIndex = headers.indexOf(this.metaboliteHeader); if (headers.contains(DEFAULT_MASS_HEADER)) { String massHeader = DEFAULT_MASS_HEADER; this.massIndex = headers.indexOf(massHeader); LOGGER.info("The parser detected the following mass header: %s", massHeader); } else { if (this.metaboliteHeader.equals(DEFAULT_FORMULA_HEADER)) { throw new RuntimeException("Masses should be provided if parsing metabolites from formulae."); } this.massIndex = -1; LOGGER.warn("The parser did not detect any mass header. Masses will be computed."); } if (headers.contains(DEFAULT_NAME_HEADER)) { String namesHeader = DEFAULT_NAME_HEADER; this.nameIndex = headers.indexOf(namesHeader); LOGGER.info("The parser detected the following name header: %s", namesHeader); } else { this.nameIndex = -1; LOGGER.info("The parser did not detect any name header"); } } /** * Parses and adds the parsed RawMetabolite to the map * @param line raw line from the input file to parse */ void addRawMetabolite(String line) { String[] splitLine = line.split(TSV_SEPARATOR); String metabolite = splitLine[metaboliteIndex]; Double mass; String name = null; if (massIndex < 0) { assert metaboliteHeader.equals(DEFAULT_STRUCTURE_HEADER); try { mass = MoleculeImporter$.MODULE$.importMolecule(metabolite).getExactMass(); } catch (MolFormatException e) { LOGGER.error("Could not parse molecule %s, skipping.", metabolite); return; } } else { mass = Double.parseDouble(splitLine[massIndex]); } if (nameIndex >= 0) { name = splitLine[nameIndex]; } massToMetaboliteMap.add(new RawMetabolite(mass, metabolite, name)); } public void parse() throws IOException { try (BufferedReader metabolitesReader = getMetabolitesReader(inputFile)) { int i = 0; // Skip headers metabolitesReader.readLine(); while (metabolitesReader.ready()) { String line = metabolitesReader.readLine(); addRawMetabolite(line); if (++i % 1000000 == 0) { LOGGER.info("Metabolites processed so far: %d", i); } } } } private BufferedReader getMetabolitesReader(File metaboliteFile) throws FileNotFoundException { FileInputStream metabolitesInputStream = new FileInputStream(metaboliteFile); return new BufferedReader(new InputStreamReader(metabolitesInputStream)); } }