MatchArrayMetMassesCLM.java example

Explorer
msInspect-master
/*
 * Copyright (c) 2003-2012 Fred Hutchinson Cancer Research Center
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.fhcrc.cpl.viewer.metabologna.commandline;

import org.fhcrc.cpl.toolbox.commandline.arguments.*;
import org.fhcrc.cpl.toolbox.filehandler.TabLoader;
import org.fhcrc.cpl.toolbox.ApplicationContext;
import org.fhcrc.cpl.toolbox.Rounder;
import org.fhcrc.cpl.toolbox.proteomics.MassUtilities;
import org.fhcrc.cpl.toolbox.proteomics.feature.Feature;
import org.fhcrc.cpl.toolbox.chem.*;
import org.fhcrc.cpl.toolbox.commandline.CommandLineModuleExecutionException;
import org.fhcrc.cpl.toolbox.commandline.CommandLineModule;
import org.fhcrc.cpl.toolbox.commandline.CommandLineModuleUtilities;
import org.fhcrc.cpl.toolbox.proteomics.feature.Spectrum;
import org.fhcrc.cpl.viewer.align.PeptideArrayAnalyzer;
import org.fhcrc.cpl.viewer.commandline.modules.BaseViewerCommandLineModuleImpl;
import org.fhcrc.cpl.viewer.metabologna.*;
import org.apache.log4j.Logger;

import java.io.*;
import java.util.*;

/**
 * Module for matching the masses in a 'peptide' array to a metabolite database. Uses the mass in the middle
 * of each cell.
 */
public class MatchArrayMetMassesCLM extends BaseViewerCommandLineModuleImpl
        implements CommandLineModule
{
    protected static Logger _log = Logger.getLogger(MatchArrayMetMassesCLM.class);

    protected File arrayFile;
    protected MetaboliteDatabaseMatcher metDBMatcher;
    protected File outFile;

    protected float massTolerancePPM = 2f;

    protected List<ChemicalCompound> databaseCompoundsByMass = null;

    protected boolean shouldFixArrayMasses = true;

    protected boolean shouldUseBaseMod = false;

    protected String[] annotationColumnNames =
            new String[] { "class","Accession_code","cas_number","kegg_id","subclass","species","pathway1","pathway2"};

    Map<String,Map<String,Object>> annotations = null;


    public MatchArrayMetMassesCLM()
    {
        init();
    }

    protected void init()
    {
        mCommandName = "matcharrmetmasses";
        mShortDescription = "Match array metabolite masses to a metabolite database";
        mHelpMessage = "Metabolite database should be a .tsv file with (at least) the following columns " +
                "(all but the first 3 of which may be empty): " + 
          "name, formula, smiles, class, Accession_code, cas_number, kegg_id, subclass, species, pathway1, pathway2";
        CommandLineArgumentDefinition[] argDefs =
                {
                        this.createUnnamedFileArgumentDefinition(true, "Input array file"),
                        new FileToReadArgumentDefinition("metdb", true, "Metabolite database file"),
                        new FileToWriteArgumentDefinition("out", false, "out"),
                        new DirectoryToWriteArgumentDefinition("outdir", false, "outdir"),
                        new BooleanArgumentDefinition("fixarraymasses", false, "Should fix array masses so that " +
                                "they are mz * z? (this is a 'correction' for arrays whose masses represent " +
                                "M+H ion masses)", shouldFixArrayMasses),
                        new DecimalArgumentDefinition("deltappm", false,
                                "Delta mass (ppm) between the center of the array cell and the mass of the metabolite",
                                massTolerancePPM),
                        new BooleanArgumentDefinition("usebasemod", false, "Include base mass, i.e., [M]?",
                                shouldUseBaseMod),
                        new StringListArgumentDefinition("mods", true, "modifications to use on every compound"),
                        new BooleanArgumentDefinition("showcharts", false, "show charts?", false),
                };
        addArgumentDefinitions(argDefs);
    }

    public static ChemicalModification createModForString(String modString) {
        if (modString.equals("M+H"))
            return new UnknownHAdditionMod();
        if (modString.equals("M+Na"))
            return new UnknownFormulaAdditionMod(new ChemicalFormula("Na1"), "M+Na","UnknownNaAdditionMod");
        if (modString.equals("2M+H"))
            return new DoubleFormulaPlusSomethingMod(new ChemicalFormula("H"), "2M+H","DoubleMassPlusHMod");
        if (modString.equals("2M+Na"))
            return new DoubleFormulaPlusSomethingMod(new ChemicalFormula("Na"), "2M+Na","DoubleMassPlusNaMod");
        return null;
    }

    public void assignArgumentValues()
            throws ArgumentValidationException
    {
        arrayFile = this.getUnnamedFileArgumentValue();

        shouldFixArrayMasses = getBooleanArgumentValue("fixarraymasses");
        if (shouldFixArrayMasses)
            System.err.println("'Fixing' feature masses by making them 1Da higher than would be the case in proteomics.");
        shouldUseBaseMod = getBooleanArgumentValue("usebasemod");

        metDBMatcher = new MetaboliteDatabaseMatcher();
        List<ChemicalModification> mods = new ArrayList<ChemicalModification>();


        for (String modString : getStringListArgumentValue("mods")) {
            mods.add(createModForString(modString));
        }

        metDBMatcher.setChemicalModifications(mods);
        try
        {
            databaseCompoundsByMass = ChemicalCompound.loadCompoundsFromFile(getFileArgumentValue("metdb"), 2);

            Collections.sort(databaseCompoundsByMass, new ChemicalCompound.ComparatorMassAsc());
            annotations = loadAnnotations(getFileArgumentValue("metdb"));
        }
        catch (IOException e)
        {
            throw new ArgumentValidationException("Failed to load metabolite database",e);
        }
        metDBMatcher.setDatabaseCompounds(databaseCompoundsByMass);
        metDBMatcher.setUseUnmodifiedAdduct(shouldUseBaseMod);




        massTolerancePPM = getFloatArgumentValue("deltappm");

        outFile = getFileArgumentValue("out");

        List<ChemicalModification> chemicalModsToAdd = new ArrayList<ChemicalModification>();

        for (String modString : getStringListArgumentValue("mods")) {
            ChemicalModification mod = createModForString(modString);
            if (mod == null)
                throw new ArgumentValidationException("Failed to process modification string: " + modString);
            chemicalModsToAdd.add(mod);
            ApplicationContext.infoMessage("Adding mod: " + mod.getSymbol());
        }
        metDBMatcher.setChemicalModifications(chemicalModsToAdd);
        metDBMatcher.setShowCharts(getBooleanArgumentValue("showcharts"));
    }

    public void execute() throws CommandLineModuleExecutionException
    {
//        PeptideArrayAnalyzer arrayAnalyzer = null;
        Map<String, Object>[] arrayRows;
        List<Feature> rowFeatures = new ArrayList<Feature>();
        TabLoader loader ;
        String headerLine;
        try {
//            arrayAnalyzer = new PeptideArrayAnalyzer(arrayFile);
//            Map<Integer, Map<String, List<Feature>>> detailsRowMap = arrayAnalyzer.loadDetailsRowMapsById();
             loader = new TabLoader(arrayFile);

            StringBuffer headerLineBuf = new StringBuffer("");
            try {
                for (TabLoader.ColumnDescriptor column : loader.getColumns())
                    headerLineBuf.append(column.name + "\t");
                headerLineBuf.append("formula\tmatchcount\tcompound\tppm\thmdb_id\tcas_number\tkegg_id\tiontype\tdeltamass\tSMILES\tclass\tsubclass\tspecies\tpathway1\tpathway2");
            } catch (IOException e) {
                throw new CommandLineModuleExecutionException(e);
            }
            headerLine = headerLineBuf.toString();


            arrayRows = (Map<String, Object>[]) loader.load();
            for (Map<String, Object> row : arrayRows) {
                int id = Integer.parseInt(row.get("id").toString());
//                Map<String, List<Feature>> runFeaturesMap = detailsRowMap.get(id);
                int charge = (Integer) row.get("charge");
                double minMass = (Double) row.get("minMass");
                double maxMass = (Double) row.get("maxMass");
                if (shouldFixArrayMasses) {
                    minMass += (charge * Spectrum.HYDROGEN_ION_MASS);
                    maxMass += (charge * Spectrum.HYDROGEN_ION_MASS);
                    row.put("minMass", minMass);
                    row.put("maxMass", minMass);
                }
                double meanMass = (minMass + maxMass) / 2;

                Feature feature = new Feature(0, (float) meanMass, 200);
                feature.setMass((float) meanMass);
                feature.setMz((float) meanMass / charge);
                feature.setCharge(charge);
                rowFeatures.add(feature);
            }
        }
        catch (IOException e) {
            throw new CommandLineModuleExecutionException(e);
        }


        ApplicationContext.infoMessage("Matching " + rowFeatures.size() + " masses...");
        Map<Feature, Map<ChemicalFormula, List<Adduct>>> matchingResult =
                metDBMatcher.massMatchFull(rowFeatures.toArray(new Feature[rowFeatures.size()]), massTolerancePPM, 1);
        ApplicationContext.infoMessage("Done matching");

        try
        {
            PrintWriter outPW = new PrintWriter(outFile);
            outPW.println(headerLine);
            outPW.flush();

            int matchCount = 0;
            ApplicationContext.infoMessage("Writing output array...");
            for (int i=0; i<arrayRows.length; i++)
            {
                int matchCountThisRow = 0;
                StringBuffer lineBuf = new StringBuffer();
                for (TabLoader.ColumnDescriptor column : loader.getColumns()) {
                    Object val = arrayRows[i].get(column.name);
                    if (val != null)
                        lineBuf.append(val.toString());
                    lineBuf.append( "\t");
                }
                Feature feature = rowFeatures.get(i);
                Map<ChemicalFormula, List<Adduct>> featureMatchingResult = matchingResult.get(feature);
                                    StringBuffer formulasBuf = new StringBuffer();
                StringBuffer namesBuf = new StringBuffer();
                StringBuffer ionTypesBuf = new StringBuffer();
                StringBuffer massDiffsBuf = new StringBuffer();
                StringBuffer smilesBuf = new StringBuffer();
                StringBuffer classBuf = new StringBuffer();
                StringBuffer subclassBuf = new StringBuffer();
                StringBuffer speciesBuf = new StringBuffer();
                StringBuffer pathway1Buf = new StringBuffer();
                StringBuffer pathway2Buf = new StringBuffer();
                StringBuffer hmdbIdBuf = new StringBuffer();
                StringBuffer casNumberBuf = new StringBuffer();
                StringBuffer ppmBuf = new StringBuffer();
                StringBuffer keggIdBuf = new StringBuffer();



                List<StringBuffer> allBufs = Arrays.asList(
                        new StringBuffer[] {formulasBuf, namesBuf, ppmBuf, hmdbIdBuf, casNumberBuf, keggIdBuf,
                                ionTypesBuf,massDiffsBuf,smilesBuf,classBuf,subclassBuf,
                                speciesBuf,pathway1Buf,pathway2Buf});


                if (featureMatchingResult != null) {
                    matchCount++;
                    boolean first = true;
                    for (ChemicalFormula formula : featureMatchingResult.keySet())
                    {
                        for (Adduct adduct : featureMatchingResult.get(formula))
                        {
                            matchCountThisRow++;
                            if (!first) {
                                for (StringBuffer buf : allBufs)
                                    buf.append(";");
                            }
                            formulasBuf.append(formula);
                            namesBuf.append(adduct.getCompound().getName());
                            ionTypesBuf.append(adduct.getIonTypeString());
                            massDiffsBuf.append(Rounder.round(feature.getMass() - adduct.getCommonestIsotopeMass(), 5));
                            if (adduct.getMolecule() != null)
                                smilesBuf.append(ChemCalcs.createSMILESString(adduct.getMolecule()));
                            String name = adduct.getCompound().getName();
                            classBuf.append(annotations.get("class").get(name));
                            subclassBuf.append(annotations.get("subclass").get(name));
                            speciesBuf.append(annotations.get("species").get(name));
                            pathway1Buf.append(annotations.get("pathway1").get(name));
                            pathway2Buf.append(annotations.get("pathway2").get(name));
                            hmdbIdBuf.append(annotations.get("Accession_code").get(name));
                            casNumberBuf.append(annotations.get("cas_number").get(name));
                            keggIdBuf.append(annotations.get("kegg_id").get(name));
                            float ppmDiff = (float) MassUtilities.convertDaToPPM(
                                    feature.getMz() - (float) adduct.getCommonestIsotopeMass(),
                                    (float) adduct.getCommonestIsotopeMass());
                            ppmBuf.append(Rounder.round(ppmDiff,1));
                            first = false;
                        }
                    }
                }
                lineBuf.append("\t" + matchCountThisRow);
                boolean firstBuf = true;
                for (StringBuffer buf : allBufs) {
                    if (!firstBuf)
                        lineBuf.append("\t" + buf.toString());
                    firstBuf = false;
                }
                outPW.println(lineBuf.toString());
                outPW.flush();
            }
            ApplicationContext.infoMessage("Matched " + matchCount + " out of " + arrayRows.length + " array rows");
            outPW.close();
        }
        catch (Exception e)
        {
            throw new CommandLineModuleExecutionException("Failed writing output file",e);
        }
    }



    protected Map<String,Map<String,Object>> loadAnnotations(File file)
            throws IOException
    {

        ApplicationContext.infoMessage("Loading annotations from file " + file.getAbsolutePath());
        TabLoader loader = new TabLoader(new FileReader(file),true);

ApplicationContext.infoMessage("Annotation file columns:");
for (TabLoader.ColumnDescriptor col : loader.getColumns()) ApplicationContext.infoMessage("\t" + col.name);
        //a separate map for each annotation column
        Map<String,Map<String,Object>> result = new HashMap<String,Map<String,Object>>();
//System.err.println("Loading annotations...");
//Set<String> allElements = new HashSet<String>();
        for (Map row : (Map[]) loader.load())
        {
//try{
//allElements.addAll(ChemCalcs.emp2atomCount((String)row.get("formula")).keySet());
//}catch(Exception e){}

//System.err.println("ROW");
//for (Object key1 : row.keySet()) System.err.println("\t" + key1.toString());
            Object key = row.get("name");
            if (key != null)
            {
//System.err.println("CHECK");
                String keyString = key.toString();

                    for (String annotationColumnName : annotationColumnNames)
                    {
//System.err.println("Checking " + annotationColumnName + "...");
                        if (row.get(annotationColumnName) == null)
                            continue;
                        Map<String, Object> mapThisAnnotColumn = result.get(annotationColumnName);
                        if (mapThisAnnotColumn == null)
                        {
                            mapThisAnnotColumn = new HashMap<String, Object>();
                            result.put(annotationColumnName, mapThisAnnotColumn);
                        }
                        mapThisAnnotColumn.put(keyString,row.get(annotationColumnName));
//System.err.println("\t found " +row.get(annotationColumnName) );
                    }

            }
        }
        ApplicationContext.infoMessage("Loaded annotation values for " + result.size() + " columns:");
        for (String annot : result.keySet())
        {
            ApplicationContext.infoMessage("\t" + annot + ": " + result.get(annot).size());
        }
//for (String element : allElements) System.err.println(element);
        return result;
    }

}