/* * Copyright (c) 2003-2012 Fred Hutchinson Cancer Research Center * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.fhcrc.cpl.viewer.commandline.modules; import org.fhcrc.cpl.toolbox.commandline.arguments.*; import org.fhcrc.cpl.toolbox.proteomics.feature.FeatureSet; import org.fhcrc.cpl.toolbox.proteomics.feature.Feature; import org.fhcrc.cpl.toolbox.proteomics.feature.FeaturePepXmlWriter; import org.fhcrc.cpl.toolbox.proteomics.feature.Spectrum; import org.fhcrc.cpl.toolbox.proteomics.feature.filehandler.APMLFeatureFileHandler; import org.fhcrc.cpl.toolbox.proteomics.feature.extraInfo.MS2ExtraInfoDef; import org.fhcrc.cpl.toolbox.proteomics.MSRun; import org.fhcrc.cpl.toolbox.proteomics.filehandler.BasePepXmlWriter; import org.fhcrc.cpl.toolbox.proteomics.feature.filehandler.HardklorFeatureFileHandler; import org.fhcrc.cpl.toolbox.proteomics.feature.filehandler.PepXMLFeatureFileHandler; import org.fhcrc.cpl.viewer.feature.extraction.SpectrumResampler; import org.fhcrc.cpl.toolbox.filehandler.TempFileManager; import org.fhcrc.cpl.toolbox.ApplicationContext; import org.fhcrc.cpl.toolbox.datastructure.FloatRange; import org.fhcrc.cpl.toolbox.commandline.CommandLineModuleExecutionException; import org.fhcrc.cpl.toolbox.commandline.CommandLineModule; import org.fhcrc.cpl.toolbox.commandline.CommandLineModuleUtilities; import org.apache.log4j.Logger; import java.util.*; import java.io.*; /** */ public class ConvertFeatureFileCommandLineModule extends BaseViewerCommandLineModuleImpl implements CommandLineModule { protected static Logger _log = Logger.getLogger(FilterFeaturesCommandLineModule.class); protected File[] inFeatureFiles = null; protected File outFeatureFile = null; protected File outDir = null; protected MSRun run = null; protected int inFileFormat = FILE_FORMAT_MSINSPECT; protected int outFileFormat = 0; protected boolean dumpWindow = false; protected File fastaFile = null; protected boolean forcePeptideProphet = false; protected double forcePeptideProphetValue = 0; protected String pepXmlSearchEngine = BasePepXmlWriter.DEFAULT_SEARCH_ENGINE; protected static final int FILE_FORMAT_MSINSPECT=0; protected static final int FILE_FORMAT_PEPXML=1; protected static final int FILE_FORMAT_SPECARRAY=2; protected static final int FILE_FORMAT_APML=3; protected static final int FILE_FORMAT_HARDKLOR=4; protected static final int FILE_FORMAT_MULTI_MSINSPECT=5; protected static final String[] formatStrings = { "msinspect", "pepxml", "specarraytsv", "apml", "hardklor", "multimsinspect" }; protected static final String[] formatDescriptions = { "msInspect tab-separated values", "PepXML", "SpecArray tab-separated values", "APML v2.0 (XML)", "Hardklor text format", "Multiple msInspect feature sets in a single file, with a column indicating run" }; protected static final int SPECARRAY_VERSION_1_0 = 0; protected static final int SPECARRAY_VERSION_1_2_0 = 1; protected int specArrayVersion = SPECARRAY_VERSION_1_2_0; protected final static String[] specArrayVersionValues = {"1.0","1.2"}; public ConvertFeatureFileCommandLineModule() { init(); } protected void init() { mCommandName = "convertfeaturefile"; mUsageMessage = CommandLineModule.MODULE_USAGE_AUTOMATIC; StringBuffer helpMessageBuf = new StringBuffer("This command converts between different formats of feature files. " + "Allowed formats:\n"); for (int i=0; i<formatStrings.length; i++) helpMessageBuf.append("\t" + formatStrings[i] + ": " + formatDescriptions[i] + "\n"); mHelpMessage = helpMessageBuf.toString(); mShortDescription = "Convert between formats of feature files"; CommandLineArgumentDefinition[] argDefs = { createUnnamedSeriesFileArgumentDefinition(true, "Input feature file(s)"), new FileToWriteArgumentDefinition("out",false,"output file"), new DirectoryToWriteArgumentDefinition("outdir",false,"output directory (for multiple inputs)"), new EnumeratedValuesArgumentDefinition("informat", false, "input file format. To force loading of an ambiguous file as a specific file type", formatStrings, "msinspect"), new EnumeratedValuesArgumentDefinition("outformat", true, "output file format",formatStrings), new FileToReadArgumentDefinition("mzxml",false,null), new BooleanArgumentDefinition("dumpwindow",false, null), new EnumeratedValuesArgumentDefinition("specarrayversion",false, "specArray version, for specArray conversions (default 1.2)", specArrayVersionValues), new DecimalArgumentDefinition("forcepeptideprophetvalue",false, "Set the PeptideProphet probability of all features to this (for pepxml output)"), new FileToReadArgumentDefinition("fasta", false, "FASTA filepath to include in pepXML file (for outformat=pepxml only)"), new StringArgumentDefinition("searchengine", false, "Search engine to store in pepXML file (for outformat=pepxml ony)", pepXmlSearchEngine) }; addArgumentDefinitions(argDefs); } protected int translateFormatString(String formatString) { int result = -1; for (int i = 0; i < formatStrings.length; i++) { String modeString = formatStrings[i]; if (modeString.equalsIgnoreCase(formatString)) { result = i; break; } } return result; } public void assignArgumentValues() throws ArgumentValidationException { if (hasArgumentValue("informat")) inFileFormat = translateFormatString(getStringArgumentValue("informat")); if (inFileFormat == FILE_FORMAT_MULTI_MSINSPECT) throw new ArgumentValidationException("multimsinspect is not a valid input format, only output. " + "For now, anyway."); outFileFormat = translateFormatString(getStringArgumentValue("outformat")); inFeatureFiles = this.getUnnamedSeriesFileArgumentValues(); outFeatureFile = getFileArgumentValue("out"); outDir = getFileArgumentValue("outdir"); pepXmlSearchEngine = getStringArgumentValue("searchengine"); if (hasArgumentValue("searchengine") && outFileFormat != FILE_FORMAT_PEPXML) throw new ArgumentValidationException("Argument searchengine is only for pepXML output mode"); if (hasArgumentValue("outdir")) assertArgumentAbsent("out"); else assertArgumentPresent("out"); if (inFeatureFiles.length > 1) assertArgumentPresent("outdir"); if (hasArgumentValue("mzxml")) { try { run = MSRun.load(getFileArgumentValue("mzxml").getAbsolutePath()); } catch (Exception e) { throw new ArgumentValidationException(e); } } if (hasArgumentValue("dumpwindow")) { dumpWindow = getBooleanArgumentValue("dumpwindow"); if (run == null) throw new ArgumentValidationException("if dumpwindow is specified, must provide an mzxml file"); } if (hasArgumentValue("specarrayversion")) specArrayVersion = ((EnumeratedValuesArgumentDefinition) getArgumentDefinition("specarrayversion")).getIndexForArgumentValue( getStringArgumentValue("specarrayversion")); if (inFileFormat == FILE_FORMAT_SPECARRAY) assertArgumentPresent("mzxml"); if (hasArgumentValue("forcepeptideprophetvalue")) { if (!(outFileFormat == FILE_FORMAT_PEPXML)) throw new ArgumentValidationException("Argument forcepeptideprophetvalue is only for pepXML output mode"); forcePeptideProphet = true; forcePeptideProphetValue = getDoubleArgumentValue("forcepeptideprophetvalue"); } if (outFileFormat != FILE_FORMAT_PEPXML) assertArgumentAbsent("fasta"); fastaFile = getFileArgumentValue("fasta"); } /** * do the actual work */ public void execute() throws CommandLineModuleExecutionException { for (File file : inFeatureFiles) { File outputFile = outFeatureFile; if (outputFile == null) { String outputSuffix = ""; switch (outFileFormat) { case FILE_FORMAT_MSINSPECT: case FILE_FORMAT_MULTI_MSINSPECT: case FILE_FORMAT_SPECARRAY: case FILE_FORMAT_HARDKLOR: outputSuffix = "tsv"; break; case FILE_FORMAT_PEPXML: outputSuffix = "pep.xml"; break; case FILE_FORMAT_APML: outputSuffix = "apml.xml"; break; } outputFile = CommandLineModuleUtilities.createOutputFile(file, outputSuffix, outDir); } handleFile(file, outputFile); } } protected void handleFile(File inputFile, File outputFile) throws CommandLineModuleExecutionException { ApplicationContext.infoMessage("Loading features from " + inputFile.getAbsolutePath() + "..."); FeatureSet featureSet = null; //pepxml files can have multiple FeatureSets in them List<FeatureSet> featureSets = null; switch (inFileFormat) { case FILE_FORMAT_SPECARRAY: featureSet = loadFeatureSetFromSpecArrayTSV(inputFile, specArrayVersion); break; case FILE_FORMAT_PEPXML: try { featureSets = PepXMLFeatureFileHandler.getSingletonInstance().loadAllFeatureSets(inputFile); //The only valid format for the output if there are multiple sets is multi_tsv. //If using a different format, ignore featureSets and use the first one, featureSet featureSet = featureSets.get(0); } catch (IOException e) { throw new CommandLineModuleExecutionException( "Failed to load feature sets from PepXML file",e); } break; case FILE_FORMAT_APML: case FILE_FORMAT_MSINSPECT: case FILE_FORMAT_HARDKLOR: try { featureSet = new FeatureSet(inputFile); } catch (Exception e) { throw new CommandLineModuleExecutionException("Problems opening feature file"); } break; default: throw new CommandLineModuleExecutionException( "Don't know how to support the specified input file format yet"); } PrintWriter pw = null; try { switch (outFileFormat) { case FILE_FORMAT_MSINSPECT: if (dumpWindow) createIntensityWindows(featureSet.getFeatures(), run); pw = new PrintWriter(outputFile); featureSet.save(pw,dumpWindow); break; case FILE_FORMAT_MULTI_MSINSPECT: if (featureSets == null || featureSets.size() == 1) { if (dumpWindow) createIntensityWindows(featureSet.getFeatures(), run); pw = new PrintWriter(outputFile); featureSet.save(pw,dumpWindow); } else { pw = new PrintWriter(outputFile); //check if all featuresets have basenames. If so, use those for //run identifier. Otherwise, use a number boolean allHaveBaseNames = true; for (FeatureSet fSet : featureSets) { String baseName = MS2ExtraInfoDef.getFeatureSetBaseName(fSet); if (baseName == null || baseName.length()<1) allHaveBaseNames = false; } for (int i=0; i<featureSets.size(); i++) { ApplicationContext.setMessage("Writing FeatureSet " + (i+1) + "..."); FeatureSet fSet = featureSets.get(i); File tempFile = TempFileManager.createTempFile("fset" + i + ".tsv", this); fSet.save(tempFile); FileReader fr = new FileReader(tempFile); BufferedReader br = new BufferedReader(fr); String line = null; if (i==0) { while ((line = br.readLine()).startsWith("#")) pw.println(line); pw.println("run\t" + line); } else { while ((line = br.readLine()).startsWith("#")) continue; //now at header line } pw.flush(); //next read is past the header line while ((line = br.readLine()) != null) { String baseName = MS2ExtraInfoDef.getFeatureSetBaseName(fSet); if (allHaveBaseNames) pw.println(baseName + "\t" + line); else pw.println((i+1) + "\t" + line); pw.flush(); } } TempFileManager.deleteTempFiles(this); } break; case FILE_FORMAT_PEPXML: if (forcePeptideProphet) { for(Feature feature : featureSet.getFeatures()) MS2ExtraInfoDef.setPeptideProphet(feature, forcePeptideProphetValue); } FeaturePepXmlWriter pepXmlWriter = new FeaturePepXmlWriter(featureSet); pepXmlWriter.set_searchEngine(pepXmlSearchEngine); if (fastaFile != null) pepXmlWriter.setSearchDatabase(fastaFile.getAbsolutePath()); pepXmlWriter.write(outputFile); break; case FILE_FORMAT_APML: APMLFeatureFileHandler.getSingletonInstance().saveFeatureSet(featureSet, outputFile); break; case FILE_FORMAT_HARDKLOR: featureSet.save(outputFile,dumpWindow, HardklorFeatureFileHandler.FILE_TYPE_NAME); break; default: throw new CommandLineModuleExecutionException("Don't know how to support the specified output file format yet"); } ApplicationContext.infoMessage("Successfully wrote feature file " + outputFile.getAbsolutePath()); } catch (Exception e) { throw new CommandLineModuleExecutionException(e); } finally { if (pw != null) pw.close(); } } protected FeatureSet loadFeatureSetFromSpecArrayTSV(File featureFile, int specArrayVersion) throws CommandLineModuleExecutionException { FeatureSet result = null; try { FileInputStream fis = new FileInputStream(featureFile); ArrayList<Feature> featureList = new ArrayList<Feature>(); String fileLine; double[] timesForScans = new double[run.getScanCount()]; for (int i=0; i<timesForScans.length; i++) { timesForScans[i] = run.getScan(i).getDoubleRetentionTime(); } while ((fileLine = readLine(fis)) != null) { if (fileLine.startsWith("index") || fileLine.contains("intensity")) continue; featureList.add(loadFeatureFromSpecArrayLine(fileLine, timesForScans, specArrayVersion)); } result = new FeatureSet(featureList.toArray(new Feature[0])); } catch (Exception e) { throw new CommandLineModuleExecutionException(e); } return result; } protected Feature loadFeatureFromSpecArrayLine(String specArrayLine, double[] timesForScans, int specArrayVersion) { Feature result = new Feature(); int specArrayCharge = 0; double specArrayTime = 0; double specArrayIntensity = 0; String[] specArrayLineArray = specArrayLine.split(" "); switch (specArrayVersion) { case SPECARRAY_VERSION_1_0: double specArrayMass = Double.parseDouble(specArrayLineArray[1]); specArrayCharge = Integer.parseInt(specArrayLineArray[2]); specArrayTime = Double.parseDouble(specArrayLineArray[11]); specArrayIntensity = Double.parseDouble(specArrayLineArray[9]); result.setMass((float) specArrayMass); break; case SPECARRAY_VERSION_1_2_0: double specArrayMz = Double.parseDouble(specArrayLineArray[0]); specArrayTime = Double.parseDouble(specArrayLineArray[1]); specArrayCharge = Integer.parseInt(specArrayLineArray[3]); specArrayIntensity = Double.parseDouble(specArrayLineArray[4]); result.setMz((float) specArrayMz); break; } result.setCharge(specArrayCharge); result.afterPopulate(); float time = (float) specArrayTime * 60; result.setTime(time); float totalIntensity = (float) specArrayIntensity; result.setIntensity(totalIntensity); int scan = Arrays.binarySearch(timesForScans,time); if (scan < 0) { scan = -scan; if (Math.abs(time - timesForScans[scan-1]) < Math.abs(time - timesForScans[scan])) scan -= 1; } result.setScan(scan); return result; } protected String readLine(FileInputStream fis) throws IOException { String result = null; StringBuffer resultBuf = new StringBuffer(); int charread; while ((charread = fis.read()) != -1 && charread != '\n') { resultBuf.append((char)charread); } if (resultBuf.length() > 0) { result = resultBuf.toString(); } return result; } /** * Create the set of intensity windows around a given set of features that allow us to * dump those intensity windows to a feature file * * TODO: can we get rid of this completely? */ public static void createIntensityWindows(Feature[] features, MSRun run) { Feature[] f = new Feature[features.length]; System.arraycopy(features, 0, f, 0, f.length); Feature.ScanAscComparator sac = new Feature.ScanAscComparator(); Arrays.sort(f, sac); MSRun.MSScan scan = null; float[][] spectrum = null; for (int i = 0; i < f.length; i++) { // If feature intensity window was already extracted, just skip. if (f[i].intensityLeadingPeaks == 3 && f[i].intensityTrailingPeaks == 3) continue; int n = run.getIndexForScanNum(f[i].scan); // This can happen if user, in error, applies feature set from one run to another. if (n >= run.getScanCount() || n < 0) continue; scan = run.getScan(n); spectrum = scan.getSpectrum(); if (null == spectrum) { _log.error("Failed to get spectrum for scan " + f[i].scan); ApplicationContext.setMessage("Failed to get spectrum for scan " + f[i].scan); return; } f[i].intensityWindow = Spectrum.Resample(spectrum, new FloatRange(f[i].mz - 3, f[i].mz + 3), SpectrumResampler.getResampleFrequency()); f[i].intensityLeadingPeaks = 3; f[i].intensityTrailingPeaks = 3; } ApplicationContext.setMessage(""); } }