/*
* Copyright (c) 2003-2012 Fred Hutchinson Cancer Research Center
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.fhcrc.cpl.viewer.ms2.commandline;
import org.fhcrc.cpl.toolbox.ApplicationContext;
import org.fhcrc.cpl.toolbox.commandline.arguments.*;
import org.fhcrc.cpl.toolbox.proteomics.commandline.arguments.FastaFileArgumentDefinition;
import org.fhcrc.cpl.toolbox.commandline.CommandLineModule;
import org.fhcrc.cpl.toolbox.commandline.CommandLineModuleExecutionException;
import org.fhcrc.cpl.viewer.commandline.modules.BaseViewerCommandLineModuleImpl;
import org.apache.log4j.Logger;
import org.fhcrc.cpl.toolbox.proteomics.Protein;
import java.io.*;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
/**
* Creates a forward-reverse fasta file from an input fasta file
*/
public class RestrictFastaProteinsCLM extends BaseViewerCommandLineModuleImpl
implements CommandLineModule
{
protected static Logger _log = Logger.getLogger(RestrictFastaProteinsCLM.class);
protected File outFile = null;
protected Protein[] fastaProteins = null;
protected boolean keepProteinsOnList = true;
protected List<String> proteinIDList = new ArrayList<String>();
public RestrictFastaProteinsCLM()
{
init();
}
protected void init()
{
mCommandName = "restrictfasta";
mShortDescription = "Remove proteins not on (or on) a list from a fasta file";
mHelpMessage = mShortDescription;
CommandLineArgumentDefinition[] argDefs =
{
new FastaFileArgumentDefinition(
CommandLineArgumentDefinition.UNNAMED_PARAMETER_VALUE_ARGUMENT,true, null),
new FileToWriteArgumentDefinition("out", true, null),
new FileToReadArgumentDefinition("proteinfile", true, "Protein list file, one line per protein ID"),
new BooleanArgumentDefinition("keeponlist", false, "Keep the proteins on the list? (if false, strip them)", keepProteinsOnList),
};
addArgumentDefinitions(argDefs);
}
public void assignArgumentValues()
throws ArgumentValidationException
{
fastaProteins = (Protein[]) this.getUnnamedArgumentValue();
outFile = getFileArgumentValue("out");
keepProteinsOnList = getBooleanArgumentValue("keeponlist");
File proteinFile = getFileArgumentValue("proteinfile");
try {
BufferedReader br = new BufferedReader(new FileReader(proteinFile));
String line = null;
while((line = br.readLine()) != null) {
proteinIDList.add(line);
}
if (proteinIDList.isEmpty())
throw new ArgumentValidationException("No proteins in file.");
ApplicationContext.infoMessage("Loaded " + proteinIDList.size() + " protein IDs from file.");
} catch (Exception e) {
throw new ArgumentValidationException("Failed to process proteinfile",e);
}
}
/**
* do the actual work
*/
public void execute() throws CommandLineModuleExecutionException
{
List<Protein> proteinsToKeep = new ArrayList<Protein>();
Set<String> idsLeft = new HashSet<String>(proteinIDList);
try
{
PrintWriter outPW = new PrintWriter(outFile);
int count = 0;
for (Protein protein : fastaProteins)
{
//This if handles both cases we want -- keeping and it's there, tossing and it's not
if (proteinIDList.contains(protein.getLookup()) == keepProteinsOnList) {
printProtein(protein, outPW);
count++;
if (keepProteinsOnList)
idsLeft.remove(protein.getLookup());
}
}
ApplicationContext.infoMessage("Wrote " + count + " proteins.");
if (keepProteinsOnList && !idsLeft.isEmpty()) {
ApplicationContext.infoMessage("Specified IDs not encountered: ");
for (String protein : idsLeft) System.err.println("\t" + protein);
}
outPW.flush();
outPW.close();
}
catch (Exception e)
{
throw new CommandLineModuleExecutionException(e);
}
}
/**
* Print the protein in FASTA format, reversing sequence and adjusting header if reverse is specified
* @param protein
* @param outPW
*/
protected void printProtein(Protein protein, PrintWriter outPW)
{
outPW.print(">");
outPW.println(protein.getHeader());
String forwardSequence = protein.getSequenceAsString();
for (int i=0; i<forwardSequence.length(); i++)
{
int indexToPrint = i;
outPW.print(forwardSequence.charAt(indexToPrint));
if ((i%80 == 79 && i > 0) ||
i == forwardSequence.length()-1)
outPW.println();
}
}
}